From dc35d21662a673a1b2d42c12425231c8efefb3bd Mon Sep 17 00:00:00 2001 From: Lukasz Jagiello Date: Wed, 11 Feb 2026 13:31:46 -0800 Subject: [PATCH] fix(text): preserve stream order for RTL-placed design-tool PDFs Design tools like Figma and Canva emit LTR text with right-to-left TJ placement. Sorting by x-position reverses the text. Detect this pattern via sequenceIndex and preserve content stream order instead. - Add RTL_PLACED_THRESHOLD constant (0.8) with documentation - Return OrderedLine { chars, rtlPlaced } from orderLineChars - Fix gap calculation in groupIntoSpans for RTL-placed lines - Fix createSpaceChar bbox positioning for RTL-placed lines - Use fractional sequenceIndex (n + 0.5) for synthetic spaces - Make sequenceIndex optional on ExtractedChar - Guard against missing sequenceIndex (fall back to x-sort) - Document that heuristic correctly handles genuine RTL text - Document mixed bidi limitation (needs full bidi algorithm) - Add 12 unit tests for RTL-placed detection edge cases --- fixtures/text/rtl-placed-ltr-text.pdf | Bin 0 -> 5879 bytes src/integration/text/rtl-placed-text.test.ts | 33 +++ src/text/line-grouper.test.ts | 244 +++++++++++++++++++ src/text/line-grouper.ts | 143 ++++++++++- src/text/text-extractor.ts | 1 + src/text/text-search.test.ts | 1 + src/text/types.ts | 2 + 7 files changed, 414 insertions(+), 10 deletions(-) create mode 100644 fixtures/text/rtl-placed-ltr-text.pdf create mode 100644 src/integration/text/rtl-placed-text.test.ts diff --git a/fixtures/text/rtl-placed-ltr-text.pdf b/fixtures/text/rtl-placed-ltr-text.pdf new file mode 100644 index 0000000000000000000000000000000000000000..ca421574c6c9286b0b9a5d515ae3ac5b714080f4 GIT binary patch literal 5879 zcmeHL&5qkP5WedvcnQ!Rme!CWB>{^CvPl*#nl?dVBn5&Tw6@l(z_u2aT%>)89{L9D zqwI_Iqb!*;Ck}e(F##5lH5_v0n~(FOC$FF1T#(F?Cy#$W{`2_f<6jAhRJ6CBlFLgm zoB#e$i`lcPuh#8K%wAWkx)TZu-iWKKq~0#wLihVr;jnJ@Q?u;E2kn-B#4hcA+Y1U5 zorEj8V|F!`9?qHlTrZpIy8SFZq*!lBiM&)F6CvD|+a4=Bk-0O5S8wXB-R~AK^YYTY zz^WUp!Y@3Kux_C<;UN0;)$ONxagfgI&;7fm+gEF0f>l>nfyM5e$>NvwcGcetZSu^m z==xnm~X^owzv}WPbYGc2F{Lh zdl$)7X=P!K@?6F8qLgWVlv@+a^IT@++R2(zA-0q9qVVg zEV840IkR6|NySlanf+QxRUYNsew9h%@wZugK3SQ2`#Jukr48_l!&2i z9_1{4ROV9o_%ZvbOhb7d<3~jv?r-DxP@zxsWB(}!KRv$u{G6l>@S_MnGNmj7|Ks?O zQIb4=89ym2qzdq1_GOvWU&BY|q1-ZjR4I-3kHv=+$s+Ktj{T!7lym&kT&4j(V*XN^ zp2%bSaJ^lw8?S#JyI&Mi``f|LD>9jRaG7>)r46u+ZOfHRz3Yscsw`v%+teX-**NqH z)U+^elE=0yP}6`KlRT!T0`+oOKgxOPaOMa5GksfzauyopQd6KGL$5&Hui9rQw9m%W zP??q`;GO)-BkHIOp$BzGIlEeFOkI2rEvAUK4XU>Ts@@D zv-S~%a*l6`(gFTV_Qm#7X@=A~$zy&{iL2-N!SErlI}PH;^uZ68I;Q>O(n;XAOC9I@ zBa71D@nM&XELtY7&+sLK(%|u9rHeA~H;~8tBW+~r{b!|(6iP$jKl6`DnFance8@=S z?dSL+zSjHC_(6zo;q7Pml&HL4F@MZWBl_%ikGbhw@W$LUX|uKm+;N!?`OCFcEH(;d z&%*af&hD-|K9I+;Q8~B`Y$tu5d%*2LKgNE@J%OC*8i!5ig_8%pUR_f7io-B#e9YfC3N+aKI>x$*4&q(XhmavRzz zmpey&R(poF3EZ6IF+T`xvCB=9oZTI;)3u$WoYg);+njp)8TSeNaBbbF&+sE_hPG-t zKgMs!E#7{9_e?&tU1#^tfPWDG7=Ox8+kJd_+oKCvdVVnc3H-qHGWo~#QF~l&Jc}P{ zi_3jy{U6(^v;HCXgy+X?B}$9-^kejJorky1#+BJM9>L{{FVC2JCfg5v&x^U6gva@B zoQd58ISq=3A$l4$XK(kn{V-PFw7bnPvtECBx(!F!FHf=KAUhjJQH6WqCsB-^!_=9% zv2z+^mdST7+g-g8%|o}}h-JHOccN>0QElp8EZS{{ak%dHyYJmF3eVUsF7INOYS}zA z-J;p9M7@T7S1+-fsGEJaX_um}HxKRkO&oyOq}eW-WwUhO1@^sISGO=tz>dMJT5PJ- zwwerK5W7+S6yjmf_xL*8NzAURu0BY9sMo*MeY2u#}Y9(rWc92q10JnDY)JQXMmA1Hr8 z0v5AlhC;qySF28Fb;w3I;!q_n2y^>|QbyqWK`Kg32P1yOC)#H5bh}#DE@M6I7VZ;j zjwx^Ej&eUPP#=cfo>dP&)XnOCl)pu?ekag#`cJx_clBM8iXy=o<3C}IL1DcU$9GhU zK_<5T9j1o(9l_T4okj-|z0;*)x*3}Fc%8}cW#FjW@2X~9?}m@nZ<{}A0e|1Lt^3YA VMxpDgT|f9^;6d`_$%|Jv$-j!v@VWp1 literal 0 HcmV?d00001 diff --git a/src/integration/text/rtl-placed-text.test.ts b/src/integration/text/rtl-placed-text.test.ts new file mode 100644 index 0000000..62d138d --- /dev/null +++ b/src/integration/text/rtl-placed-text.test.ts @@ -0,0 +1,33 @@ +/** + * Regression test for reversed text extraction from design-tool PDFs. + * + * Some design tools (e.g. Figma, Canva) export PDFs where characters are + * placed RIGHT-TO-LEFT in user space via TJ positioning adjustments, even + * though the text is LTR (English). The font has near-zero glyph widths, + * and all positioning is done via positive TJ adjustments (which move the + * pen left). Characters appear in correct reading order in the content + * stream, but their x-positions decrease. + * + * The line grouper sorts characters by x-position (left to right), which + * reverses the correct reading order for these PDFs. + */ +import { PDF } from "#src/api/pdf"; +import { loadFixture } from "#src/test-utils"; +import { describe, expect, it } from "vitest"; + +describe("RTL-placed LTR text (design-tool PDFs)", () => { + it("extracts text in correct reading order, not reversed", async () => { + const bytes = await loadFixture("text", "rtl-placed-ltr-text.pdf"); + const pdf = await PDF.load(bytes); + const page = pdf.getPage(0); + + expect(page).not.toBeNull(); + + const pageText = page!.extractText(); + + // The fixture has lorem ipsum text placed right-to-left via TJ adjustments. + // Text should read correctly, not reversed. + expect(pageText.text).toContain("Lorem ipsum dolor sit amet consectetur"); + expect(pageText.text).not.toContain("rutetcesnoc tema tis rolod muspi meroL"); + }); +}); diff --git a/src/text/line-grouper.test.ts b/src/text/line-grouper.test.ts index f1ee258..ba76456 100644 --- a/src/text/line-grouper.test.ts +++ b/src/text/line-grouper.test.ts @@ -19,6 +19,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 10, + sequenceIndex: 0, }, { char: "e", @@ -26,6 +27,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 10, + sequenceIndex: 1, }, { char: "l", @@ -33,6 +35,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 10, + sequenceIndex: 2, }, { char: "l", @@ -40,6 +43,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 10, + sequenceIndex: 3, }, { char: "o", @@ -47,6 +51,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 10, + sequenceIndex: 4, }, ]; @@ -66,6 +71,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 100, + sequenceIndex: 0, }, { char: "B", @@ -73,6 +79,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 100, + sequenceIndex: 1, }, // Line 2 at baseline 80 { @@ -81,6 +88,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 80, + sequenceIndex: 2, }, { char: "D", @@ -88,6 +96,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 80, + sequenceIndex: 3, }, ]; @@ -109,6 +118,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 10, + sequenceIndex: 0, }, { char: "i", @@ -116,6 +126,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 10, + sequenceIndex: 1, }, // Gap that should trigger space insertion { @@ -124,6 +135,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 10, + sequenceIndex: 2, }, { char: "h", @@ -131,6 +143,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 10, + sequenceIndex: 3, }, { char: "e", @@ -138,6 +151,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 10, + sequenceIndex: 4, }, { char: "r", @@ -145,6 +159,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 10, + sequenceIndex: 5, }, { char: "e", @@ -152,6 +167,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 10, + sequenceIndex: 6, }, ]; @@ -169,6 +185,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 10, + sequenceIndex: 0, }, { char: "o", @@ -176,6 +193,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 10, + sequenceIndex: 1, }, { char: "r", @@ -183,6 +201,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 10, + sequenceIndex: 2, }, { char: "m", @@ -190,6 +209,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 10, + sequenceIndex: 3, }, { char: "a", @@ -197,6 +217,7 @@ describe("LineGrouper", () => { fontSize: 14, fontName: "Helvetica-Bold", baseline: 10, + sequenceIndex: 4, }, { char: "l", @@ -204,6 +225,7 @@ describe("LineGrouper", () => { fontSize: 14, fontName: "Helvetica-Bold", baseline: 10, + sequenceIndex: 5, }, ]; @@ -224,6 +246,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 10, + sequenceIndex: 0, }, { char: "B", @@ -231,6 +254,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 10.5, + sequenceIndex: 1, }, { char: "C", @@ -238,6 +262,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 11, + sequenceIndex: 2, }, ]; @@ -255,6 +280,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 10, + sequenceIndex: 0, }, // Small gap - should NOT be a space with high threshold { @@ -263,6 +289,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 10, + sequenceIndex: 1, }, ]; @@ -278,6 +305,219 @@ describe("LineGrouper", () => { }); }); + describe("RTL-placed text detection", () => { + /** Helper to build an ExtractedChar with sensible defaults. */ + function makeChar(char: string, x: number, sequenceIndex?: number, width = 8): ExtractedChar { + return { + char, + bbox: { x, y: 0, width, height: 12 }, + fontSize: 12, + fontName: "Helvetica", + baseline: 10, + sequenceIndex, + }; + } + + it("preserves stream order for 100% RTL-placed chars", () => { + // Chars placed right-to-left (x decreasing) but stream order is A, B, C, D. + // Adjacent chars touch (x + width = next x) so no spaces inserted. + const chars = [ + makeChar("A", 30, 0), + makeChar("B", 22, 1), + makeChar("C", 14, 2), + makeChar("D", 6, 3), + ]; + + const lines = groupCharsIntoLines(chars); + + expect(lines).toHaveLength(1); + expect(lines[0].text).toBe("ABCD"); + }); + + it("detects RTL-placed at exactly 80% threshold", () => { + // 6 chars → 5 pairs. 4 decreasing = 80% → should be detected. + // Adjacent chars (width=8) so gaps are 0 and no spaces inserted. + const chars = [ + makeChar("A", 50, 0), + makeChar("B", 42, 1), // decreasing + makeChar("C", 34, 2), // decreasing + makeChar("D", 26, 3), // decreasing + makeChar("E", 28, 4), // increasing (forward jump) + makeChar("F", 20, 5), // decreasing + ]; + + const lines = groupCharsIntoLines(chars); + + expect(lines).toHaveLength(1); + expect(lines[0].text).toBe("ABCDEF"); + }); + + it("falls back to x-sort below 80% threshold", () => { + // 6 chars → 5 pairs. 3 decreasing = 60% → NOT detected → x-sort. + const chars = [ + makeChar("A", 50, 0), + makeChar("B", 42, 1), // decreasing + makeChar("C", 44, 2), // increasing + makeChar("D", 36, 3), // decreasing + makeChar("E", 38, 4), // increasing + makeChar("F", 30, 5), // decreasing + ]; + + const lines = groupCharsIntoLines(chars); + + expect(lines).toHaveLength(1); + // x-sorted order: F(30), D(36), E(38), B(42), C(44), A(50) + expect(lines[0].text).toBe("FDEBCA"); + }); + + it("uses x-sort for normal LTR text", () => { + const chars = [ + makeChar("A", 0, 0), + makeChar("B", 10, 1), + makeChar("C", 20, 2), + makeChar("D", 30, 3), + ]; + + const lines = groupCharsIntoLines(chars); + + expect(lines).toHaveLength(1); + expect(lines[0].text).toBe("ABCD"); + }); + + it("handles single character", () => { + const chars = [makeChar("X", 10, 0)]; + + const lines = groupCharsIntoLines(chars); + + expect(lines).toHaveLength(1); + expect(lines[0].text).toBe("X"); + }); + + it("detects two chars with decreasing x as RTL-placed", () => { + // 2 chars → 1 pair, 1/1 = 100% decreasing + const chars = [makeChar("A", 20, 0), makeChar("B", 10, 1)]; + + const lines = groupCharsIntoLines(chars); + + expect(lines).toHaveLength(1); + expect(lines[0].text).toBe("AB"); + }); + + it("preserves stream order for genuine RTL text with normal glyph widths", () => { + // Real RTL text (Arabic/Hebrew) has normal glyph widths and decreasing x. + // The heuristic correctly detects this and preserves stream order, which + // IS the correct reading order for RTL text. + const chars = [ + makeChar("\u0628", 30, 0), // ba + makeChar("\u0627", 22, 1), // alef + makeChar("\u062F", 14, 2), // dal + makeChar("\u0631", 6, 3), // ra + ]; + + const lines = groupCharsIntoLines(chars); + + expect(lines).toHaveLength(1); + // Stream order preserved: ba, alef, dal, ra (correct reading order) + expect(lines[0].text).toBe("\u0628\u0627\u062F\u0631"); + }); + + it("inserts space correctly in RTL-placed lines", () => { + // Two words placed right-to-left with a gap between them. + // Within-word: chars adjacent (prev.x - (char.x + char.width) ≈ 0). + // Between-word: gap = 42 - (24 + 8) = 10 > 3.6 threshold → space. + const chars = [ + makeChar("H", 50, 0), + makeChar("i", 42, 1), + makeChar("t", 24, 2), + makeChar("h", 16, 3), + makeChar("e", 8, 4), + makeChar("r", 0, 5), + ]; + + const lines = groupCharsIntoLines(chars); + + expect(lines).toHaveLength(1); + expect(lines[0].text).toBe("Hi ther"); + }); + + it("inserts multiple spaces in RTL-placed lines with three words", () => { + // Three words "AB CD EF" placed right-to-left. + // Within-word gap = prev.x - (char.x + 8) = 0 → no space. + // Between-word gap = 10 > 3.6 → space. + const chars = [ + makeChar("A", 52, 0), + makeChar("B", 44, 1), // gap = 52 - 52 = 0 → no space + makeChar("C", 28, 2), // gap = 44 - 36 = 8 → space + makeChar("D", 20, 3), // gap = 28 - 28 = 0 → no space + makeChar("E", 4, 4), // gap = 20 - 12 = 8 → space + makeChar("F", -4, 5), // gap = 4 - 4 = 0 → no space + ]; + + const lines = groupCharsIntoLines(chars); + + expect(lines).toHaveLength(1); + expect(lines[0].text).toBe("AB CD EF"); + }); + + it("handles overlapping RTL-placed characters without crashing", () => { + // Tightly kerned chars where bboxes overlap slightly. + // gap = prevChar.x - (char.x + char.width) → negative → no space + const chars = [ + makeChar("A", 20, 0), + makeChar("B", 13, 1), // gap = 20 - 21 = -1 → no space (overlap) + makeChar("C", 6, 2), // gap = 13 - 14 = -1 → no space (overlap) + makeChar("D", -1, 3), // gap = 6 - 7 = -1 → no space (overlap) + ]; + + const lines = groupCharsIntoLines(chars); + + expect(lines).toHaveLength(1); + expect(lines[0].text).toBe("ABCD"); + }); + + it("handles mixed RTL-placed and LTR lines on the same page", () => { + // Line 1 (baseline 100): RTL-placed text (decreasing x in stream order) + // Line 2 (baseline 80): normal LTR text (increasing x) + // Each line's RTL detection is independent. + const chars: ExtractedChar[] = [ + // RTL-placed line — adjacent chars (no spaces) + { ...makeChar("R", 24, 0), baseline: 100, bbox: { x: 24, y: 90, width: 8, height: 12 } }, + { ...makeChar("T", 16, 1), baseline: 100, bbox: { x: 16, y: 90, width: 8, height: 12 } }, + { ...makeChar("L", 8, 2), baseline: 100, bbox: { x: 8, y: 90, width: 8, height: 12 } }, + // Normal LTR line — adjacent chars (no spaces) + { ...makeChar("L", 0, 3), baseline: 80, bbox: { x: 0, y: 70, width: 8, height: 12 } }, + { ...makeChar("T", 8, 4), baseline: 80, bbox: { x: 8, y: 70, width: 8, height: 12 } }, + { ...makeChar("R", 16, 5), baseline: 80, bbox: { x: 16, y: 70, width: 8, height: 12 } }, + ]; + + const lines = groupCharsIntoLines(chars); + + expect(lines).toHaveLength(2); + // Line 1 (higher baseline): RTL-placed → stream order preserved + expect(lines[0].text).toBe("RTL"); + expect(lines[0].baseline).toBe(100); + // Line 2 (lower baseline): normal LTR → x-sort + expect(lines[1].text).toBe("LTR"); + expect(lines[1].baseline).toBe(80); + }); + + it("falls back to x-sort when sequenceIndex is missing", () => { + // Chars placed right-to-left but without sequenceIndex — should x-sort + const chars = [ + makeChar("A", 30, undefined), + makeChar("B", 20, undefined), + makeChar("C", 10, undefined), + makeChar("D", 0, undefined), + ]; + + const lines = groupCharsIntoLines(chars); + + expect(lines).toHaveLength(1); + // x-sort produces D(0), C(10), B(20), A(30) + expect(lines[0].text).toBe("DCBA"); + }); + }); + describe("getPlainText", () => { it("joins lines with newlines", () => { const chars: ExtractedChar[] = [ @@ -287,6 +527,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 100, + sequenceIndex: 0, }, { char: "1", @@ -294,6 +535,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 100, + sequenceIndex: 1, }, { char: "L", @@ -301,6 +543,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 80, + sequenceIndex: 2, }, { char: "2", @@ -308,6 +551,7 @@ describe("LineGrouper", () => { fontSize: 12, fontName: "Helvetica", baseline: 80, + sequenceIndex: 3, }, ]; diff --git a/src/text/line-grouper.ts b/src/text/line-grouper.ts index a4b2aca..a8c27e7 100644 --- a/src/text/line-grouper.ts +++ b/src/text/line-grouper.ts @@ -9,6 +9,25 @@ import type { ExtractedChar, TextLine, TextSpan } from "./types"; import { mergeBboxes } from "./types"; +/** + * Minimum fraction of consecutive char pairs with decreasing x-positions + * (in stream order) to classify a line as "RTL-placed". + * + * Figma/Canva exports produce ~100% decreasing pairs within words. + * 80% tolerates small forward jumps at word boundaries. + */ +const RTL_PLACED_THRESHOLD = 0.8; + +/** + * Result of ordering characters within a line. + */ +interface OrderedLine { + /** Characters in reading order */ + chars: ExtractedChar[]; + /** Whether the line was detected as RTL-placed (design-tool pattern) */ + rtlPlaced: boolean; +} + /** * Options for line grouping. */ @@ -53,11 +72,15 @@ export function groupCharsIntoLines( const lines: TextLine[] = []; for (const group of lineGroups) { - // Sort characters left-to-right within the line - const sorted = [...group].sort((a, b) => a.bbox.x - b.bbox.x); + // Order characters within the line. + // Normally we sort left-to-right by x-position, but some design tools + // (Figma, Canva) place characters right-to-left via TJ adjustments while + // the text is actually LTR. In that case, content stream order is correct + // and position-based sorting would reverse the text. + const { chars: sorted, rtlPlaced } = orderLineChars(group); // Group into spans and detect spaces - const spans = groupIntoSpans(sorted, spaceThreshold); + const spans = groupIntoSpans(sorted, spaceThreshold, rtlPlaced); if (spans.length === 0) { continue; @@ -82,6 +105,93 @@ export function groupCharsIntoLines( return lines; } +/** + * Determine the correct character order for a line. + * + * Design tools like Figma and Canva export PDFs where LTR characters are placed + * right-to-left via TJ positioning adjustments (positive values move the pen left). + * The font has near-zero glyph widths, so all positioning comes from TJ. Characters + * appear in correct reading order in the content stream, but their x-positions + * decrease monotonically. + * + * When this pattern is detected, we preserve content stream order instead of sorting + * by x-position, which would reverse the text. + * + * **Limitation**: Detection requires `sequenceIndex` on every character. If any + * character in the group lacks a `sequenceIndex`, we fall back to x-position sorting + * because stream order cannot be reliably reconstructed. + */ +function orderLineChars(group: ExtractedChar[]): OrderedLine { + if (group.length <= 1) { + return { chars: [...group], rtlPlaced: false }; + } + + // If any character lacks sequenceIndex, fall back to x-sort + const hasStreamOrder = group.every(c => c.sequenceIndex != null); + + if (!hasStreamOrder) { + return { + chars: [...group].sort((a, b) => a.bbox.x - b.bbox.x), + rtlPlaced: false, + }; + } + + // Sort by sequenceIndex to get content stream order. + // Safe to use `!` — hasStreamOrder guarantees every char has sequenceIndex. + const streamOrder = [...group].sort((a, b) => a.sequenceIndex! - b.sequenceIndex!); + + if (isRtlPlaced(streamOrder)) { + return { chars: streamOrder, rtlPlaced: true }; + } + + // Normal case: sort left-to-right by x-position + return { + chars: [...group].sort((a, b) => a.bbox.x - b.bbox.x), + rtlPlaced: false, + }; +} + +/** + * Detect whether characters are placed right-to-left in user space while + * content stream order represents the correct reading order. + * + * Returns true when x-positions in stream order are predominantly decreasing + * (≥ 80% of consecutive pairs). In that case, position-based sorting would + * reverse the reading order, so we preserve stream order instead. + * + * This covers two real-world scenarios: + * - **Design-tool PDFs** (Figma, Canva): LTR text placed right-to-left via + * TJ positioning adjustments. Stream order = correct reading order. + * - **Genuine RTL text** (Arabic, Hebrew): characters naturally placed + * right-to-left. PDF producers typically emit them in reading order, so + * stream order is again correct. + * + * In both cases, when x-positions decrease in stream order, preserving stream + * order produces the correct reading order. + * + * **Known limitation**: mixed bidi text (e.g., Arabic with embedded English) + * requires a full Unicode bidi algorithm, which is out of scope for this + * heuristic. For mixed lines, neither stream order nor x-sort is fully + * correct; a future bidi implementation should replace this heuristic. + */ +function isRtlPlaced(streamOrder: ExtractedChar[]): boolean { + if (streamOrder.length < 2) { + return false; + } + + // Count how many consecutive character pairs have decreasing x + let decreasingCount = 0; + for (let i = 1; i < streamOrder.length; i++) { + if (streamOrder[i].bbox.x < streamOrder[i - 1].bbox.x) { + decreasingCount++; + } + } + + const totalPairs = streamOrder.length - 1; + + return decreasingCount / totalPairs >= RTL_PLACED_THRESHOLD; +} + /** * Group characters by baseline Y coordinate. */ @@ -113,7 +223,11 @@ function groupByBaseline(chars: ExtractedChar[], tolerance: number): ExtractedCh /** * Group characters into spans based on font/size and detect spaces. */ -function groupIntoSpans(chars: ExtractedChar[], spaceThreshold: number): TextSpan[] { +function groupIntoSpans( + chars: ExtractedChar[], + spaceThreshold: number, + rtlPlaced: boolean, +): TextSpan[] { if (chars.length === 0) { return []; } @@ -131,8 +245,12 @@ function groupIntoSpans(chars: ExtractedChar[], spaceThreshold: number): TextSpa const fontChanged = char.fontName !== currentFontName || Math.abs(char.fontSize - currentFontSize) > 0.5; - // Check for space gap - const gap = char.bbox.x - (prevChar.bbox.x + prevChar.bbox.width); + // Check for space gap — in RTL-placed lines, the "next" character in + // reading order sits to the left of the previous one, so the gap is + // measured from the left edge of prevChar to the right edge of char. + const gap = rtlPlaced + ? prevChar.bbox.x - (char.bbox.x + char.bbox.width) + : char.bbox.x - (prevChar.bbox.x + prevChar.bbox.width); const avgFontSize = (prevChar.fontSize + char.fontSize) / 2; const needsSpace = gap > avgFontSize * spaceThreshold; @@ -147,7 +265,7 @@ function groupIntoSpans(chars: ExtractedChar[], spaceThreshold: number): TextSpa } else if (needsSpace) { // Add space to current span and continue // We insert a synthetic space character - currentSpan.push(createSpaceChar(prevChar, char)); + currentSpan.push(createSpaceChar(prevChar, char, rtlPlaced)); currentSpan.push(char); } else { currentSpan.push(char); @@ -184,9 +302,13 @@ function buildSpan(chars: ExtractedChar[]): TextSpan { /** * Create a synthetic space character between two characters. */ -function createSpaceChar(before: ExtractedChar, after: ExtractedChar): ExtractedChar { - const x = before.bbox.x + before.bbox.width; - const width = after.bbox.x - x; +function createSpaceChar( + before: ExtractedChar, + after: ExtractedChar, + rtlPlaced: boolean, +): ExtractedChar { + const x = rtlPlaced ? after.bbox.x + after.bbox.width : before.bbox.x + before.bbox.width; + const width = rtlPlaced ? before.bbox.x - x : after.bbox.x - x; return { char: " ", @@ -199,6 +321,7 @@ function createSpaceChar(before: ExtractedChar, after: ExtractedChar): Extracted fontSize: (before.fontSize + after.fontSize) / 2, fontName: before.fontName, baseline: (before.baseline + after.baseline) / 2, + sequenceIndex: before.sequenceIndex != null ? before.sequenceIndex + 0.5 : undefined, }; } diff --git a/src/text/text-extractor.ts b/src/text/text-extractor.ts index 1b134f9..4a9c41f 100644 --- a/src/text/text-extractor.ts +++ b/src/text/text-extractor.ts @@ -278,6 +278,7 @@ export class TextExtractor { fontSize: this.state.effectiveFontSize, fontName: font.baseFontName, baseline: bbox.baseline, + sequenceIndex: this.chars.length, }); // Advance text position diff --git a/src/text/text-search.test.ts b/src/text/text-search.test.ts index 9d4b5b6..122dfab 100644 --- a/src/text/text-search.test.ts +++ b/src/text/text-search.test.ts @@ -20,6 +20,7 @@ function createPageText(text: string, pageIndex = 0): PageText { fontSize: 12, fontName: "Helvetica", baseline: 10, + sequenceIndex: chars.length, }); x += 10; } diff --git a/src/text/types.ts b/src/text/types.ts index 2b7052d..a505f38 100644 --- a/src/text/types.ts +++ b/src/text/types.ts @@ -33,6 +33,8 @@ export interface ExtractedChar { fontName: string; /** Y coordinate of the text baseline */ baseline: number; + /** Index in the content stream extraction order (0-based) */ + sequenceIndex?: number; } /**