Skip to content

Commit 35ff5cb

Browse files
committed
Use ActualText when getting the text for the text layer
1 parent 250cc7d commit 35ff5cb

File tree

4 files changed

+40
-0
lines changed

4 files changed

+40
-0
lines changed

src/core/evaluator.js

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2406,6 +2406,7 @@ class PartialEvaluator {
24062406
transform: null,
24072407
fontName: null,
24082408
hasEOL: false,
2409+
span: "",
24092410
};
24102411

24112412
// Use a circular buffer (length === 2) to save the last chars in the
@@ -3070,6 +3071,19 @@ class PartialEvaluator {
30703071
textContentItem.str.length = 0;
30713072
}
30723073

3074+
function replaceTextContentBySpan() {
3075+
const { span, str } = textContentItem;
3076+
if (!span) {
3077+
return;
3078+
}
3079+
textContentItem.span = "";
3080+
if (/^\s+$/.test(span)) {
3081+
return;
3082+
}
3083+
str.length = 0;
3084+
str.push(span);
3085+
}
3086+
30733087
function enqueueChunk(batch = false) {
30743088
const length = textContent.items.length;
30753089
if (length === 0) {
@@ -3446,6 +3460,11 @@ class PartialEvaluator {
34463460
return;
34473461
case OPS.beginMarkedContent:
34483462
flushTextContentItem();
3463+
if (args[0]?.name === "Span") {
3464+
textContentItem.span = stringToPDFString(
3465+
args[1]?.get("ActualText") || ""
3466+
);
3467+
}
34493468
if (includeMarkedContent) {
34503469
markedContentData.level++;
34513470

@@ -3457,6 +3476,11 @@ class PartialEvaluator {
34573476
break;
34583477
case OPS.beginMarkedContentProps:
34593478
flushTextContentItem();
3479+
if (args[0]?.name === "Span") {
3480+
textContentItem.span = stringToPDFString(
3481+
args[1]?.get("ActualText") || ""
3482+
);
3483+
}
34603484
if (includeMarkedContent) {
34613485
markedContentData.level++;
34623486

@@ -3474,6 +3498,7 @@ class PartialEvaluator {
34743498
}
34753499
break;
34763500
case OPS.endMarkedContent:
3501+
replaceTextContentBySpan();
34773502
flushTextContentItem();
34783503
if (includeMarkedContent) {
34793504
if (markedContentData.level === 0) {

test/pdfs/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -726,3 +726,4 @@
726726
!chrome-text-selection-markedContent.pdf
727727
!bug1963407.pdf
728728
!issue19517.pdf
729+
!issue20007.pdf

test/pdfs/issue20007.pdf

11.3 KB
Binary file not shown.

test/unit/api_spec.js

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3923,6 +3923,20 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
39233923
expect(items[1].fontName).not.toEqual(items[0].fontName);
39243924
});
39253925

3926+
it("gets text content from /ActualText", async function () {
3927+
const loadingTask = getDocument(buildGetDocumentParams("issue20007.pdf"));
3928+
const pdfDoc = await loadingTask.promise;
3929+
const pdfPage = await pdfDoc.getPage(1);
3930+
3931+
const { items } = await pdfPage.getTextContent({
3932+
disableNormalization: true,
3933+
});
3934+
const text = mergeText(items);
3935+
expect(text).toEqual("The quick brown fox jumps over the lazy dog");
3936+
3937+
await loadingTask.destroy();
3938+
});
3939+
39263940
it("gets empty structure tree", async function () {
39273941
const tree = await page.getStructTree();
39283942

0 commit comments

Comments
 (0)