From 1b971c45d69517389e35f7329a978d48fe092dbf Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 7 Aug 2025 11:20:18 +0300 Subject: [PATCH 1/4] pom.xml: upgrade pdfbox and tika Use latest pdfbox 3.0.5 and tika 3.2.2. See: https://pdfbox.apache.org/3.0/migration.html See: https://dist.apache.org/repos/dist/release/tika/3.2.2/CHANGES-3.2.2.txt --- pom.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index e7f0aef230..369dc6acc3 100644 --- a/pom.xml +++ b/pom.xml @@ -38,12 +38,12 @@ 4.0.5 1.1.1 - 9.4.57.v20241219 - 2.24.3 - 2.0.34 + 9.4.58.v20250814 + 2.25.2 + 3.0.5 1.19.0 2.0.17 - 2.9.4 + 3.2.2 1.81 8.0.1 From 6aa3ddb1b616ab22a9ce096444c0b1311be2f6f0 Mon Sep 17 00:00:00 2001 From: Alan Orth Date: Thu, 7 Aug 2025 11:39:09 +0300 Subject: [PATCH 2/4] dspace-api: Update syntax for pdfbox 3.0.x Conflicts resolved with CitationDocumentServiceImpl to satisfy the older implementation. --- .../ImageMagickThumbnailFilter.java | 6 ++--- .../app/mediafilter/PDFBoxThumbnail.java | 4 +++- .../dspace/content/packager/PDFPackager.java | 23 +++++++++++-------- .../EpoImportMetadataSourceServiceImpl.java | 2 +- .../app/rest/BitstreamRestControllerIT.java | 6 +++-- 5 files changed, 25 insertions(+), 16 deletions(-) diff --git a/dspace-api/src/main/java/org/dspace/app/mediafilter/ImageMagickThumbnailFilter.java b/dspace-api/src/main/java/org/dspace/app/mediafilter/ImageMagickThumbnailFilter.java index 408982d157..7543410a79 100644 --- a/dspace-api/src/main/java/org/dspace/app/mediafilter/ImageMagickThumbnailFilter.java +++ b/dspace-api/src/main/java/org/dspace/app/mediafilter/ImageMagickThumbnailFilter.java @@ -14,7 +14,7 @@ import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; -import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.dspace.content.Bitstream; @@ -153,8 +153,8 @@ public File getImageFile(File f, boolean verbose) // the CropBox is missing or empty because pdfbox will set it to the // same size as the MediaBox if it doesn't exist. Also note that we // only need to check the first page, since that's what we use for - // generating the thumbnail (PDDocument uses a zero-based index). - PDPage pdfPage = PDDocument.load(f).getPage(0); + // generating the thumbnail (PDPage uses a zero-based index). + PDPage pdfPage = Loader.loadPDF(f).getPage(0); PDRectangle pdfPageMediaBox = pdfPage.getMediaBox(); PDRectangle pdfPageCropBox = pdfPage.getCropBox(); diff --git a/dspace-api/src/main/java/org/dspace/app/mediafilter/PDFBoxThumbnail.java b/dspace-api/src/main/java/org/dspace/app/mediafilter/PDFBoxThumbnail.java index 3acb6900db..94c463b280 100644 --- a/dspace-api/src/main/java/org/dspace/app/mediafilter/PDFBoxThumbnail.java +++ b/dspace-api/src/main/java/org/dspace/app/mediafilter/PDFBoxThumbnail.java @@ -11,6 +11,8 @@ import java.io.InputStream; import org.apache.logging.log4j.Logger; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.io.RandomAccessReadBuffer; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException; import org.apache.pdfbox.rendering.PDFRenderer; @@ -71,7 +73,7 @@ public InputStream getDestinationStream(Item currentItem, InputStream source, bo BufferedImage buf; // Render the page image. - try ( PDDocument doc = PDDocument.load(source); ) { + try ( PDDocument doc = Loader.loadPDF(new RandomAccessReadBuffer(source)); ) { PDFRenderer renderer = new PDFRenderer(doc); buf = renderer.renderImage(0); } catch (InvalidPasswordException ex) { diff --git a/dspace-api/src/main/java/org/dspace/content/packager/PDFPackager.java b/dspace-api/src/main/java/org/dspace/content/packager/PDFPackager.java index 6c7baad454..f63585f3c4 100644 --- a/dspace-api/src/main/java/org/dspace/content/packager/PDFPackager.java +++ b/dspace-api/src/main/java/org/dspace/content/packager/PDFPackager.java @@ -18,11 +18,11 @@ import org.apache.commons.lang3.ArrayUtils; import org.apache.logging.log4j.Logger; +import org.apache.pdfbox.Loader; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.io.MemoryUsageSetting; -import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream; +import org.apache.pdfbox.io.RandomAccessReadBuffer; import org.apache.pdfbox.io.ScratchFile; -import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.dspace.authorize.AuthorizeException; @@ -330,19 +330,24 @@ private void crosswalkPDF(Context context, Item item, InputStream metadata) COSDocument cos = null; try { - ScratchFile scratchFile = null; + PDDocument document = null; + try { - long useRAM = Runtime.getRuntime().freeMemory() * 80 / 100; // use up to 80% of JVM free memory - scratchFile = new ScratchFile( - MemoryUsageSetting.setupMixed(useRAM)); // then fallback to temp file (unlimited size) + // Use up to 80% of JVM free memory and fall back to a temp file (unlimited size) + long useRAM = Runtime.getRuntime().freeMemory() * 80 / 100; + document = Loader.loadPDF( + new RandomAccessReadBuffer(metadata), + () -> new ScratchFile(MemoryUsageSetting.setupMixed(useRAM))); } catch (IOException ioe) { log.warn("Error initializing scratch file: " + ioe.getMessage()); } - PDFParser parser = new PDFParser(new RandomAccessBufferedFileInputStream(metadata), scratchFile); - parser.parse(); - cos = parser.getDocument(); + // sanity check: loaded PDF document must not be null. + if (document == null) { + throw new MetadataValidationException("The provided stream could not be parsed into a PDF document."); + } + cos = document.getDocument(); // sanity check: PDFBox breaks on encrypted documents, so give up. if (cos.getEncryptionDictionary() != null) { throw new MetadataValidationException("This packager cannot accept an encrypted PDF document."); diff --git a/dspace-api/src/main/java/org/dspace/importer/external/epo/service/EpoImportMetadataSourceServiceImpl.java b/dspace-api/src/main/java/org/dspace/importer/external/epo/service/EpoImportMetadataSourceServiceImpl.java index 552f607827..4ec1f4db39 100644 --- a/dspace-api/src/main/java/org/dspace/importer/external/epo/service/EpoImportMetadataSourceServiceImpl.java +++ b/dspace-api/src/main/java/org/dspace/importer/external/epo/service/EpoImportMetadataSourceServiceImpl.java @@ -29,9 +29,9 @@ import org.apache.commons.lang3.StringUtils; import org.apache.http.HttpException; import org.apache.http.client.utils.URIBuilder; +import org.apache.jena.ext.xerces.impl.dv.util.Base64; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.apache.xerces.impl.dv.util.Base64; import org.dspace.app.util.XMLUtils; import org.dspace.content.Item; import org.dspace.importer.external.datamodel.ImportRecord; diff --git a/dspace-server-webapp/src/test/java/org/dspace/app/rest/BitstreamRestControllerIT.java b/dspace-server-webapp/src/test/java/org/dspace/app/rest/BitstreamRestControllerIT.java index 691927c6e4..1e7d6440ff 100644 --- a/dspace-server-webapp/src/test/java/org/dspace/app/rest/BitstreamRestControllerIT.java +++ b/dspace-server-webapp/src/test/java/org/dspace/app/rest/BitstreamRestControllerIT.java @@ -58,6 +58,8 @@ import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.CharEncoding; import org.apache.commons.lang3.StringUtils; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.io.RandomAccessReadBuffer; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.solr.client.solrj.SolrServerException; @@ -989,7 +991,7 @@ private String extractPDFText(byte[] content) throws IOException { try (ByteArrayInputStream source = new ByteArrayInputStream(content); Writer writer = new StringWriter(); - PDDocument pdfDoc = PDDocument.load(source)) { + PDDocument pdfDoc = Loader.loadPDF(new RandomAccessReadBuffer(source))) { pts.writeText(pdfDoc, writer); return writer.toString(); @@ -998,7 +1000,7 @@ private String extractPDFText(byte[] content) throws IOException { private int getNumberOfPdfPages(byte[] content) throws IOException { try (ByteArrayInputStream source = new ByteArrayInputStream(content); - PDDocument pdfDoc = PDDocument.load(source)) { + PDDocument pdfDoc = Loader.loadPDF(new RandomAccessReadBuffer(source))) { return pdfDoc.getNumberOfPages(); } } From cba43b4e150ebe77edb317f7a7b54559b60d2c93 Mon Sep 17 00:00:00 2001 From: Kim Shepherd Date: Wed, 10 Dec 2025 17:56:36 +0100 Subject: [PATCH 3/4] Apply PDFBox 3.x changes to CitationDocumentServiceImpl --- .../disseminate/CitationDocumentServiceImpl.java | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/dspace-api/src/main/java/org/dspace/disseminate/CitationDocumentServiceImpl.java b/dspace-api/src/main/java/org/dspace/disseminate/CitationDocumentServiceImpl.java index c20961db75..1aa31d4db9 100644 --- a/dspace-api/src/main/java/org/dspace/disseminate/CitationDocumentServiceImpl.java +++ b/dspace-api/src/main/java/org/dspace/disseminate/CitationDocumentServiceImpl.java @@ -23,6 +23,8 @@ import org.apache.commons.lang3.tuple.Pair; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.io.RandomAccessReadBuffer; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageContentStream; @@ -30,6 +32,7 @@ import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.dspace.authorize.AuthorizeException; import org.dspace.authorize.service.AuthorizeService; import org.dspace.content.Bitstream; @@ -304,7 +307,7 @@ public Pair makeCitedDocument(Context context, Bitstream bitstream Item item = (Item) bitstreamService.getParentObject(context, bitstream); final InputStream inputStream = bitstreamService.retrieve(context, bitstream); try { - sourceDocument = sourceDocument.load(inputStream); + sourceDocument = Loader.loadPDF(new RandomAccessReadBuffer(inputStream)); } finally { inputStream.close(); } @@ -335,9 +338,10 @@ protected void generateCoverPage(Context context, PDDocument document, PDPage co int xwidth = 550; int ygap = 20; - PDFont fontHelvetica = PDType1Font.HELVETICA; - PDFont fontHelveticaBold = PDType1Font.HELVETICA_BOLD; - PDFont fontHelveticaOblique = PDType1Font.HELVETICA_OBLIQUE; + PDFont fontHelvetica = new PDType1Font(Standard14Fonts.FontName.HELVETICA); + PDFont fontHelveticaBold = new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD); + PDFont fontHelveticaOblique = new PDType1Font(Standard14Fonts.FontName.HELVETICA_OBLIQUE); + contentStream.setNonStrokingColor(Color.BLACK); String[][] content = {header1}; From 3b56f4a83730c6103ef606778b3151929ee181a2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 6 Oct 2025 19:56:59 +0000 Subject: [PATCH 4/4] Bump tika.version from 3.2.2 to 3.2.3 Bumps `tika.version` from 3.2.2 to 3.2.3. Updates `org.apache.tika:tika-core` from 3.2.2 to 3.2.3 - [Changelog](https://github.com/apache/tika/blob/main/CHANGES.txt) - [Commits](https://github.com/apache/tika/compare/3.2.2...3.2.3) Updates `org.apache.tika:tika-parsers-standard-package` from 3.2.2 to 3.2.3 --- updated-dependencies: - dependency-name: org.apache.tika:tika-core dependency-version: 3.2.3 dependency-type: direct:production update-type: version-update:semver-patch - dependency-name: org.apache.tika:tika-parsers-standard-package dependency-version: 3.2.3 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 369dc6acc3..cdbdc3a9d4 100644 --- a/pom.xml +++ b/pom.xml @@ -43,7 +43,7 @@ 3.0.5 1.19.0 2.0.17 - 3.2.2 + 3.2.3 1.81 8.0.1