diff --git a/pom.xml b/pom.xml index 3a9c27b..49f672c 100644 --- a/pom.xml +++ b/pom.xml @@ -55,6 +55,7 @@ 4.0.4 4.0.6 4.0.6 + 1.2.2 1.9.3 3.0.6 5.5.0 @@ -385,6 +386,13 @@ ${opennlp-tools.version} compile + + + + com.github.pemistahl + lingua + ${lingua.version} + diff --git a/src/main/java/org/jadice/filetype/matchers/PDFMatcher.java b/src/main/java/org/jadice/filetype/matchers/PDFMatcher.java index c1f3852..ff1473e 100644 --- a/src/main/java/org/jadice/filetype/matchers/PDFMatcher.java +++ b/src/main/java/org/jadice/filetype/matchers/PDFMatcher.java @@ -21,6 +21,8 @@ import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; +import com.github.pemistahl.lingua.api.LanguageDetector; +import com.github.pemistahl.lingua.api.LanguageDetectorBuilder; import org.apache.pdfbox.Loader; import org.apache.pdfbox.io.IOUtils; import org.apache.pdfbox.pdmodel.PDDocument; @@ -77,6 +79,20 @@ public class PDFMatcher extends Matcher { public static final String TEXT_LENGTH_KEY = "text-length"; public static final String TEXT_LENGTH_PER_PAGE_KEY = "text-length-per-page"; + /** + * Most likely language of the text of the PDF, analyzed with lingua + */ + public static final String MOST_LIKELY_TEXT_LANGUAGE = "most-likely-text-language"; + /** + * All possible languages of the PDF's text, sorted by their confidence value, analyzed with lingua + */ + public static final String TEXT_LANGUAGE_CONFIDENCE_VALUES = "text-language-confidence-values"; + + + private static boolean checkLanguage() { + return "true".equalsIgnoreCase(System.getProperty(PDFMatcher.class.getName() + ".languageCheck", "false")); + } + private static boolean lookForText() { return "true".equalsIgnoreCase(System.getProperty(PDFMatcher.class.getName() + ".lookForText", "false")); } @@ -279,9 +295,33 @@ private static void addTextInfo(final Map pdfDetails, final PDDo final String pdfText = new PDFTextStripper().getText(doc); pdfDetails.put(TEXT_LENGTH_PER_PAGE_KEY, textLengthPerPages); pdfDetails.put(TEXT_LENGTH_KEY, pdfText.replaceAll("([\\r\\n])", "").length()); + if (checkLanguage()) + addLanguageInformation(pdfDetails, pdfText); } } + /** + * Adds information about the given text to the given map. + * The most likely text language will be {@link com.github.pemistahl.lingua.api.Language#UNKNOWN} in case + * language detection is not reliably possible. + * + * @param pdfDetails map to which the results get added + * @param text text to analyze + */ + public static void addLanguageInformation(final Map pdfDetails, final String text) { + LanguageDetectorBuilder languageDetectorBuilder = + LanguageDetectorBuilder + .fromAllLanguages() + .withMinimumRelativeDistance(0.1); + if (text.length() > 120) + languageDetectorBuilder.withLowAccuracyMode(); + final LanguageDetector languageDetector = languageDetectorBuilder.build(); + final long startTime = System.currentTimeMillis(); + pdfDetails.put(TEXT_LANGUAGE_CONFIDENCE_VALUES, languageDetector.computeLanguageConfidenceValues(text)); + pdfDetails.put(MOST_LIKELY_TEXT_LANGUAGE, languageDetector.detectLanguageOf(text).toString()); + LOGGER.debug("Language recognition took {} ms.", System.currentTimeMillis() - startTime); + } + /** * Checks if the PDF is an electronic invoice. * diff --git a/src/test/java/TestPDFMatcher.java b/src/test/java/TestPDFMatcher.java index a077656..94a5363 100644 --- a/src/test/java/TestPDFMatcher.java +++ b/src/test/java/TestPDFMatcher.java @@ -1,7 +1,10 @@ import static org.hamcrest.CoreMatchers.containsString; import static org.hamcrest.CoreMatchers.hasItems; import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.*; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.hasEntry; +import static org.hamcrest.Matchers.hasKey; +import static org.hamcrest.Matchers.notNullValue; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; @@ -19,8 +22,9 @@ import org.jadice.filetype.database.MimeTypeAction; import org.jadice.filetype.matchers.PDFMatcher; import org.jadice.filetype.pdfutil.SignatureUtil; -import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInfo; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.CsvFileSource; @@ -37,8 +41,14 @@ class TestPDFMatcher { private static Analyzer ANALYZER; - @BeforeAll - public static void init() throws AnalyzerException { + @BeforeEach + public void init(TestInfo testInfo) throws AnalyzerException { + try { + if (testInfo.getTestMethod().get().getName().equals("testContainsText")) + System.setProperty(PDFMatcher.class.getName() + ".languageCheck", "true"); + } catch (Exception e) { + e.printStackTrace(); + } ANALYZER = Analyzer.getInstance("/magic.xml"); } @@ -161,7 +171,8 @@ void testSignedPDFs(final String urlString, final int expectedSignatureCount, fi @SuppressWarnings("unchecked") @ParameterizedTest @CsvFileSource(resources = "/pdf/contains-text.csv", numLinesToSkip = 1) - void testContainsText(final String filePath, final boolean expected) throws IOException { + void testContainsText(final String filePath, final boolean expected, final String language) throws IOException { + System.setProperty(PDFMatcher.class.getName() + ".languageCheck", "true"); System.setProperty(PDFMatcher.class.getName() + ".lookForText", "true"); Map result = ANALYZER.analyze(new File(filePath)); assertNotNull(result); @@ -177,7 +188,12 @@ void testContainsText(final String filePath, final boolean expected) throws IOEx final List textLengthPerPages = (List) pdfDetails.get(PDFMatcher.TEXT_LENGTH_PER_PAGE_KEY); final int sum = textLengthPerPages.stream().mapToInt(Integer::intValue).sum(); assertEquals(totalTextLength, sum); + if (!language.equals("null")) { + assertEquals(language, pdfDetails.get(PDFMatcher.MOST_LIKELY_TEXT_LANGUAGE)); + } + assertTrue(pdfDetails.containsKey(PDFMatcher.TEXT_LANGUAGE_CONFIDENCE_VALUES)); } + System.clearProperty(PDFMatcher.class.getName() + ".languageCheck"); System.clearProperty(PDFMatcher.class.getName() + ".lookForText"); } diff --git a/src/test/resources/pdf/contains-text.csv b/src/test/resources/pdf/contains-text.csv index ca8e201..91bed27 100644 --- a/src/test/resources/pdf/contains-text.csv +++ b/src/test/resources/pdf/contains-text.csv @@ -1,19 +1,19 @@ -path,contains-text -src/test/resources/pdf/normal/lorem-ipsum.pdf,true -src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false -src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true -src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true -src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true -src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false -src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false -src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true -src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false -src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true -src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true -src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true -src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true -src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true -src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true -src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true -src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true -src/test/resources/pdf/portfolio/portable-collection-1.pdf,true \ No newline at end of file +path,contains-text,language(ignored if null) +src/test/resources/pdf/normal/lorem-ipsum.pdf,true,LATIN +src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false,null +src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true,null +src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true,null +src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true,null +src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false,null +src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false,null +src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true,null +src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false,null +src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true,null +src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true,null +src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true,null +src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true,null +src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true,null +src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true,null +src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true,null +src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true,null +src/test/resources/pdf/portfolio/portable-collection-1.pdf,true,GERMAN \ No newline at end of file