diff --git a/pom.xml b/pom.xml
index 3a9c27b..49f672c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -55,6 +55,7 @@
4.0.4
4.0.6
4.0.6
+ 1.2.2
1.9.3
3.0.6
5.5.0
@@ -385,6 +386,13 @@
${opennlp-tools.version}
compile
+
+
+
+ com.github.pemistahl
+ lingua
+ ${lingua.version}
+
diff --git a/src/main/java/org/jadice/filetype/matchers/PDFMatcher.java b/src/main/java/org/jadice/filetype/matchers/PDFMatcher.java
index c1f3852..ff1473e 100644
--- a/src/main/java/org/jadice/filetype/matchers/PDFMatcher.java
+++ b/src/main/java/org/jadice/filetype/matchers/PDFMatcher.java
@@ -21,6 +21,8 @@
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
+import com.github.pemistahl.lingua.api.LanguageDetector;
+import com.github.pemistahl.lingua.api.LanguageDetectorBuilder;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
@@ -77,6 +79,20 @@ public class PDFMatcher extends Matcher {
public static final String TEXT_LENGTH_KEY = "text-length";
public static final String TEXT_LENGTH_PER_PAGE_KEY = "text-length-per-page";
+ /**
+ * Most likely language of the text of the PDF, analyzed with lingua
+ */
+ public static final String MOST_LIKELY_TEXT_LANGUAGE = "most-likely-text-language";
+ /**
+ * All possible languages of the PDF's text, sorted by their confidence value, analyzed with lingua
+ */
+ public static final String TEXT_LANGUAGE_CONFIDENCE_VALUES = "text-language-confidence-values";
+
+
+ private static boolean checkLanguage() {
+ return "true".equalsIgnoreCase(System.getProperty(PDFMatcher.class.getName() + ".languageCheck", "false"));
+ }
+
private static boolean lookForText() {
return "true".equalsIgnoreCase(System.getProperty(PDFMatcher.class.getName() + ".lookForText", "false"));
}
@@ -279,9 +295,33 @@ private static void addTextInfo(final Map pdfDetails, final PDDo
final String pdfText = new PDFTextStripper().getText(doc);
pdfDetails.put(TEXT_LENGTH_PER_PAGE_KEY, textLengthPerPages);
pdfDetails.put(TEXT_LENGTH_KEY, pdfText.replaceAll("([\\r\\n])", "").length());
+ if (checkLanguage())
+ addLanguageInformation(pdfDetails, pdfText);
}
}
+ /**
+ * Adds information about the given text to the given map.
+ * The most likely text language will be {@link com.github.pemistahl.lingua.api.Language#UNKNOWN} in case
+ * language detection is not reliably possible.
+ *
+ * @param pdfDetails map to which the results get added
+ * @param text text to analyze
+ */
+ public static void addLanguageInformation(final Map pdfDetails, final String text) {
+ LanguageDetectorBuilder languageDetectorBuilder =
+ LanguageDetectorBuilder
+ .fromAllLanguages()
+ .withMinimumRelativeDistance(0.1);
+ if (text.length() > 120)
+ languageDetectorBuilder.withLowAccuracyMode();
+ final LanguageDetector languageDetector = languageDetectorBuilder.build();
+ final long startTime = System.currentTimeMillis();
+ pdfDetails.put(TEXT_LANGUAGE_CONFIDENCE_VALUES, languageDetector.computeLanguageConfidenceValues(text));
+ pdfDetails.put(MOST_LIKELY_TEXT_LANGUAGE, languageDetector.detectLanguageOf(text).toString());
+ LOGGER.debug("Language recognition took {} ms.", System.currentTimeMillis() - startTime);
+ }
+
/**
* Checks if the PDF is an electronic invoice.
*
diff --git a/src/test/java/TestPDFMatcher.java b/src/test/java/TestPDFMatcher.java
index a077656..94a5363 100644
--- a/src/test/java/TestPDFMatcher.java
+++ b/src/test/java/TestPDFMatcher.java
@@ -1,7 +1,10 @@
import static org.hamcrest.CoreMatchers.containsString;
import static org.hamcrest.CoreMatchers.hasItems;
import static org.hamcrest.MatcherAssert.assertThat;
-import static org.hamcrest.Matchers.*;
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.hasEntry;
+import static org.hamcrest.Matchers.hasKey;
+import static org.hamcrest.Matchers.notNullValue;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
@@ -19,8 +22,9 @@
import org.jadice.filetype.database.MimeTypeAction;
import org.jadice.filetype.matchers.PDFMatcher;
import org.jadice.filetype.pdfutil.SignatureUtil;
-import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.TestInfo;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvFileSource;
@@ -37,8 +41,14 @@ class TestPDFMatcher {
private static Analyzer ANALYZER;
- @BeforeAll
- public static void init() throws AnalyzerException {
+ @BeforeEach
+ public void init(TestInfo testInfo) throws AnalyzerException {
+ try {
+ if (testInfo.getTestMethod().get().getName().equals("testContainsText"))
+ System.setProperty(PDFMatcher.class.getName() + ".languageCheck", "true");
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
ANALYZER = Analyzer.getInstance("/magic.xml");
}
@@ -161,7 +171,8 @@ void testSignedPDFs(final String urlString, final int expectedSignatureCount, fi
@SuppressWarnings("unchecked")
@ParameterizedTest
@CsvFileSource(resources = "/pdf/contains-text.csv", numLinesToSkip = 1)
- void testContainsText(final String filePath, final boolean expected) throws IOException {
+ void testContainsText(final String filePath, final boolean expected, final String language) throws IOException {
+ System.setProperty(PDFMatcher.class.getName() + ".languageCheck", "true");
System.setProperty(PDFMatcher.class.getName() + ".lookForText", "true");
Map result = ANALYZER.analyze(new File(filePath));
assertNotNull(result);
@@ -177,7 +188,12 @@ void testContainsText(final String filePath, final boolean expected) throws IOEx
final List textLengthPerPages = (List) pdfDetails.get(PDFMatcher.TEXT_LENGTH_PER_PAGE_KEY);
final int sum = textLengthPerPages.stream().mapToInt(Integer::intValue).sum();
assertEquals(totalTextLength, sum);
+ if (!language.equals("null")) {
+ assertEquals(language, pdfDetails.get(PDFMatcher.MOST_LIKELY_TEXT_LANGUAGE));
+ }
+ assertTrue(pdfDetails.containsKey(PDFMatcher.TEXT_LANGUAGE_CONFIDENCE_VALUES));
}
+ System.clearProperty(PDFMatcher.class.getName() + ".languageCheck");
System.clearProperty(PDFMatcher.class.getName() + ".lookForText");
}
diff --git a/src/test/resources/pdf/contains-text.csv b/src/test/resources/pdf/contains-text.csv
index ca8e201..91bed27 100644
--- a/src/test/resources/pdf/contains-text.csv
+++ b/src/test/resources/pdf/contains-text.csv
@@ -1,19 +1,19 @@
-path,contains-text
-src/test/resources/pdf/normal/lorem-ipsum.pdf,true
-src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false
-src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true
-src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true
-src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true
-src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false
-src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false
-src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true
-src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false
-src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true
-src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true
-src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true
-src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true
-src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true
-src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true
-src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true
-src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true
-src/test/resources/pdf/portfolio/portable-collection-1.pdf,true
\ No newline at end of file
+path,contains-text,language(ignored if null)
+src/test/resources/pdf/normal/lorem-ipsum.pdf,true,LATIN
+src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false,null
+src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true,null
+src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true,null
+src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true,null
+src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false,null
+src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false,null
+src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true,null
+src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false,null
+src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true,null
+src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true,null
+src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true,null
+src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true,null
+src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true,null
+src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true,null
+src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true,null
+src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true,null
+src/test/resources/pdf/portfolio/portable-collection-1.pdf,true,GERMAN
\ No newline at end of file