Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
<jaxb-api.version>4.0.4</jaxb-api.version>
<jaxb-core.version>4.0.6</jaxb-core.version>
<jaxb-impl.version>4.0.6</jaxb-impl.version>
<lingua.version>1.2.2</lingua.version>
<opennlp-tools.version>1.9.3</opennlp-tools.version>
<pdfbox.version>3.0.6</pdfbox.version>
<poi.version>5.5.0</poi.version>
Expand Down Expand Up @@ -385,6 +386,13 @@
<version>${opennlp-tools.version}</version>
<scope>compile</scope>
</dependency>

<!-- Language recognition -->
<dependency>
<groupId>com.github.pemistahl</groupId>
<artifactId>lingua</artifactId>
<version>${lingua.version}</version>
</dependency>
</dependencies>

<profiles>
Expand Down
40 changes: 40 additions & 0 deletions src/main/java/org/jadice/filetype/matchers/PDFMatcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;

import com.github.pemistahl.lingua.api.LanguageDetector;
import com.github.pemistahl.lingua.api.LanguageDetectorBuilder;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
Expand Down Expand Up @@ -77,6 +79,20 @@ public class PDFMatcher extends Matcher {
public static final String TEXT_LENGTH_KEY = "text-length";
public static final String TEXT_LENGTH_PER_PAGE_KEY = "text-length-per-page";

/**
* Most likely language of the text of the PDF, analyzed with <a href="https://github.com/pemistahl/lingua">lingua</a>
*/
public static final String MOST_LIKELY_TEXT_LANGUAGE = "most-likely-text-language";
/**
* All possible languages of the PDF's text, sorted by their confidence value, analyzed with <a href="https://github.com/pemistahl/lingua">lingua</a>
*/
public static final String TEXT_LANGUAGE_CONFIDENCE_VALUES = "text-language-confidence-values";


private static boolean checkLanguage() {
return "true".equalsIgnoreCase(System.getProperty(PDFMatcher.class.getName() + ".languageCheck", "false"));
}

private static boolean lookForText() {
return "true".equalsIgnoreCase(System.getProperty(PDFMatcher.class.getName() + ".lookForText", "false"));
}
Expand Down Expand Up @@ -279,9 +295,33 @@ private static void addTextInfo(final Map<String, Object> pdfDetails, final PDDo
final String pdfText = new PDFTextStripper().getText(doc);
pdfDetails.put(TEXT_LENGTH_PER_PAGE_KEY, textLengthPerPages);
pdfDetails.put(TEXT_LENGTH_KEY, pdfText.replaceAll("([\\r\\n])", "").length());
if (checkLanguage())
addLanguageInformation(pdfDetails, pdfText);
}
}

/**
* Adds information about the given text to the given map.
* The most likely text language will be {@link com.github.pemistahl.lingua.api.Language#UNKNOWN} in case
* language detection is not reliably possible.
*
* @param pdfDetails map to which the results get added
* @param text text to analyze
*/
public static void addLanguageInformation(final Map<String, Object> pdfDetails, final String text) {
LanguageDetectorBuilder languageDetectorBuilder =
LanguageDetectorBuilder
.fromAllLanguages()
.withMinimumRelativeDistance(0.1);
if (text.length() > 120)
languageDetectorBuilder.withLowAccuracyMode();
final LanguageDetector languageDetector = languageDetectorBuilder.build();
final long startTime = System.currentTimeMillis();
pdfDetails.put(TEXT_LANGUAGE_CONFIDENCE_VALUES, languageDetector.computeLanguageConfidenceValues(text));
pdfDetails.put(MOST_LIKELY_TEXT_LANGUAGE, languageDetector.detectLanguageOf(text).toString());
LOGGER.debug("Language recognition took {} ms.", System.currentTimeMillis() - startTime);
}

/**
* Checks if the PDF is an electronic invoice.
*
Expand Down
26 changes: 21 additions & 5 deletions src/test/java/TestPDFMatcher.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import static org.hamcrest.CoreMatchers.containsString;
import static org.hamcrest.CoreMatchers.hasItems;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.*;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasEntry;
import static org.hamcrest.Matchers.hasKey;
import static org.hamcrest.Matchers.notNullValue;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
Expand All @@ -19,8 +22,9 @@
import org.jadice.filetype.database.MimeTypeAction;
import org.jadice.filetype.matchers.PDFMatcher;
import org.jadice.filetype.pdfutil.SignatureUtil;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.TestInfo;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvFileSource;

Expand All @@ -37,8 +41,14 @@ class TestPDFMatcher {

private static Analyzer ANALYZER;

@BeforeAll
public static void init() throws AnalyzerException {
@BeforeEach
public void init(TestInfo testInfo) throws AnalyzerException {
try {
if (testInfo.getTestMethod().get().getName().equals("testContainsText"))
System.setProperty(PDFMatcher.class.getName() + ".languageCheck", "true");
} catch (Exception e) {
e.printStackTrace();
}
ANALYZER = Analyzer.getInstance("/magic.xml");
}

Expand Down Expand Up @@ -161,7 +171,8 @@ void testSignedPDFs(final String urlString, final int expectedSignatureCount, fi
@SuppressWarnings("unchecked")
@ParameterizedTest
@CsvFileSource(resources = "/pdf/contains-text.csv", numLinesToSkip = 1)
void testContainsText(final String filePath, final boolean expected) throws IOException {
void testContainsText(final String filePath, final boolean expected, final String language) throws IOException {
System.setProperty(PDFMatcher.class.getName() + ".languageCheck", "true");
System.setProperty(PDFMatcher.class.getName() + ".lookForText", "true");
Map<String, Object> result = ANALYZER.analyze(new File(filePath));
assertNotNull(result);
Expand All @@ -177,7 +188,12 @@ void testContainsText(final String filePath, final boolean expected) throws IOEx
final List<Integer> textLengthPerPages = (List<Integer>) pdfDetails.get(PDFMatcher.TEXT_LENGTH_PER_PAGE_KEY);
final int sum = textLengthPerPages.stream().mapToInt(Integer::intValue).sum();
assertEquals(totalTextLength, sum);
if (!language.equals("null")) {
assertEquals(language, pdfDetails.get(PDFMatcher.MOST_LIKELY_TEXT_LANGUAGE));
}
assertTrue(pdfDetails.containsKey(PDFMatcher.TEXT_LANGUAGE_CONFIDENCE_VALUES));
}
System.clearProperty(PDFMatcher.class.getName() + ".languageCheck");
System.clearProperty(PDFMatcher.class.getName() + ".lookForText");
}

Expand Down
38 changes: 19 additions & 19 deletions src/test/resources/pdf/contains-text.csv
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
path,contains-text
src/test/resources/pdf/normal/lorem-ipsum.pdf,true
src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true
src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false
src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false
src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true
src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false
src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true
src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true
src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true
src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true
src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true
src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true
src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true
src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true
src/test/resources/pdf/portfolio/portable-collection-1.pdf,true
path,contains-text,language(ignored if null)
src/test/resources/pdf/normal/lorem-ipsum.pdf,true,LATIN
src/test/resources/pdf/normal/PDF-testdoc-1.3-Karte.pdf,false,null
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo.pdf,true,null
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-annotation.pdf,true,null
src/test/resources/pdf/normal/PDF-testdoc-1.4-hallo-complex.pdf,true,null
src/test/resources/pdf/normal/PDF-testdoc-1.4-lines.pdf,false,null
src/test/resources/pdf/normal/PDF-testdoc-1.4-MultiImage.pdf,false,null
src/test/resources/pdf/normal/PDF-testdoc-1.4-RotatedImages.pdf,true,null
src/test/resources/pdf/normal/PDF-testdoc-1.4-SingleImage.pdf,false,null
src/test/resources/pdf/normal/PDF-testdoc-1.4-TextTest.pdf,true,null
src/test/resources/pdf/encrypted/02_enc40bit_no-pw.pdf,true,null
src/test/resources/pdf/encrypted/03_enc40bit_pw-owner.pdf,true,null
src/test/resources/pdf/encrypted/06_enc128bit_no-pw.pdf,true,null
src/test/resources/pdf/encrypted/07_enc128bit_pw-owner.pdf,true,null
src/test/resources/pdf/encrypted/10_enc128bit-aes_no-pw.pdf,true,null
src/test/resources/pdf/encrypted/11_enc128bit-aes_pw-owner.pdf,true,null
src/test/resources/pdf/encrypted/Contrapunctus I BWV 1080.pdf,true,null
src/test/resources/pdf/portfolio/portable-collection-1.pdf,true,GERMAN
Loading