diff --git a/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java b/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java index 6e4f6a45f..59f8f2008 100644 --- a/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java +++ b/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java @@ -165,7 +165,8 @@ public static String process(final Document document) throws RatDocumentAnalysis * @throws UnsupportedCharsetException on unsupported charset. */ private static Charset detectCharset(final InputStream stream, final DocumentName documentName) throws IOException, UnsupportedCharsetException { - CharsetDetector encodingDetector = new CharsetDetector(); + final int bytesForCharsetDetection = 256; + CharsetDetector encodingDetector = new CharsetDetector(bytesForCharsetDetection); encodingDetector.setText(stream); CharsetMatch charsetMatch = encodingDetector.detect(); if (charsetMatch != null) { diff --git a/apache-rat-core/src/main/java/org/apache/rat/api/Document.java b/apache-rat-core/src/main/java/org/apache/rat/api/Document.java index a29babf75..c0d7efbd6 100644 --- a/apache-rat-core/src/main/java/org/apache/rat/api/Document.java +++ b/apache-rat-core/src/main/java/org/apache/rat/api/Document.java @@ -109,8 +109,10 @@ public boolean equals(final Object obj) { * @throws IOException if this document cannot be read. */ public Reader reader() throws IOException { + final int bytesForCharsetDetection = 256; + CharsetDetector charsetDetector = new CharsetDetector(bytesForCharsetDetection); // RAT-494: Tika's CharsetDetector.getReader() may return null if the read can not be constructed due to I/O or encoding errors - Reader result = new CharsetDetector().getReader(TikaProcessor.markSupportedInputStream(inputStream()), getMetaData().getCharset().name()); + Reader result = charsetDetector.getReader(TikaProcessor.markSupportedInputStream(inputStream()), getMetaData().getCharset().name()); if (result == null) { throw new IOException(String.format("Can not read document `%s`", getName())); } diff --git a/src/changes/changes.xml b/src/changes/changes.xml index b1f1a20f7..951c630e2 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -68,6 +68,9 @@ in order to be properly linked in site reports. --> + + Reduce sample size of charset detection from 12000 to 256 byte (Tika) to increase I/O performance of RAT scans. + Fix NPE that license families is null if licenses are defined manually, reported by huangxiaoping from Hudi.