From 6ccac6c5b948aca85e368961031a5cff8714b897 Mon Sep 17 00:00:00 2001 From: Ryan Schmitt Date: Sun, 18 Jan 2026 12:51:49 -0800 Subject: [PATCH 1/2] Use smaller text samples for charset detection Tika by default uses the first 12,000 bytes of a document for charset detection. This is an extremely computationally intensive process that checks every byte of the sample against every supported charset, and which also performs ngram-based natural language detection for ISO-8859-1. As a result, the majority of apache-rat runtime is actually spent performing charset detection. Reducing the sample size to 256 bytes reduces the cost of charset detection by over 95%. On my machine, this single change cuts the total runtime of `apache-rat:check` in half. --- .../src/main/java/org/apache/rat/analysis/TikaProcessor.java | 3 ++- .../src/main/java/org/apache/rat/api/Document.java | 4 +++- src/changes/changes.xml | 3 +++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java b/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java index 6e4f6a45f..59f8f2008 100644 --- a/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java +++ b/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java @@ -165,7 +165,8 @@ public static String process(final Document document) throws RatDocumentAnalysis * @throws UnsupportedCharsetException on unsupported charset. */ private static Charset detectCharset(final InputStream stream, final DocumentName documentName) throws IOException, UnsupportedCharsetException { - CharsetDetector encodingDetector = new CharsetDetector(); + final int bytesForCharsetDetection = 256; + CharsetDetector encodingDetector = new CharsetDetector(bytesForCharsetDetection); encodingDetector.setText(stream); CharsetMatch charsetMatch = encodingDetector.detect(); if (charsetMatch != null) { diff --git a/apache-rat-core/src/main/java/org/apache/rat/api/Document.java b/apache-rat-core/src/main/java/org/apache/rat/api/Document.java index a29babf75..c0d7efbd6 100644 --- a/apache-rat-core/src/main/java/org/apache/rat/api/Document.java +++ b/apache-rat-core/src/main/java/org/apache/rat/api/Document.java @@ -109,8 +109,10 @@ public boolean equals(final Object obj) { * @throws IOException if this document cannot be read. */ public Reader reader() throws IOException { + final int bytesForCharsetDetection = 256; + CharsetDetector charsetDetector = new CharsetDetector(bytesForCharsetDetection); // RAT-494: Tika's CharsetDetector.getReader() may return null if the read can not be constructed due to I/O or encoding errors - Reader result = new CharsetDetector().getReader(TikaProcessor.markSupportedInputStream(inputStream()), getMetaData().getCharset().name()); + Reader result = charsetDetector.getReader(TikaProcessor.markSupportedInputStream(inputStream()), getMetaData().getCharset().name()); if (result == null) { throw new IOException(String.format("Can not read document `%s`", getName())); } diff --git a/src/changes/changes.xml b/src/changes/changes.xml index b1f1a20f7..9260c0858 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -68,6 +68,9 @@ in order to be properly linked in site reports. --> + + Reduce sample size of charset detection from 12000 to 256 byte (Tika) to increase I/O performance of RAT scane.s + Fix NPE that license families is null if licenses are defined manually, reported by huangxiaoping from Hudi. From 8ea7c23b12cedc4c639b2d6bbc1d721ce3edfd4f Mon Sep 17 00:00:00 2001 From: "P. Ottlinger" Date: Wed, 21 Jan 2026 09:05:40 +0100 Subject: [PATCH 2/2] Update changes.xml to fix typo --- src/changes/changes.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 9260c0858..951c630e2 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -69,7 +69,7 @@ in order to be properly linked in site reports. --> - Reduce sample size of charset detection from 12000 to 256 byte (Tika) to increase I/O performance of RAT scane.s + Reduce sample size of charset detection from 12000 to 256 byte (Tika) to increase I/O performance of RAT scans. Fix NPE that license families is null if licenses are defined manually, reported by huangxiaoping from Hudi.