From 6ccac6c5b948aca85e368961031a5cff8714b897 Mon Sep 17 00:00:00 2001
From: Ryan Schmitt <ryansch@amazon.com>
Date: Sun, 18 Jan 2026 12:51:49 -0800
Subject: [PATCH 1/2] Use smaller text samples for charset detection

Tika by default uses the first 12,000 bytes of a document for charset
detection. This is an extremely computationally intensive process that
checks every byte of the sample against every supported charset, and
which also performs ngram-based natural language detection for
ISO-8859-1. As a result, the majority of apache-rat runtime is actually
spent performing charset detection.

Reducing the sample size to 256 bytes reduces the cost of charset
detection by over 95%. On my machine, this single change cuts the total
runtime of `apache-rat:check` in half.
---
 .../src/main/java/org/apache/rat/analysis/TikaProcessor.java  | 3 ++-
 .../src/main/java/org/apache/rat/api/Document.java            | 4 +++-
 src/changes/changes.xml                                       | 3 +++
 3 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java b/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java
index 6e4f6a45f..59f8f2008 100644
--- a/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java
+++ b/apache-rat-core/src/main/java/org/apache/rat/analysis/TikaProcessor.java
@@ -165,7 +165,8 @@ public static String process(final Document document) throws RatDocumentAnalysis
      * @throws UnsupportedCharsetException on unsupported charset.
      */
     private static Charset detectCharset(final InputStream stream, final DocumentName documentName) throws IOException, UnsupportedCharsetException {
-        CharsetDetector encodingDetector = new CharsetDetector();
+        final int bytesForCharsetDetection = 256;
+        CharsetDetector encodingDetector = new CharsetDetector(bytesForCharsetDetection);
         encodingDetector.setText(stream);
         CharsetMatch charsetMatch = encodingDetector.detect();
         if (charsetMatch != null) {
diff --git a/apache-rat-core/src/main/java/org/apache/rat/api/Document.java b/apache-rat-core/src/main/java/org/apache/rat/api/Document.java
index a29babf75..c0d7efbd6 100644
--- a/apache-rat-core/src/main/java/org/apache/rat/api/Document.java
+++ b/apache-rat-core/src/main/java/org/apache/rat/api/Document.java
@@ -109,8 +109,10 @@ public boolean equals(final Object obj) {
      * @throws IOException if this document cannot be read.
      */
     public Reader reader() throws IOException {
+        final int bytesForCharsetDetection = 256;
+        CharsetDetector charsetDetector = new CharsetDetector(bytesForCharsetDetection);
         // RAT-494: Tika's CharsetDetector.getReader() may return null if the read can not be constructed due to I/O or encoding errors
-        Reader result = new CharsetDetector().getReader(TikaProcessor.markSupportedInputStream(inputStream()), getMetaData().getCharset().name());
+        Reader result = charsetDetector.getReader(TikaProcessor.markSupportedInputStream(inputStream()), getMetaData().getCharset().name());
         if (result == null) {
             throw new IOException(String.format("Can not read document `%s`", getName()));
         }
diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index b1f1a20f7..9260c0858 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -68,6 +68,9 @@ in order to be properly linked in site reports.
     </release>
     -->
     <release version="1.0.0" date="xxxx-yy-zz" description="Current SNAPSHOT - release to be done">
+      <action issue="RAT-533" type="fix" dev="pottlinger" due-to="Ryan Schmitt">
+        Reduce sample size of charset detection from 12000 to 256 byte (Tika) to increase I/O performance of RAT scane.s
+      </action>
       <action issue="RAT-531" type="fix" dev="pottlinger" due-to="huangxiaoping">
         Fix NPE that license families is null if licenses are defined manually, reported by huangxiaoping from Hudi.
       </action>

From 8ea7c23b12cedc4c639b2d6bbc1d721ce3edfd4f Mon Sep 17 00:00:00 2001
From: "P. Ottlinger" <ottlinger@users.noreply.github.com>
Date: Wed, 21 Jan 2026 09:05:40 +0100
Subject: [PATCH 2/2] Update changes.xml to fix typo

---
 src/changes/changes.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/changes/changes.xml b/src/changes/changes.xml
index 9260c0858..951c630e2 100644
--- a/src/changes/changes.xml
+++ b/src/changes/changes.xml
@@ -69,7 +69,7 @@ in order to be properly linked in site reports.
     -->
     <release version="1.0.0" date="xxxx-yy-zz" description="Current SNAPSHOT - release to be done">
       <action issue="RAT-533" type="fix" dev="pottlinger" due-to="Ryan Schmitt">
-        Reduce sample size of charset detection from 12000 to 256 byte (Tika) to increase I/O performance of RAT scane.s
+        Reduce sample size of charset detection from 12000 to 256 byte (Tika) to increase I/O performance of RAT scans.
       </action>
       <action issue="RAT-531" type="fix" dev="pottlinger" due-to="huangxiaoping">
         Fix NPE that license families is null if licenses are defined manually, reported by huangxiaoping from Hudi.