From 17e4fed12c0fdee2c2eee99e3e3a2e06c2d53aa2 Mon Sep 17 00:00:00 2001 From: Daniel Radeau Date: Mon, 12 Jan 2026 12:26:37 +0100 Subject: [PATCH] bugs #15608 fix(import): big objects digest implicit 2GB limitation changes: * resolve application failure on objects over 2GB * add progress feedback for big objects at import --- .../tools/sedalib/core/BinaryDataObject.java | 147 ++++++------------ .../utils/digest/DigestProgressLogger.java | 131 ++++++++++++++++ .../sedalib/utils/digest/DigestSha512.java | 92 +++++++++++ .../utils/digest/NioDigestComputer.java | 79 ++++++++++ .../utils/digest/DigestSha512Test.java | 119 ++++++++++++++ 5 files changed, 472 insertions(+), 96 deletions(-) create mode 100644 sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/utils/digest/DigestProgressLogger.java create mode 100644 sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/utils/digest/DigestSha512.java create mode 100644 sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/utils/digest/NioDigestComputer.java create mode 100644 sedalib/src/test/java/fr/gouv/vitam/tools/sedalib/utils/digest/DigestSha512Test.java diff --git a/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/core/BinaryDataObject.java b/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/core/BinaryDataObject.java index 780153af..22b41876 100644 --- a/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/core/BinaryDataObject.java +++ b/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/core/BinaryDataObject.java @@ -28,10 +28,8 @@ package fr.gouv.vitam.tools.sedalib.core; import com.fasterxml.jackson.annotation.JsonIgnore; -import fr.gouv.vitam.tools.sedalib.core.seda.SedaContext; import fr.gouv.vitam.tools.sedalib.core.seda.SedaVersion; import fr.gouv.vitam.tools.sedalib.droid.DroidIdentifier; -import fr.gouv.vitam.tools.sedalib.metadata.SEDAMetadata; import fr.gouv.vitam.tools.sedalib.metadata.content.PersistentIdentifier; import fr.gouv.vitam.tools.sedalib.metadata.data.FileInfo; import fr.gouv.vitam.tools.sedalib.metadata.data.FormatIdentification; @@ -45,14 +43,9 @@ import uk.gov.nationalarchives.droid.core.interfaces.IdentificationResult; import java.io.*; -import java.nio.MappedByteBuffer; -import java.nio.channels.FileChannel; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.StandardOpenOption; import java.nio.file.attribute.FileTime; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; import java.util.*; import static fr.gouv.vitam.tools.sedalib.utils.SEDALibProgressLogger.*; @@ -82,25 +75,25 @@ boolean isForVersion(int version) { } private static final List ALL_FIELDS = Arrays.asList( - new MetadataField("DataObjectProfile", new ComplexListMetadataKind(StringType.class, false), 2, 3), - new MetadataField("DataObjectSystemId", new ComplexListMetadataKind(StringType.class, false), 1, 2, 3), - new MetadataField("DataObjectGroupSystemId", new ComplexListMetadataKind(StringType.class, false), 1, 2, 3), - new MetadataField("Relationship", new ComplexListMetadataKind(Relationship.class, true), 1, 2, 3), - new MetadataField("DataObjectGroupReferenceId", new ComplexListMetadataKind(StringType.class, false), 1), - new MetadataField("DataObjectGroupId", new ComplexListMetadataKind(StringType.class, false), 1, 2, 3), - new MetadataField("DataObjectVersion", new ComplexListMetadataKind(StringType.class, false), 1, 2, 3), - new MetadataField("PersistentIdentifier", new ComplexListMetadataKind(PersistentIdentifier.class, true), 3), - new MetadataField("DataObjectUse", new ComplexListMetadataKind(StringType.class, false), 3), - new MetadataField("DataObjectNumber", new ComplexListMetadataKind(IntegerType.class, false), 3), - new MetadataField("Uri", new ComplexListMetadataKind(StringType.class, false), 1, 2, 3), - new MetadataField("MessageDigest", new ComplexListMetadataKind(DigestType.class, false), 1, 2, 3), - new MetadataField("Size", new ComplexListMetadataKind(IntegerType.class, false), 1, 2, 3), - new MetadataField("Compressed", new ComplexListMetadataKind(StringType.class, false), 1, 2, 3), - new MetadataField("FormatIdentification", new ComplexListMetadataKind(FormatIdentification.class, false), 1, 2, 3), - new MetadataField("FileInfo", new ComplexListMetadataKind(FileInfo.class, false), 1, 2, 3), - new MetadataField("Metadata", new ComplexListMetadataKind(Metadata.class, false), 1, 2, 3), - new MetadataField("OtherMetadata", new ComplexListMetadataKind(AnyXMLListType.class, false), 1, 2, 3) - ); + new MetadataField("DataObjectProfile", new ComplexListMetadataKind(StringType.class, false), 2, 3), + new MetadataField("DataObjectSystemId", new ComplexListMetadataKind(StringType.class, false), 1, 2, 3), + new MetadataField("DataObjectGroupSystemId", new ComplexListMetadataKind(StringType.class, false), 1, 2, 3), + new MetadataField("Relationship", new ComplexListMetadataKind(Relationship.class, true), 1, 2, 3), + new MetadataField("DataObjectGroupReferenceId", new ComplexListMetadataKind(StringType.class, false), 1), + new MetadataField("DataObjectGroupId", new ComplexListMetadataKind(StringType.class, false), 1, 2, 3), + new MetadataField("DataObjectVersion", new ComplexListMetadataKind(StringType.class, false), 1, 2, 3), + new MetadataField("PersistentIdentifier", new ComplexListMetadataKind(PersistentIdentifier.class, true), 3), + new MetadataField("DataObjectUse", new ComplexListMetadataKind(StringType.class, false), 3), + new MetadataField("DataObjectNumber", new ComplexListMetadataKind(IntegerType.class, false), 3), + new MetadataField("Uri", new ComplexListMetadataKind(StringType.class, false), 1, 2, 3), + new MetadataField("MessageDigest", new ComplexListMetadataKind(DigestType.class, false), 1, 2, 3), + new MetadataField("Size", new ComplexListMetadataKind(IntegerType.class, false), 1, 2, 3), + new MetadataField("Compressed", new ComplexListMetadataKind(StringType.class, false), 1, 2, 3), + new MetadataField("FormatIdentification", new ComplexListMetadataKind(FormatIdentification.class, false), 1, + 2, 3), + new MetadataField("FileInfo", new ComplexListMetadataKind(FileInfo.class, false), 1, 2, 3), + new MetadataField("Metadata", new ComplexListMetadataKind(Metadata.class, false), 1, 2, 3), + new MetadataField("OtherMetadata", new ComplexListMetadataKind(AnyXMLListType.class, false), 1, 2, 3)); private static LinkedHashMap createMetadataMapForVersion(int version) { LinkedHashMap map = new LinkedHashMap<>(); @@ -128,7 +121,7 @@ private static LinkedHashMap createMetadataMapF @Override public LinkedHashMap getMetadataMap() throws SEDALibException { return (LinkedHashMap) ComplexListInterface - .getMetadataMap(this.getClass()); + .getMetadataMap(this.getClass()); } /** @@ -138,7 +131,7 @@ public LinkedHashMap getMetadataMap() throws SE @Override public boolean isNotExpandable() { return ComplexListInterface - .isNotExpandable(this.getClass()); + .isNotExpandable(this.getClass()); } // Inner element @@ -171,8 +164,10 @@ public BinaryDataObject(DataObjectPackage dataObjectPackage) { * If an explicit filename is provided, it will be used, * otherwise the filename will be extracted from the path if available. * - * @param path The file path to extract filename from if no explicit name provided - * @param explicitFilename The explicit filename to use, or null to use path filename + * @param path The file path to extract filename from if no explicit + * name provided + * @param explicitFilename The explicit filename to use, or null to use path + * filename */ private void addFilenameMetadata(Path path, String explicitFilename) { String nameValue = explicitFilename != null @@ -202,7 +197,8 @@ private void addFilenameMetadata(Path path, String explicitFilename) { * @param explicitFilename the filename metadata * @param dataObjectVersion the DataObjectVersion */ - public BinaryDataObject(DataObjectPackage dataObjectPackage, Path path, String explicitFilename, String dataObjectVersion) { + public BinaryDataObject(DataObjectPackage dataObjectPackage, Path path, String explicitFilename, + String dataObjectVersion) { super(dataObjectPackage); if (dataObjectVersion != null) metadataList.add(new StringType("DataObjectVersion", dataObjectVersion)); @@ -262,75 +258,33 @@ private static String getExtension(String fileName) { return i < 0 ? "seda" : fileName.substring(i + 1); } - private static final String SHA512_ALGORITHM = "SHA-512"; - private static final long SMALL_FILE_THRESHOLD = 2 * 1024 * 1024; // 10Mo - - /** - * Computes the message digest (hash) for a file. - * - * @param digest The MessageDigest instance to use for computing the hash - * @param path The path to the file to hash - * @return The computed digest bytes - * @throws SEDALibException if an error occurs reading the file - */ - private static byte[] computeDigest(MessageDigest digest, Path path) throws SEDALibException { - try { - long size = Files.size(path); - - if (size <= SMALL_FILE_THRESHOLD) { - byte[] all = Files.readAllBytes(path); - digest.update(all); - return digest.digest(); - } - - // Quicker on big files - try (FileChannel channel = FileChannel.open(path, StandardOpenOption.READ)) { - MappedByteBuffer buffer = channel.map(FileChannel.MapMode.READ_ONLY, 0, size); - digest.update(buffer); - return digest.digest(); - } - - } catch (IOException e) { - throw new SEDALibException( - String.format("Impossible de calculer le hash du fichier [%s]", path), e); - } - } - /** - * Converts a byte array to its hexadecimal string representation. + * Gets the digest sha 512. * - * @param bytes The byte array to convert - * @return The hexadecimal string representation of the bytes + * @param path the path of the file to hash + * @return the digest sha 512 + * @throws SEDALibException if unable to get digest */ - private static String bytesToHex(byte[] bytes) { - StringBuilder sb = new StringBuilder(bytes.length * 2); - for (byte b : bytes) { - sb.append(String.format("%02x", b)); - } - return sb.toString(); + public static String getDigestSha512(Path path) throws SEDALibException { + return fr.gouv.vitam.tools.sedalib.utils.digest.DigestSha512.compute(path); } /** * Gets the digest sha 512. * - * @param path the path of the file to hash + * @param path the path of the file to hash + * @param logger the logger * @return the digest sha 512 * @throws SEDALibException if unable to get digest */ - public static String getDigestSha512(Path path) throws SEDALibException { - MessageDigest digest; - try { - digest = MessageDigest.getInstance(SHA512_ALGORITHM); - byte[] hash = computeDigest(digest, path); - return bytesToHex(hash); - } catch (NoSuchAlgorithmException e) { - throw new SEDALibException("Impossible de mobiliser l'algorithme de hashage " + SHA512_ALGORITHM, e); - } + public static String getDigestSha512(Path path, SEDALibProgressLogger logger) throws SEDALibException { + return fr.gouv.vitam.tools.sedalib.utils.digest.DigestSha512.compute(path, logger); } /** * Updates the FileInfo metadata for a binary data object. - * If no FileInfo exists, creates a new one. Sets the filename from the onDiskPath if not already set. + * If no FileInfo exists, creates a new one. Sets the filename from the + * onDiskPath if not already set. * Updates the last modified timestamp. * * @param lastModifiedTime The last modified timestamp to set @@ -370,8 +324,7 @@ private IdentificationResult identifyFormat(SEDALibProgressLogger logger, Path p logger, OBJECTS_WARNINGS, "sedalib: impossible de faire l'identification Droid pour le fichier [" + path + "]", - e - ); + e); return null; } } @@ -380,7 +333,8 @@ private IdentificationResult identifyFormat(SEDALibProgressLogger logger, Path p * Extract technical elements (lastmodified date, size, format, digest...) from * file and complete the BinaryDataObject metadata. * - * @param sedaLibProgressLogger the progress logger or null if no progress log expected + * @param sedaLibProgressLogger the progress logger or null if no progress log + * expected * @throws SEDALibException if unable to get size or lastmodified date (probably * can't access file) */ @@ -396,7 +350,7 @@ public void extractTechnicalElements(SEDALibProgressLogger sedaLibProgressLogger } updateFileInfo(lastModifiedTime); - addMetadata(new DigestType("MessageDigest", getDigestSha512(onDiskPath), "SHA-512")); + addMetadata(new DigestType("MessageDigest", getDigestSha512(onDiskPath, sedaLibProgressLogger), "SHA-512")); addMetadata(new IntegerType("Size", size)); IdentificationResult idResult = identifyFormat(sedaLibProgressLogger, onDiskPath); @@ -441,30 +395,31 @@ public void toSedaXml(SEDAXMLStreamWriter xmlWriter, SEDALibProgressLogger sedaL super.toSedaXml(xmlWriter, sedaLibProgressLogger); } - // SEDA XML importer - /** * Import the BinaryDataObject in XML expected form from the SEDA Manifest in * the DataObjectPackage. * * @param xmlReader the SEDAXMLEventReader reading the SEDA manifest * @param dataObjectPackage the DataObjectPackage to be completed - * @param sedaLibProgressLogger the progress logger or null if no progress log expected + * @param sedaLibProgressLogger the progress logger or null if no progress log + * expected * @return the read BinaryDataObject, or null if not a BinaryDataObject * @throws SEDALibException if the XML can't be read or the SEDA scheme is * not respected * @throws InterruptedException if export process is interrupted */ public static BinaryDataObject fromSedaXml(SEDAXMLEventReader xmlReader, DataObjectPackage dataObjectPackage, - SEDALibProgressLogger sedaLibProgressLogger) throws SEDALibException, InterruptedException { + SEDALibProgressLogger sedaLibProgressLogger) throws SEDALibException, InterruptedException { BinaryDataObject bdo = new BinaryDataObject(); - return (importUnitaryDataObjectPackageIdElementFromSedaXml(bdo, xmlReader, dataObjectPackage, sedaLibProgressLogger) - ? bdo : null); + return (importUnitaryDataObjectPackageIdElementFromSedaXml(bdo, xmlReader, dataObjectPackage, + sedaLibProgressLogger) + ? bdo + : null); } - /** + /** * Gets the FileInfo metadata from the metadata list. * * @return the FileInfo metadata, or null if not found diff --git a/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/utils/digest/DigestProgressLogger.java b/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/utils/digest/DigestProgressLogger.java new file mode 100644 index 00000000..fc14188b --- /dev/null +++ b/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/utils/digest/DigestProgressLogger.java @@ -0,0 +1,131 @@ +/** + * Copyright French Prime minister Office/DINSIC/Vitam Program (2015-2019) + *

+ * contact.vitam@programmevitam.fr + *

+ * This software is developed as a validation helper tool, for constructing Submission Information Packages (archives + * sets) in the Vitam program whose purpose is to implement a digital archiving back-office system managing high + * volumetry securely and efficiently. + *

+ * This software is governed by the CeCILL 2.1 license under French law and abiding by the rules of distribution of free + * software. You can use, modify and/ or redistribute the software under the terms of the CeCILL 2.1 license as + * circulated by CEA, CNRS and INRIA archiveTransfer the following URL "http://www.cecill.info". + *

+ * As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, + * users are provided only with a limited warranty and the software's author, the holder of the economic rights, and the + * successive licensors have only limited liability. + *

+ * In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or + * developing or reproducing the software by the user in light of its specific status of free software, that may mean + * that it is complicated to manipulate, and that also therefore means that it is reserved for developers and + * experienced professionals having in-depth computer knowledge. Users are therefore encouraged to load and test the + * software's suitability as regards their requirements in conditions enabling the security of their systems and/or data + * to be ensured and, more generally, to use and operate it in the same conditions as regards security. + *

+ * The fact that you are presently reading this means that you have had knowledge of the CeCILL 2.1 license and that you + * accept its terms. + */ +package fr.gouv.vitam.tools.sedalib.utils.digest; + +import fr.gouv.vitam.tools.sedalib.utils.SEDALibProgressLogger; +import org.apache.commons.io.FileUtils; + +import java.nio.file.Path; + +import static fr.gouv.vitam.tools.sedalib.utils.SEDALibProgressLogger.GLOBAL; +import static fr.gouv.vitam.tools.sedalib.utils.SEDALibProgressLogger.doProgressLogWithoutInterruption; + +/** + * Common logging logic for digest computation. + */ +public class DigestProgressLogger { + + private static final long LOGGING_THRESHOLD_BYTES = 20L * FileUtils.ONE_MB; + private static final long LOG_INTERVAL_MS = 3_000; + + private final SEDALibProgressLogger logger; + private final String filename; + private final long fileSize; + private final boolean enabled; + + private long lastLogTimestamp; + + public DigestProgressLogger( + SEDALibProgressLogger logger, + Path path, + long fileSize + ) { + this.logger = logger; + this.filename = extractFilename(path); + this.fileSize = fileSize; + this.enabled = logger != null && fileSize > LOGGING_THRESHOLD_BYTES; + this.lastLogTimestamp = System.currentTimeMillis(); + + if (enabled) { + log(0); + } + } + + /** + * Logs progress based on total bytes read. + */ + public void logProgress(long bytesReadTotal) { + if (!enabled || !shouldLogNow()) { + return; + } + + log(bytesReadTotal); + } + + public void logEnd() { + if (!enabled) { + return; + } + + log(fileSize); + } + + private boolean shouldLogNow() { + long now = System.currentTimeMillis(); + if (now - lastLogTimestamp >= LOG_INTERVAL_MS) { + lastLogTimestamp = now; + return true; + } + return false; + } + + private void log(long bytesRead) { + doProgressLogWithoutInterruption( + logger, + GLOBAL, + formatMessage(bytesRead), + null + ); + } + + private String formatMessage(long bytesRead) { + int percent = computePercent(bytesRead); + + return String.format( + "digest %s %s/%s (%d%%)", + filename, + FileUtils.byteCountToDisplaySize(bytesRead), + FileUtils.byteCountToDisplaySize(fileSize), + percent + ); + } + + private int computePercent(long bytesReadTotal) { + if (fileSize <= 0) { + return 100; + } + long percent = (bytesReadTotal * 100L) / fileSize; + return (int) Math.min(percent, 100); + } + + private static String extractFilename(Path path) { + return (path != null && path.getFileName() != null) + ? path.getFileName().toString() + : "inconnu"; + } +} diff --git a/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/utils/digest/DigestSha512.java b/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/utils/digest/DigestSha512.java new file mode 100644 index 00000000..774a0dda --- /dev/null +++ b/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/utils/digest/DigestSha512.java @@ -0,0 +1,92 @@ +/** + * Copyright French Prime minister Office/DINSIC/Vitam Program (2015-2019) + *

+ * contact.vitam@programmevitam.fr + *

+ * This software is developed as a validation helper tool, for constructing Submission Information Packages (archives + * sets) in the Vitam program whose purpose is to implement a digital archiving back-office system managing high + * volumetry securely and efficiently. + *

+ * This software is governed by the CeCILL 2.1 license under French law and abiding by the rules of distribution of free + * software. You can use, modify and/ or redistribute the software under the terms of the CeCILL 2.1 license as + * circulated by CEA, CNRS and INRIA archiveTransfer the following URL "http://www.cecill.info". + *

+ * As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, + * users are provided only with a limited warranty and the software's author, the holder of the economic rights, and the + * successive licensors have only limited liability. + *

+ * In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or + * developing or reproducing the software by the user in light of its specific status of free software, that may mean + * that it is complicated to manipulate, and that also therefore means that it is reserved for developers and + * experienced professionals having in-depth computer knowledge. Users are therefore encouraged to load and test the + * software's suitability as regards their requirements in conditions enabling the security of their systems and/or data + * to be ensured and, more generally, to use and operate it in the same conditions as regards security. + *

+ * The fact that you are presently reading this means that you have had knowledge of the CeCILL 2.1 license and that you + * accept its terms. + */ +package fr.gouv.vitam.tools.sedalib.utils.digest; + +import fr.gouv.vitam.tools.sedalib.utils.SEDALibException; +import fr.gouv.vitam.tools.sedalib.utils.SEDALibProgressLogger; + +import java.nio.file.Path; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; + +/** + * Utility class for computing file digests using optimized methods. + */ +public class DigestSha512 { + + private static final String SHA512_ALGORITHM = "SHA-512"; + + // Private constructor to hide the implicit public one + private DigestSha512() { + } + + /** + * compute SHA-512 digest for a file. + * Automatically switches between standard IO and parallel prefetching based on + * file size. + * + * @param path the file path + * @param logger the logger (can be null) + * @return the hex string of the digest + * @throws SEDALibException if an error occurs + */ + public static String compute(Path path, SEDALibProgressLogger logger) throws SEDALibException { + try { + MessageDigest digest = MessageDigest.getInstance(SHA512_ALGORITHM); + byte[] hash = compute(digest, path, logger); + return bytesToHex(hash); + } catch (NoSuchAlgorithmException e) { + throw new SEDALibException("Impossible de mobiliser l'algorithme de hashage " + SHA512_ALGORITHM, e); + } + } + + /** + * compute SHA-512 digest for a file (no logger). + * + * @param path the file path + * @return the hex string of the digest + * @throws SEDALibException if an error occurs + */ + public static String compute(Path path) throws SEDALibException { + return compute(path, null); + } + + private static byte[] compute(MessageDigest digest, Path path, SEDALibProgressLogger logger) + throws SEDALibException { + NioDigestComputer computer = new NioDigestComputer(); + return computer.compute(digest, path, logger); + } + + private static String bytesToHex(byte[] bytes) { + StringBuilder sb = new StringBuilder(bytes.length * 2); + for (byte b : bytes) { + sb.append(String.format("%02x", b)); + } + return sb.toString(); + } +} diff --git a/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/utils/digest/NioDigestComputer.java b/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/utils/digest/NioDigestComputer.java new file mode 100644 index 00000000..f824b5a1 --- /dev/null +++ b/sedalib/src/main/java/fr/gouv/vitam/tools/sedalib/utils/digest/NioDigestComputer.java @@ -0,0 +1,79 @@ +/** + * Copyright French Prime minister Office/DINSIC/Vitam Program (2015-2019) + *

+ * contact.vitam@programmevitam.fr + *

+ * This software is developed as a validation helper tool, for constructing Submission Information Packages (archives + * sets) in the Vitam program whose purpose is to implement a digital archiving back-office system managing high + * volumetry securely and efficiently. + *

+ * This software is governed by the CeCILL 2.1 license under French law and abiding by the rules of distribution of free + * software. You can use, modify and/ or redistribute the software under the terms of the CeCILL 2.1 license as + * circulated by CEA, CNRS and INRIA archiveTransfer the following URL "http://www.cecill.info". + *

+ * As a counterpart to the access to the source code and rights to copy, modify and redistribute granted by the license, + * users are provided only with a limited warranty and the software's author, the holder of the economic rights, and the + * successive licensors have only limited liability. + *

+ * In this respect, the user's attention is drawn to the risks associated with loading, using, modifying and/or + * developing or reproducing the software by the user in light of its specific status of free software, that may mean + * that it is complicated to manipulate, and that also therefore means that it is reserved for developers and + * experienced professionals having in-depth computer knowledge. Users are therefore encouraged to load and test the + * software's suitability as regards their requirements in conditions enabling the security of their systems and/or data + * to be ensured and, more generally, to use and operate it in the same conditions as regards security. + *

+ * The fact that you are presently reading this means that you have had knowledge of the CeCILL 2.1 license and that you + * accept its terms. + */ +package fr.gouv.vitam.tools.sedalib.utils.digest; + +import fr.gouv.vitam.tools.sedalib.utils.SEDALibException; +import fr.gouv.vitam.tools.sedalib.utils.SEDALibProgressLogger; +import org.apache.commons.io.FileUtils; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.security.MessageDigest; + +public class NioDigestComputer { + + public byte[] compute(MessageDigest digest, Path path, SEDALibProgressLogger logger) throws SEDALibException { + long fileSize; + try { + fileSize = Files.size(path); + } catch (IOException e) { + throw new SEDALibException(String.format("Impossible d'accéder au fichier [%s]", path), e); + } + + final int CHUNK_BUFFER_SIZE = Math.toIntExact(64 * FileUtils.ONE_KB); + + try (FileChannel channel = FileChannel.open(path)) { + + DigestProgressLogger progressLogger = new DigestProgressLogger(logger, path, fileSize); + long bytesReadTotal = 0; + + int bufferSize = (int) Math.min(fileSize > 0 ? fileSize : CHUNK_BUFFER_SIZE, CHUNK_BUFFER_SIZE); + ByteBuffer buffer = ByteBuffer.allocateDirect(bufferSize); + + while (channel.read(buffer) != -1) { + buffer.flip(); + digest.update(buffer); + int read = buffer.position(); + bytesReadTotal += read; + buffer.clear(); + + progressLogger.logProgress(bytesReadTotal); + } + + progressLogger.logEnd(); + + return digest.digest(); + } catch (IOException e) { + throw new SEDALibException( + String.format("Impossible de calculer le hash du fichier [%s]", path), e); + } + } +} diff --git a/sedalib/src/test/java/fr/gouv/vitam/tools/sedalib/utils/digest/DigestSha512Test.java b/sedalib/src/test/java/fr/gouv/vitam/tools/sedalib/utils/digest/DigestSha512Test.java new file mode 100644 index 00000000..1c9456bd --- /dev/null +++ b/sedalib/src/test/java/fr/gouv/vitam/tools/sedalib/utils/digest/DigestSha512Test.java @@ -0,0 +1,119 @@ +package fr.gouv.vitam.tools.sedalib.utils.digest; + +import fr.gouv.vitam.tools.sedalib.utils.SEDALibException; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.Random; + +import static org.assertj.core.api.Assertions.assertThat; + +class DigestSha512Test { + + @TempDir + Path tempDir; + + @Test + void testComputeDigestSmallFile() throws IOException, SEDALibException, NoSuchAlgorithmException { + // Given + Path file = tempDir.resolve("small.txt"); + String content = "Hello World"; + Files.writeString(file, content); + + // When + String digest = DigestSha512.compute(file); + + // Then + MessageDigest md = MessageDigest.getInstance("SHA-512"); + byte[] expectedBytes = md.digest(content.getBytes()); + String expected = bytesToHex(expectedBytes); + + assertThat(digest).isEqualTo(expected); + } + + @Test + void testComputeDigestEmptyFile() throws IOException, SEDALibException, NoSuchAlgorithmException { + // Given + Path file = tempDir.resolve("empty.txt"); + Files.createFile(file); + + // When + String digest = DigestSha512.compute(file); + + // Then + MessageDigest md = MessageDigest.getInstance("SHA-512"); + byte[] expectedBytes = md.digest(new byte[0]); + String expected = bytesToHex(expectedBytes); + + assertThat(digest).isEqualTo(expected); + } + + @Test + void testComputeDigestLargeFile() throws IOException, SEDALibException, NoSuchAlgorithmException { + // Given: Create a file larger than 20MB to trigger parallel prefetching + // 21 MB + int size = 21 * 1024 * 1024; + Path file = tempDir.resolve("large.bin"); + + // Generate random content + byte[] data = new byte[size]; + new Random().nextBytes(data); + Files.write(file, data); + + // When + String digest = DigestSha512.compute(file); + + // Then + MessageDigest md = MessageDigest.getInstance("SHA-512"); + byte[] expectedBytes = md.digest(data); + String expected = bytesToHex(expectedBytes); + + assertThat(digest).isEqualTo(expected); + } + + @Test + void testComputeDigestFileLargerThan2GB() throws IOException, SEDALibException, NoSuchAlgorithmException { + // Given: Create a sparse file larger than 2GB (e.g., 2.5 GB) + // 2.5 GB = 2684354560 bytes + long size = 2684354560L; + Path file = tempDir.resolve("huge_sparse.bin"); + + try (java.io.RandomAccessFile raf = new java.io.RandomAccessFile(file.toFile(), "rw")) { + raf.setLength(size); + } + + // When + String digest = DigestSha512.compute(file); + + // Then + // We know a sparse file reads as zeros. + // To verify, we can either hardcode the hash of 2.5GB of zeros or compute it. + // Computing it in the test ensures correctness but takes time. + // 2.5GB read ~ 5-10s at 500MB/s. + + MessageDigest md = MessageDigest.getInstance("SHA-512"); + byte[] buffer = new byte[64 * 1024]; // 64KB of zeros + long remaining = size; + while (remaining > 0) { + int toRead = (int) Math.min(buffer.length, remaining); + md.update(buffer, 0, toRead); + remaining -= toRead; + } + String expected = bytesToHex(md.digest()); + + assertThat(digest).isEqualTo(expected); + } + + private String bytesToHex(byte[] bytes) { + StringBuilder sb = new StringBuilder(bytes.length * 2); + for (byte b : bytes) { + sb.append(String.format("%02x", b)); + } + return sb.toString(); + } +}