diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 56c1252..83b44b9 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -1,7 +1,7 @@ [versions] guava_version = "33.5.0-jre" -junit_version = "6.0.1" -log4j_version = "2.25.2" +junit_version = "6.0.2" +log4j_version = "2.25.3" [libraries] guava = { module = "com.google.guava:guava", version.ref = "guava_version" } diff --git a/src/main/java/com/wildermods/masshash/Blob.java b/src/main/java/com/wildermods/masshash/Blob.java index bd832cf..07bfa89 100644 --- a/src/main/java/com/wildermods/masshash/Blob.java +++ b/src/main/java/com/wildermods/masshash/Blob.java @@ -4,6 +4,7 @@ import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.util.Objects; import com.wildermods.masshash.exception.IntegrityException; import com.wildermods.masshash.utils.ByteUtil; @@ -15,6 +16,11 @@ */ public record Blob(byte[] data, String hash) implements IBlob { + public Blob { + Objects.requireNonNull(data); + Objects.requireNonNull(hash); + } + /** * Constructs a Blob from the given data and computes its hash. * @@ -118,23 +124,7 @@ public Blob(InputStream stream, Hash hash) throws IOException, IntegrityExceptio * @return A new {@link Hash} object that represents this blob, but with no associated data. */ public Hash dropData() { - if(isTransient()) { - throw new UnsupportedOperationException("Data already dropped!"); - } - return new Blob((byte[])null, hash); - } - - /** - * Returns the data associated with this Blob - * - * @return a byte array that contains the data stored in this blob - */ - @Override - public byte[] data() { - if(data == null) { - throw new UnsupportedOperationException("Null data! Was the data dropped?"); - } - return data; + return Hash.of(hash()); } @Override @@ -143,7 +133,7 @@ public int hashCode() { } /** - * Compares this object with another Hash object for equality. All {@link Blob} objects are also instances of {@link Hash}. + * Compares this object with another Hash object for equality. All {@link IBlob} objects are also instances of {@link Hash}. *

* Two {@link Hash} objects are considered equal if their hashes are the same. This method specifically compares * the hash of the other object with the hash of this object. If the other object is not an instance of {@link Hash}, diff --git a/src/main/java/com/wildermods/masshash/Data.java b/src/main/java/com/wildermods/masshash/Data.java index 2a7b974..dcbdad3 100644 --- a/src/main/java/com/wildermods/masshash/Data.java +++ b/src/main/java/com/wildermods/masshash/Data.java @@ -1,5 +1,8 @@ package com.wildermods.masshash; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; import java.util.Arrays; /** @@ -15,6 +18,21 @@ public interface Data { */ public byte[] data(); + /** + * Returns an {@link InputStream} for reading the data. + * + * Default implementation wraps {@link data()} in a {@link ByteArrayInputStream} + * Classes that can stream data without holding it in memory should override this. + * + * @return an {@link InputStream} for the data + */ + public default InputStream dataStream() throws IOException { + if(isTransient()) { + throw new IllegalStateException("No data to stream!"); + } + return new ByteArrayInputStream(data()); + } + /** * Checks if the data is transient, meaning the data is null or otherwise unavailable. * diff --git a/src/main/java/com/wildermods/masshash/Hash.java b/src/main/java/com/wildermods/masshash/Hash.java index 6bb4893..866372d 100644 --- a/src/main/java/com/wildermods/masshash/Hash.java +++ b/src/main/java/com/wildermods/masshash/Hash.java @@ -41,6 +41,51 @@ public default boolean hashEquals(String hash) { * @return a new {@link Hash} instance. */ public static Hash of(String hash) { - return new Blob((byte[])null, hash); + return new Hash() { + + @Override + public String hash() { + return hash; + } + + @Override + public int hashCode() { + return hash.hashCode(); + } + + /** + * Compares this object with another Hash object for equality. All {@link IBlob} objects are also instances of {@link Hash}. + *

+ * Two {@link Hash} objects are considered equal if their hashes are the same. This method specifically compares + * the hash of the other object with the hash of this object. If the other object is not an instance of {@link Hash}, + * the method returns {@code false}. + *

+ * + * @param o the object to compare with this Hash object. + * @return {@code true} if the other object is a {@link Hash} and has the same hash; {@code false} otherwise. + */ + @Override + public boolean equals(Object o) { + if(o instanceof Hash) { + return hash().equals(((Hash) o).hash()); + } + return false; + } + + /** + * Returns a string representation of this Blob, which is its hash value. + *

+ * This method overrides the default {@link Object#toString()} method to provide a more meaningful + * string representation of the Blob. + *

+ * + * @return the hash of the Blob as a string. + */ + @Override + public String toString() { + return hash(); + } + + }; } } \ No newline at end of file diff --git a/src/main/java/com/wildermods/masshash/Hasher.java b/src/main/java/com/wildermods/masshash/Hasher.java index 5913f0f..19be387 100644 --- a/src/main/java/com/wildermods/masshash/Hasher.java +++ b/src/main/java/com/wildermods/masshash/Hasher.java @@ -93,7 +93,7 @@ public Hasher(final Stream files) throws IOException { * before being added to the result map. The updated reference value will be associated with the computed hash. * @throws IOException if an I/O error occurs during hashing */ - public Hasher(final Stream files, final BiConsumer, Blob> forEachBlob) throws IOException { + public Hasher(final Stream files, final BiConsumer, IBlob> forEachBlob) throws IOException { this(files, (p) -> true, forEachBlob); } @@ -112,7 +112,7 @@ public Hasher(final Stream files, final BiConsumer, Blob> * @throws IOException if an I/O error occurs during hashing * @throws IllegalArgumentException if no files match the predicate */ - public Hasher(final Stream files, final Predicate predicate, final BiConsumer, Blob> forEachBlob) throws IOException { + public Hasher(final Stream files, final Predicate predicate, final BiConsumer, IBlob> forEachBlob) throws IOException { this(files, Runtime.getRuntime().availableProcessors(), predicate, forEachBlob); } @@ -147,7 +147,7 @@ public Hasher(final Stream files, final Predicate predicate, final B * @throws IOException if an error occurs while reading files or during thread execution * @throws IllegalArgumentException if no files matched the provided predicate */ - public Hasher(final Stream files, int threads, final Predicate predicate, final BiConsumer,Blob> forEachBlob) throws IOException { + public Hasher(final Stream files, int threads, final Predicate predicate, final BiConsumer,IBlob> forEachBlob) throws IOException { final int processors = Runtime.getRuntime().availableProcessors(); Objects.requireNonNull(files); Objects.requireNonNull(predicate); @@ -214,12 +214,12 @@ public Hasher(final Stream files, int threads, final Predicate predi for (Path file : sublist) { Reference newFile = new Reference<>(file); //Read and hash the file into a Blob, then discard the Blob’s data to conserve memory - Hash blob = new Blob(file); - forEachBlob.accept(newFile, (Blob) blob); - ((Blob) blob).dropData(); + IBlob blob = LightBlob.from(file); + forEachBlob.accept(newFile, (IBlob) blob); + Hash hash = blob.dropData(); //Group files by their content hash. Files with the same hash will share the same key - local.computeIfAbsent(blob, k -> new HashSet<>()).add(newFile.get()); + local.computeIfAbsent(hash, k -> new HashSet<>()).add(newFile.get()); } return local; })); diff --git a/src/main/java/com/wildermods/masshash/IBlob.java b/src/main/java/com/wildermods/masshash/IBlob.java index 7fa11ef..6f9c859 100644 --- a/src/main/java/com/wildermods/masshash/IBlob.java +++ b/src/main/java/com/wildermods/masshash/IBlob.java @@ -17,4 +17,13 @@ public interface IBlob extends Data, Hash { * indicating data corruption or alteration. */ public void verify() throws IntegrityException; + + /** + * Drops the data from the current object and returns a new Hash that represents the hash of this Blob. + * The original blob still holds the data for as long as you keep it referenced. + * + * @return A new {@link Hash} object that represents this blob, but with no associated data. + */ + public Hash dropData(); + } diff --git a/src/main/java/com/wildermods/masshash/LightBlob.java b/src/main/java/com/wildermods/masshash/LightBlob.java new file mode 100644 index 0000000..461419e --- /dev/null +++ b/src/main/java/com/wildermods/masshash/LightBlob.java @@ -0,0 +1,217 @@ +package com.wildermods.masshash; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UncheckedIOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Objects; +import java.util.function.Supplier; + +import com.wildermods.masshash.exception.IntegrityException; +import com.wildermods.masshash.utils.ByteUtil; + +/** + * A lightweight implementation of {@link IBlob} that represents data which can be read + * as a stream. Does not necessarily store the full byte array in memory. + *

+ * This is particularly useful for large files or streams (e.g., files on disk, network streams), + * where reading the entire content into memory is undesirable. The hash of the data is always stored + * and can be verified without retaining the raw bytes. + *

+ */ +public record LightBlob(Supplier streamSupplier, String hash) implements IBlob { + + /** + * Canonical constructor. Ensures neither the {@code streamSupplier} nor {@code hash} are null. + * + * @param streamSupplier a {@link Supplier} that provides a fresh {@link InputStream} to read the data + * @param hash the SHA-1 hash of the data + * @throws NullPointerException if either {@code streamSupplier} or {@code hash} is null + */ + public LightBlob { + Objects.requireNonNull(streamSupplier); + Objects.requireNonNull(hash); + } + + /** + * Creates a {@link LightBlob} from a file at the specified path, computing the hash from its contents. + * + * @param path the file path + * @return a new {@link LightBlob} representing the file + * @throws IOException if reading the file fails + */ + public static LightBlob from(Path path) throws IOException { + Supplier streamSupplier = () -> { + try { + return Files.newInputStream(path); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + }; + + try (InputStream stream = streamSupplier.get()){ + return new LightBlob(streamSupplier, ByteUtil.hash(stream)); + } catch (UncheckedIOException e) { + throw new IOException(e); + } + } + + /** + * Creates a {@link LightBlob} from a file at the specified path and verifies it matches the expected hash. + * + * @param path the file path + * @param expectedHash the expected hash of the file contents + * @return a new {@link LightBlob} representing the file + * @throws IOException if reading the file fails + * @throws IntegrityException if the file's hash does not match {@code expectedHash} + */ + public static LightBlob from(Path path, String expectedHash) throws IOException, IntegrityException { + Supplier streamSupplier = () -> { + try { + return Files.newInputStream(path); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + }; + + try { + LightBlob blob = new LightBlob(streamSupplier, expectedHash); + blob.verify(); + return blob; + } catch (UncheckedIOException e) { + throw new IOException(e); + } + } + + /** + * Returns the full byte array of the blob data. + *

+ * Deprecated because reading the entire data into memory may be expensive for large streams. + * Prefer {@link #dataStream()} instead. + *

+ * + * @return the byte array of the blob + * @throws UncheckedIOException if reading the stream fails + */ + @Override + @Deprecated + public byte[] data() { + try (InputStream stream = dataStream()){ + return stream.readAllBytes(); + } + catch(IOException e) { + throw new UncheckedIOException(e); + } + } + + /** + * Checks if this blob is transient, meaning the underlying stream cannot be opened. + *

+ * This can occur if the file is deleted, the network stream fails, or any other I/O error + * prevents the stream from being accessed. + *

+ * + * @return {@code true} if the data stream cannot be opened, {@code false} otherwise + */ + @Override + public boolean isTransient() { + try (InputStream stream = streamSupplier.get()){ + return false; + } + catch(Exception e) { + return true; + } + } + + /** + * Returns a fresh {@link InputStream} for reading the blob's data. + *

+ * Each call returns a new stream. The caller is responsible for closing it. + *

+ * + * @return a fresh {@link InputStream} for reading the blob's contents + * @throws IOException if the stream cannot be opened + */ + @Override + public InputStream dataStream() throws IOException { + try { + return streamSupplier.get(); + } + catch(Exception e) { + throw new IOException(e); + } + } + + /** + * Verifies that the data matches the provided hash. + *

+ * This method computes the hash of the current data and compares it to the expected hash. If the hashes do not match, + * an {@link IntegrityException} is thrown. This method ensures the integrity of the data. + *

+ * + * @throws IntegrityException if the computed hash of the data does not match the expected hash. + */ + @Override + public void verify() throws IntegrityException { + try (InputStream stream = dataStream()){ + String actualHash = ByteUtil.hash(stream); + if(!actualHash.equals(hash)) { + throw new IntegrityException("Expected hash " + hash + " but got " + actualHash); + } + } + catch(IOException e) { + throw new UncheckedIOException(e); + } + } + + /** + * Drops the data from the current object and returns a new Hash that represents the hash of this Blob. + * The original blob still holds the data for as long as you keep it referenced. + * + * @return A new {@link Hash} object that represents this blob, but with no associated data. + */ + @Override + public Hash dropData() { + return Hash.of(hash()); + } + + @Override + public int hashCode() { + return hash.hashCode(); + } + + /** + * Compares this object with another Hash object for equality. All {@link IBlob} objects are also instances of {@link Hash}. + *

+ * Two {@link Hash} objects are considered equal if their hashes are the same. This method specifically compares + * the hash of the other object with the hash of this object. If the other object is not an instance of {@link Hash}, + * the method returns {@code false}. + *

+ * + * @param o the object to compare with this Hash object. + * @return {@code true} if the other object is a {@link Hash} and has the same hash; {@code false} otherwise. + */ + @Override + public boolean equals(Object o) { + if(o instanceof Hash) { + return hash().equals(((Hash) o).hash()); + } + return false; + } + + /** + * Returns a string representation of this Blob, which is its hash value. + *

+ * This method overrides the default {@link Object#toString()} method to provide a more meaningful + * string representation of the Blob. + *

+ * + * @return the hash of the Blob as a string. + */ + @Override + public String toString() { + return hash(); + } + +} diff --git a/src/main/java/com/wildermods/masshash/utils/ByteUtil.java b/src/main/java/com/wildermods/masshash/utils/ByteUtil.java index c430c8d..11d765f 100644 --- a/src/main/java/com/wildermods/masshash/utils/ByteUtil.java +++ b/src/main/java/com/wildermods/masshash/utils/ByteUtil.java @@ -1,5 +1,7 @@ package com.wildermods.masshash.utils; +import java.io.IOException; +import java.io.InputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.Objects; @@ -29,6 +31,30 @@ public static String hash(byte[] bytes) { } } + /** + * Hashes the contents of an InputStream using SHA-1 without loading all bytes into memory. + * The stream is read sequentially in 1 MiB chunks. + * + * @param stream the InputStream to hash + * @return the hexadecimal SHA-1 hash + * @throws IOException if an I/O error occurs reading the stream + * @throws NullPointerException if the stream is null + */ + public static String hash(InputStream stream) throws IOException { + Objects.requireNonNull(stream, "InputStream cannot be null."); + try { + MessageDigest digest = MessageDigest.getInstance("SHA-1"); + byte[] buffer = new byte[1048576]; // 1 MiB buffer + int bytesRead; + while ((bytesRead = stream.read(buffer)) != -1) { + digest.update(buffer, 0, bytesRead); + } + return bytesToHex(digest.digest()); + } catch (NoSuchAlgorithmException e) { + throw new AssertionError("SHA-1 algorithm is unavailable.", e); + } + } + /** * Converts a byte array into a hexadecimal string representation. *

diff --git a/src/test/java/com/wildermods/masshash/BlobTests.java b/src/test/java/com/wildermods/masshash/BlobTests.java index 0f92860..b560e33 100644 --- a/src/test/java/com/wildermods/masshash/BlobTests.java +++ b/src/test/java/com/wildermods/masshash/BlobTests.java @@ -3,21 +3,41 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrowsExactly; +import java.util.function.Supplier; +import java.io.ByteArrayInputStream; +import java.io.InputStream; + import org.junit.jupiter.api.Test; import com.wildermods.masshash.exception.IntegrityException; +import com.wildermods.masshash.utils.ByteUtil; public class BlobTests { + private static final String testHash = "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3"; + private static final Blob testBlob = new Blob("test".getBytes()); private static final Blob testBlob2 = new Blob("test".getBytes()); + private static final LightBlob lightBlob = new LightBlob( + (Supplier)() -> { + return new ByteArrayInputStream( + "test".getBytes() + );}, testHash + ); - private static final String testHash = "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3"; + @Test + public void testNullConstructors() { + assertThrowsExactly(NullPointerException.class, () -> new Blob((byte[])null)); + assertThrowsExactly(NullPointerException.class, () -> new Blob(new byte[0], (String)null)); + assertThrowsExactly(NullPointerException.class, () -> new LightBlob(null, "")); + assertThrowsExactly(NullPointerException.class, () -> new LightBlob(lightBlob.streamSupplier(), null)); + } @Test public void testBlob() { assertEquals(testBlob.hash(), testHash); assertEquals(testBlob.toString(), testHash); + assertEquals(lightBlob.toString(), testHash); } @Test @@ -25,6 +45,8 @@ public void testBlobEquality() { assertEquals(testBlob, testBlob); assertEquals(testBlob, testBlob2); assertEquals(testBlob2, testBlob); + assertEquals(lightBlob, testBlob); + assertEquals(testBlob, lightBlob); } @Test @@ -35,25 +57,26 @@ public void testDroppedBlobEquality() { assertEquals(testBlob, testBlob2); assertEquals(testBlob2, testBlob); assertEquals(testBlob2, testBlob2); - } - - @Test - public void testDropData() { - Blob dropped = (Blob) testBlob.dropData(); - assertThrowsExactly(UnsupportedOperationException.class, () -> {dropped.data();}); - assertThrowsExactly(UnsupportedOperationException.class, () -> {dropped.dropData();}); + + Hash lightBlob2 = lightBlob.dropData(); + assertEquals(testBlob, testBlob); + assertEquals(testBlob, lightBlob2); + assertEquals(lightBlob2, testBlob); + assertEquals(lightBlob2, lightBlob2); } @Test public void testVerification() throws IntegrityException { testBlob.verify(); - Blob dropped = (Blob) testBlob.dropData(); - Blob corrupt = new Blob(testBlob.data(), new Blob("corrupt".getBytes()).hash()); + IBlob corrupt = new Blob(testBlob.data(), new Blob("corrupt".getBytes()).hash()); + IBlob corrupt2 = new LightBlob( + lightBlob.streamSupplier(), ByteUtil.hash("corrupt".getBytes()) + ); - assertThrowsExactly(UnsupportedOperationException.class, () -> dropped.verify()); assertThrowsExactly(IntegrityException.class, () -> corrupt.verify()); assertThrowsExactly(IntegrityException.class, () -> new Blob("test".getBytes(), new Blob("corrupt".getBytes()))); + assertThrowsExactly(IntegrityException.class, () -> corrupt2.verify()); } }