diff --git a/README.md b/README.md index 21857ec2..52de81e5 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,13 @@ plot_duplicates(image_dir='path/to/image/directory', duplicate_map=duplicates, filename='ukbench00120.jpg') ``` + +Alternatively, you can also use our library in the CLI: +```bash +imagededup find-duplicates --image_dir path/to/image/directory --method PHash +``` +Use the `--help` flag to get more information on the available options. + For more examples, refer [this](https://github.com/idealo/imagededup/tree/master/examples) part of the repository. diff --git a/imagededup/client/__init__.py b/imagededup/client/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/imagededup/client/client.py b/imagededup/client/client.py new file mode 100644 index 00000000..ff5f74ae --- /dev/null +++ b/imagededup/client/client.py @@ -0,0 +1,48 @@ +import importlib +import click + +from typing import Optional +from pathlib import PosixPath + + +@click.group() +def cli(): + pass + + +@cli.command() +@click.option('--image_dir', help='Path to the directory containing all the images.', type=str, required=True) +@click.option('--method', help='Select which algorithm to use.', + type=click.Choice(['PHash', 'DHash', 'WHash', 'AHash', 'CNN']), required=True) +@click.option('--outfile', help='Name of the file the results should be written to.', type=str) +@click.option('--min_similarity_threshold', + help='For CNN only: threshold value (must be float between -1.0 and 1.0). Default is 0.9.', + type=click.FloatRange(-1.0, 1.0), + default=0.9) +@click.option('--max_distance_threshold', + help='For hashing methods only: threshold value (must be integer between 0 and 64). Default is 10.', + type=click.IntRange(0, 64), default=10) +@click.option('--scores', + help='Boolean indicating whether scores are to be returned along with retrieved duplicates.', + type=bool) +def find_duplicates(image_dir: PosixPath, + method: str, + outfile: Optional[str], + min_similarity_threshold: float, + max_distance_threshold: int, + scores: bool) -> None: + selected_method = getattr(importlib.import_module('imagededup.methods'), method)() + encodings = selected_method.encode_images(image_dir) + + if method == 'CNN': + duplicates = selected_method.find_duplicates(encoding_map=encodings, + outfile=outfile, + min_similarity_threshold=min_similarity_threshold, + scores=scores) + else: + duplicates = selected_method.find_duplicates(encoding_map=encodings, + outfile=outfile, + max_distance_threshold=max_distance_threshold, + scores=scores) + if outfile is None: + click.echo(duplicates) diff --git a/setup.py b/setup.py index 4d7a2bb2..18b846c8 100644 --- a/setup.py +++ b/setup.py @@ -36,7 +36,8 @@ 'tensorflow~=2.0.0', 'tqdm', 'scikit-learn', - 'matplotlib' + 'matplotlib', + 'Click' ], extras_require={ 'tests': ['pytest', 'pytest-cov', 'pytest-mock', 'codecov'], @@ -55,4 +56,5 @@ 'Topic :: Software Development :: Libraries :: Python Modules', ], packages=find_packages(exclude=('tests',)), + entry_points={'console_scripts': ['imagededup=imagededup.client.client:cli']}, ) diff --git a/tests/test_client.py b/tests/test_client.py new file mode 100644 index 00000000..080846f8 --- /dev/null +++ b/tests/test_client.py @@ -0,0 +1,153 @@ +import os + +from click.testing import CliRunner +from imagededup.client.client import find_duplicates + +PATH_IMAGE_DIR = 'tests/data/mixed_images' +FILENAME = 'tests/test_output.json' + + +def test_no_image_dir_given(): + runner = CliRunner() + result = runner.invoke(find_duplicates, ['--image_dir', '']) + assert result.exit_code == 2 + + +def test_image_dir_given_but_no_method(): + runner = CliRunner() + result = runner.invoke(find_duplicates, ['--image_dir', PATH_IMAGE_DIR]) + assert result.exit_code == 2 + + +def test_image_dir_given_and_method(): + runner = CliRunner() + result = runner.invoke(find_duplicates, ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash']) + assert result.exit_code == 0 + + +def test_image_dir_given_but_wrong_method(): + runner = CliRunner() + result = runner.invoke(find_duplicates, ['--image_dir', PATH_IMAGE_DIR, '--method', 'LHash']) + assert result.exit_code == 2 + + +def test_file_is_created(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--outfile', FILENAME]) + assert result.exit_code == 0 + assert os.path.isfile(FILENAME) is True + # cleanup + os.remove(FILENAME) + + +def test_hash_max_distance_threshold_int(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--max_distance_threshold', '20']) + assert result.exit_code == 0 + + +def test_hash_max_distance_threshold_no_int(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--max_distance_threshold', '0.5']) + assert result.exit_code == 2 + + +def test_hash_max_distance_threshold_in_range_left_interval(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--max_distance_threshold', '0']) + assert result.exit_code == 0 + + +def test_hash_max_distance_threshold_in_range_right_interval(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--max_distance_threshold', '64']) + assert result.exit_code == 0 + + +def test_hash_max_distance_threshold_out_of_range_negative(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--max_distance_threshold', '-30']) + assert result.exit_code == 2 + + +def test_hash_max_distance_threshold_out_of_range_positive(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--max_distance_threshold', '900']) + assert result.exit_code == 2 + + +def test_hash_min_similarity_threshold_has_no_effect(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--min_similarity_threshold', '0.5']) + assert result.exit_code == 0 + + +def test_cnn_min_similarity_threshold_float(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'CNN', '--min_similarity_threshold', '0.5']) + assert result.exit_code == 0 + + +def test_cnn_min_similarity_threshold_no_float(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'CNN', '--min_similarity_threshold', '10']) + assert result.exit_code == 2 + + +def test_cnn_min_similarity_threshold_in_range_left_interval(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'CNN', '--min_similarity_threshold', '-1.0']) + assert result.exit_code == 0 + + +def test_cnn_min_similarity_threshold_in_range_right_interval(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'CNN', '--min_similarity_threshold', '1.0']) + assert result.exit_code == 0 + + +def test_cnn_min_similarity_threshold_out_of_range_negative(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'CNN', '--min_similarity_threshold', '-1.5']) + assert result.exit_code == 2 + + +def test_cnn_min_similarity_threshold_out_of_range_positive(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'CNN', '--min_similarity_threshold', '1.5']) + assert result.exit_code == 2 + + +def test_cnn_max_distance_threshold_has_no_effect(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--max_distance_threshold', '10']) + assert result.exit_code == 0 + + +def test_scores_boolean(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--scores', 'False']) + assert result.exit_code == 0 + + +def test_scores_no_boolean(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--scores', 'hello']) + assert result.exit_code == 2