From 11b9999bf69e35087e42959645845c7e789e7bdb Mon Sep 17 00:00:00 2001 From: Dat Tran Date: Thu, 10 Oct 2019 16:51:55 +0200 Subject: [PATCH 1/5] Add CLI to find duplicates directly from the terminal. --- README.md | 7 ++++ imagededup/client/__init__.py | 0 imagededup/client/client.py | 35 ++++++++++++++++++ setup.py | 1 + tests/test_client.py | 69 +++++++++++++++++++++++++++++++++++ 5 files changed, 112 insertions(+) create mode 100644 imagededup/client/__init__.py create mode 100644 imagededup/client/client.py create mode 100644 tests/test_client.py diff --git a/README.md b/README.md index 50b57d51..63319394 100644 --- a/README.md +++ b/README.md @@ -116,6 +116,13 @@ plot_duplicates(image_dir='path/to/image/directory', duplicate_map=duplicates, filename='ukbench00120.jpg') ``` + +Alternatively, you can also use our library in the CLI: +```bash +imagededup find-duplicates --image_dir path/to/image/directory --method PHash +``` +For more information on the available options use the `--help` flag. + For more examples, refer [this](https://github.com/idealo/imagededup/tree/master/examples) part of the repository. diff --git a/imagededup/client/__init__.py b/imagededup/client/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/imagededup/client/client.py b/imagededup/client/client.py new file mode 100644 index 00000000..41b23b17 --- /dev/null +++ b/imagededup/client/client.py @@ -0,0 +1,35 @@ +import click + +from typing import Optional +from pathlib import PosixPath + + +@click.group() +def cli(): + pass + + +@cli.command() +@click.option('--image_dir', help='Path to the directory containing all the images.', required=True, type=str) +@click.option('--method', help='Select which algorithm to use.', required=True, + type=click.Choice(['PHash', 'DHash', 'WHash', 'AHash', 'CNN'])) +@click.option('--outfile', help='Name of the file the results should be written to.', type=str) +@click.option('--max_distance_threshold', default=10, + help='Hamming distance between two images below which retrieved duplicates are valid.', type=int) +@click.option('--scores', + help='Boolean indicating whether Hamming distances are to be returned along with retrieved duplicates.', + type=bool) +def find_duplicates(image_dir: PosixPath, + method: str, + outfile: Optional[str], + max_distance_threshold: int, + scores: bool) -> None: + import imagededup.methods + selected_method = eval('imagededup.methods.{}()'.format(method)) + encodings = selected_method.encode_images(image_dir) + duplicates = selected_method.find_duplicates(encoding_map=encodings, + outfile=outfile, + max_distance_threshold=max_distance_threshold, + scores=scores) + if outfile is None: + click.echo(duplicates) diff --git a/setup.py b/setup.py index 93c480e3..e91a9d01 100644 --- a/setup.py +++ b/setup.py @@ -55,4 +55,5 @@ 'Topic :: Software Development :: Libraries :: Python Modules', ], packages=find_packages(exclude=('tests',)), + entry_points={'console_scripts': ['imagededup=imagededup.client.client:cli']}, ) diff --git a/tests/test_client.py b/tests/test_client.py new file mode 100644 index 00000000..78b19a35 --- /dev/null +++ b/tests/test_client.py @@ -0,0 +1,69 @@ +import os + +from click.testing import CliRunner +from imagededup.client.client import find_duplicates + +PATH_IMAGE_DIR = 'tests/data/mixed_images' +FILENAME = 'tests/test_output.json' + + +def test_no_image_dir_given(): + runner = CliRunner() + result = runner.invoke(find_duplicates, ['--image_dir', '']) + assert result.exit_code == 2 + + +def test_image_dir_given_but_no_method(): + runner = CliRunner() + result = runner.invoke(find_duplicates, ['--image_dir', PATH_IMAGE_DIR]) + assert result.exit_code == 2 + + +def test_image_dir_given_and_method(): + runner = CliRunner() + result = runner.invoke(find_duplicates, ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash']) + assert result.exit_code == 0 + + +def test_image_dir_given_but_wrong_method(): + runner = CliRunner() + result = runner.invoke(find_duplicates, ['--image_dir', PATH_IMAGE_DIR, '--method', 'LHash']) + assert result.exit_code == 2 + + +def test_file_is_created(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--outfile', FILENAME]) + assert result.exit_code == 0 + assert os.path.isfile(FILENAME) is True + # cleanup + os.remove(FILENAME) + + +def test_max_distance_threshold_int(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--max_distance_threshold', '20']) + assert result.exit_code == 0 + + +def test_max_distance_threshold_no_int(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--max_distance_threshold', '0.5']) + assert result.exit_code == 2 + + +def test_scores_boolean(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--scores', 'False']) + assert result.exit_code == 0 + + +def test_scores_no_boolean(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--scores', 'hello']) + assert result.exit_code == 2 From 3bbb62f3f9f18086b6573db160d9ad85eb7bac1e Mon Sep 17 00:00:00 2001 From: Dat Tran Date: Fri, 11 Oct 2019 09:47:34 +0200 Subject: [PATCH 2/5] Add Click to setup.py. --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 114c14a6..18b846c8 100644 --- a/setup.py +++ b/setup.py @@ -36,7 +36,8 @@ 'tensorflow~=2.0.0', 'tqdm', 'scikit-learn', - 'matplotlib' + 'matplotlib', + 'Click' ], extras_require={ 'tests': ['pytest', 'pytest-cov', 'pytest-mock', 'codecov'], From 15ab00415b3a01ad4896bdfa2df45339418a7795 Mon Sep 17 00:00:00 2001 From: Dat Tran Date: Fri, 11 Oct 2019 10:18:25 +0200 Subject: [PATCH 3/5] Small rewording in readme. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ed30eaf3..52de81e5 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ Alternatively, you can also use our library in the CLI: ```bash imagededup find-duplicates --image_dir path/to/image/directory --method PHash ``` -For more information on the available options use the `--help` flag. +Use the `--help` flag to get more information on the available options. For more examples, refer [this](https://github.com/idealo/imagededup/tree/master/examples) part of the repository. From 65105aa3b120b5d8231cf8cc6b0a059756845688 Mon Sep 17 00:00:00 2001 From: Dat Tran Date: Fri, 11 Oct 2019 11:59:09 +0200 Subject: [PATCH 4/5] Add min_similarity_threshold as option in the CLI for CNN and restrict ranges for both min_similarity_threshold and max_distance_threshold. --- imagededup/client/client.py | 41 +++++++++++------ tests/test_client.py | 88 ++++++++++++++++++++++++++++++++++++- 2 files changed, 113 insertions(+), 16 deletions(-) diff --git a/imagededup/client/client.py b/imagededup/client/client.py index 41b23b17..867a8d3c 100644 --- a/imagededup/client/client.py +++ b/imagededup/client/client.py @@ -10,26 +10,39 @@ def cli(): @cli.command() -@click.option('--image_dir', help='Path to the directory containing all the images.', required=True, type=str) -@click.option('--method', help='Select which algorithm to use.', required=True, - type=click.Choice(['PHash', 'DHash', 'WHash', 'AHash', 'CNN'])) +@click.option('--image_dir', help='Path to the directory containing all the images.', type=str, required=True) +@click.option('--method', help='Select which algorithm to use.', + type=click.Choice(['PHash', 'DHash', 'WHash', 'AHash', 'CNN']), required=True) @click.option('--outfile', help='Name of the file the results should be written to.', type=str) -@click.option('--max_distance_threshold', default=10, - help='Hamming distance between two images below which retrieved duplicates are valid.', type=int) +@click.option('--min_similarity_threshold', + help='For CNN only: threshold value (must be float between -1.0 and 1.0). Default is 0.9.', + type=click.FloatRange(-1.0, 1.0), + default=0.9) +@click.option('--max_distance_threshold', + help='For hashing methods only: threshold value (must be integer between 0 and 64). Default is 10.', + type=click.IntRange(0, 64), default=10) @click.option('--scores', - help='Boolean indicating whether Hamming distances are to be returned along with retrieved duplicates.', + help='Boolean indicating whether scores are to be returned along with retrieved duplicates.', type=bool) def find_duplicates(image_dir: PosixPath, - method: str, - outfile: Optional[str], - max_distance_threshold: int, - scores: bool) -> None: + method: str, + outfile: Optional[str], + min_similarity_threshold: float, + max_distance_threshold: int, + scores: bool) -> None: import imagededup.methods selected_method = eval('imagededup.methods.{}()'.format(method)) encodings = selected_method.encode_images(image_dir) - duplicates = selected_method.find_duplicates(encoding_map=encodings, - outfile=outfile, - max_distance_threshold=max_distance_threshold, - scores=scores) + + if method == 'CNN': + duplicates = selected_method.find_duplicates(encoding_map=encodings, + outfile=outfile, + min_similarity_threshold=min_similarity_threshold, + scores=scores) + else: + duplicates = selected_method.find_duplicates(encoding_map=encodings, + outfile=outfile, + max_distance_threshold=max_distance_threshold, + scores=scores) if outfile is None: click.echo(duplicates) diff --git a/tests/test_client.py b/tests/test_client.py index 78b19a35..080846f8 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -41,20 +41,104 @@ def test_file_is_created(): os.remove(FILENAME) -def test_max_distance_threshold_int(): +def test_hash_max_distance_threshold_int(): runner = CliRunner() result = runner.invoke(find_duplicates, ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--max_distance_threshold', '20']) assert result.exit_code == 0 -def test_max_distance_threshold_no_int(): +def test_hash_max_distance_threshold_no_int(): runner = CliRunner() result = runner.invoke(find_duplicates, ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--max_distance_threshold', '0.5']) assert result.exit_code == 2 +def test_hash_max_distance_threshold_in_range_left_interval(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--max_distance_threshold', '0']) + assert result.exit_code == 0 + + +def test_hash_max_distance_threshold_in_range_right_interval(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--max_distance_threshold', '64']) + assert result.exit_code == 0 + + +def test_hash_max_distance_threshold_out_of_range_negative(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--max_distance_threshold', '-30']) + assert result.exit_code == 2 + + +def test_hash_max_distance_threshold_out_of_range_positive(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--max_distance_threshold', '900']) + assert result.exit_code == 2 + + +def test_hash_min_similarity_threshold_has_no_effect(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--min_similarity_threshold', '0.5']) + assert result.exit_code == 0 + + +def test_cnn_min_similarity_threshold_float(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'CNN', '--min_similarity_threshold', '0.5']) + assert result.exit_code == 0 + + +def test_cnn_min_similarity_threshold_no_float(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'CNN', '--min_similarity_threshold', '10']) + assert result.exit_code == 2 + + +def test_cnn_min_similarity_threshold_in_range_left_interval(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'CNN', '--min_similarity_threshold', '-1.0']) + assert result.exit_code == 0 + + +def test_cnn_min_similarity_threshold_in_range_right_interval(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'CNN', '--min_similarity_threshold', '1.0']) + assert result.exit_code == 0 + + +def test_cnn_min_similarity_threshold_out_of_range_negative(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'CNN', '--min_similarity_threshold', '-1.5']) + assert result.exit_code == 2 + + +def test_cnn_min_similarity_threshold_out_of_range_positive(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'CNN', '--min_similarity_threshold', '1.5']) + assert result.exit_code == 2 + + +def test_cnn_max_distance_threshold_has_no_effect(): + runner = CliRunner() + result = runner.invoke(find_duplicates, + ['--image_dir', PATH_IMAGE_DIR, '--method', 'PHash', '--max_distance_threshold', '10']) + assert result.exit_code == 0 + + def test_scores_boolean(): runner = CliRunner() result = runner.invoke(find_duplicates, From 5c9cdd09ca776bbce69f7ffc4688689edd8596a3 Mon Sep 17 00:00:00 2001 From: Dat Tran Date: Fri, 11 Oct 2019 21:49:25 +0200 Subject: [PATCH 5/5] Replace eval as this is unsafe. --- imagededup/client/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/imagededup/client/client.py b/imagededup/client/client.py index 867a8d3c..ff5f74ae 100644 --- a/imagededup/client/client.py +++ b/imagededup/client/client.py @@ -1,3 +1,4 @@ +import importlib import click from typing import Optional @@ -30,8 +31,7 @@ def find_duplicates(image_dir: PosixPath, min_similarity_threshold: float, max_distance_threshold: int, scores: bool) -> None: - import imagededup.methods - selected_method = eval('imagededup.methods.{}()'.format(method)) + selected_method = getattr(importlib.import_module('imagededup.methods'), method)() encodings = selected_method.encode_images(image_dir) if method == 'CNN':