diff --git a/ml_datasets/test/test_util.py b/ml_datasets/test/test_util.py new file mode 100644 index 0000000..d37826b --- /dev/null +++ b/ml_datasets/test/test_util.py @@ -0,0 +1,27 @@ +import pytest +from urllib.error import HTTPError, URLError +from ml_datasets.util import get_file + + +def test_get_file_domain_resolution_fails(): + with pytest.raises( + URLError, match=r"test_non_existent_file.*(not known|getaddrinfo failed)" + ): + get_file( + "non_existent_file.txt", + "http://test_notexist.wth/test_non_existent_file.txt" + ) + + +def test_get_file_404_file_not_found(): + with pytest.raises(HTTPError, match="test_non_existent_file.*404.*Not Found") as e: + get_file( + "non_existent_file.txt", + "http://google.com/test_non_existent_file.txt" + ) + assert e.value.code == 404 + # Suppress pytest.PytestUnraisableExceptionWarning: + # Exception ignored while calling deallocator + # This questionable design quirk comes from urllib.request.urlretrieve, + # so we shouldn't shim around it. + e.value.close() diff --git a/ml_datasets/util.py b/ml_datasets/util.py index b62e105..1cfa797 100644 --- a/ml_datasets/util.py +++ b/ml_datasets/util.py @@ -37,15 +37,20 @@ def dl_progress(count, block_size, total_size): else: progbar.update(block_size) - error_msg = "URL fetch failure on {}: {} -- {}" if not os.path.exists(fpath): try: try: urlretrieve(origin, fpath, dl_progress) - except URLError as e: - raise Exception(error_msg.format(origin, e.errno, e.reason)) + # Enrich download exceptions with full file name + # HTTPError is a subclass of URLError, so it must be caught first except HTTPError as e: - raise Exception(error_msg.format(origin, e.code, e.msg)) + error_msg = "URL fetch failure on {} : {} -- {}" + e.msg = error_msg.format(origin, e.code, e.msg) + raise + except URLError as e: + error_msg = "URL fetch failure on {} -- {}" + e.reason = error_msg.format(origin, e.reason) + raise except (Exception, KeyboardInterrupt): if os.path.exists(fpath): os.remove(fpath)