From 41114dd5b04b1087830cf5d55582a04566dbb003 Mon Sep 17 00:00:00 2001 From: crusaderky Date: Fri, 12 Dec 2025 16:39:59 +0000 Subject: [PATCH 1/2] Don't obfuscate URLError --- ml_datasets/test/test_util.py | 25 +++++++++++++++++++++++++ ml_datasets/util.py | 13 +++++++++---- 2 files changed, 34 insertions(+), 4 deletions(-) create mode 100644 ml_datasets/test/test_util.py diff --git a/ml_datasets/test/test_util.py b/ml_datasets/test/test_util.py new file mode 100644 index 0000000..3c96330 --- /dev/null +++ b/ml_datasets/test/test_util.py @@ -0,0 +1,25 @@ +import pytest +from urllib.error import HTTPError, URLError +from ml_datasets.util import get_file + + +def test_get_file_domain_resolution_fails(): + with pytest.raises(URLError, match="test_non_existent_file.*Name or service not known"): + get_file( + "non_existent_file.txt", + "http://test_notexist.wth/test_non_existent_file.txt" + ) + + +def test_get_file_404_file_not_found(): + with pytest.raises(HTTPError, match="test_non_existent_file.*404.*Not Found") as e: + get_file( + "non_existent_file.txt", + "http://google.com/test_non_existent_file.txt" + ) + assert e.value.code == 404 + # Suppress pytest.PytestUnraisableExceptionWarning: + # Exception ignored while calling deallocator + # This questionable design quirk comes from urllib.request.urlretrieve, + # so we shouldn't shim around it. + e.value.close() diff --git a/ml_datasets/util.py b/ml_datasets/util.py index b62e105..1cfa797 100644 --- a/ml_datasets/util.py +++ b/ml_datasets/util.py @@ -37,15 +37,20 @@ def dl_progress(count, block_size, total_size): else: progbar.update(block_size) - error_msg = "URL fetch failure on {}: {} -- {}" if not os.path.exists(fpath): try: try: urlretrieve(origin, fpath, dl_progress) - except URLError as e: - raise Exception(error_msg.format(origin, e.errno, e.reason)) + # Enrich download exceptions with full file name + # HTTPError is a subclass of URLError, so it must be caught first except HTTPError as e: - raise Exception(error_msg.format(origin, e.code, e.msg)) + error_msg = "URL fetch failure on {} : {} -- {}" + e.msg = error_msg.format(origin, e.code, e.msg) + raise + except URLError as e: + error_msg = "URL fetch failure on {} -- {}" + e.reason = error_msg.format(origin, e.reason) + raise except (Exception, KeyboardInterrupt): if os.path.exists(fpath): os.remove(fpath) From 0842b98ba49cf8c6c16bf963eb36713c1435a3ae Mon Sep 17 00:00:00 2001 From: crusaderky Date: Fri, 12 Dec 2025 16:51:55 +0000 Subject: [PATCH 2/2] fix regex on windows and mac --- ml_datasets/test/test_util.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ml_datasets/test/test_util.py b/ml_datasets/test/test_util.py index 3c96330..d37826b 100644 --- a/ml_datasets/test/test_util.py +++ b/ml_datasets/test/test_util.py @@ -4,7 +4,9 @@ def test_get_file_domain_resolution_fails(): - with pytest.raises(URLError, match="test_non_existent_file.*Name or service not known"): + with pytest.raises( + URLError, match=r"test_non_existent_file.*(not known|getaddrinfo failed)" + ): get_file( "non_existent_file.txt", "http://test_notexist.wth/test_non_existent_file.txt"