From 8fc74e69db3deb0f7b31b7d6a7a451203110c586 Mon Sep 17 00:00:00 2001 From: Kevin Marsh Date: Wed, 10 Sep 2025 12:43:34 -0700 Subject: [PATCH 1/4] tests: add some basic coverage of the `check_links` function --- linkcheck/tests/test_linkcheck.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/linkcheck/tests/test_linkcheck.py b/linkcheck/tests/test_linkcheck.py index 8a3f1e1..c9edf3b 100644 --- a/linkcheck/tests/test_linkcheck.py +++ b/linkcheck/tests/test_linkcheck.py @@ -1,5 +1,6 @@ import os from datetime import datetime, timedelta +from http import HTTPStatus from io import StringIO from unittest.mock import patch @@ -13,6 +14,7 @@ from django.test import LiveServerTestCase, TestCase from django.test.utils import override_settings from django.urls import reverse +from django.utils import timezone from requests.exceptions import ConnectionError from linkcheck.linkcheck_settings import MAX_URL_LENGTH @@ -25,6 +27,7 @@ unregister_listeners, ) from linkcheck.models import Link, Url +from linkcheck.utils import check_links from linkcheck.views import get_jquery_min_js from .sampleapp.models import Author, Book, Journal, Page @@ -1203,6 +1206,33 @@ def test_filter_callable(self): ) +class TestCheckLinks(TestCase): + + @requests_mock.Mocker() + def test_check_links(self, mocker): + good_url = 'https://example.com/good' + mocker.register_uri('HEAD', good_url, status_code=HTTPStatus.OK, reason='OK') + Url.objects.create(url=good_url) + + bad_url = 'https://example.com/bad' + mocker.register_uri('HEAD', bad_url, status_code=HTTPStatus.NOT_FOUND, reason='NOT FOUND') + Url.objects.create(url=bad_url) + + exception_url = 'https://example.com/exception' + mocker.register_uri('HEAD', exception_url, exc=ConnectionError("Something went wrong")) + Url.objects.create(url=exception_url) + + recently_checked_url = 'https://example.com/recent' + # Shouldn't be requested + Url.objects.create(url=recently_checked_url, last_checked=timezone.now() - timedelta(days=1)) + + self.assertEqual(check_links(), 3) + self.assertEqual(Url.objects.get(url=good_url).status, True) + self.assertEqual(Url.objects.get(url=bad_url).status, False) + self.assertEqual(Url.objects.get(url=exception_url).status, False) + self.assertEqual(Url.objects.get(url=recently_checked_url).status, None) + + def get_command_output(command, *args, **kwargs): """ Helper function for running a management command and checking its output From 30b160f19398f995a92f097d6881f461b6f28775 Mon Sep 17 00:00:00 2001 From: Kevin Marsh Date: Wed, 10 Sep 2025 14:30:30 -0700 Subject: [PATCH 2/4] tests: fix naive datetime warning Was raising a `RuntimeWarning: DateTimeField Url.last_checked received a naive datetime ... while time zone support is active.` --- linkcheck/tests/test_linkcheck.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/linkcheck/tests/test_linkcheck.py b/linkcheck/tests/test_linkcheck.py index c9edf3b..3f5bd24 100644 --- a/linkcheck/tests/test_linkcheck.py +++ b/linkcheck/tests/test_linkcheck.py @@ -1,5 +1,5 @@ import os -from datetime import datetime, timedelta +from datetime import timedelta from http import HTTPStatus from io import StringIO from unittest.mock import patch @@ -873,7 +873,7 @@ def test_checklinks_command(self): "1 internal URLs and 0 external URLs have been checked.\n" ) - yesterday = datetime.now() - timedelta(days=1) + yesterday = timezone.now() - timedelta(days=1) Url.objects.all().update(last_checked=yesterday) out = StringIO() call_command('checklinks', externalinterval=20, stdout=out) From 24e7632d7d59a53c951a4bca8ca0c1c37b72a89a Mon Sep 17 00:00:00 2001 From: Kevin Marsh Date: Wed, 10 Sep 2025 14:51:52 -0700 Subject: [PATCH 3/4] check_links: add concurrent version of the `check_links` helper function Since we're using sqlite as the test db backend, we can't effectively test the `Url` objects being saved in the `ThreadPoolExecutor` futures, but there's enough test coverage that under the hood `Url.check_url` is doing the right thing --- linkcheck/tests/test_linkcheck.py | 37 +++++++++++++++-- linkcheck/utils.py | 66 +++++++++++++++++++++++++++++++ 2 files changed, 99 insertions(+), 4 deletions(-) diff --git a/linkcheck/tests/test_linkcheck.py b/linkcheck/tests/test_linkcheck.py index 3f5bd24..551c6ab 100644 --- a/linkcheck/tests/test_linkcheck.py +++ b/linkcheck/tests/test_linkcheck.py @@ -27,7 +27,7 @@ unregister_listeners, ) from linkcheck.models import Link, Url -from linkcheck.utils import check_links +from linkcheck.utils import check_links, concurrent_check_links from linkcheck.views import get_jquery_min_js from .sampleapp.models import Author, Book, Journal, Page @@ -1208,8 +1208,10 @@ def test_filter_callable(self): class TestCheckLinks(TestCase): - @requests_mock.Mocker() - def test_check_links(self, mocker): + def _setup_mock_urls(self, mocker): + """ + Set up common mock URLs for link checking tests. + """ good_url = 'https://example.com/good' mocker.register_uri('HEAD', good_url, status_code=HTTPStatus.OK, reason='OK') Url.objects.create(url=good_url) @@ -1224,7 +1226,13 @@ def test_check_links(self, mocker): recently_checked_url = 'https://example.com/recent' # Shouldn't be requested - Url.objects.create(url=recently_checked_url, last_checked=timezone.now() - timedelta(days=1)) + Url.objects.create(url=recently_checked_url, status=None, last_checked=timezone.now() - timedelta(days=1)) + + return (good_url, bad_url, exception_url, recently_checked_url) + + @requests_mock.Mocker() + def test_check_links(self, mocker): + good_url, bad_url, exception_url, recently_checked_url = self._setup_mock_urls(mocker) self.assertEqual(check_links(), 3) self.assertEqual(Url.objects.get(url=good_url).status, True) @@ -1232,6 +1240,27 @@ def test_check_links(self, mocker): self.assertEqual(Url.objects.get(url=exception_url).status, False) self.assertEqual(Url.objects.get(url=recently_checked_url).status, None) + @requests_mock.Mocker() + def test_concurrent_check_links(self, mocker): + self._setup_mock_urls(mocker) + + # Since the tests are running in sqlite, we can't insert data via our threaded code + # there's enough other test coverage that we can use `Url.save` as a proxy + with patch.object(Url, "save") as patched_save: + self.assertEqual(concurrent_check_links(), 3) + self.assertEqual(patched_save.call_count, 3) + + def test_concurrent_check_links_error_handling(self): + Url.objects.create(url='https://example.com/good') + with ( + patch("linkcheck.utils.logger.exception") as patched_logged_exception, + patch.object(Url, "check_external", side_effect=ValueError("oops")), + ): + self.assertEqual(concurrent_check_links(), 0) + self.assertEqual(patched_logged_exception.call_count, 1) + msg, *args = patched_logged_exception.call_args[0] + self.assertEqual(msg % tuple(args), "ValueError while checking https://example.com/good: oops") + def get_command_output(command, *args, **kwargs): """ diff --git a/linkcheck/utils.py b/linkcheck/utils.py index 0b4e6d7..8058ac5 100644 --- a/linkcheck/utils.py +++ b/linkcheck/utils.py @@ -1,4 +1,6 @@ import logging +import threading +from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import timedelta from django.apps import apps @@ -120,6 +122,70 @@ def check_links(external_recheck_interval=10080, limit=-1, check_internal=True, return check_count +def concurrent_check_links( + external_recheck_interval=10080, + limit=-1, + check_internal=True, + check_external=True, + max_workers=20, +): + """ + Return the number of links effectively checked. + A concurrent version of `check_links` + + Args: + external_recheck_interval: Minutes before rechecking external links + limit: Maximum number of URLs to check (-1 for unlimited) + check_internal: Whether to check internal links + check_external: Whether to check external links + max_workers: Maximum number of concurrent threads + """ + + urls = Url.objects.all() + + # An optimization for when check_internal is False + if not check_internal: + recheck_datetime = timezone.now() - timedelta(minutes=external_recheck_interval) + urls = urls.exclude(last_checked__gt=recheck_datetime) + + url_list = list(urls[:limit] if limit > 0 else urls) + + if not url_list: + return 0 + + # Thread-safe counter + check_count = 0 + count_lock = threading.Lock() + + def check_single_url(url_obj): + """Check a single URL and return 1 if checked, 0 if not""" + try: + status = url_obj.check_url(check_internal=check_internal, check_external=check_external) + return 1 if status is not None else 0 + except Exception as e: + logger.exception( + "%s while checking %s: %s", + type(e).__name__, + url_obj.url, + e + ) + return 0 + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + # Submit all tasks + future_to_url = { + executor.submit(check_single_url, url): url + for url in url_list + } + # Process completed futures + for future in as_completed(future_to_url): + result = future.result() + with count_lock: + check_count += result + + return check_count + + def update_urls(urls, content_type, object_id): # Structure of urls param is [(field, link text, url), ... ] From ba90c0a66b7afcb8d3861e9ffc00007c928a0ed8 Mon Sep 17 00:00:00 2001 From: Ben Stockermans Date: Fri, 19 Sep 2025 11:35:58 +0100 Subject: [PATCH 4/4] Updated docstring to warn users about potentially triggering attack detection if you have many links to the same domain. --- linkcheck/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/linkcheck/utils.py b/linkcheck/utils.py index 8058ac5..ea1229b 100644 --- a/linkcheck/utils.py +++ b/linkcheck/utils.py @@ -131,7 +131,11 @@ def concurrent_check_links( ): """ Return the number of links effectively checked. - A concurrent version of `check_links` + + A concurrent version of `check_links`. It should be faster than `check_links`, but + be aware that if you have multiple links to the same domain, you risk triggering + some attack detection on the target server, hence this concurrent version is best used + for links from all different domains or internal links. Args: external_recheck_interval: Minutes before rechecking external links