From b89b7e049708cf2ba03ed10b5d1647bc3e5eedd6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Muras?= <mmuras@man.poznan.pl>
Date: Tue, 2 Dec 2025 15:08:38 +0100
Subject: [PATCH 1/3] html_scraper - url processing

---
 .../windows/modules/auxiliary/html_scraper.py | 27 ++++++++++---------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/analyzer/windows/modules/auxiliary/html_scraper.py b/analyzer/windows/modules/auxiliary/html_scraper.py
index c6b2de97..c8d1312d 100644
--- a/analyzer/windows/modules/auxiliary/html_scraper.py
+++ b/analyzer/windows/modules/auxiliary/html_scraper.py
@@ -56,24 +56,23 @@ def upload_to_htmldump_folder(file_name: str, content: bytes):
 
     def scrape_html(self):
         if not HAVE_SELENIUM:
-            log.debug("Selenium not installed on machine, not scraping", self.driver_path)
+            log.warning(f"Selenium not installed on machine, not scraping: {self.driver_path}")
             return
 
         if not os.path.isfile(self.driver_path):
-            log.debug("Web driver not found in path %s, not scraping", self.driver_path)
+            log.warning(f"Web driver not found in path %s, not scraping: {self.driver_path}")
             return
 
-        if not hasattr(self.config, "category") or self.config.category != "file":
-            log.debug("Category is not file, not scraping", self.config.category)
+        if not hasattr(self.config, "category") or self.config.category not in ("file", "url"):
+            log.debug(f"Category is neither 'file' nor 'url', not scraping. (Category is {self.config.category})")
             return
 
-        if not hasattr(self.config, "file_type") or "HTML" not in self.config.file_type:
-            log.debug("File is not html, not scraping", self.config.category)
+        if (self.config.category == "file" and
+                (not hasattr(self.config, "file_type") or "HTML" not in self.config.file_type)):
+            log.debug(f"File is not html, not scraping (file_type is {self.config.file_type}")
             return
 
         try:
-            file_path = os.path.join(os.environ["TEMP"] + os.sep, str(self.config.file_name))
-
             service = Service(self.driver_path)
 
             # This flag ensures that gecko driver will run without opening a cmd window
@@ -82,29 +81,33 @@ def scrape_html(self):
             firefox_options = webdriver.FirefoxOptions()
             firefox_options.add_argument("--disable-gpu")
             firefox_options.headless = True
-
             self.browser = webdriver.Firefox(options=firefox_options, service=service)
             self.browser.set_page_load_timeout(10)
 
-            sample_url = "file:///{}".format(os.path.abspath(file_path))
+            if self.config.category == "file":
+                file_path = os.path.join(os.environ["TEMP"] + os.sep, str(self.config.file_name))
+                sample_url = "file:///{}".format(os.path.abspath(file_path))
+            else:
+                sample_url = self.config.target
+
+            log.debug(f"html_scraper try to scrape {sample_url}")
             try:
                 self.browser.get(sample_url)
                 time.sleep(self.browser_runtime)
             except TimeoutException:
                 log.warning("Page load timed out")
 
-            log.debug("Starting upload")
             self.upload_to_htmldump_folder("html_dump.dump", self.browser.page_source.encode())
 
             if not self.browser.current_url.startswith("file://"):
                 self.upload_to_htmldump_folder("last_url.dump", self.browser.current_url.encode())
 
-            log.debug("HTML scraped successfully")
         except Exception as e:
             log.error(e, exc_info=True)
 
     def run(self):
         if not self.enabled:
+            log.debug("html_scraper RUN rejected because is disabled in config")
             return False
 
         self.scrape_html()

From 70eb91677a626f3a87746605bd0c9085feebc856 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Muras?= <mmuras@man.poznan.pl>
Date: Tue, 2 Dec 2025 15:13:25 +0100
Subject: [PATCH 2/3] html_scraper - verify URLExtract

---
 modules/processing/html_scraper.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/modules/processing/html_scraper.py b/modules/processing/html_scraper.py
index a30fea5e..ef8f57b2 100644
--- a/modules/processing/html_scraper.py
+++ b/modules/processing/html_scraper.py
@@ -6,6 +6,7 @@
 from typing import Optional
 
 from lib.cuckoo.common.abstracts import Processing
+from lib.cuckoo.common.exceptions import CuckooDependencyError
 
 from data.scraper_safe_url_list import safe_url_list
 
@@ -46,13 +47,14 @@ def force_decode(text: str, max_decode_depth: int) -> Optional[str]:
 
 
 class HtmlScraper(Processing):
-    def run(self):
+    def __init__(self, *args, **kwargs):
+        self.key = "html_scraper"
         if not HAVE_URLEXTRACT:
-            print("Missed optional dependency: poetry run pip install -r extra/optional_dependencies.txt")
-            return
+            raise CuckooDependencyError("Missing dependency 'URLExtract'")
+        super().__init__(*args, **kwargs)
 
+    def run(self):
         log.debug("Started html dump processing")
-        self.key = "html_scraper"
 
         html_dump_path = os.path.join(self.analysis_path, "htmldump", "html_dump.dump")
         last_url_path = os.path.join(self.analysis_path, "htmldump", "last_url.dump")

From 0a1f3558300b6d6f6b828ae207ad5a5b2ec23bf9 Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Fri, 2 Jan 2026 23:22:28 +0100
Subject: [PATCH 3/3] Update html_scraper.py

---
 analyzer/windows/modules/auxiliary/html_scraper.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/analyzer/windows/modules/auxiliary/html_scraper.py b/analyzer/windows/modules/auxiliary/html_scraper.py
index c8d1312d..296e1fa6 100644
--- a/analyzer/windows/modules/auxiliary/html_scraper.py
+++ b/analyzer/windows/modules/auxiliary/html_scraper.py
@@ -56,20 +56,20 @@ def upload_to_htmldump_folder(file_name: str, content: bytes):
 
     def scrape_html(self):
         if not HAVE_SELENIUM:
-            log.warning(f"Selenium not installed on machine, not scraping: {self.driver_path}")
+            log.warning("Selenium not installed on machine, not scraping", self.driver_path)
             return
 
         if not os.path.isfile(self.driver_path):
-            log.warning(f"Web driver not found in path %s, not scraping: {self.driver_path}")
+            log.warning("Web driver not found in path %s, not scraping", self.driver_path)
             return
 
         if not hasattr(self.config, "category") or self.config.category not in ("file", "url"):
-            log.debug(f"Category is neither 'file' nor 'url', not scraping. (Category is {self.config.category})")
+            log.debug("Category %s is neither 'file' nor 'url', not scraping", self.config.category)
             return
 
         if (self.config.category == "file" and
                 (not hasattr(self.config, "file_type") or "HTML" not in self.config.file_type)):
-            log.debug(f"File is not html, not scraping (file_type is {self.config.file_type}")
+            log.debug("File is not html, not scraping (file_type is %s)", self.config.file_type)
             return
 
         try:
@@ -90,7 +90,7 @@ def scrape_html(self):
             else:
                 sample_url = self.config.target
 
-            log.debug(f"html_scraper try to scrape {sample_url}")
+            log.debug("html_scraper try to scrape: %s", sample_url)
             try:
                 self.browser.get(sample_url)
                 time.sleep(self.browser_runtime)