From b89b7e049708cf2ba03ed10b5d1647bc3e5eedd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Muras?= Date: Tue, 2 Dec 2025 15:08:38 +0100 Subject: [PATCH 1/3] html_scraper - url processing --- .../windows/modules/auxiliary/html_scraper.py | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/analyzer/windows/modules/auxiliary/html_scraper.py b/analyzer/windows/modules/auxiliary/html_scraper.py index c6b2de97..c8d1312d 100644 --- a/analyzer/windows/modules/auxiliary/html_scraper.py +++ b/analyzer/windows/modules/auxiliary/html_scraper.py @@ -56,24 +56,23 @@ def upload_to_htmldump_folder(file_name: str, content: bytes): def scrape_html(self): if not HAVE_SELENIUM: - log.debug("Selenium not installed on machine, not scraping", self.driver_path) + log.warning(f"Selenium not installed on machine, not scraping: {self.driver_path}") return if not os.path.isfile(self.driver_path): - log.debug("Web driver not found in path %s, not scraping", self.driver_path) + log.warning(f"Web driver not found in path %s, not scraping: {self.driver_path}") return - if not hasattr(self.config, "category") or self.config.category != "file": - log.debug("Category is not file, not scraping", self.config.category) + if not hasattr(self.config, "category") or self.config.category not in ("file", "url"): + log.debug(f"Category is neither 'file' nor 'url', not scraping. (Category is {self.config.category})") return - if not hasattr(self.config, "file_type") or "HTML" not in self.config.file_type: - log.debug("File is not html, not scraping", self.config.category) + if (self.config.category == "file" and + (not hasattr(self.config, "file_type") or "HTML" not in self.config.file_type)): + log.debug(f"File is not html, not scraping (file_type is {self.config.file_type}") return try: - file_path = os.path.join(os.environ["TEMP"] + os.sep, str(self.config.file_name)) - service = Service(self.driver_path) # This flag ensures that gecko driver will run without opening a cmd window @@ -82,29 +81,33 @@ def scrape_html(self): firefox_options = webdriver.FirefoxOptions() firefox_options.add_argument("--disable-gpu") firefox_options.headless = True - self.browser = webdriver.Firefox(options=firefox_options, service=service) self.browser.set_page_load_timeout(10) - sample_url = "file:///{}".format(os.path.abspath(file_path)) + if self.config.category == "file": + file_path = os.path.join(os.environ["TEMP"] + os.sep, str(self.config.file_name)) + sample_url = "file:///{}".format(os.path.abspath(file_path)) + else: + sample_url = self.config.target + + log.debug(f"html_scraper try to scrape {sample_url}") try: self.browser.get(sample_url) time.sleep(self.browser_runtime) except TimeoutException: log.warning("Page load timed out") - log.debug("Starting upload") self.upload_to_htmldump_folder("html_dump.dump", self.browser.page_source.encode()) if not self.browser.current_url.startswith("file://"): self.upload_to_htmldump_folder("last_url.dump", self.browser.current_url.encode()) - log.debug("HTML scraped successfully") except Exception as e: log.error(e, exc_info=True) def run(self): if not self.enabled: + log.debug("html_scraper RUN rejected because is disabled in config") return False self.scrape_html() From 70eb91677a626f3a87746605bd0c9085feebc856 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Muras?= Date: Tue, 2 Dec 2025 15:13:25 +0100 Subject: [PATCH 2/3] html_scraper - verify URLExtract --- modules/processing/html_scraper.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/processing/html_scraper.py b/modules/processing/html_scraper.py index a30fea5e..ef8f57b2 100644 --- a/modules/processing/html_scraper.py +++ b/modules/processing/html_scraper.py @@ -6,6 +6,7 @@ from typing import Optional from lib.cuckoo.common.abstracts import Processing +from lib.cuckoo.common.exceptions import CuckooDependencyError from data.scraper_safe_url_list import safe_url_list @@ -46,13 +47,14 @@ def force_decode(text: str, max_decode_depth: int) -> Optional[str]: class HtmlScraper(Processing): - def run(self): + def __init__(self, *args, **kwargs): + self.key = "html_scraper" if not HAVE_URLEXTRACT: - print("Missed optional dependency: poetry run pip install -r extra/optional_dependencies.txt") - return + raise CuckooDependencyError("Missing dependency 'URLExtract'") + super().__init__(*args, **kwargs) + def run(self): log.debug("Started html dump processing") - self.key = "html_scraper" html_dump_path = os.path.join(self.analysis_path, "htmldump", "html_dump.dump") last_url_path = os.path.join(self.analysis_path, "htmldump", "last_url.dump") From 0a1f3558300b6d6f6b828ae207ad5a5b2ec23bf9 Mon Sep 17 00:00:00 2001 From: doomedraven Date: Fri, 2 Jan 2026 23:22:28 +0100 Subject: [PATCH 3/3] Update html_scraper.py --- analyzer/windows/modules/auxiliary/html_scraper.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/analyzer/windows/modules/auxiliary/html_scraper.py b/analyzer/windows/modules/auxiliary/html_scraper.py index c8d1312d..296e1fa6 100644 --- a/analyzer/windows/modules/auxiliary/html_scraper.py +++ b/analyzer/windows/modules/auxiliary/html_scraper.py @@ -56,20 +56,20 @@ def upload_to_htmldump_folder(file_name: str, content: bytes): def scrape_html(self): if not HAVE_SELENIUM: - log.warning(f"Selenium not installed on machine, not scraping: {self.driver_path}") + log.warning("Selenium not installed on machine, not scraping", self.driver_path) return if not os.path.isfile(self.driver_path): - log.warning(f"Web driver not found in path %s, not scraping: {self.driver_path}") + log.warning("Web driver not found in path %s, not scraping", self.driver_path) return if not hasattr(self.config, "category") or self.config.category not in ("file", "url"): - log.debug(f"Category is neither 'file' nor 'url', not scraping. (Category is {self.config.category})") + log.debug("Category %s is neither 'file' nor 'url', not scraping", self.config.category) return if (self.config.category == "file" and (not hasattr(self.config, "file_type") or "HTML" not in self.config.file_type)): - log.debug(f"File is not html, not scraping (file_type is {self.config.file_type}") + log.debug("File is not html, not scraping (file_type is %s)", self.config.file_type) return try: @@ -90,7 +90,7 @@ def scrape_html(self): else: sample_url = self.config.target - log.debug(f"html_scraper try to scrape {sample_url}") + log.debug("html_scraper try to scrape: %s", sample_url) try: self.browser.get(sample_url) time.sleep(self.browser_runtime)