From b0a23683bb0d02720eaab21f466ac37ca6e8050e Mon Sep 17 00:00:00 2001 From: doomedraven Date: Tue, 10 Feb 2026 12:30:48 +0100 Subject: [PATCH 01/11] Update network_utils.py --- lib/cuckoo/common/network_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/cuckoo/common/network_utils.py b/lib/cuckoo/common/network_utils.py index 86302687f28..5c450873ea1 100644 --- a/lib/cuckoo/common/network_utils.py +++ b/lib/cuckoo/common/network_utils.py @@ -31,6 +31,7 @@ "internetconnectw", "winhttpopenrequest", "winhttpsendrequest", + "winhttpgetproxyforurl", "winhttpconnect", "winhttpopen", "internetopenurla", From 06dd1e6c684965f70febb96f8468f811c24c1c93 Mon Sep 17 00:00:00 2001 From: doomedraven Date: Tue, 10 Feb 2026 12:40:18 +0100 Subject: [PATCH 02/11] Update network_utils.py --- lib/cuckoo/common/network_utils.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lib/cuckoo/common/network_utils.py b/lib/cuckoo/common/network_utils.py index 5c450873ea1..cdad4ca53f2 100644 --- a/lib/cuckoo/common/network_utils.py +++ b/lib/cuckoo/common/network_utils.py @@ -15,6 +15,7 @@ "gethostbynamew", "dnsquery_a", "dnsquery_w", + "dnsquery_utf8", "dnsqueryex", "dnsquery", } @@ -25,6 +26,8 @@ "internetcrackurlw", "httpsendrequesta", "httpsendrequestw", + "httpsendrequestexa", + "httpsendrequestexw", "internetsendrequesta", "internetsendrequestw", "internetconnecta", @@ -38,6 +41,14 @@ "internetopenurlw", "httpopenrequesta", "httpopenrequestw", + "urldownloadtofilew", + "urldownloadtocachefilew", + "cryptretrieveobjectbyurlw", + "urlcanonicalizew", + "mkparsedisplayname", + "mkparsedisplaynameex", + "dsenumeratedomaintrustsw", + "wnetuseconnectionw", "isvalidurl", } @@ -96,6 +107,8 @@ def _extract_domain_from_call(call, args_map): "QueryName", "lpstrName", "pName", + "ServerName", + "servername", ): v = args_map.get(name) if isinstance(v, str) and v.strip(): From e22b5f457f800d6ce3f625da09cbfb7a9663896b Mon Sep 17 00:00:00 2001 From: doomedraven Date: Tue, 10 Feb 2026 12:47:46 +0100 Subject: [PATCH 03/11] add http(s) from behavior --- modules/processing/behavior.py | 13 ++++++ modules/processing/network.py | 76 +++++++++++++++++++++++++++++++--- 2 files changed, 84 insertions(+), 5 deletions(-) diff --git a/modules/processing/behavior.py b/modules/processing/behavior.py index 942fd2bced5..bbf08586c29 100644 --- a/modules/processing/behavior.py +++ b/modules/processing/behavior.py @@ -1232,6 +1232,7 @@ class NetworkMap: def __init__(self): self.endpoint_map = defaultdict(list) # (ip, port) -> [pinfo] self.http_host_map = defaultdict(list) # host -> [pinfo] + self.http_requests = [] # url -> [pinfo] self.dns_intents = defaultdict(list) # domain -> [intent] def event_apicall(self, call, process): @@ -1277,6 +1278,17 @@ def event_apicall(self, call, process): if host: _add_http_host(self.http_host_map, host, pinfo, sock=sock) + if u: + self.http_requests.append( + { + "url": u, + "host": host, + "process_id": process.get("process_id"), + "process_name": process.get("process_name"), + "time": _parse_behavior_ts(call.get("timestamp")), + } + ) + if isinstance(buf, str): u2 = _extract_first_url(buf) if u2: @@ -1332,6 +1344,7 @@ def run(self): "endpoint_map": endpoint_map_str, "http_host_map": self.http_host_map, "dns_intents": self.dns_intents, + "http_requests": self.http_requests, } diff --git a/modules/processing/network.py b/modules/processing/network.py index f402a77148e..2bb3b16f425 100644 --- a/modules/processing/network.py +++ b/modules/processing/network.py @@ -23,7 +23,7 @@ from itertools import islice from json import loads from typing import Any, Dict, List, Optional -from urllib.parse import urlunparse +from urllib.parse import urlparse, urlunparse import cachetools.func import dns.resolver @@ -1365,11 +1365,77 @@ def _merge_behavior_network(self, results): # 2. HTTP http_host_map = net_map.get("http_host_map", {}) - existing_hosts = {h.get("host") for h in network.get("http", [])} - http_events = (network.get("http", []) or []) + (network.get("http_ex", []) or []) + (network.get("https_ex", []) or []) - existing_hosts = {_norm_domain(h.get("host")) for h in http_events if h.get("host")} + http_requests = net_map.get("http_requests", []) + + existing_hosts = set() + existing_urls = set() + for h in (network.get("http", []) or []) + (network.get("http_ex", []) or []) + (network.get("https_ex", []) or []): + host = h.get("host") + if host: + existing_hosts.add(_norm_domain(host)) + uri = h.get("uri", "/") + # Store simplistic URL representation for deduplication + existing_urls.add(f"{host}{uri}") + + # Process full requests from behavior + for req in http_requests: + url = req.get("url") + if not url: + continue + + # Parse URL to components + try: + parsed = urlparse(url) + if not parsed.netloc and not parsed.path: + continue + + host = parsed.netloc or req.get("host") + # Handle cases where URL might be just a domain or path + if not host and url and "." in url and "/" not in url: + host = url + + # Fallback host normalization + if not host and req.get("host"): + host = req.get("host") + + uri = parsed.path + if parsed.query: + uri += f"?{parsed.query}" + if not uri: + uri = "/" + + # Check for duplicates + url_key = f"{host}{uri}" + if url_key in existing_urls: + continue + + port = 80 + if parsed.port: + port = parsed.port + elif parsed.scheme == "https": + port = 443 + + entry = { + "host": host, + "port": port, + "uri": uri, + "method": "GET", + "source": "behavior", + "process_id": req.get("process_id"), + "process_name": req.get("process_name"), + "time": req.get("time"), + } + network.setdefault("http", []).append(entry) + if host: + existing_hosts.add(_norm_domain(host)) + existing_urls.add(url_key) + + except Exception: + log.warning("Failed to parse behavior URL: %s", url) + + # Process host-only map for remaining missing hosts for host, procs in http_host_map.items(): - if host not in existing_hosts: + if _norm_domain(host) not in existing_hosts: proc = procs[0] if procs else {} entry = { "host": host, From 18f5661acf6b8ef2979db74fd7d1cec7e1ca2ec1 Mon Sep 17 00:00:00 2001 From: doomedraven Date: Tue, 10 Feb 2026 12:54:16 +0100 Subject: [PATCH 04/11] Update network.py --- modules/processing/network.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/modules/processing/network.py b/modules/processing/network.py index 2bb3b16f425..de175ec5010 100644 --- a/modules/processing/network.py +++ b/modules/processing/network.py @@ -1342,7 +1342,7 @@ def _merge_behavior_network(self, results): if not net_map: return - network = results.get("network", {}) + network = results # 1. DNS dns_intents = net_map.get("dns_intents", {}) @@ -1359,7 +1359,7 @@ def _merge_behavior_network(self, results): "source": "behavior", "process_id": proc.get("process_id"), "process_name": proc.get("process_name"), - "time": first_intent.get("ts_epoch"), + "first_seen": first_intent.get("ts_epoch"), } network.setdefault("dns", []).append(entry) @@ -1398,14 +1398,14 @@ def _merge_behavior_network(self, results): if not host and req.get("host"): host = req.get("host") - uri = parsed.path + path = parsed.path if parsed.query: - uri += f"?{parsed.query}" - if not uri: - uri = "/" + path += f"?{parsed.query}" + if not path: + path = "/" # Check for duplicates - url_key = f"{host}{uri}" + url_key = f"{host}{path}" if url_key in existing_urls: continue @@ -1418,12 +1418,13 @@ def _merge_behavior_network(self, results): entry = { "host": host, "port": port, - "uri": uri, + "uri": url, + "path": path, "method": "GET", "source": "behavior", "process_id": req.get("process_id"), "process_name": req.get("process_name"), - "time": req.get("time"), + "first_seen": req.get("time"), } network.setdefault("http", []).append(entry) if host: @@ -1440,7 +1441,8 @@ def _merge_behavior_network(self, results): entry = { "host": host, "port": 80, - "uri": "/", + "uri": f"http://{host}/", + "path": "/", "method": "GET", "source": "behavior", "process_id": proc.get("process_id"), From fda8d198d9fda0ea00c60241278510bfbb07eb17 Mon Sep 17 00:00:00 2001 From: doomedraven Date: Tue, 10 Feb 2026 12:59:54 +0100 Subject: [PATCH 05/11] Update admin_conf.py_example --- admin/admin_conf.py_example | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/admin/admin_conf.py_example b/admin/admin_conf.py_example index 50e990fae5a..eccd5698e06 100644 --- a/admin/admin_conf.py_example +++ b/admin/admin_conf.py_example @@ -23,7 +23,7 @@ JUMP_BOX_SECOND_USERNAME = "" JUMP_BOX_SECOND_PORT = 22 NUM_THREADS = 5 -POSTPROCESS = "systemctl restart cape-processor; systemctl status cape-processor" +POSTPROCESS = "systemctl restart cape-processor; systemctl status --no-pager cape-processor" EXCLUDE_DIRS = set( [ From 7a7ce83965bcc8a4d437b217e17abbfe58a7bd45 Mon Sep 17 00:00:00 2001 From: doomedraven Date: Tue, 10 Feb 2026 14:06:12 +0100 Subject: [PATCH 06/11] file details cache (#2894) * file details cache Summary of Changes: 1. Modified `lib/cuckoo/common/objects.py`: * Updated File.init_yara to calculate a SHA256 hash of all compiled YARA rule files. * Stored this hash in File.yara_rules_hash. 2. Modified `modules/processing/CAPE.py`: * In process_file, imported mongo_find_one from dev_utils.mongodb. * Implemented logic to query the MongoDB files collection using the file's SHA256. * Cache Hit: If the file is found and yara_hash matches, file_info is populated from the database, skipping expensive operations like f.get_all() (PE parsing, initial YARA scan) and static_file_info. * Partial Cache Hit: If the file is found but yara_hash differs, the cached data is loaded, but YARA scans are re-run (f.get_yara()), and the yara_hash field is updated. static_file_info is still skipped to avoid re-extracting/re-analyzing static properties. * Cache Miss: If the file is not in the DB, standard processing proceeds, and yara_hash is added to file_info for future caching. This solution optimizes processing time for previously analyzed files while ensuring YARA scan results remain up-to-date when rules change. * more * fix * Update CAPE.py * Update CAPE.py * Update test_file_extra_info.py * fixes * Update objects.py * make it on/off --- conf/default/processing.conf.default | 2 + .../common/integrations/file_extra_info.py | 161 ++++++------------ .../file_extra_info_modules/msi_extract.py | 73 ++++++++ lib/cuckoo/common/integrations/utils.py | 23 +++ lib/cuckoo/common/objects.py | 24 ++- modules/processing/CAPE.py | 71 ++++++-- tests/test_file_extra_info.py | 3 +- 7 files changed, 237 insertions(+), 120 deletions(-) create mode 100644 lib/cuckoo/common/integrations/file_extra_info_modules/msi_extract.py create mode 100644 lib/cuckoo/common/integrations/utils.py diff --git a/conf/default/processing.conf.default b/conf/default/processing.conf.default index ca0c448bca6..8cc2ef115a5 100644 --- a/conf/default/processing.conf.default +++ b/conf/default/processing.conf.default @@ -228,6 +228,8 @@ max_file_size = 90 userdb_signature = no # https://capev2.readthedocs.io/en/latest/usage/patterns_replacement.html replace_patterns = no +# Use file cache to speed up processing by looking up already processed files in MongoDB +file_cache = no # Deduplicate screenshots - You need to install dependency ImageHash>=4.3.1 [deduplication] diff --git a/lib/cuckoo/common/integrations/file_extra_info.py b/lib/cuckoo/common/integrations/file_extra_info.py index 31472887fd0..e4c393c735c 100644 --- a/lib/cuckoo/common/integrations/file_extra_info.py +++ b/lib/cuckoo/common/integrations/file_extra_info.py @@ -1,5 +1,4 @@ import concurrent.futures -import functools import hashlib import json import logging @@ -7,11 +6,10 @@ import re import shlex import shutil -import signal import subprocess # from contextlib import suppress -from typing import Any, DefaultDict, List, Optional, Set, Union +from typing import Any, DefaultDict, List, Optional, Set import pebble @@ -38,7 +36,6 @@ from lib.cuckoo.common.load_extra_modules import file_extra_info_load_modules from lib.cuckoo.common.objects import File from lib.cuckoo.common.path_utils import ( - path_delete, path_exists, path_get_size, path_is_file, @@ -176,52 +173,58 @@ def static_file_info( ): log.info("Missed dependencies: pip3 install oletools") - if "MSI Installer" in data_dictionary["type"]: + if "MSI Installer" in data_dictionary["type"] and "msi" not in data_dictionary: data_dictionary["msi"] = parse_msi(file_path) # ToDo we need type checking as it wont work for most of static jobs if HAVE_PEFILE and ("PE32" in data_dictionary["type"] or "MS-DOS executable" in data_dictionary["type"]): - with PortableExecutable(file_path) as pe: - data_dictionary["pe"] = pe.run(task_id) + if "pe" not in data_dictionary: + with PortableExecutable(file_path) as pe: + data_dictionary["pe"] = pe.run(task_id) - if HAVE_FLARE_CAPA: + if HAVE_FLARE_CAPA and "flare_capa" not in data_dictionary: # https://github.com/mandiant/capa/issues/2620 capa_details = flare_capa_details(file_path, "static") if capa_details: data_dictionary["flare_capa"] = capa_details - if HAVE_FLOSS and integration_conf.floss.enabled and "Mono" not in data_dictionary["type"]: + if HAVE_FLOSS and integration_conf.floss.enabled and "Mono" not in data_dictionary["type"] and "floss" not in data_dictionary: floss_strings = Floss(file_path, "static", "pe").run() if floss_strings: data_dictionary["floss"] = floss_strings - if "Mono" in data_dictionary["type"]: + if "Mono" in data_dictionary["type"] and "dotnet" not in data_dictionary: if integration_conf.general.dotnet: data_dictionary["dotnet"] = DotNETExecutable(file_path).run() - if processing_conf.strings.dotnet: + if processing_conf.strings.dotnet and "dotnet_strings" not in data_dictionary: dotnet_strings = dotnet_user_strings(file_path) if dotnet_strings: data_dictionary.setdefault("dotnet_strings", dotnet_strings) elif (HAVE_OLETOOLS and package in {"doc", "ppt", "xls", "pub"} and integration_conf.general.office) or data_dictionary.get("name", "").endswith((".doc", ".ppt", ".xls", ".pub")): - # options is dict where we need to get pass get_options - data_dictionary["office"] = Office(file_path, task_id, data_dictionary["sha256"], options_dict).run() + if "office" not in data_dictionary: + # options is dict where we need to get pass get_options + data_dictionary["office"] = Office(file_path, task_id, data_dictionary["sha256"], options_dict).run() elif ("PDF" in data_dictionary["type"] or file_path.endswith(".pdf")) and integration_conf.general.pdf: - data_dictionary["pdf"] = PDF(file_path).run() + if "pdf" not in data_dictionary: + data_dictionary["pdf"] = PDF(file_path).run() elif ( package in {"wsf", "hta"} or data_dictionary["type"] == "XML document text" or file_path.endswith(".wsf") ) and integration_conf.general.windows_script: - data_dictionary["wsf"] = WindowsScriptFile(file_path).run() + if "wsf" not in data_dictionary: + data_dictionary["wsf"] = WindowsScriptFile(file_path).run() # elif package in {"js", "vbs"}: # data_dictionary["js"] = EncodedScriptFile(file_path).run() elif (package == "lnk" or "MS Windows shortcut" in data_dictionary["type"]) and integration_conf.general.lnk: - data_dictionary["lnk"] = LnkShortcut(file_path).run() + if "lnk" not in data_dictionary: + data_dictionary["lnk"] = LnkShortcut(file_path).run() elif ("Java Jar" in data_dictionary["type"] or file_path.endswith(".jar")) and integration_conf.general.java: - if integration_conf.procyon.binary and not path_exists(integration_conf.procyon.binary): - log.error("procyon_path specified in processing.conf but the file does not exist") - else: - data_dictionary["java"] = Java(file_path, integration_conf.procyon.binary).run() - elif file_path.endswith(".rdp") or data_dictionary.get("name", {}).endswith(".rdp"): + if "java" not in data_dictionary: + if integration_conf.procyon.binary and not path_exists(integration_conf.procyon.binary): + log.error("procyon_path specified in processing.conf but the file does not exist") + else: + data_dictionary["java"] = Java(file_path, integration_conf.procyon.binary).run() + elif (file_path.endswith(".rdp") or data_dictionary.get("name", {}).endswith(".rdp")) and "rdp" not in data_dictionary: data_dictionary["rdp"] = parse_rdp_file(file_path) # It's possible to fool libmagic into thinking our 2007+ file is a zip. # So until we have static analysis for zip files, we can use oleid to fail us out silently, @@ -237,13 +240,13 @@ def static_file_info( if not file_path.startswith(exclude_startswith) and not file_path.endswith(excluded_extensions): data_dictionary["data"] = is_text_file(data_dictionary, file_path, processing_conf.CAPE.buffer, data) - if processing_conf.trid.enabled: + if processing_conf.trid.enabled and "trid" not in data_dictionary: data_dictionary["trid"] = trid_info(file_path) - if processing_conf.die.enabled: + if processing_conf.die.enabled and "die" not in data_dictionary: data_dictionary["die"] = detect_it_easy_info(file_path) - if HAVE_FLOSS and processing_conf.floss.enabled and "Mono" not in data_dictionary["type"]: + if HAVE_FLOSS and processing_conf.floss.enabled and "Mono" not in data_dictionary["type"] and "floss" not in data_dictionary: floss_strings = Floss(file_path, package).run() if floss_strings: data_dictionary["floss"] = floss_strings @@ -253,7 +256,7 @@ def static_file_info( # think that we want to look them up on-demand (i.e. display the # "strings" button linking to an on_demand URL). data_dictionary["strings"] = [] - elif HAVE_STRINGS: + elif HAVE_STRINGS and "strings" not in data_dictionary: strings = extract_strings(file_path, dedup=True) data_dictionary["strings"] = strings else: @@ -262,7 +265,7 @@ def static_file_info( pass # ToDo we need url support - if HAVE_VIRUSTOTAL and processing_conf.virustotal.enabled: + if HAVE_VIRUSTOTAL and processing_conf.virustotal.enabled and "virustotal" not in data_dictionary: vt_details = vt_lookup("file", file_path, results) if vt_details: data_dictionary["virustotal"] = vt_details @@ -401,25 +404,7 @@ def _extracted_files_metadata( return metadata -def pass_signal(proc, signum, frame): - proc.send_signal(signum) - - -def run_tool(*args, **kwargs) -> Union[bytes, str]: - """Start a subprocess to run the given tool. Make sure to pass a SIGTERM signal to - that process if it is received. - """ - kwargs["stdout"] = subprocess.PIPE - old_handler = None - try: - proc = subprocess.Popen(*args, **kwargs) - old_handler = signal.signal(signal.SIGTERM, functools.partial(pass_signal, proc)) - (stdout, stderr) = proc.communicate() - return stdout - finally: - if old_handler: - signal.signal(signal.SIGTERM, old_handler) - +from lib.cuckoo.common.integrations.utils import run_tool def generic_file_extractors( file: str, @@ -454,24 +439,8 @@ def generic_file_extractors( "tests": tests, } - file_info_funcs = [ - msi_extract, - kixtart_extract, - vbe_extract, - batch_extract, - UnAutoIt_extract, - UPX_unpack, - RarSFX_extract, - Inno_extract, - SevenZip_unpack, - de4dot_deobfuscate, - eziriz_deobfuscate, - office_one, - msix_extract, - UnGPG_extract, - ] - futures = {} + executed_tools = data_dictionary.setdefault("executed_tools", []) with pebble.ProcessPool(max_workers=int(integration_conf.general.max_workers)) as pool: # Prefer custom modules over the built-in ones, since only 1 is allowed # to be the extracted_files_tool. @@ -479,6 +448,9 @@ def generic_file_extractors( for module in extra_info_modules: func_timeout = int(getattr(module, "timeout", 60)) funcname = module.__name__.split(".")[-1] + if funcname in executed_tools: + continue + executed_tools.append(funcname) futures[funcname] = pool.schedule(module.extract_details, args=args, kwargs=kwargs, timeout=func_timeout) for extraction_func in file_info_funcs: @@ -489,6 +461,10 @@ def generic_file_extractors( ): continue + if funcname in executed_tools: + continue + executed_tools.append(funcname) + func_timeout = int(getattr(integration_conf, funcname, {}).get("timeout", 60)) futures[funcname] = pool.schedule(extraction_func, args=args, kwargs=kwargs, timeout=func_timeout) pool.join() @@ -677,51 +653,6 @@ def de4dot_deobfuscate(file: str, *, filetype: str, **_) -> ExtractorReturnType: return ctx -@time_tracker -def msi_extract(file: str, *, filetype: str, **kwargs) -> ExtractorReturnType: - """Work on MSI Installers""" - - if "MSI Installer" not in filetype: - return - - # ToDo replace MsiExtract with pymsi - extracted_files = [] - # sudo apt install msitools - with extractor_ctx(file, "MsiExtract", prefix="msidump_", folder=tools_folder) as ctx: - tempdir = ctx["tempdir"] - output = False - if not kwargs.get("tests"): - # msiextract in different way that 7z, we need to add subfolder support - output = run_tool( - [integration_conf.msi_extract.binary, file, "--directory", tempdir], - universal_newlines=True, - stderr=subprocess.PIPE, - ) - if output: - extracted_files = [ - extracted_file - for extracted_file in list(filter(None, output.split("\n"))) - if path_is_file(os.path.join(tempdir, extracted_file)) - ] - else: - output = run_tool( - [sevenzip_binary, "e", f"-o{tempdir}", "-y", file], - universal_newlines=True, - stderr=subprocess.PIPE, - ) - valid_msi_filetypes = ["PE32", "text", "Microsoft Cabinet archive"] - for root, _, filenames in os.walk(tempdir): - for filename in filenames: - path = os.path.join(root, filename) - if any([x in File(path).get_type() for x in valid_msi_filetypes]): - os.rename(path, os.path.join(root, filename.split(".")[-1].strip("'").strip("!"))) - else: - path_delete(path) - extracted_files = collect_extracted_filenames(tempdir) - - ctx["extracted_files"] = extracted_files - - return ctx @time_tracker @@ -1021,3 +952,19 @@ def UnGPG_extract(file: str, filetype: str, data_dictionary: dict, options: dict ctx["extracted_files"] = collect_extracted_filenames(tempdir) return ctx + +file_info_funcs = [ + kixtart_extract, + vbe_extract, + batch_extract, + UnAutoIt_extract, + UPX_unpack, + RarSFX_extract, + Inno_extract, + SevenZip_unpack, + de4dot_deobfuscate, + eziriz_deobfuscate, + office_one, + msix_extract, + UnGPG_extract, +] diff --git a/lib/cuckoo/common/integrations/file_extra_info_modules/msi_extract.py b/lib/cuckoo/common/integrations/file_extra_info_modules/msi_extract.py new file mode 100644 index 00000000000..7fd9e43497f --- /dev/null +++ b/lib/cuckoo/common/integrations/file_extra_info_modules/msi_extract.py @@ -0,0 +1,73 @@ +import logging +import os +import subprocess + +from lib.cuckoo.common.config import Config +from lib.cuckoo.common.constants import CUCKOO_ROOT +from lib.cuckoo.common.integrations.file_extra_info_modules import ( + ExtractorReturnType, + collect_extracted_filenames, + extractor_ctx, + time_tracker, +) +from lib.cuckoo.common.objects import File +from lib.cuckoo.common.path_utils import path_delete, path_exists, path_is_file +from lib.cuckoo.common.integrations.utils import run_tool + +log = logging.getLogger(__name__) +integration_conf = Config("integrations") + +sevenzip_binary = os.path.join(CUCKOO_ROOT, "data/7zz") +if integration_conf.SevenZip_unpack.binary: + tmp_sevenzip_binary = os.path.join(CUCKOO_ROOT, integration_conf.SevenZip_unpack.binary) + if path_exists(tmp_sevenzip_binary): + sevenzip_binary = tmp_sevenzip_binary +# fallback +if not path_exists(sevenzip_binary): + sevenzip_binary = "/usr/bin/7z" + +@time_tracker +def extract_details(file: str, *, filetype: str, **kwargs) -> ExtractorReturnType: + """Work on MSI Installers""" + + if "MSI Installer" not in filetype: + return + + # ToDo replace MsiExtract with pymsi + extracted_files = [] + # sudo apt install msitools + with extractor_ctx(file, "MsiExtract", prefix="msidump_") as ctx: + tempdir = ctx["tempdir"] + output = False + if not kwargs.get("tests"): + # msiextract in different way that 7z, we need to add subfolder support + output = run_tool( + [integration_conf.msi_extract.binary, file, "--directory", tempdir], + universal_newlines=True, + stderr=subprocess.PIPE, + ) + if output: + extracted_files = [ + extracted_file + for extracted_file in list(filter(None, output.split("\n"))) + if path_is_file(os.path.join(tempdir, extracted_file)) + ] + else: + output = run_tool( + [sevenzip_binary, "e", f"-o{tempdir}", "-y", file], + universal_newlines=True, + stderr=subprocess.PIPE, + ) + valid_msi_filetypes = ["PE32", "text", "Microsoft Cabinet archive"] + for root, _, filenames in os.walk(tempdir): + for filename in filenames: + path = os.path.join(root, filename) + if any([x in File(path).get_type() for x in valid_msi_filetypes]): + os.rename(path, os.path.join(root, filename.split(".")[-1].strip("'").strip("!"))) + else: + path_delete(path) + extracted_files = collect_extracted_filenames(tempdir) + + ctx["extracted_files"] = extracted_files + + return ctx diff --git a/lib/cuckoo/common/integrations/utils.py b/lib/cuckoo/common/integrations/utils.py new file mode 100644 index 00000000000..b94bcc74e84 --- /dev/null +++ b/lib/cuckoo/common/integrations/utils.py @@ -0,0 +1,23 @@ +import functools +import signal +import subprocess +from typing import Union + +def pass_signal(proc, signum, frame): + proc.send_signal(signum) + + +def run_tool(*args, **kwargs) -> Union[bytes, str]: + """Start a subprocess to run the given tool. Make sure to pass a SIGTERM signal to + that process if it is received. + """ + kwargs["stdout"] = subprocess.PIPE + old_handler = None + try: + proc = subprocess.Popen(*args, **kwargs) + old_handler = signal.signal(signal.SIGTERM, functools.partial(pass_signal, proc)) + (stdout, stderr) = proc.communicate() + return stdout + finally: + if old_handler: + signal.signal(signal.SIGTERM, old_handler) diff --git a/lib/cuckoo/common/objects.py b/lib/cuckoo/common/objects.py index 7a5505232f5..9c689efb01b 100644 --- a/lib/cuckoo/common/objects.py +++ b/lib/cuckoo/common/objects.py @@ -175,6 +175,7 @@ class File: # caching 'em. This dictionary is filled during init_yara(). # ToDo find a way to get compiled YARA hash so we can loopup files if hash is the same yara_rules = {} + yara_rules_hash = None yara_initialized = False # static fields which indicate whether the user has been # notified about missing dependencies already @@ -444,6 +445,26 @@ def init_yara(self, raise_exception: bool = False): # Generate root directory for yara rules. yara_root = os.path.join(CUCKOO_ROOT, "data", "yara") custom_yara_root = os.path.join(CUCKOO_ROOT, "custom", "yara") + + # Collect all rule files for hashing to ensure determinism + all_rule_files = [] + for category in categories: + for path in (yara_root, custom_yara_root): + category_root = os.path.join(path, category) + if not path_exists(category_root): + continue + for root, _, filenames in os.walk(category_root, followlinks=True): + if root.endswith("deprecated"): + continue + for filename in filenames: + if filename.endswith((".yar", ".yara")): + all_rule_files.append(os.path.join(root, filename)) + + hasher = hashlib.sha256() + for filepath in sorted(all_rule_files): + hasher.update(Path(filepath).read_bytes()) + File.yara_rules_hash = hasher.hexdigest() + # Loop through all categories. for category in categories: rules, indexed = {}, [] @@ -457,7 +478,7 @@ def init_yara(self, raise_exception: bool = False): for category_root, _, filenames in os.walk(category_root, followlinks=True): if category_root.endswith("deprecated"): continue - for filename in filenames: + for filename in sorted(filenames): if not filename.endswith((".yar", ".yara")): continue filepath = os.path.join(category_root, filename) @@ -540,6 +561,7 @@ def init_yara(self, raise_exception: bool = False): log.debug("\t `-- %s %s", category, entry) else: log.debug("\t |-- %s %s", category, entry) + File.yara_rules_hash = hasher.hexdigest() def get_yara(self, category="binaries", externals=None): """Get Yara signatures matches. diff --git a/modules/processing/CAPE.py b/modules/processing/CAPE.py index cc4671eed89..fd7733077a9 100644 --- a/modules/processing/CAPE.py +++ b/modules/processing/CAPE.py @@ -13,6 +13,7 @@ # along with this program. If not, see . import collections +import hashlib import json import logging import os @@ -31,10 +32,12 @@ add_family_detection, convert_to_printable_and_truncate, get_clamav_consensus, + get_options, make_bytes, texttypes, wide2str, ) +from dev_utils.mongodb import mongo_find_one processing_conf = Config("processing") integrations_conf = Config("integrations") @@ -181,7 +184,52 @@ def process_file(self, file_path, append_file, metadata: dict, *, category: str, else: duplicated["sha256"].add(sha256) - file_info, pefile_object = f.get_all() + cached = False + pefile_object = None + run_static = True + + # Calculate options hash to prevent poisoning + opts = get_options(self.task.get("options", "")) + sorted_opts = json.dumps(opts, sort_keys=True) + options_hash = hashlib.sha256(sorted_opts.encode()).hexdigest() + + if processing_conf.CAPE.file_cache: + try: + db_file = mongo_find_one("files", {"sha256": sha256}) + if db_file: + # Security Fix: Update path immediately + db_file["path"] = file_path + if "_id" in db_file: + del db_file["_id"] + + yara_match = db_file.get("yara_hash", "") == File.yara_rules_hash + options_match = db_file.get("options_hash", "") == options_hash + + if yara_match and options_match: + file_info = db_file + cached = True + run_static = False + else: + # Partial hit + file_info = db_file + cached = True # We have the base object + run_static = True # But we need to re-run static/tools + + if not yara_match: + # Update YARA + file_info["yara"] = f.get_yara() + file_info["cape_yara"] = f.get_yara(category="CAPE") + file_info["yara_hash"] = File.yara_rules_hash + + except Exception as e: + log.exception(e) + + if not cached: + file_info, pefile_object = f.get_all() + file_info["yara_hash"] = File.yara_rules_hash + run_static = True + + file_info["options_hash"] = options_hash if category in ("static", "file"): file_info["name"] = Path(self.task["target"]).name @@ -195,16 +243,17 @@ def process_file(self, file_path, append_file, metadata: dict, *, category: str, add_family_detection(self.results, clamav_detection, "ClamAV", file_info["sha256"]) # should we use dropped path here? - static_file_info( - file_info, - file_path, - str(self.task["id"]), - self.task.get("package", ""), - self.task.get("options", ""), - self.self_extracted, - self.results, - duplicated, - ) + if run_static: + static_file_info( + file_info, + file_path, + str(self.task["id"]), + self.task.get("package", ""), + self.task.get("options", ""), + self.self_extracted, + self.results, + duplicated, + ) type_string, append_file = self._metadata_processing(metadata, file_info, append_file) diff --git a/tests/test_file_extra_info.py b/tests/test_file_extra_info.py index c07f8b123ee..78f29dbf227 100644 --- a/tests/test_file_extra_info.py +++ b/tests/test_file_extra_info.py @@ -7,6 +7,7 @@ import pytest from lib.cuckoo.common.integrations import file_extra_info +from lib.cuckoo.common.integrations.file_extra_info_modules.msi_extract import extract_details as msi_extract @pytest.fixture(autouse=True) @@ -101,7 +102,7 @@ def test_de4dot_deobfuscate(self): reason="Required data file is not present", ) def test_msi_extract(self): - extracted_files = file_extra_info.msi_extract( + extracted_files = msi_extract( file=f"{self_extraction_dir}/0ea5e25b12ab314bc9a0569c3ca756f205f40b792119f8e0fc62c874628dfea0.msi", filetype="MSI Installer", **{"tests": True, "options": {}}, From 728e31a2745c34bd9744a4866782a576c5e0acd3 Mon Sep 17 00:00:00 2001 From: doomedraven Date: Tue, 10 Feb 2026 14:20:46 +0100 Subject: [PATCH 07/11] Update file_extra_info.py --- .../common/integrations/file_extra_info.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/lib/cuckoo/common/integrations/file_extra_info.py b/lib/cuckoo/common/integrations/file_extra_info.py index e4c393c735c..df1c09dc579 100644 --- a/lib/cuckoo/common/integrations/file_extra_info.py +++ b/lib/cuckoo/common/integrations/file_extra_info.py @@ -168,16 +168,16 @@ def static_file_info( if ( not HAVE_OLETOOLS - and "Zip archive data, at least v2.0" in data_dictionary["type"] + and "Zip archive data, at least v2.0" in data_dictionary.get("type", "") and package in {"doc", "ppt", "xls", "pub"} ): log.info("Missed dependencies: pip3 install oletools") - if "MSI Installer" in data_dictionary["type"] and "msi" not in data_dictionary: + if "MSI Installer" in data_dictionary.get("type", "") and "msi" not in data_dictionary: data_dictionary["msi"] = parse_msi(file_path) # ToDo we need type checking as it wont work for most of static jobs - if HAVE_PEFILE and ("PE32" in data_dictionary["type"] or "MS-DOS executable" in data_dictionary["type"]): + if HAVE_PEFILE and ("PE32" in data_dictionary.get("type", "") or "MS-DOS executable" in data_dictionary.get("type", "")): if "pe" not in data_dictionary: with PortableExecutable(file_path) as pe: data_dictionary["pe"] = pe.run(task_id) @@ -188,12 +188,12 @@ def static_file_info( if capa_details: data_dictionary["flare_capa"] = capa_details - if HAVE_FLOSS and integration_conf.floss.enabled and "Mono" not in data_dictionary["type"] and "floss" not in data_dictionary: + if HAVE_FLOSS and integration_conf.floss.enabled and "Mono" not in data_dictionary.get("type", "") and "floss" not in data_dictionary: floss_strings = Floss(file_path, "static", "pe").run() if floss_strings: data_dictionary["floss"] = floss_strings - if "Mono" in data_dictionary["type"] and "dotnet" not in data_dictionary: + if "Mono" in data_dictionary.get("type", "") and "dotnet" not in data_dictionary: if integration_conf.general.dotnet: data_dictionary["dotnet"] = DotNETExecutable(file_path).run() if processing_conf.strings.dotnet and "dotnet_strings" not in data_dictionary: @@ -205,20 +205,20 @@ def static_file_info( if "office" not in data_dictionary: # options is dict where we need to get pass get_options data_dictionary["office"] = Office(file_path, task_id, data_dictionary["sha256"], options_dict).run() - elif ("PDF" in data_dictionary["type"] or file_path.endswith(".pdf")) and integration_conf.general.pdf: + elif ("PDF" in data_dictionary.get("type", "") or file_path.endswith(".pdf")) and integration_conf.general.pdf: if "pdf" not in data_dictionary: data_dictionary["pdf"] = PDF(file_path).run() elif ( - package in {"wsf", "hta"} or data_dictionary["type"] == "XML document text" or file_path.endswith(".wsf") + package in {"wsf", "hta"} or data_dictionary.get("type", "") == "XML document text" or file_path.endswith(".wsf") ) and integration_conf.general.windows_script: if "wsf" not in data_dictionary: data_dictionary["wsf"] = WindowsScriptFile(file_path).run() # elif package in {"js", "vbs"}: # data_dictionary["js"] = EncodedScriptFile(file_path).run() - elif (package == "lnk" or "MS Windows shortcut" in data_dictionary["type"]) and integration_conf.general.lnk: + elif (package == "lnk" or "MS Windows shortcut" in data_dictionary.get("type", "")) and integration_conf.general.lnk: if "lnk" not in data_dictionary: data_dictionary["lnk"] = LnkShortcut(file_path).run() - elif ("Java Jar" in data_dictionary["type"] or file_path.endswith(".jar")) and integration_conf.general.java: + elif ("Java Jar" in data_dictionary.get("type", "") or file_path.endswith(".jar")) and integration_conf.general.java: if "java" not in data_dictionary: if integration_conf.procyon.binary and not path_exists(integration_conf.procyon.binary): log.error("procyon_path specified in processing.conf but the file does not exist") @@ -229,7 +229,7 @@ def static_file_info( # It's possible to fool libmagic into thinking our 2007+ file is a zip. # So until we have static analysis for zip files, we can use oleid to fail us out silently, # yeilding no static analysis results for actual zip files. - # elif ("ELF" in data_dictionary["type"] or file_path.endswith(".elf")) and integration_conf.general.elf: + # elif ("ELF" in data_dictionary.get("type", "") or file_path.endswith(".elf")) and integration_conf.general.elf: # data_dictionary["elf"] = ELF(file_path).run() # data_dictionary["keys"] = f.get_keys() # elif HAVE_OLETOOLS and package == "hwp" and integration_conf.general.hwp: @@ -246,7 +246,7 @@ def static_file_info( if processing_conf.die.enabled and "die" not in data_dictionary: data_dictionary["die"] = detect_it_easy_info(file_path) - if HAVE_FLOSS and processing_conf.floss.enabled and "Mono" not in data_dictionary["type"] and "floss" not in data_dictionary: + if HAVE_FLOSS and processing_conf.floss.enabled and "Mono" not in data_dictionary.get("type", "") and "floss" not in data_dictionary: floss_strings = Floss(file_path, package).run() if floss_strings: data_dictionary["floss"] = floss_strings @@ -433,7 +433,7 @@ def generic_file_extractors( # Arguments that some extractors need. They will always get passed, so the # extractor functions need to accept `**_` and just discard them. kwargs = { - "filetype": data_dictionary["type"], + "filetype": data_dictionary.get("type", ""), "data_dictionary": data_dictionary, "options": options, "tests": tests, From 729b928aa261e4a1b1214e48fe73cf02170596aa Mon Sep 17 00:00:00 2001 From: doomedraven Date: Tue, 10 Feb 2026 14:41:22 +0100 Subject: [PATCH 08/11] add missed type in file_cache --- dev_utils/mongo_hooks.py | 1 + modules/processing/CAPE.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/dev_utils/mongo_hooks.py b/dev_utils/mongo_hooks.py index 08427a7ec31..360b15ea451 100644 --- a/dev_utils/mongo_hooks.py +++ b/dev_utils/mongo_hooks.py @@ -61,6 +61,7 @@ def normalize_file(file_dict, task_id): "entrypoint", "data", "strings", + "type", ) new_dict = {} for fld in static_fields: diff --git a/modules/processing/CAPE.py b/modules/processing/CAPE.py index fd7733077a9..99954ed60dd 100644 --- a/modules/processing/CAPE.py +++ b/modules/processing/CAPE.py @@ -209,12 +209,19 @@ def process_file(self, file_path, append_file, metadata: dict, *, category: str, file_info = db_file cached = True run_static = False + # Regenerate fields stripped by mongo_hooks + if "type" not in file_info: + file_info["type"] = f.get_type() else: # Partial hit file_info = db_file cached = True # We have the base object run_static = True # But we need to re-run static/tools + # Regenerate fields stripped by mongo_hooks + if "type" not in file_info: + file_info["type"] = f.get_type() + if not yara_match: # Update YARA file_info["yara"] = f.get_yara() From b4f67702c2c431ec2e7093e44699493fdd852996 Mon Sep 17 00:00:00 2001 From: doomedraven Date: Tue, 10 Feb 2026 14:43:07 +0100 Subject: [PATCH 09/11] Update file_extra_info.py --- lib/cuckoo/common/integrations/file_extra_info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/cuckoo/common/integrations/file_extra_info.py b/lib/cuckoo/common/integrations/file_extra_info.py index df1c09dc579..1535190d3f5 100644 --- a/lib/cuckoo/common/integrations/file_extra_info.py +++ b/lib/cuckoo/common/integrations/file_extra_info.py @@ -224,7 +224,7 @@ def static_file_info( log.error("procyon_path specified in processing.conf but the file does not exist") else: data_dictionary["java"] = Java(file_path, integration_conf.procyon.binary).run() - elif (file_path.endswith(".rdp") or data_dictionary.get("name", {}).endswith(".rdp")) and "rdp" not in data_dictionary: + elif (file_path.endswith(".rdp") or data_dictionary.get("name", "").endswith(".rdp")) and "rdp" not in data_dictionary: data_dictionary["rdp"] = parse_rdp_file(file_path) # It's possible to fool libmagic into thinking our 2007+ file is a zip. # So until we have static analysis for zip files, we can use oleid to fail us out silently, From 0da43269ff7fd6b07440db6aabeaa8c0eb98fa40 Mon Sep 17 00:00:00 2001 From: doomedraven Date: Tue, 10 Feb 2026 14:57:30 +0100 Subject: [PATCH 10/11] Add extra file metadata fields to CAPE/mongo hooks Add additional static fields (yara, cape_yara, yara_hash, options_hash, clamav) to mongo normalization so they aren't stripped. In CAPE processing, ensure the internal pe object is populated (f.get_type(); pefile_object = f.pe) for full and partial hits, and fill missing options_hash and yara_hash for partial results. Also ensure file name and guest_paths are set when absent. These changes restore metadata removed by mongo_hooks and ensure pefile and hash fields are available for downstream results. --- conf/default/processing.conf.default | 2 ++ dev_utils/mongo_hooks.py | 5 +++++ modules/processing/CAPE.py | 21 +++++++++++++++++++++ 3 files changed, 28 insertions(+) diff --git a/conf/default/processing.conf.default b/conf/default/processing.conf.default index 8cc2ef115a5..4d28e0fdd81 100644 --- a/conf/default/processing.conf.default +++ b/conf/default/processing.conf.default @@ -230,6 +230,8 @@ userdb_signature = no replace_patterns = no # Use file cache to speed up processing by looking up already processed files in MongoDB file_cache = no +# Store pefile objects for later usage? useful if you doing something in signatures/reporting +pefile_store = no # Deduplicate screenshots - You need to install dependency ImageHash>=4.3.1 [deduplication] diff --git a/dev_utils/mongo_hooks.py b/dev_utils/mongo_hooks.py index 360b15ea451..4f280ab820d 100644 --- a/dev_utils/mongo_hooks.py +++ b/dev_utils/mongo_hooks.py @@ -62,6 +62,11 @@ def normalize_file(file_dict, task_id): "data", "strings", "type", + "yara", + "cape_yara", + "yara_hash", + "options_hash", + "clamav", ) new_dict = {} for fld in static_fields: diff --git a/modules/processing/CAPE.py b/modules/processing/CAPE.py index 99954ed60dd..cbd6e2dd99e 100644 --- a/modules/processing/CAPE.py +++ b/modules/processing/CAPE.py @@ -212,6 +212,11 @@ def process_file(self, file_path, append_file, metadata: dict, *, category: str, # Regenerate fields stripped by mongo_hooks if "type" not in file_info: file_info["type"] = f.get_type() + + if processing_conf.CAPE.pefile_store: + # Populate internal pe object for self.results["pefiles"] + f.get_type() + pefile_object = f.pe else: # Partial hit file_info = db_file @@ -222,6 +227,17 @@ def process_file(self, file_path, append_file, metadata: dict, *, category: str, if "type" not in file_info: file_info["type"] = f.get_type() + if processing_conf.CAPE.pefile_store: + # Populate internal pe object for self.results["pefiles"] + f.get_type() + pefile_object = f.pe + + if "options_hash" not in file_info: + file_info["options_hash"] = options_hash + + if "yara_hash" not in file_info: + file_info["yara_hash"] = File.yara_rules_hash + if not yara_match: # Update YARA file_info["yara"] = f.get_yara() @@ -236,6 +252,11 @@ def process_file(self, file_path, append_file, metadata: dict, *, category: str, file_info["yara_hash"] = File.yara_rules_hash run_static = True + if "name" not in file_info: + file_info["name"] = f.get_name() + if "guest_paths" not in file_info: + file_info["guest_paths"] = f.guest_paths + file_info["options_hash"] = options_hash if category in ("static", "file"): From 410fa9a9ead47ab5720d64c768a2ac51f9f4668e Mon Sep 17 00:00:00 2001 From: doomedraven Date: Tue, 10 Feb 2026 15:06:17 +0100 Subject: [PATCH 11/11] set reprocess variable --- lib/cuckoo/core/plugins.py | 1 + modules/reporting/gcs.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/lib/cuckoo/core/plugins.py b/lib/cuckoo/core/plugins.py index 49d6c59eae5..7998a75b027 100644 --- a/lib/cuckoo/core/plugins.py +++ b/lib/cuckoo/core/plugins.py @@ -861,6 +861,7 @@ def process(self, module): current.set_options(options) # Load the content of the analysis.conf file. current.cfg = AnalysisConfig(current.conf_path) + current.reprocess = self.reprocess try: log.debug('Executing reporting module "%s"', current.__class__.__name__) diff --git a/modules/reporting/gcs.py b/modules/reporting/gcs.py index 6f88b279cc9..1af389d0a12 100644 --- a/modules/reporting/gcs.py +++ b/modules/reporting/gcs.py @@ -149,6 +149,9 @@ def run(self, results): ) return + if self.reprocess: + return + tlp = results.get("info", {}).get("tlp") analysis_id = results.get("info", {}).get("id")