From b0a23683bb0d02720eaab21f466ac37ca6e8050e Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Tue, 10 Feb 2026 12:30:48 +0100
Subject: [PATCH 01/11] Update network_utils.py

---
 lib/cuckoo/common/network_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/cuckoo/common/network_utils.py b/lib/cuckoo/common/network_utils.py
index 86302687f28..5c450873ea1 100644
--- a/lib/cuckoo/common/network_utils.py
+++ b/lib/cuckoo/common/network_utils.py
@@ -31,6 +31,7 @@
     "internetconnectw",
     "winhttpopenrequest",
     "winhttpsendrequest",
+    "winhttpgetproxyforurl",
     "winhttpconnect",
     "winhttpopen",
     "internetopenurla",

From 06dd1e6c684965f70febb96f8468f811c24c1c93 Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Tue, 10 Feb 2026 12:40:18 +0100
Subject: [PATCH 02/11] Update network_utils.py

---
 lib/cuckoo/common/network_utils.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/lib/cuckoo/common/network_utils.py b/lib/cuckoo/common/network_utils.py
index 5c450873ea1..cdad4ca53f2 100644
--- a/lib/cuckoo/common/network_utils.py
+++ b/lib/cuckoo/common/network_utils.py
@@ -15,6 +15,7 @@
     "gethostbynamew",
     "dnsquery_a",
     "dnsquery_w",
+    "dnsquery_utf8",
     "dnsqueryex",
     "dnsquery",
 }
@@ -25,6 +26,8 @@
     "internetcrackurlw",
     "httpsendrequesta",
     "httpsendrequestw",
+    "httpsendrequestexa",
+    "httpsendrequestexw",
     "internetsendrequesta",
     "internetsendrequestw",
     "internetconnecta",
@@ -38,6 +41,14 @@
     "internetopenurlw",
     "httpopenrequesta",
     "httpopenrequestw",
+    "urldownloadtofilew",
+    "urldownloadtocachefilew",
+    "cryptretrieveobjectbyurlw",
+    "urlcanonicalizew",
+    "mkparsedisplayname",
+    "mkparsedisplaynameex",
+    "dsenumeratedomaintrustsw",
+    "wnetuseconnectionw",
     "isvalidurl",
 }
 
@@ -96,6 +107,8 @@ def _extract_domain_from_call(call, args_map):
         "QueryName",
         "lpstrName",
         "pName",
+        "ServerName",
+        "servername",
     ):
         v = args_map.get(name)
         if isinstance(v, str) and v.strip():

From e22b5f457f800d6ce3f625da09cbfb7a9663896b Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Tue, 10 Feb 2026 12:47:46 +0100
Subject: [PATCH 03/11] add http(s) from behavior

---
 modules/processing/behavior.py | 13 ++++++
 modules/processing/network.py  | 76 +++++++++++++++++++++++++++++++---
 2 files changed, 84 insertions(+), 5 deletions(-)

diff --git a/modules/processing/behavior.py b/modules/processing/behavior.py
index 942fd2bced5..bbf08586c29 100644
--- a/modules/processing/behavior.py
+++ b/modules/processing/behavior.py
@@ -1232,6 +1232,7 @@ class NetworkMap:
     def __init__(self):
         self.endpoint_map = defaultdict(list)  # (ip, port) -> [pinfo]
         self.http_host_map = defaultdict(list)  # host -> [pinfo]
+        self.http_requests = []  # url -> [pinfo]
         self.dns_intents = defaultdict(list)  # domain -> [intent]
 
     def event_apicall(self, call, process):
@@ -1277,6 +1278,17 @@ def event_apicall(self, call, process):
                 if host:
                     _add_http_host(self.http_host_map, host, pinfo, sock=sock)
 
+                if u:
+                    self.http_requests.append(
+                        {
+                            "url": u,
+                            "host": host,
+                            "process_id": process.get("process_id"),
+                            "process_name": process.get("process_name"),
+                            "time": _parse_behavior_ts(call.get("timestamp")),
+                        }
+                    )
+
             if isinstance(buf, str):
                 u2 = _extract_first_url(buf)
                 if u2:
@@ -1332,6 +1344,7 @@ def run(self):
             "endpoint_map": endpoint_map_str,
             "http_host_map": self.http_host_map,
             "dns_intents": self.dns_intents,
+            "http_requests": self.http_requests,
         }
 
 
diff --git a/modules/processing/network.py b/modules/processing/network.py
index f402a77148e..2bb3b16f425 100644
--- a/modules/processing/network.py
+++ b/modules/processing/network.py
@@ -23,7 +23,7 @@
 from itertools import islice
 from json import loads
 from typing import Any, Dict, List, Optional
-from urllib.parse import urlunparse
+from urllib.parse import urlparse, urlunparse
 
 import cachetools.func
 import dns.resolver
@@ -1365,11 +1365,77 @@ def _merge_behavior_network(self, results):
 
         # 2. HTTP
         http_host_map = net_map.get("http_host_map", {})
-        existing_hosts = {h.get("host") for h in network.get("http", [])}
-        http_events = (network.get("http", []) or []) + (network.get("http_ex", []) or []) + (network.get("https_ex", []) or [])
-        existing_hosts = {_norm_domain(h.get("host")) for h in http_events if h.get("host")}
+        http_requests = net_map.get("http_requests", [])
+
+        existing_hosts = set()
+        existing_urls = set()
+        for h in (network.get("http", []) or []) + (network.get("http_ex", []) or []) + (network.get("https_ex", []) or []):
+            host = h.get("host")
+            if host:
+                existing_hosts.add(_norm_domain(host))
+                uri = h.get("uri", "/")
+                # Store simplistic URL representation for deduplication
+                existing_urls.add(f"{host}{uri}")
+
+        # Process full requests from behavior
+        for req in http_requests:
+            url = req.get("url")
+            if not url:
+                continue
+
+            # Parse URL to components
+            try:
+                parsed = urlparse(url)
+                if not parsed.netloc and not parsed.path:
+                    continue
+
+                host = parsed.netloc or req.get("host")
+                # Handle cases where URL might be just a domain or path
+                if not host and url and "." in url and "/" not in url:
+                    host = url
+
+                # Fallback host normalization
+                if not host and req.get("host"):
+                    host = req.get("host")
+
+                uri = parsed.path
+                if parsed.query:
+                    uri += f"?{parsed.query}"
+                if not uri:
+                    uri = "/"
+
+                # Check for duplicates
+                url_key = f"{host}{uri}"
+                if url_key in existing_urls:
+                    continue
+
+                port = 80
+                if parsed.port:
+                    port = parsed.port
+                elif parsed.scheme == "https":
+                    port = 443
+
+                entry = {
+                    "host": host,
+                    "port": port,
+                    "uri": uri,
+                    "method": "GET",
+                    "source": "behavior",
+                    "process_id": req.get("process_id"),
+                    "process_name": req.get("process_name"),
+                    "time": req.get("time"),
+                }
+                network.setdefault("http", []).append(entry)
+                if host:
+                    existing_hosts.add(_norm_domain(host))
+                existing_urls.add(url_key)
+
+            except Exception:
+                log.warning("Failed to parse behavior URL: %s", url)
+
+        # Process host-only map for remaining missing hosts
         for host, procs in http_host_map.items():
-            if host not in existing_hosts:
+            if _norm_domain(host) not in existing_hosts:
                 proc = procs[0] if procs else {}
                 entry = {
                     "host": host,

From 18f5661acf6b8ef2979db74fd7d1cec7e1ca2ec1 Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Tue, 10 Feb 2026 12:54:16 +0100
Subject: [PATCH 04/11] Update network.py

---
 modules/processing/network.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/modules/processing/network.py b/modules/processing/network.py
index 2bb3b16f425..de175ec5010 100644
--- a/modules/processing/network.py
+++ b/modules/processing/network.py
@@ -1342,7 +1342,7 @@ def _merge_behavior_network(self, results):
         if not net_map:
             return
 
-        network = results.get("network", {})
+        network = results
 
         # 1. DNS
         dns_intents = net_map.get("dns_intents", {})
@@ -1359,7 +1359,7 @@ def _merge_behavior_network(self, results):
                     "source": "behavior",
                     "process_id": proc.get("process_id"),
                     "process_name": proc.get("process_name"),
-                    "time": first_intent.get("ts_epoch"),
+                    "first_seen": first_intent.get("ts_epoch"),
                 }
                 network.setdefault("dns", []).append(entry)
 
@@ -1398,14 +1398,14 @@ def _merge_behavior_network(self, results):
                 if not host and req.get("host"):
                     host = req.get("host")
 
-                uri = parsed.path
+                path = parsed.path
                 if parsed.query:
-                    uri += f"?{parsed.query}"
-                if not uri:
-                    uri = "/"
+                    path += f"?{parsed.query}"
+                if not path:
+                    path = "/"
 
                 # Check for duplicates
-                url_key = f"{host}{uri}"
+                url_key = f"{host}{path}"
                 if url_key in existing_urls:
                     continue
 
@@ -1418,12 +1418,13 @@ def _merge_behavior_network(self, results):
                 entry = {
                     "host": host,
                     "port": port,
-                    "uri": uri,
+                    "uri": url,
+                    "path": path,
                     "method": "GET",
                     "source": "behavior",
                     "process_id": req.get("process_id"),
                     "process_name": req.get("process_name"),
-                    "time": req.get("time"),
+                    "first_seen": req.get("time"),
                 }
                 network.setdefault("http", []).append(entry)
                 if host:
@@ -1440,7 +1441,8 @@ def _merge_behavior_network(self, results):
                 entry = {
                     "host": host,
                     "port": 80,
-                    "uri": "/",
+                    "uri": f"http://{host}/",
+                    "path": "/",
                     "method": "GET",
                     "source": "behavior",
                     "process_id": proc.get("process_id"),

From fda8d198d9fda0ea00c60241278510bfbb07eb17 Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Tue, 10 Feb 2026 12:59:54 +0100
Subject: [PATCH 05/11] Update admin_conf.py_example

---
 admin/admin_conf.py_example | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/admin/admin_conf.py_example b/admin/admin_conf.py_example
index 50e990fae5a..eccd5698e06 100644
--- a/admin/admin_conf.py_example
+++ b/admin/admin_conf.py_example
@@ -23,7 +23,7 @@ JUMP_BOX_SECOND_USERNAME = ""
 JUMP_BOX_SECOND_PORT = 22
 
 NUM_THREADS = 5
-POSTPROCESS = "systemctl restart cape-processor; systemctl status cape-processor"
+POSTPROCESS = "systemctl restart cape-processor; systemctl status --no-pager cape-processor"
 
 EXCLUDE_DIRS = set(
     [

From 7a7ce83965bcc8a4d437b217e17abbfe58a7bd45 Mon Sep 17 00:00:00 2001
From: doomedraven <abrukhovetskyy@google.com>
Date: Tue, 10 Feb 2026 14:06:12 +0100
Subject: [PATCH 06/11] file details cache (#2894)

* file details cache

  Summary of Changes:

   1. Modified `lib/cuckoo/common/objects.py`:
       * Updated File.init_yara to calculate a SHA256 hash of all compiled YARA rule files.
       * Stored this hash in File.yara_rules_hash.

   2. Modified `modules/processing/CAPE.py`:
       * In process_file, imported mongo_find_one from dev_utils.mongodb.
       * Implemented logic to query the MongoDB files collection using the file's SHA256.
       * Cache Hit: If the file is found and yara_hash matches, file_info is populated from the database, skipping expensive operations like f.get_all() (PE parsing, initial YARA scan) and static_file_info.
       * Partial Cache Hit: If the file is found but yara_hash differs, the cached data is loaded, but YARA scans are re-run (f.get_yara()), and the yara_hash field is updated. static_file_info is still skipped to
         avoid re-extracting/re-analyzing static properties.
       * Cache Miss: If the file is not in the DB, standard processing proceeds, and yara_hash is added to file_info for future caching.

  This solution optimizes processing time for previously analyzed files while ensuring YARA scan results remain up-to-date when rules change.

* more

* fix

* Update CAPE.py

* Update CAPE.py

* Update test_file_extra_info.py

* fixes

* Update objects.py

* make it on/off
---
 conf/default/processing.conf.default          |   2 +
 .../common/integrations/file_extra_info.py    | 161 ++++++------------
 .../file_extra_info_modules/msi_extract.py    |  73 ++++++++
 lib/cuckoo/common/integrations/utils.py       |  23 +++
 lib/cuckoo/common/objects.py                  |  24 ++-
 modules/processing/CAPE.py                    |  71 ++++++--
 tests/test_file_extra_info.py                 |   3 +-
 7 files changed, 237 insertions(+), 120 deletions(-)
 create mode 100644 lib/cuckoo/common/integrations/file_extra_info_modules/msi_extract.py
 create mode 100644 lib/cuckoo/common/integrations/utils.py

diff --git a/conf/default/processing.conf.default b/conf/default/processing.conf.default
index ca0c448bca6..8cc2ef115a5 100644
--- a/conf/default/processing.conf.default
+++ b/conf/default/processing.conf.default
@@ -228,6 +228,8 @@ max_file_size = 90
 userdb_signature = no
 # https://capev2.readthedocs.io/en/latest/usage/patterns_replacement.html
 replace_patterns = no
+# Use file cache to speed up processing by looking up already processed files in MongoDB
+file_cache = no
 
 # Deduplicate screenshots - You need to install dependency ImageHash>=4.3.1
 [deduplication]
diff --git a/lib/cuckoo/common/integrations/file_extra_info.py b/lib/cuckoo/common/integrations/file_extra_info.py
index 31472887fd0..e4c393c735c 100644
--- a/lib/cuckoo/common/integrations/file_extra_info.py
+++ b/lib/cuckoo/common/integrations/file_extra_info.py
@@ -1,5 +1,4 @@
 import concurrent.futures
-import functools
 import hashlib
 import json
 import logging
@@ -7,11 +6,10 @@
 import re
 import shlex
 import shutil
-import signal
 import subprocess
 
 # from contextlib import suppress
-from typing import Any, DefaultDict, List, Optional, Set, Union
+from typing import Any, DefaultDict, List, Optional, Set
 
 import pebble
 
@@ -38,7 +36,6 @@
 from lib.cuckoo.common.load_extra_modules import file_extra_info_load_modules
 from lib.cuckoo.common.objects import File
 from lib.cuckoo.common.path_utils import (
-    path_delete,
     path_exists,
     path_get_size,
     path_is_file,
@@ -176,52 +173,58 @@ def static_file_info(
     ):
         log.info("Missed dependencies: pip3 install oletools")
 
-    if "MSI Installer" in data_dictionary["type"]:
+    if "MSI Installer" in data_dictionary["type"] and "msi" not in data_dictionary:
         data_dictionary["msi"] = parse_msi(file_path)
 
     # ToDo we need type checking as it wont work for most of static jobs
     if HAVE_PEFILE and ("PE32" in data_dictionary["type"] or "MS-DOS executable" in data_dictionary["type"]):
-        with PortableExecutable(file_path) as pe:
-            data_dictionary["pe"] = pe.run(task_id)
+        if "pe" not in data_dictionary:
+            with PortableExecutable(file_path) as pe:
+                data_dictionary["pe"] = pe.run(task_id)
 
-        if HAVE_FLARE_CAPA:
+        if HAVE_FLARE_CAPA and "flare_capa" not in data_dictionary:
             # https://github.com/mandiant/capa/issues/2620
             capa_details = flare_capa_details(file_path, "static")
             if capa_details:
                 data_dictionary["flare_capa"] = capa_details
 
-        if HAVE_FLOSS and integration_conf.floss.enabled and "Mono" not in data_dictionary["type"]:
+        if HAVE_FLOSS and integration_conf.floss.enabled and "Mono" not in data_dictionary["type"] and "floss" not in data_dictionary:
             floss_strings = Floss(file_path, "static", "pe").run()
             if floss_strings:
                 data_dictionary["floss"] = floss_strings
 
-        if "Mono" in data_dictionary["type"]:
+        if "Mono" in data_dictionary["type"] and "dotnet" not in data_dictionary:
             if integration_conf.general.dotnet:
                 data_dictionary["dotnet"] = DotNETExecutable(file_path).run()
-                if processing_conf.strings.dotnet:
+                if processing_conf.strings.dotnet and "dotnet_strings" not in data_dictionary:
                     dotnet_strings = dotnet_user_strings(file_path)
                     if dotnet_strings:
                         data_dictionary.setdefault("dotnet_strings", dotnet_strings)
 
     elif (HAVE_OLETOOLS and package in {"doc", "ppt", "xls", "pub"} and integration_conf.general.office) or data_dictionary.get("name", "").endswith((".doc", ".ppt", ".xls", ".pub")):
-        # options is dict where we need to get pass get_options
-        data_dictionary["office"] = Office(file_path, task_id, data_dictionary["sha256"], options_dict).run()
+        if "office" not in data_dictionary:
+            # options is dict where we need to get pass get_options
+            data_dictionary["office"] = Office(file_path, task_id, data_dictionary["sha256"], options_dict).run()
     elif ("PDF" in data_dictionary["type"] or file_path.endswith(".pdf")) and integration_conf.general.pdf:
-        data_dictionary["pdf"] = PDF(file_path).run()
+        if "pdf" not in data_dictionary:
+            data_dictionary["pdf"] = PDF(file_path).run()
     elif (
         package in {"wsf", "hta"} or data_dictionary["type"] == "XML document text" or file_path.endswith(".wsf")
     ) and integration_conf.general.windows_script:
-        data_dictionary["wsf"] = WindowsScriptFile(file_path).run()
+        if "wsf" not in data_dictionary:
+            data_dictionary["wsf"] = WindowsScriptFile(file_path).run()
     # elif package in {"js", "vbs"}:
     #    data_dictionary["js"] = EncodedScriptFile(file_path).run()
     elif (package == "lnk" or "MS Windows shortcut" in data_dictionary["type"]) and integration_conf.general.lnk:
-        data_dictionary["lnk"] = LnkShortcut(file_path).run()
+        if "lnk" not in data_dictionary:
+            data_dictionary["lnk"] = LnkShortcut(file_path).run()
     elif ("Java Jar" in data_dictionary["type"] or file_path.endswith(".jar")) and integration_conf.general.java:
-        if integration_conf.procyon.binary and not path_exists(integration_conf.procyon.binary):
-            log.error("procyon_path specified in processing.conf but the file does not exist")
-        else:
-            data_dictionary["java"] = Java(file_path, integration_conf.procyon.binary).run()
-    elif file_path.endswith(".rdp") or data_dictionary.get("name", {}).endswith(".rdp"):
+        if "java" not in data_dictionary:
+            if integration_conf.procyon.binary and not path_exists(integration_conf.procyon.binary):
+                log.error("procyon_path specified in processing.conf but the file does not exist")
+            else:
+                data_dictionary["java"] = Java(file_path, integration_conf.procyon.binary).run()
+    elif (file_path.endswith(".rdp") or data_dictionary.get("name", {}).endswith(".rdp")) and "rdp" not in data_dictionary:
         data_dictionary["rdp"] = parse_rdp_file(file_path)
     # It's possible to fool libmagic into thinking our 2007+ file is a zip.
     # So until we have static analysis for zip files, we can use oleid to fail us out silently,
@@ -237,13 +240,13 @@ def static_file_info(
     if not file_path.startswith(exclude_startswith) and not file_path.endswith(excluded_extensions):
         data_dictionary["data"] = is_text_file(data_dictionary, file_path, processing_conf.CAPE.buffer, data)
 
-        if processing_conf.trid.enabled:
+        if processing_conf.trid.enabled and "trid" not in data_dictionary:
             data_dictionary["trid"] = trid_info(file_path)
 
-        if processing_conf.die.enabled:
+        if processing_conf.die.enabled and "die" not in data_dictionary:
             data_dictionary["die"] = detect_it_easy_info(file_path)
 
-        if HAVE_FLOSS and processing_conf.floss.enabled and "Mono" not in data_dictionary["type"]:
+        if HAVE_FLOSS and processing_conf.floss.enabled and "Mono" not in data_dictionary["type"] and "floss" not in data_dictionary:
             floss_strings = Floss(file_path, package).run()
             if floss_strings:
                 data_dictionary["floss"] = floss_strings
@@ -253,7 +256,7 @@ def static_file_info(
             # think that we want to look them up on-demand (i.e. display the
             # "strings" button linking to an on_demand URL).
             data_dictionary["strings"] = []
-        elif HAVE_STRINGS:
+        elif HAVE_STRINGS and "strings" not in data_dictionary:
             strings = extract_strings(file_path, dedup=True)
             data_dictionary["strings"] = strings
         else:
@@ -262,7 +265,7 @@ def static_file_info(
             pass
 
         # ToDo we need url support
-        if HAVE_VIRUSTOTAL and processing_conf.virustotal.enabled:
+        if HAVE_VIRUSTOTAL and processing_conf.virustotal.enabled and "virustotal" not in data_dictionary:
             vt_details = vt_lookup("file", file_path, results)
             if vt_details:
                 data_dictionary["virustotal"] = vt_details
@@ -401,25 +404,7 @@ def _extracted_files_metadata(
     return metadata
 
 
-def pass_signal(proc, signum, frame):
-    proc.send_signal(signum)
-
-
-def run_tool(*args, **kwargs) -> Union[bytes, str]:
-    """Start a subprocess to run the given tool. Make sure to pass a SIGTERM signal to
-    that process if it is received.
-    """
-    kwargs["stdout"] = subprocess.PIPE
-    old_handler = None
-    try:
-        proc = subprocess.Popen(*args, **kwargs)
-        old_handler = signal.signal(signal.SIGTERM, functools.partial(pass_signal, proc))
-        (stdout, stderr) = proc.communicate()
-        return stdout
-    finally:
-        if old_handler:
-            signal.signal(signal.SIGTERM, old_handler)
-
+from lib.cuckoo.common.integrations.utils import run_tool
 
 def generic_file_extractors(
     file: str,
@@ -454,24 +439,8 @@ def generic_file_extractors(
         "tests": tests,
     }
 
-    file_info_funcs = [
-        msi_extract,
-        kixtart_extract,
-        vbe_extract,
-        batch_extract,
-        UnAutoIt_extract,
-        UPX_unpack,
-        RarSFX_extract,
-        Inno_extract,
-        SevenZip_unpack,
-        de4dot_deobfuscate,
-        eziriz_deobfuscate,
-        office_one,
-        msix_extract,
-        UnGPG_extract,
-    ]
-
     futures = {}
+    executed_tools = data_dictionary.setdefault("executed_tools", [])
     with pebble.ProcessPool(max_workers=int(integration_conf.general.max_workers)) as pool:
         # Prefer custom modules over the built-in ones, since only 1 is allowed
         # to be the extracted_files_tool.
@@ -479,6 +448,9 @@ def generic_file_extractors(
             for module in extra_info_modules:
                 func_timeout = int(getattr(module, "timeout", 60))
                 funcname = module.__name__.split(".")[-1]
+                if funcname in executed_tools:
+                    continue
+                executed_tools.append(funcname)
                 futures[funcname] = pool.schedule(module.extract_details, args=args, kwargs=kwargs, timeout=func_timeout)
 
         for extraction_func in file_info_funcs:
@@ -489,6 +461,10 @@ def generic_file_extractors(
             ):
                 continue
 
+            if funcname in executed_tools:
+                continue
+            executed_tools.append(funcname)
+
             func_timeout = int(getattr(integration_conf, funcname, {}).get("timeout", 60))
             futures[funcname] = pool.schedule(extraction_func, args=args, kwargs=kwargs, timeout=func_timeout)
     pool.join()
@@ -677,51 +653,6 @@ def de4dot_deobfuscate(file: str, *, filetype: str, **_) -> ExtractorReturnType:
     return ctx
 
 
-@time_tracker
-def msi_extract(file: str, *, filetype: str, **kwargs) -> ExtractorReturnType:
-    """Work on MSI Installers"""
-
-    if "MSI Installer" not in filetype:
-        return
-
-    # ToDo replace MsiExtract with pymsi
-    extracted_files = []
-    # sudo apt install msitools
-    with extractor_ctx(file, "MsiExtract", prefix="msidump_", folder=tools_folder) as ctx:
-        tempdir = ctx["tempdir"]
-        output = False
-        if not kwargs.get("tests"):
-            # msiextract in different way that 7z, we need to add subfolder support
-            output = run_tool(
-                [integration_conf.msi_extract.binary, file, "--directory", tempdir],
-                universal_newlines=True,
-                stderr=subprocess.PIPE,
-            )
-        if output:
-            extracted_files = [
-                extracted_file
-                for extracted_file in list(filter(None, output.split("\n")))
-                if path_is_file(os.path.join(tempdir, extracted_file))
-            ]
-        else:
-            output = run_tool(
-                [sevenzip_binary, "e", f"-o{tempdir}", "-y", file],
-                universal_newlines=True,
-                stderr=subprocess.PIPE,
-            )
-            valid_msi_filetypes = ["PE32", "text", "Microsoft Cabinet archive"]
-            for root, _, filenames in os.walk(tempdir):
-                for filename in filenames:
-                    path = os.path.join(root, filename)
-                    if any([x in File(path).get_type() for x in valid_msi_filetypes]):
-                        os.rename(path, os.path.join(root, filename.split(".")[-1].strip("'").strip("!")))
-                    else:
-                        path_delete(path)
-            extracted_files = collect_extracted_filenames(tempdir)
-
-        ctx["extracted_files"] = extracted_files
-
-    return ctx
 
 
 @time_tracker
@@ -1021,3 +952,19 @@ def UnGPG_extract(file: str, filetype: str, data_dictionary: dict, options: dict
             ctx["extracted_files"] = collect_extracted_filenames(tempdir)
 
     return ctx
+
+file_info_funcs = [
+    kixtart_extract,
+    vbe_extract,
+    batch_extract,
+    UnAutoIt_extract,
+    UPX_unpack,
+    RarSFX_extract,
+    Inno_extract,
+    SevenZip_unpack,
+    de4dot_deobfuscate,
+    eziriz_deobfuscate,
+    office_one,
+    msix_extract,
+    UnGPG_extract,
+]
diff --git a/lib/cuckoo/common/integrations/file_extra_info_modules/msi_extract.py b/lib/cuckoo/common/integrations/file_extra_info_modules/msi_extract.py
new file mode 100644
index 00000000000..7fd9e43497f
--- /dev/null
+++ b/lib/cuckoo/common/integrations/file_extra_info_modules/msi_extract.py
@@ -0,0 +1,73 @@
+import logging
+import os
+import subprocess
+
+from lib.cuckoo.common.config import Config
+from lib.cuckoo.common.constants import CUCKOO_ROOT
+from lib.cuckoo.common.integrations.file_extra_info_modules import (
+    ExtractorReturnType,
+    collect_extracted_filenames,
+    extractor_ctx,
+    time_tracker,
+)
+from lib.cuckoo.common.objects import File
+from lib.cuckoo.common.path_utils import path_delete, path_exists, path_is_file
+from lib.cuckoo.common.integrations.utils import run_tool
+
+log = logging.getLogger(__name__)
+integration_conf = Config("integrations")
+
+sevenzip_binary = os.path.join(CUCKOO_ROOT, "data/7zz")
+if integration_conf.SevenZip_unpack.binary:
+    tmp_sevenzip_binary = os.path.join(CUCKOO_ROOT, integration_conf.SevenZip_unpack.binary)
+    if path_exists(tmp_sevenzip_binary):
+        sevenzip_binary = tmp_sevenzip_binary
+# fallback
+if not path_exists(sevenzip_binary):
+    sevenzip_binary = "/usr/bin/7z"
+
+@time_tracker
+def extract_details(file: str, *, filetype: str, **kwargs) -> ExtractorReturnType:
+    """Work on MSI Installers"""
+
+    if "MSI Installer" not in filetype:
+        return
+
+    # ToDo replace MsiExtract with pymsi
+    extracted_files = []
+    # sudo apt install msitools
+    with extractor_ctx(file, "MsiExtract", prefix="msidump_") as ctx:
+        tempdir = ctx["tempdir"]
+        output = False
+        if not kwargs.get("tests"):
+            # msiextract in different way that 7z, we need to add subfolder support
+            output = run_tool(
+                [integration_conf.msi_extract.binary, file, "--directory", tempdir],
+                universal_newlines=True,
+                stderr=subprocess.PIPE,
+            )
+        if output:
+            extracted_files = [
+                extracted_file
+                for extracted_file in list(filter(None, output.split("\n")))
+                if path_is_file(os.path.join(tempdir, extracted_file))
+            ]
+        else:
+            output = run_tool(
+                [sevenzip_binary, "e", f"-o{tempdir}", "-y", file],
+                universal_newlines=True,
+                stderr=subprocess.PIPE,
+            )
+            valid_msi_filetypes = ["PE32", "text", "Microsoft Cabinet archive"]
+            for root, _, filenames in os.walk(tempdir):
+                for filename in filenames:
+                    path = os.path.join(root, filename)
+                    if any([x in File(path).get_type() for x in valid_msi_filetypes]):
+                        os.rename(path, os.path.join(root, filename.split(".")[-1].strip("'").strip("!")))
+                    else:
+                        path_delete(path)
+            extracted_files = collect_extracted_filenames(tempdir)
+
+        ctx["extracted_files"] = extracted_files
+
+    return ctx
diff --git a/lib/cuckoo/common/integrations/utils.py b/lib/cuckoo/common/integrations/utils.py
new file mode 100644
index 00000000000..b94bcc74e84
--- /dev/null
+++ b/lib/cuckoo/common/integrations/utils.py
@@ -0,0 +1,23 @@
+import functools
+import signal
+import subprocess
+from typing import Union
+
+def pass_signal(proc, signum, frame):
+    proc.send_signal(signum)
+
+
+def run_tool(*args, **kwargs) -> Union[bytes, str]:
+    """Start a subprocess to run the given tool. Make sure to pass a SIGTERM signal to
+    that process if it is received.
+    """
+    kwargs["stdout"] = subprocess.PIPE
+    old_handler = None
+    try:
+        proc = subprocess.Popen(*args, **kwargs)
+        old_handler = signal.signal(signal.SIGTERM, functools.partial(pass_signal, proc))
+        (stdout, stderr) = proc.communicate()
+        return stdout
+    finally:
+        if old_handler:
+            signal.signal(signal.SIGTERM, old_handler)
diff --git a/lib/cuckoo/common/objects.py b/lib/cuckoo/common/objects.py
index 7a5505232f5..9c689efb01b 100644
--- a/lib/cuckoo/common/objects.py
+++ b/lib/cuckoo/common/objects.py
@@ -175,6 +175,7 @@ class File:
     # caching 'em. This dictionary is filled during init_yara().
     # ToDo find a way to get compiled YARA hash so we can loopup files if hash is the same
     yara_rules = {}
+    yara_rules_hash = None
     yara_initialized = False
     # static fields which indicate whether the user has been
     # notified about missing dependencies already
@@ -444,6 +445,26 @@ def init_yara(self, raise_exception: bool = False):
         # Generate root directory for yara rules.
         yara_root = os.path.join(CUCKOO_ROOT, "data", "yara")
         custom_yara_root = os.path.join(CUCKOO_ROOT, "custom", "yara")
+
+        # Collect all rule files for hashing to ensure determinism
+        all_rule_files = []
+        for category in categories:
+            for path in (yara_root, custom_yara_root):
+                category_root = os.path.join(path, category)
+                if not path_exists(category_root):
+                    continue
+                for root, _, filenames in os.walk(category_root, followlinks=True):
+                    if root.endswith("deprecated"):
+                        continue
+                    for filename in filenames:
+                        if filename.endswith((".yar", ".yara")):
+                            all_rule_files.append(os.path.join(root, filename))
+
+        hasher = hashlib.sha256()
+        for filepath in sorted(all_rule_files):
+            hasher.update(Path(filepath).read_bytes())
+        File.yara_rules_hash = hasher.hexdigest()
+
         # Loop through all categories.
         for category in categories:
             rules, indexed = {}, []
@@ -457,7 +478,7 @@ def init_yara(self, raise_exception: bool = False):
                 for category_root, _, filenames in os.walk(category_root, followlinks=True):
                     if category_root.endswith("deprecated"):
                         continue
-                    for filename in filenames:
+                    for filename in sorted(filenames):
                         if not filename.endswith((".yar", ".yara")):
                             continue
                         filepath = os.path.join(category_root, filename)
@@ -540,6 +561,7 @@ def init_yara(self, raise_exception: bool = False):
                     log.debug("\t `-- %s %s", category, entry)
                 else:
                     log.debug("\t |-- %s %s", category, entry)
+        File.yara_rules_hash = hasher.hexdigest()
 
     def get_yara(self, category="binaries", externals=None):
         """Get Yara signatures matches.
diff --git a/modules/processing/CAPE.py b/modules/processing/CAPE.py
index cc4671eed89..fd7733077a9 100644
--- a/modules/processing/CAPE.py
+++ b/modules/processing/CAPE.py
@@ -13,6 +13,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 import collections
+import hashlib
 import json
 import logging
 import os
@@ -31,10 +32,12 @@
     add_family_detection,
     convert_to_printable_and_truncate,
     get_clamav_consensus,
+    get_options,
     make_bytes,
     texttypes,
     wide2str,
 )
+from dev_utils.mongodb import mongo_find_one
 
 processing_conf = Config("processing")
 integrations_conf = Config("integrations")
@@ -181,7 +184,52 @@ def process_file(self, file_path, append_file, metadata: dict, *, category: str,
         else:
             duplicated["sha256"].add(sha256)
 
-        file_info, pefile_object = f.get_all()
+        cached = False
+        pefile_object = None
+        run_static = True
+
+        # Calculate options hash to prevent poisoning
+        opts = get_options(self.task.get("options", ""))
+        sorted_opts = json.dumps(opts, sort_keys=True)
+        options_hash = hashlib.sha256(sorted_opts.encode()).hexdigest()
+
+        if processing_conf.CAPE.file_cache:
+            try:
+                db_file = mongo_find_one("files", {"sha256": sha256})
+                if db_file:
+                    # Security Fix: Update path immediately
+                    db_file["path"] = file_path
+                    if "_id" in db_file:
+                        del db_file["_id"]
+
+                    yara_match = db_file.get("yara_hash", "") == File.yara_rules_hash
+                    options_match = db_file.get("options_hash", "") == options_hash
+
+                    if yara_match and options_match:
+                        file_info = db_file
+                        cached = True
+                        run_static = False
+                    else:
+                        # Partial hit
+                        file_info = db_file
+                        cached = True  # We have the base object
+                        run_static = True  # But we need to re-run static/tools
+
+                        if not yara_match:
+                            # Update YARA
+                            file_info["yara"] = f.get_yara()
+                            file_info["cape_yara"] = f.get_yara(category="CAPE")
+                            file_info["yara_hash"] = File.yara_rules_hash
+
+            except Exception as e:
+                log.exception(e)
+
+        if not cached:
+            file_info, pefile_object = f.get_all()
+            file_info["yara_hash"] = File.yara_rules_hash
+            run_static = True
+
+        file_info["options_hash"] = options_hash
 
         if category in ("static", "file"):
             file_info["name"] = Path(self.task["target"]).name
@@ -195,16 +243,17 @@ def process_file(self, file_path, append_file, metadata: dict, *, category: str,
                 add_family_detection(self.results, clamav_detection, "ClamAV", file_info["sha256"])
 
         # should we use dropped path here?
-        static_file_info(
-            file_info,
-            file_path,
-            str(self.task["id"]),
-            self.task.get("package", ""),
-            self.task.get("options", ""),
-            self.self_extracted,
-            self.results,
-            duplicated,
-        )
+        if run_static:
+            static_file_info(
+                file_info,
+                file_path,
+                str(self.task["id"]),
+                self.task.get("package", ""),
+                self.task.get("options", ""),
+                self.self_extracted,
+                self.results,
+                duplicated,
+            )
 
         type_string, append_file = self._metadata_processing(metadata, file_info, append_file)
 
diff --git a/tests/test_file_extra_info.py b/tests/test_file_extra_info.py
index c07f8b123ee..78f29dbf227 100644
--- a/tests/test_file_extra_info.py
+++ b/tests/test_file_extra_info.py
@@ -7,6 +7,7 @@
 import pytest
 
 from lib.cuckoo.common.integrations import file_extra_info
+from lib.cuckoo.common.integrations.file_extra_info_modules.msi_extract import extract_details as msi_extract
 
 
 @pytest.fixture(autouse=True)
@@ -101,7 +102,7 @@ def test_de4dot_deobfuscate(self):
         reason="Required data file is not present",
     )
     def test_msi_extract(self):
-        extracted_files = file_extra_info.msi_extract(
+        extracted_files = msi_extract(
             file=f"{self_extraction_dir}/0ea5e25b12ab314bc9a0569c3ca756f205f40b792119f8e0fc62c874628dfea0.msi",
             filetype="MSI Installer",
             **{"tests": True, "options": {}},

From 728e31a2745c34bd9744a4866782a576c5e0acd3 Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Tue, 10 Feb 2026 14:20:46 +0100
Subject: [PATCH 07/11] Update file_extra_info.py

---
 .../common/integrations/file_extra_info.py    | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/lib/cuckoo/common/integrations/file_extra_info.py b/lib/cuckoo/common/integrations/file_extra_info.py
index e4c393c735c..df1c09dc579 100644
--- a/lib/cuckoo/common/integrations/file_extra_info.py
+++ b/lib/cuckoo/common/integrations/file_extra_info.py
@@ -168,16 +168,16 @@ def static_file_info(
 
     if (
         not HAVE_OLETOOLS
-        and "Zip archive data, at least v2.0" in data_dictionary["type"]
+        and "Zip archive data, at least v2.0" in data_dictionary.get("type", "")
         and package in {"doc", "ppt", "xls", "pub"}
     ):
         log.info("Missed dependencies: pip3 install oletools")
 
-    if "MSI Installer" in data_dictionary["type"] and "msi" not in data_dictionary:
+    if "MSI Installer" in data_dictionary.get("type", "") and "msi" not in data_dictionary:
         data_dictionary["msi"] = parse_msi(file_path)
 
     # ToDo we need type checking as it wont work for most of static jobs
-    if HAVE_PEFILE and ("PE32" in data_dictionary["type"] or "MS-DOS executable" in data_dictionary["type"]):
+    if HAVE_PEFILE and ("PE32" in data_dictionary.get("type", "") or "MS-DOS executable" in data_dictionary.get("type", "")):
         if "pe" not in data_dictionary:
             with PortableExecutable(file_path) as pe:
                 data_dictionary["pe"] = pe.run(task_id)
@@ -188,12 +188,12 @@ def static_file_info(
             if capa_details:
                 data_dictionary["flare_capa"] = capa_details
 
-        if HAVE_FLOSS and integration_conf.floss.enabled and "Mono" not in data_dictionary["type"] and "floss" not in data_dictionary:
+        if HAVE_FLOSS and integration_conf.floss.enabled and "Mono" not in data_dictionary.get("type", "") and "floss" not in data_dictionary:
             floss_strings = Floss(file_path, "static", "pe").run()
             if floss_strings:
                 data_dictionary["floss"] = floss_strings
 
-        if "Mono" in data_dictionary["type"] and "dotnet" not in data_dictionary:
+        if "Mono" in data_dictionary.get("type", "") and "dotnet" not in data_dictionary:
             if integration_conf.general.dotnet:
                 data_dictionary["dotnet"] = DotNETExecutable(file_path).run()
                 if processing_conf.strings.dotnet and "dotnet_strings" not in data_dictionary:
@@ -205,20 +205,20 @@ def static_file_info(
         if "office" not in data_dictionary:
             # options is dict where we need to get pass get_options
             data_dictionary["office"] = Office(file_path, task_id, data_dictionary["sha256"], options_dict).run()
-    elif ("PDF" in data_dictionary["type"] or file_path.endswith(".pdf")) and integration_conf.general.pdf:
+    elif ("PDF" in data_dictionary.get("type", "") or file_path.endswith(".pdf")) and integration_conf.general.pdf:
         if "pdf" not in data_dictionary:
             data_dictionary["pdf"] = PDF(file_path).run()
     elif (
-        package in {"wsf", "hta"} or data_dictionary["type"] == "XML document text" or file_path.endswith(".wsf")
+        package in {"wsf", "hta"} or data_dictionary.get("type", "") == "XML document text" or file_path.endswith(".wsf")
     ) and integration_conf.general.windows_script:
         if "wsf" not in data_dictionary:
             data_dictionary["wsf"] = WindowsScriptFile(file_path).run()
     # elif package in {"js", "vbs"}:
     #    data_dictionary["js"] = EncodedScriptFile(file_path).run()
-    elif (package == "lnk" or "MS Windows shortcut" in data_dictionary["type"]) and integration_conf.general.lnk:
+    elif (package == "lnk" or "MS Windows shortcut" in data_dictionary.get("type", "")) and integration_conf.general.lnk:
         if "lnk" not in data_dictionary:
             data_dictionary["lnk"] = LnkShortcut(file_path).run()
-    elif ("Java Jar" in data_dictionary["type"] or file_path.endswith(".jar")) and integration_conf.general.java:
+    elif ("Java Jar" in data_dictionary.get("type", "") or file_path.endswith(".jar")) and integration_conf.general.java:
         if "java" not in data_dictionary:
             if integration_conf.procyon.binary and not path_exists(integration_conf.procyon.binary):
                 log.error("procyon_path specified in processing.conf but the file does not exist")
@@ -229,7 +229,7 @@ def static_file_info(
     # It's possible to fool libmagic into thinking our 2007+ file is a zip.
     # So until we have static analysis for zip files, we can use oleid to fail us out silently,
     # yeilding no static analysis results for actual zip files.
-    # elif ("ELF" in data_dictionary["type"] or file_path.endswith(".elf")) and integration_conf.general.elf:
+    # elif ("ELF" in data_dictionary.get("type", "") or file_path.endswith(".elf")) and integration_conf.general.elf:
     #    data_dictionary["elf"] = ELF(file_path).run()
     #    data_dictionary["keys"] = f.get_keys()
     # elif HAVE_OLETOOLS and package == "hwp" and integration_conf.general.hwp:
@@ -246,7 +246,7 @@ def static_file_info(
         if processing_conf.die.enabled and "die" not in data_dictionary:
             data_dictionary["die"] = detect_it_easy_info(file_path)
 
-        if HAVE_FLOSS and processing_conf.floss.enabled and "Mono" not in data_dictionary["type"] and "floss" not in data_dictionary:
+        if HAVE_FLOSS and processing_conf.floss.enabled and "Mono" not in data_dictionary.get("type", "") and "floss" not in data_dictionary:
             floss_strings = Floss(file_path, package).run()
             if floss_strings:
                 data_dictionary["floss"] = floss_strings
@@ -433,7 +433,7 @@ def generic_file_extractors(
     # Arguments that some extractors need. They will always get passed, so the
     # extractor functions need to accept `**_` and just discard them.
     kwargs = {
-        "filetype": data_dictionary["type"],
+        "filetype": data_dictionary.get("type", ""),
         "data_dictionary": data_dictionary,
         "options": options,
         "tests": tests,

From 729b928aa261e4a1b1214e48fe73cf02170596aa Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Tue, 10 Feb 2026 14:41:22 +0100
Subject: [PATCH 08/11] add missed type in file_cache

---
 dev_utils/mongo_hooks.py   | 1 +
 modules/processing/CAPE.py | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/dev_utils/mongo_hooks.py b/dev_utils/mongo_hooks.py
index 08427a7ec31..360b15ea451 100644
--- a/dev_utils/mongo_hooks.py
+++ b/dev_utils/mongo_hooks.py
@@ -61,6 +61,7 @@ def normalize_file(file_dict, task_id):
         "entrypoint",
         "data",
         "strings",
+        "type",
     )
     new_dict = {}
     for fld in static_fields:
diff --git a/modules/processing/CAPE.py b/modules/processing/CAPE.py
index fd7733077a9..99954ed60dd 100644
--- a/modules/processing/CAPE.py
+++ b/modules/processing/CAPE.py
@@ -209,12 +209,19 @@ def process_file(self, file_path, append_file, metadata: dict, *, category: str,
                         file_info = db_file
                         cached = True
                         run_static = False
+                        # Regenerate fields stripped by mongo_hooks
+                        if "type" not in file_info:
+                            file_info["type"] = f.get_type()
                     else:
                         # Partial hit
                         file_info = db_file
                         cached = True  # We have the base object
                         run_static = True  # But we need to re-run static/tools
 
+                        # Regenerate fields stripped by mongo_hooks
+                        if "type" not in file_info:
+                            file_info["type"] = f.get_type()
+
                         if not yara_match:
                             # Update YARA
                             file_info["yara"] = f.get_yara()

From b4f67702c2c431ec2e7093e44699493fdd852996 Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Tue, 10 Feb 2026 14:43:07 +0100
Subject: [PATCH 09/11] Update file_extra_info.py

---
 lib/cuckoo/common/integrations/file_extra_info.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/cuckoo/common/integrations/file_extra_info.py b/lib/cuckoo/common/integrations/file_extra_info.py
index df1c09dc579..1535190d3f5 100644
--- a/lib/cuckoo/common/integrations/file_extra_info.py
+++ b/lib/cuckoo/common/integrations/file_extra_info.py
@@ -224,7 +224,7 @@ def static_file_info(
                 log.error("procyon_path specified in processing.conf but the file does not exist")
             else:
                 data_dictionary["java"] = Java(file_path, integration_conf.procyon.binary).run()
-    elif (file_path.endswith(".rdp") or data_dictionary.get("name", {}).endswith(".rdp")) and "rdp" not in data_dictionary:
+    elif (file_path.endswith(".rdp") or data_dictionary.get("name", "").endswith(".rdp")) and "rdp" not in data_dictionary:
         data_dictionary["rdp"] = parse_rdp_file(file_path)
     # It's possible to fool libmagic into thinking our 2007+ file is a zip.
     # So until we have static analysis for zip files, we can use oleid to fail us out silently,

From 0da43269ff7fd6b07440db6aabeaa8c0eb98fa40 Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Tue, 10 Feb 2026 14:57:30 +0100
Subject: [PATCH 10/11] Add extra file metadata fields to CAPE/mongo hooks

Add additional static fields (yara, cape_yara, yara_hash, options_hash, clamav) to mongo normalization so they aren't stripped. In CAPE processing, ensure the internal pe object is populated (f.get_type(); pefile_object = f.pe) for full and partial hits, and fill missing options_hash and yara_hash for partial results. Also ensure file name and guest_paths are set when absent. These changes restore metadata removed by mongo_hooks and ensure pefile and hash fields are available for downstream results.
---
 conf/default/processing.conf.default |  2 ++
 dev_utils/mongo_hooks.py             |  5 +++++
 modules/processing/CAPE.py           | 21 +++++++++++++++++++++
 3 files changed, 28 insertions(+)

diff --git a/conf/default/processing.conf.default b/conf/default/processing.conf.default
index 8cc2ef115a5..4d28e0fdd81 100644
--- a/conf/default/processing.conf.default
+++ b/conf/default/processing.conf.default
@@ -230,6 +230,8 @@ userdb_signature = no
 replace_patterns = no
 # Use file cache to speed up processing by looking up already processed files in MongoDB
 file_cache = no
+# Store pefile objects for later usage? useful if you doing something in signatures/reporting
+pefile_store = no
 
 # Deduplicate screenshots - You need to install dependency ImageHash>=4.3.1
 [deduplication]
diff --git a/dev_utils/mongo_hooks.py b/dev_utils/mongo_hooks.py
index 360b15ea451..4f280ab820d 100644
--- a/dev_utils/mongo_hooks.py
+++ b/dev_utils/mongo_hooks.py
@@ -62,6 +62,11 @@ def normalize_file(file_dict, task_id):
         "data",
         "strings",
         "type",
+        "yara",
+        "cape_yara",
+        "yara_hash",
+        "options_hash",
+        "clamav",
     )
     new_dict = {}
     for fld in static_fields:
diff --git a/modules/processing/CAPE.py b/modules/processing/CAPE.py
index 99954ed60dd..cbd6e2dd99e 100644
--- a/modules/processing/CAPE.py
+++ b/modules/processing/CAPE.py
@@ -212,6 +212,11 @@ def process_file(self, file_path, append_file, metadata: dict, *, category: str,
                         # Regenerate fields stripped by mongo_hooks
                         if "type" not in file_info:
                             file_info["type"] = f.get_type()
+
+                        if processing_conf.CAPE.pefile_store:
+                            # Populate internal pe object for self.results["pefiles"]
+                            f.get_type()
+                            pefile_object = f.pe
                     else:
                         # Partial hit
                         file_info = db_file
@@ -222,6 +227,17 @@ def process_file(self, file_path, append_file, metadata: dict, *, category: str,
                         if "type" not in file_info:
                             file_info["type"] = f.get_type()
 
+                        if processing_conf.CAPE.pefile_store:
+                            # Populate internal pe object for self.results["pefiles"]
+                            f.get_type()
+                            pefile_object = f.pe
+
+                        if "options_hash" not in file_info:
+                            file_info["options_hash"] = options_hash
+
+                        if "yara_hash" not in file_info:
+                            file_info["yara_hash"] = File.yara_rules_hash
+
                         if not yara_match:
                             # Update YARA
                             file_info["yara"] = f.get_yara()
@@ -236,6 +252,11 @@ def process_file(self, file_path, append_file, metadata: dict, *, category: str,
             file_info["yara_hash"] = File.yara_rules_hash
             run_static = True
 
+        if "name" not in file_info:
+            file_info["name"] = f.get_name()
+        if "guest_paths" not in file_info:
+            file_info["guest_paths"] = f.guest_paths
+
         file_info["options_hash"] = options_hash
 
         if category in ("static", "file"):

From 410fa9a9ead47ab5720d64c768a2ac51f9f4668e Mon Sep 17 00:00:00 2001
From: doomedraven <doommedraven@gmail.com>
Date: Tue, 10 Feb 2026 15:06:17 +0100
Subject: [PATCH 11/11] set reprocess variable

---
 lib/cuckoo/core/plugins.py | 1 +
 modules/reporting/gcs.py   | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/lib/cuckoo/core/plugins.py b/lib/cuckoo/core/plugins.py
index 49d6c59eae5..7998a75b027 100644
--- a/lib/cuckoo/core/plugins.py
+++ b/lib/cuckoo/core/plugins.py
@@ -861,6 +861,7 @@ def process(self, module):
         current.set_options(options)
         # Load the content of the analysis.conf file.
         current.cfg = AnalysisConfig(current.conf_path)
+        current.reprocess = self.reprocess
 
         try:
             log.debug('Executing reporting module "%s"', current.__class__.__name__)
diff --git a/modules/reporting/gcs.py b/modules/reporting/gcs.py
index 6f88b279cc9..1af389d0a12 100644
--- a/modules/reporting/gcs.py
+++ b/modules/reporting/gcs.py
@@ -149,6 +149,9 @@ def run(self, results):
             )
             return
 
+        if self.reprocess:
+            return
+
         tlp = results.get("info", {}).get("tlp")
         analysis_id = results.get("info", {}).get("id")