Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion admin/admin_conf.py_example
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ JUMP_BOX_SECOND_USERNAME = ""
JUMP_BOX_SECOND_PORT = 22

NUM_THREADS = 5
POSTPROCESS = "systemctl restart cape-processor; systemctl status cape-processor"
POSTPROCESS = "systemctl restart cape-processor; systemctl status --no-pager cape-processor"

EXCLUDE_DIRS = set(
[
Expand Down
4 changes: 4 additions & 0 deletions conf/default/processing.conf.default
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,10 @@ max_file_size = 90
userdb_signature = no
# https://capev2.readthedocs.io/en/latest/usage/patterns_replacement.html
replace_patterns = no
# Use file cache to speed up processing by looking up already processed files in MongoDB
file_cache = no
# Store pefile objects for later usage? useful if you doing something in signatures/reporting
pefile_store = no

# Deduplicate screenshots - You need to install dependency ImageHash>=4.3.1
[deduplication]
Expand Down
6 changes: 6 additions & 0 deletions dev_utils/mongo_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@ def normalize_file(file_dict, task_id):
"entrypoint",
"data",
"strings",
"type",
"yara",
"cape_yara",
"yara_hash",
"options_hash",
"clamav",
)
new_dict = {}
for fld in static_fields:
Expand Down
177 changes: 62 additions & 115 deletions lib/cuckoo/common/integrations/file_extra_info.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
import concurrent.futures
import functools
import hashlib
import json
import logging
import os
import re
import shlex
import shutil
import signal
import subprocess

# from contextlib import suppress
from typing import Any, DefaultDict, List, Optional, Set, Union
from typing import Any, DefaultDict, List, Optional, Set

import pebble

Expand All @@ -38,7 +36,6 @@
from lib.cuckoo.common.load_extra_modules import file_extra_info_load_modules
from lib.cuckoo.common.objects import File
from lib.cuckoo.common.path_utils import (
path_delete,
path_exists,
path_get_size,
path_is_file,
Expand Down Expand Up @@ -171,62 +168,68 @@ def static_file_info(

if (
not HAVE_OLETOOLS
and "Zip archive data, at least v2.0" in data_dictionary["type"]
and "Zip archive data, at least v2.0" in data_dictionary.get("type", "")
and package in {"doc", "ppt", "xls", "pub"}
):
log.info("Missed dependencies: pip3 install oletools")

if "MSI Installer" in data_dictionary["type"]:
if "MSI Installer" in data_dictionary.get("type", "") and "msi" not in data_dictionary:
data_dictionary["msi"] = parse_msi(file_path)

# ToDo we need type checking as it wont work for most of static jobs
if HAVE_PEFILE and ("PE32" in data_dictionary["type"] or "MS-DOS executable" in data_dictionary["type"]):
with PortableExecutable(file_path) as pe:
data_dictionary["pe"] = pe.run(task_id)
if HAVE_PEFILE and ("PE32" in data_dictionary.get("type", "") or "MS-DOS executable" in data_dictionary.get("type", "")):
if "pe" not in data_dictionary:
with PortableExecutable(file_path) as pe:
data_dictionary["pe"] = pe.run(task_id)

if HAVE_FLARE_CAPA:
if HAVE_FLARE_CAPA and "flare_capa" not in data_dictionary:
# https://github.com/mandiant/capa/issues/2620
capa_details = flare_capa_details(file_path, "static")
if capa_details:
data_dictionary["flare_capa"] = capa_details

if HAVE_FLOSS and integration_conf.floss.enabled and "Mono" not in data_dictionary["type"]:
if HAVE_FLOSS and integration_conf.floss.enabled and "Mono" not in data_dictionary.get("type", "") and "floss" not in data_dictionary:
floss_strings = Floss(file_path, "static", "pe").run()
if floss_strings:
data_dictionary["floss"] = floss_strings

if "Mono" in data_dictionary["type"]:
if "Mono" in data_dictionary.get("type", "") and "dotnet" not in data_dictionary:
if integration_conf.general.dotnet:
data_dictionary["dotnet"] = DotNETExecutable(file_path).run()
if processing_conf.strings.dotnet:
if processing_conf.strings.dotnet and "dotnet_strings" not in data_dictionary:
dotnet_strings = dotnet_user_strings(file_path)
if dotnet_strings:
data_dictionary.setdefault("dotnet_strings", dotnet_strings)

elif (HAVE_OLETOOLS and package in {"doc", "ppt", "xls", "pub"} and integration_conf.general.office) or data_dictionary.get("name", "").endswith((".doc", ".ppt", ".xls", ".pub")):
# options is dict where we need to get pass get_options
data_dictionary["office"] = Office(file_path, task_id, data_dictionary["sha256"], options_dict).run()
elif ("PDF" in data_dictionary["type"] or file_path.endswith(".pdf")) and integration_conf.general.pdf:
data_dictionary["pdf"] = PDF(file_path).run()
if "office" not in data_dictionary:
# options is dict where we need to get pass get_options
data_dictionary["office"] = Office(file_path, task_id, data_dictionary["sha256"], options_dict).run()
elif ("PDF" in data_dictionary.get("type", "") or file_path.endswith(".pdf")) and integration_conf.general.pdf:
if "pdf" not in data_dictionary:
data_dictionary["pdf"] = PDF(file_path).run()
elif (
package in {"wsf", "hta"} or data_dictionary["type"] == "XML document text" or file_path.endswith(".wsf")
package in {"wsf", "hta"} or data_dictionary.get("type", "") == "XML document text" or file_path.endswith(".wsf")
) and integration_conf.general.windows_script:
data_dictionary["wsf"] = WindowsScriptFile(file_path).run()
if "wsf" not in data_dictionary:
data_dictionary["wsf"] = WindowsScriptFile(file_path).run()
# elif package in {"js", "vbs"}:
# data_dictionary["js"] = EncodedScriptFile(file_path).run()
elif (package == "lnk" or "MS Windows shortcut" in data_dictionary["type"]) and integration_conf.general.lnk:
data_dictionary["lnk"] = LnkShortcut(file_path).run()
elif ("Java Jar" in data_dictionary["type"] or file_path.endswith(".jar")) and integration_conf.general.java:
if integration_conf.procyon.binary and not path_exists(integration_conf.procyon.binary):
log.error("procyon_path specified in processing.conf but the file does not exist")
else:
data_dictionary["java"] = Java(file_path, integration_conf.procyon.binary).run()
elif file_path.endswith(".rdp") or data_dictionary.get("name", {}).endswith(".rdp"):
elif (package == "lnk" or "MS Windows shortcut" in data_dictionary.get("type", "")) and integration_conf.general.lnk:
if "lnk" not in data_dictionary:
data_dictionary["lnk"] = LnkShortcut(file_path).run()
elif ("Java Jar" in data_dictionary.get("type", "") or file_path.endswith(".jar")) and integration_conf.general.java:
if "java" not in data_dictionary:
if integration_conf.procyon.binary and not path_exists(integration_conf.procyon.binary):
log.error("procyon_path specified in processing.conf but the file does not exist")
else:
data_dictionary["java"] = Java(file_path, integration_conf.procyon.binary).run()
elif (file_path.endswith(".rdp") or data_dictionary.get("name", "").endswith(".rdp")) and "rdp" not in data_dictionary:
data_dictionary["rdp"] = parse_rdp_file(file_path)
# It's possible to fool libmagic into thinking our 2007+ file is a zip.
# So until we have static analysis for zip files, we can use oleid to fail us out silently,
# yeilding no static analysis results for actual zip files.
# elif ("ELF" in data_dictionary["type"] or file_path.endswith(".elf")) and integration_conf.general.elf:
# elif ("ELF" in data_dictionary.get("type", "") or file_path.endswith(".elf")) and integration_conf.general.elf:
# data_dictionary["elf"] = ELF(file_path).run()
# data_dictionary["keys"] = f.get_keys()
# elif HAVE_OLETOOLS and package == "hwp" and integration_conf.general.hwp:
Expand All @@ -237,13 +240,13 @@ def static_file_info(
if not file_path.startswith(exclude_startswith) and not file_path.endswith(excluded_extensions):
data_dictionary["data"] = is_text_file(data_dictionary, file_path, processing_conf.CAPE.buffer, data)

if processing_conf.trid.enabled:
if processing_conf.trid.enabled and "trid" not in data_dictionary:
data_dictionary["trid"] = trid_info(file_path)

if processing_conf.die.enabled:
if processing_conf.die.enabled and "die" not in data_dictionary:
data_dictionary["die"] = detect_it_easy_info(file_path)

if HAVE_FLOSS and processing_conf.floss.enabled and "Mono" not in data_dictionary["type"]:
if HAVE_FLOSS and processing_conf.floss.enabled and "Mono" not in data_dictionary.get("type", "") and "floss" not in data_dictionary:
floss_strings = Floss(file_path, package).run()
if floss_strings:
data_dictionary["floss"] = floss_strings
Expand All @@ -253,7 +256,7 @@ def static_file_info(
# think that we want to look them up on-demand (i.e. display the
# "strings" button linking to an on_demand URL).
data_dictionary["strings"] = []
elif HAVE_STRINGS:
elif HAVE_STRINGS and "strings" not in data_dictionary:
strings = extract_strings(file_path, dedup=True)
data_dictionary["strings"] = strings
else:
Expand All @@ -262,7 +265,7 @@ def static_file_info(
pass

# ToDo we need url support
if HAVE_VIRUSTOTAL and processing_conf.virustotal.enabled:
if HAVE_VIRUSTOTAL and processing_conf.virustotal.enabled and "virustotal" not in data_dictionary:
vt_details = vt_lookup("file", file_path, results)
if vt_details:
data_dictionary["virustotal"] = vt_details
Expand Down Expand Up @@ -401,25 +404,7 @@ def _extracted_files_metadata(
return metadata


def pass_signal(proc, signum, frame):
proc.send_signal(signum)


def run_tool(*args, **kwargs) -> Union[bytes, str]:
"""Start a subprocess to run the given tool. Make sure to pass a SIGTERM signal to
that process if it is received.
"""
kwargs["stdout"] = subprocess.PIPE
old_handler = None
try:
proc = subprocess.Popen(*args, **kwargs)
old_handler = signal.signal(signal.SIGTERM, functools.partial(pass_signal, proc))
(stdout, stderr) = proc.communicate()
return stdout
finally:
if old_handler:
signal.signal(signal.SIGTERM, old_handler)

from lib.cuckoo.common.integrations.utils import run_tool

def generic_file_extractors(
file: str,
Expand Down Expand Up @@ -448,37 +433,24 @@ def generic_file_extractors(
# Arguments that some extractors need. They will always get passed, so the
# extractor functions need to accept `**_` and just discard them.
kwargs = {
"filetype": data_dictionary["type"],
"filetype": data_dictionary.get("type", ""),
"data_dictionary": data_dictionary,
"options": options,
"tests": tests,
}

file_info_funcs = [
msi_extract,
kixtart_extract,
vbe_extract,
batch_extract,
UnAutoIt_extract,
UPX_unpack,
RarSFX_extract,
Inno_extract,
SevenZip_unpack,
de4dot_deobfuscate,
eziriz_deobfuscate,
office_one,
msix_extract,
UnGPG_extract,
]

futures = {}
executed_tools = data_dictionary.setdefault("executed_tools", [])
with pebble.ProcessPool(max_workers=int(integration_conf.general.max_workers)) as pool:
# Prefer custom modules over the built-in ones, since only 1 is allowed
# to be the extracted_files_tool.
if extra_info_modules:
for module in extra_info_modules:
func_timeout = int(getattr(module, "timeout", 60))
funcname = module.__name__.split(".")[-1]
if funcname in executed_tools:
continue
executed_tools.append(funcname)
futures[funcname] = pool.schedule(module.extract_details, args=args, kwargs=kwargs, timeout=func_timeout)

for extraction_func in file_info_funcs:
Expand All @@ -489,6 +461,10 @@ def generic_file_extractors(
):
continue

if funcname in executed_tools:
continue
executed_tools.append(funcname)

func_timeout = int(getattr(integration_conf, funcname, {}).get("timeout", 60))
futures[funcname] = pool.schedule(extraction_func, args=args, kwargs=kwargs, timeout=func_timeout)
pool.join()
Expand Down Expand Up @@ -677,51 +653,6 @@ def de4dot_deobfuscate(file: str, *, filetype: str, **_) -> ExtractorReturnType:
return ctx


@time_tracker
def msi_extract(file: str, *, filetype: str, **kwargs) -> ExtractorReturnType:
"""Work on MSI Installers"""

if "MSI Installer" not in filetype:
return

# ToDo replace MsiExtract with pymsi
extracted_files = []
# sudo apt install msitools
with extractor_ctx(file, "MsiExtract", prefix="msidump_", folder=tools_folder) as ctx:
tempdir = ctx["tempdir"]
output = False
if not kwargs.get("tests"):
# msiextract in different way that 7z, we need to add subfolder support
output = run_tool(
[integration_conf.msi_extract.binary, file, "--directory", tempdir],
universal_newlines=True,
stderr=subprocess.PIPE,
)
if output:
extracted_files = [
extracted_file
for extracted_file in list(filter(None, output.split("\n")))
if path_is_file(os.path.join(tempdir, extracted_file))
]
else:
output = run_tool(
[sevenzip_binary, "e", f"-o{tempdir}", "-y", file],
universal_newlines=True,
stderr=subprocess.PIPE,
)
valid_msi_filetypes = ["PE32", "text", "Microsoft Cabinet archive"]
for root, _, filenames in os.walk(tempdir):
for filename in filenames:
path = os.path.join(root, filename)
if any([x in File(path).get_type() for x in valid_msi_filetypes]):
os.rename(path, os.path.join(root, filename.split(".")[-1].strip("'").strip("!")))
else:
path_delete(path)
extracted_files = collect_extracted_filenames(tempdir)

ctx["extracted_files"] = extracted_files

return ctx


@time_tracker
Expand Down Expand Up @@ -1021,3 +952,19 @@ def UnGPG_extract(file: str, filetype: str, data_dictionary: dict, options: dict
ctx["extracted_files"] = collect_extracted_filenames(tempdir)

return ctx

file_info_funcs = [
kixtart_extract,
vbe_extract,
batch_extract,
UnAutoIt_extract,
UPX_unpack,
RarSFX_extract,
Inno_extract,
SevenZip_unpack,
de4dot_deobfuscate,
eziriz_deobfuscate,
office_one,
msix_extract,
UnGPG_extract,
]
Loading