From 8c65320695784e1678a85e442cd7f1ca66ef0215 Mon Sep 17 00:00:00 2001 From: Seth Michael Larson Date: Tue, 23 Dec 2025 09:38:23 -0600 Subject: [PATCH 1/2] Add hypothesis harness, tarfile strategies --- .gitignore | 216 ++++++++++++++++++++++++++++++++++++++++++ Makefile | 4 +- tarfile_hypothesis.py | 95 +++++++++++++++++++ 3 files changed, 314 insertions(+), 1 deletion(-) create mode 100644 .gitignore create mode 100644 tarfile_hypothesis.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..42f20c5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,216 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +# Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +# poetry.lock +# poetry.toml + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +# pdm.lock +# pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +# pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# Redis +*.rdb +*.aof +*.pid + +# RabbitMQ +mnesia/ +rabbitmq/ +rabbitmq-data/ + +# ActiveMQ +activemq-data/ + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +# .idea/ + +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +# .vscode/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ + +# Streamlit +.streamlit/secrets.toml diff --git a/Makefile b/Makefile index a550011..f81c708 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -all : fuzzer-html fuzzer-email fuzzer-httpclient fuzzer-json fuzzer-difflib fuzzer-csv fuzzer-decode fuzzer-ast fuzzer-tarfile fuzzer-zipfile fuzzer-re fuzzer-configparser fuzzer-tomllib fuzzer-plistlib fuzzer-xml +all : fuzzer-html fuzzer-email fuzzer-httpclient fuzzer-json fuzzer-difflib fuzzer-csv fuzzer-decode fuzzer-ast fuzzer-tarfile fuzzer-tarfile-hypothesis fuzzer-zipfile fuzzer-re fuzzer-configparser fuzzer-tomllib fuzzer-plistlib fuzzer-xml PYTHON_CONFIG_PATH=$(CPYTHON_INSTALL_PATH)/bin/python3-config CXXFLAGS += $(shell $(PYTHON_CONFIG_PATH) --cflags) @@ -26,6 +26,8 @@ fuzzer-zipfile: clang++ $(CXXFLAGS) $(LIB_FUZZING_ENGINE) -std=c++17 fuzzer.cpp -DPYTHON_HARNESS_PATH="\"zipfile.py\"" -ldl $(LDFLAGS) -o fuzzer-zipfile fuzzer-tarfile: clang++ $(CXXFLAGS) $(LIB_FUZZING_ENGINE) -std=c++17 fuzzer.cpp -DPYTHON_HARNESS_PATH="\"tarfile.py\"" -ldl $(LDFLAGS) -o fuzzer-tarfile +fuzzer-tarfile-hypothesis: + clang++ $(CXXFLAGS) $(LIB_FUZZING_ENGINE) -std=c++17 fuzzer.cpp -DPYTHON_HARNESS_PATH="\"tarfile_hypothesis.py\"" -ldl $(LDFLAGS) -o fuzzer-tarfile-hypothesis fuzzer-configparser: clang++ $(CXXFLAGS) $(LIB_FUZZING_ENGINE) -std=c++17 fuzzer.cpp -DPYTHON_HARNESS_PATH="\"configparser.py\"" -ldl $(LDFLAGS) -o fuzzer-configparser fuzzer-tomllib: diff --git a/tarfile_hypothesis.py b/tarfile_hypothesis.py new file mode 100644 index 0000000..5602365 --- /dev/null +++ b/tarfile_hypothesis.py @@ -0,0 +1,95 @@ +import io +import tarfile + +from hypothesis import given +from hypothesis import strategies as st + + +def utf8_text(*, max_size: int, min_size: int = 0) -> st.SearchStrategy[str]: + """Returns UTF-8 text that, when encoded to bytes, + is within the size restrictions. + """ + return st.text(min_size=min_size, max_size=max_size).filter( + lambda s: min_size <= len(s.encode("utf-8")) <= max_size + ) + + +def tar_integers( + *, format: int, digits: int = 1, allow_negative: bool = False +) -> st.SearchStrategy[tuple[io.BytesIO, tarfile.TarFile]]: + """tar has a unique way of encoding integers that is format-dependent + and based on the number of "digits" allowed for a value. + """ + if digits <= 0: + raise ValueError("Digits must be greater than one.") + if format == tarfile.GNU_FORMAT: + min_value = -(256 ** (digits - 1)) if allow_negative else 0 + max_value = (256 ** (digits - 1)) - 1 + else: + min_value = 0 + max_value = (4**digits) - 1 + return st.integers(min_value=min_value, max_value=max_value) + + +@st.composite +def tar_archives(draw): + buf = io.BytesIO() + format = draw( + st.sampled_from((tarfile.GNU_FORMAT, tarfile.PAX_FORMAT, tarfile.USTAR_FORMAT)) + ) + tar = tarfile.TarFile(fileobj=buf, format=format, mode="w") + types = list(tarfile.REGULAR_TYPES) + + for _ in range(draw(st.integers(min_value=1, max_value=10))): + info = tarfile.TarInfo( + name=draw(utf8_text(min_size=1, max_size=tarfile.LENGTH_NAME)) + ) + if draw(st.booleans()): + fileobj = io.BytesIO(draw(st.binary(min_size=0, max_size=0xFFFFFFFF))) + else: + fileobj = None + + info.type = draw(st.sampled_from(types)) + info.mode = draw(tar_integers(format=format, digits=8)) + info.uid = draw(tar_integers(format=format, digits=8)) + info.gid = draw(tar_integers(format=format, digits=8)) + info.mtime = draw(tar_integers(format=format, digits=12)) + info.devmajor = draw(tar_integers(format=format, digits=8)) + info.devminor = draw(tar_integers(format=format, digits=8)) + + if draw(st.booleans()): + info.linkname = draw(utf8_text(min_size=1, max_size=tarfile.LENGTH_LINK)) + + def maybe_set_pax_header(obj, name, value): + if draw(st.booleans()): + obj.pax_headers[name] = value + + if format == tarfile.PAX_FORMAT: + maybe_set_pax_header(info, "uname", draw(st.text(max_size=32))) + maybe_set_pax_header(info, "gname", draw(st.text(max_size=32))) + maybe_set_pax_header( + info, + "path", + draw(utf8_text(min_size=1, max_size=tarfile.LENGTH_NAME)), + ) + maybe_set_pax_header( + info, + "linkpath", + draw(utf8_text(min_size=1, max_size=tarfile.LENGTH_LINK)), + ) + + tar.addfile(info, fileobj=fileobj) + + return buf, tar + + +@given(tar_archives()) +def tar_archive_fuzz_target(buf_tar: tuple[io.BytesIO, tarfile.TarFile]) -> None: + buf, tar1 = buf_tar + tar2 = tarfile.TarFile(fileobj=buf) + # Assert that tar files round-trip. + assert list(tar1.getmembers()) == list(tar2.getmembers()) + + +# Exposes the Hypothesis fuzz target for integrating with OSS-Fuzz. +FuzzerRunOne = tar_archive_fuzz_target.hypothesis.fuzz_one_input From fdcc82578a0242afdaf35a773217b7e785ddd1b9 Mon Sep 17 00:00:00 2001 From: Seth Michael Larson Date: Wed, 14 Jan 2026 15:40:04 -0600 Subject: [PATCH 2/2] Always print 'reproduce_failure()' with blob --- tarfile_hypothesis.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tarfile_hypothesis.py b/tarfile_hypothesis.py index 5602365..ce7d082 100644 --- a/tarfile_hypothesis.py +++ b/tarfile_hypothesis.py @@ -1,7 +1,7 @@ import io import tarfile -from hypothesis import given +from hypothesis import given, settings from hypothesis import strategies as st @@ -32,6 +32,7 @@ def tar_integers( @st.composite +@settings(print_blob=True) def tar_archives(draw): buf = io.BytesIO() format = draw( @@ -45,7 +46,7 @@ def tar_archives(draw): name=draw(utf8_text(min_size=1, max_size=tarfile.LENGTH_NAME)) ) if draw(st.booleans()): - fileobj = io.BytesIO(draw(st.binary(min_size=0, max_size=0xFFFFFFFF))) + fileobj = io.BytesIO(draw(st.binary(min_size=0, max_size=0xFFFF))) else: fileobj = None @@ -84,11 +85,14 @@ def maybe_set_pax_header(obj, name, value): @given(tar_archives()) +@settings(print_blob=True) def tar_archive_fuzz_target(buf_tar: tuple[io.BytesIO, tarfile.TarFile]) -> None: buf, tar1 = buf_tar tar2 = tarfile.TarFile(fileobj=buf) # Assert that tar files round-trip. - assert list(tar1.getmembers()) == list(tar2.getmembers()) + assert list(tar1.getmembers()) == list(tar2.getmembers()), ( + repr(buf.getvalue()) + ) # Exposes the Hypothesis fuzz target for integrating with OSS-Fuzz.