From 59bd7235ee85fe009c2571d69b24f1acfdfd0d65 Mon Sep 17 00:00:00 2001 From: Eloy Felix Date: Thu, 15 Jan 2026 22:17:57 +0100 Subject: [PATCH 1/5] parquet backend --- FPSim2/FPSim2.py | 4 ++++ FPSim2/base.py | 7 +++++++ FPSim2/io/backends/__init__.py | 3 ++- 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/FPSim2/FPSim2.py b/FPSim2/FPSim2.py index 7b07ee84..566dee2e 100644 --- a/FPSim2/FPSim2.py +++ b/FPSim2/FPSim2.py @@ -53,6 +53,10 @@ def __init__( raise ValueError( "Loading the fingerprints into memory is required for the SQLAlchemy backend" ) + if not in_memory_fps and storage_backend == "parquet": + raise ValueError( + "Loading the fingerprints into memory is required for the Parquet backend" + ) self.empty_sim = np.ndarray((0,), dtype=[("mol_id", " Date: Thu, 15 Jan 2026 23:11:24 +0100 Subject: [PATCH 2/5] parquet backend --- FPSim2/io/backends/parquet.py | 259 ++++++++++++++++++++++++++++++++++ 1 file changed, 259 insertions(+) create mode 100644 FPSim2/io/backends/parquet.py diff --git a/FPSim2/io/backends/parquet.py b/FPSim2/io/backends/parquet.py new file mode 100644 index 00000000..e79cf3f3 --- /dev/null +++ b/FPSim2/io/backends/parquet.py @@ -0,0 +1,259 @@ +"""Parquet storage backend for FPSim2.""" + +from typing import Dict, Iterable as IterableType, Tuple, Union +from .base import BaseStorageBackend +import numpy as np +import rdkit +import math +import json +from importlib.metadata import version + +__version__ = version("FPSim2") + +try: + import pyarrow.parquet as pq + import pyarrow as pa + + HAS_PYARROW = True +except ImportError: + HAS_PYARROW = False + + +def create_parquet_file( + mols_source: Union[str, IterableType], + filename: str, + mol_format: str, + fp_type: str, + fp_params: dict = {}, + mol_id_prop: str = "mol_id", + full_sanitization: bool = True, + compression: str = "zstd", + compression_level: int = 3, + row_group_size: int = 100000, +) -> None: + """Create a Parquet fingerprint database file.""" + if not HAS_PYARROW: + raise ImportError("PyArrow is required. Install with: pip install pyarrow") + + from ..chem import ( + build_fp, + get_mol_supplier, + get_fp_length, + FP_FUNC_DEFAULTS, + RDKIT_PARSE_FUNCS, + ) + + is_valid_file = isinstance(mols_source, str) and ( + mols_source.endswith((".smi", ".sdf", ".sdf.gz")) + ) + if not (is_valid_file or mol_format in RDKIT_PARSE_FUNCS): + raise ValueError(f"Unsupported mol_format: {mol_format}") + + if fp_type not in FP_FUNC_DEFAULTS: + raise ValueError(f"Unsupported fp_type: {fp_type}") + + if not fp_params: + fp_params = FP_FUNC_DEFAULTS[fp_type] + else: + if "fpSize" not in fp_params: + if "fpSize" in FP_FUNC_DEFAULTS[fp_type]: + fp_params["fpSize"] = FP_FUNC_DEFAULTS[fp_type]["fpSize"] + + supplier = get_mol_supplier(mols_source) + fp_length = get_fp_length(fp_type, fp_params) + n_fp_cols = math.ceil(fp_length / 64) + + # Build schema with metadata + fields = [("mol_id", pa.int64())] + fields += [(f"f{i + 1}", pa.uint64()) for i in range(n_fp_cols)] + fields += [("popcnt", pa.int64())] + schema = pa.schema(fields) + + metadata = { + b"fp_type": fp_type.encode(), + b"fp_params": json.dumps(fp_params).encode(), + b"rdkit_version": rdkit.__version__.encode(), + b"fpsim2_version": __version__.encode(), + } + schema = schema.with_metadata(metadata) + + iterable = supplier( + mols_source, + full_sanitization, + mol_format=mol_format, + mol_id_prop=mol_id_prop, + ) + + # Write in batches to avoid loading all into memory + fps_batch = [] + rows_written = 0 + + with pq.ParquetWriter( + filename, + schema, + compression=compression, + compression_level=compression_level if compression else None, + ) as writer: + for mol_id, rdmol in iterable: + fp = build_fp(rdmol, fp_type, fp_params, mol_id) + fps_batch.append(fp) + + if len(fps_batch) >= row_group_size: + _write_batch(writer, fps_batch, n_fp_cols, schema) + rows_written += len(fps_batch) + fps_batch = [] + + # Write remaining batch + if fps_batch: + _write_batch(writer, fps_batch, n_fp_cols, schema) + rows_written += len(fps_batch) + + if rows_written == 0: + raise ValueError("No valid molecules found in source") + + +def _write_batch(writer, fps_batch, n_fp_cols, schema): + """Write a batch of fingerprints to Parquet.""" + fps_array = np.array(fps_batch, dtype=np.uint64) + arrays = {"mol_id": pa.array(fps_array[:, 0].astype(np.int64))} + for i in range(n_fp_cols): + arrays[f"f{i + 1}"] = pa.array(fps_array[:, i + 1]) + arrays["popcnt"] = pa.array(fps_array[:, -1].astype(np.int64)) + table = pa.table(arrays, schema=schema) + writer.write_table(table) + + +def h5_to_parquet( + h5_filename: str, + parquet_filename: str, + compression: str = "zstd", + compression_level: int = 3, + row_group_size: int = 100000, +) -> None: + """Convert an existing HDF5/PyTables FPSim2 database to Parquet format.""" + if not HAS_PYARROW: + raise ImportError("PyArrow is required. Install with: pip install pyarrow") + + import tables as tb + from .pytables import get_fp_length + + with tb.open_file(h5_filename, mode="r") as fp_file: + fp_type = fp_file.root.config[0] + fp_params = fp_file.root.config[1] + rdkit_ver = fp_file.root.config[2] + fpsim2_ver = fp_file.root.config[3] + + fp_length = get_fp_length(fp_type, fp_params) + n_fp_cols = math.ceil(fp_length / 64) + n_rows = fp_file.root.fps.nrows + num_fields = len(fp_file.root.fps.dtype) + + # Build schema with metadata + fields = [("mol_id", pa.int64())] + fields += [(f"f{i + 1}", pa.uint64()) for i in range(n_fp_cols)] + fields += [("popcnt", pa.int64())] + schema = pa.schema(fields) + + metadata = { + b"fp_type": fp_type.encode(), + b"fp_params": json.dumps(fp_params).encode(), + b"rdkit_version": rdkit_ver.encode(), + b"fpsim2_version": fpsim2_ver.encode(), + } + schema = schema.with_metadata(metadata) + + # Write in chunks to avoid loading all into memory + with pq.ParquetWriter( + parquet_filename, + schema, + compression=compression, + compression_level=compression_level if compression else None, + ) as writer: + for start in range(0, n_rows, row_group_size): + end = min(start + row_group_size, n_rows) + chunk = fp_file.root.fps[start:end] + fps_array = chunk.view(" None: + if not HAS_PYARROW: + raise ImportError("PyArrow is required. Install with: pip install pyarrow") + + super(ParquetStorageBackend, self).__init__() + self.name = "parquet" + self.fp_filename = fp_filename + self.fp_type, self.fp_params, self.rdkit_ver, self.fpsim2_ver = ( + self.read_parameters() + ) + self.load_fps() + self.load_popcnt_bins() + + if self.rdkit_ver and self.rdkit_ver != rdkit.__version__: + print( + f"Warning: Database was created with RDKit version {self.rdkit_ver} " + f"but installed version is {rdkit.__version__}. " + "Please ensure there were no relevant changes in RDKit regarding " + "fingerprint generation between these versions." + ) + + def read_parameters(self) -> Tuple[str, Dict, str, str]: + """Read fingerprint parameters from file metadata.""" + pf = pq.ParquetFile(self.fp_filename) + metadata = pf.schema_arrow.metadata or {} + + def get_meta(key: str) -> str: + val = metadata.get(key.encode(), b"") + return val.decode() if val else None + + fp_type = get_meta("fp_type") + fp_params_str = get_meta("fp_params") + fp_params = json.loads(fp_params_str) if fp_params_str else {} + rdkit_ver = get_meta("rdkit_version") + fpsim2_ver = get_meta("fpsim2_version") + + return fp_type, fp_params, rdkit_ver, fpsim2_ver + + def load_fps(self) -> None: + """Load fingerprints from Parquet file into memory.""" + pf = pq.ParquetFile(self.fp_filename) + schema = pf.schema_arrow + fp_cols = sorted( + [f.name for f in schema if f.name.startswith("f") and f.name[1:].isdigit()], + key=lambda x: int(x[1:]), + ) + columns = ["mol_id"] + fp_cols + ["popcnt"] + + # Build structured dtype for in-place sort + dtype = [("fp_id", " None: + self.popcnt_bins = self.calc_popcnt_bins(self.fps) \ No newline at end of file From 4b3d66d01c1190a36cda2b8db97e46dc18f0c0ec Mon Sep 17 00:00:00 2001 From: Eloy Felix Date: Thu, 15 Jan 2026 23:19:00 +0100 Subject: [PATCH 3/5] parquet backend --- FPSim2/io/backends/parquet.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/FPSim2/io/backends/parquet.py b/FPSim2/io/backends/parquet.py index e79cf3f3..fb119e62 100644 --- a/FPSim2/io/backends/parquet.py +++ b/FPSim2/io/backends/parquet.py @@ -239,15 +239,14 @@ def load_fps(self) -> None: dtype += [(f"f{i+1}", " Date: Thu, 15 Jan 2026 23:49:22 +0100 Subject: [PATCH 4/5] add tests --- tests/test_parquet.py | 145 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 tests/test_parquet.py diff --git a/tests/test_parquet.py b/tests/test_parquet.py new file mode 100644 index 00000000..7d17c554 --- /dev/null +++ b/tests/test_parquet.py @@ -0,0 +1,145 @@ +"""Tests for the Parquet storage backend.""" + +import pytest +import numpy as np +import os +import tempfile + +# Skip all tests if pyarrow is not installed +pytest.importorskip("pyarrow") + +from FPSim2 import FPSim2Engine +from FPSim2.io.backends.parquet import ( + ParquetStorageBackend, + create_parquet_file, + h5_to_parquet, +) + + +@pytest.fixture +def smi_file(): + return "tests/data/10mols.smi" + + +@pytest.fixture +def h5_file(): + return "tests/data/10mols.h5" + + +@pytest.fixture +def parquet_file(smi_file): + """Create a temporary Parquet file from test molecules.""" + with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as f: + parquet_path = f.name + + create_parquet_file( + mols_source=smi_file, + filename=parquet_path, + mol_format="smi", + fp_type="Morgan", + fp_params={"radius": 2, "fpSize": 2048}, + ) + + yield parquet_path + + # Cleanup + if os.path.exists(parquet_path): + os.remove(parquet_path) + + +@pytest.fixture +def parquet_from_h5(h5_file): + """Create a temporary Parquet file converted from HDF5.""" + with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as f: + parquet_path = f.name + + h5_to_parquet(h5_file, parquet_path) + + yield parquet_path + + if os.path.exists(parquet_path): + os.remove(parquet_path) + + +class TestParquetBackend: + """Test ParquetStorageBackend functionality.""" + + def test_create_parquet_file(self, parquet_file): + """Test that Parquet file is created correctly.""" + import pyarrow.parquet as pq + + pf = pq.ParquetFile(parquet_file) + assert pf.metadata.num_rows == 10 + + # Check schema has expected columns + schema = pf.schema_arrow + assert "mol_id" in schema.names + assert "popcnt" in schema.names + assert "f1" in schema.names # At least one fingerprint column + + def test_load_parquet_backend(self, parquet_file): + """Test loading Parquet file into ParquetStorageBackend.""" + backend = ParquetStorageBackend(parquet_file) + + assert backend.fps is not None + assert backend.fps.shape[0] == 10 + assert backend.popcnt_bins is not None + + def test_fpsim2_engine_parquet(self, parquet_file): + """Test FPSim2Engine with Parquet backend.""" + fpe = FPSim2Engine(parquet_file, storage_backend="parquet") + + assert fpe.fps.shape[0] == 10 + assert fpe.fp_type == "Morgan" + assert fpe.fp_params["radius"] == 2 + + def test_similarity_search_parquet(self, parquet_file): + """Test similarity search with Parquet backend.""" + fpe = FPSim2Engine(parquet_file, storage_backend="parquet") + + # Use first molecule as query + results = fpe.similarity("Cc1ccc(-n2ncc(=O)[nH]c2=O)cc1", 0.5) + + assert len(results) > 0 + assert all(r[1] >= 0.5 for r in results) # All results above threshold + + def test_h5_to_parquet_conversion(self, h5_file, parquet_from_h5): + """Test HDF5 to Parquet conversion produces equivalent results.""" + # Load both backends + fpe_h5 = FPSim2Engine(h5_file, storage_backend="pytables") + fpe_pq = FPSim2Engine(parquet_from_h5, storage_backend="parquet") + + # Compare fingerprint arrays + np.testing.assert_array_equal(fpe_h5.fps, fpe_pq.fps) + + # Compare metadata + assert fpe_h5.fp_type == fpe_pq.fp_type + assert fpe_h5.fp_params == fpe_pq.fp_params + + def test_search_results_match_h5(self, h5_file, parquet_from_h5): + """Test that Parquet backend produces same search results as HDF5.""" + fpe_h5 = FPSim2Engine(h5_file, storage_backend="pytables") + fpe_pq = FPSim2Engine(parquet_from_h5, storage_backend="parquet") + + query = "Cc1ccc(-n2ncc(=O)[nH]c2=O)cc1" + threshold = 0.5 + + results_h5 = fpe_h5.similarity(query, threshold) + results_pq = fpe_pq.similarity(query, threshold) + + # Same number of results + assert len(results_h5) == len(results_pq) + + # Same mol_ids (may be in different order due to floating point) + h5_ids = set(r[0] for r in results_h5) + pq_ids = set(r[0] for r in results_pq) + assert h5_ids == pq_ids + + def test_metadata_from_parquet(self, parquet_file): + """Test that metadata is correctly read from Parquet file.""" + backend = ParquetStorageBackend(parquet_file) + + assert backend.fp_type == "Morgan" + assert backend.fp_params["radius"] == 2 + assert backend.fp_params["fpSize"] == 2048 + assert backend.rdkit_ver is not None From b0597349d7957067f412d4a559b02160ad223045 Mon Sep 17 00:00:00 2001 From: Eloy Felix Date: Tue, 3 Feb 2026 17:21:37 +0100 Subject: [PATCH 5/5] time parquet backend --- FPSim2/io/backends/parquet.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/FPSim2/io/backends/parquet.py b/FPSim2/io/backends/parquet.py index fb119e62..2c5f8be9 100644 --- a/FPSim2/io/backends/parquet.py +++ b/FPSim2/io/backends/parquet.py @@ -6,6 +6,7 @@ import rdkit import math import json +import time from importlib.metadata import version __version__ = version("FPSim2") @@ -226,6 +227,8 @@ def get_meta(key: str) -> str: def load_fps(self) -> None: """Load fingerprints from Parquet file into memory.""" + # Time loading the file + load_start = time.time() pf = pq.ParquetFile(self.fp_filename) schema = pf.schema_arrow fp_cols = sorted( @@ -240,16 +243,26 @@ def load_fps(self) -> None: dtype += [("popcnt", "