Skip to content

Commit 2458a59

Browse files
committed
new parquet to root converter
modified: .gitignore modified: etc/parquet2root.py
1 parent 8fd7863 commit 2458a59

File tree

2 files changed

+86
-14
lines changed

2 files changed

+86
-14
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
/target
22
/dist
3+
.venv
4+
*.root
5+
*.parquet

etc/parquet2root.py

Lines changed: 83 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,91 @@
1+
#!/usr/bin/env python3
2+
import os
3+
import sys
4+
import glob
15

2-
# This script demonstrates how to convert a Parquet file to a ROOT TTree using the `parquet_to_root` function
3-
# from the `parquet_to_root` module.
6+
import pyarrow.parquet as pq
7+
import uproot
8+
from tqdm import tqdm
9+
import numpy as np
410

5-
# pip install parquet-to-root
6-
# import ROOT must work for the script to run
11+
def parquet_to_root_stream(parquet_paths: list[str],
12+
root_path: str,
13+
tree_name: str,
14+
batch_size: int = 100_000) -> None:
15+
"""
16+
Stream one or more Parquet files into a single ROOT TTree without
17+
loading all data into memory. Shows progress with tqdm, including
18+
total elapsed time.
19+
"""
20+
# Precompute batches per file
21+
file_batches = []
22+
for path in parquet_paths:
23+
pf_meta = pq.ParquetFile(path)
24+
n_rows = pf_meta.metadata.num_rows
25+
n_batches = (n_rows + batch_size - 1) // batch_size
26+
file_batches.append((path, n_batches))
27+
total_batches = sum(nb for _, nb in file_batches)
728

8-
from parquet_to_root import parquet_to_root
9-
import time
29+
# Custom bar format to show elapsed, remaining, rate, and total elapsed
30+
bar_format = (
31+
"{desc}: {percentage:3.0f}%|{bar}| "
32+
"{n}/{total} [{elapsed}<{remaining}, {rate_fmt}] "
33+
"[total: {elapsed}]"
34+
)
1035

11-
start_time = time.time()
36+
with uproot.recreate(root_path) as root_file:
37+
first = True
38+
with tqdm(total=total_batches,
39+
desc=f"Parquet → ROOT ({tree_name})",
40+
bar_format=bar_format) as pbar:
41+
for path, _ in file_batches:
42+
pf = pq.ParquetFile(path)
43+
for batch in pf.iter_batches(batch_size=batch_size):
44+
# Convert RecordBatch to numpy arrays
45+
data = {
46+
col: batch.column(col).to_numpy(zero_copy_only=False)
47+
for col in batch.schema.names
48+
}
1249

13-
# Specify the input Parquet file and the output ROOT file
14-
parquet_file = "./input_file.parquet"
50+
if first:
51+
root_file[tree_name] = data
52+
first = False
53+
else:
54+
root_file[tree_name].extend(data)
1555

16-
root_file = "./output_file.root"
17-
tree_name = "TreeName" # Name of the ROOT TTree
56+
pbar.update(1)
1857

19-
# Convert the Parquet file to a ROOT TTree
20-
parquet_to_root(parquet_file, root_file, treename=tree_name, verbose=True)
58+
# Print final file size
59+
size_bytes = os.path.getsize(root_path)
60+
size_mb = size_bytes / (1024**2)
61+
print(f"Wrote streaming TTree '{tree_name}' to {root_path} ({size_mb:.2f} MB)")
2162

22-
print(f"Conversion complete in {time.time() - start_time:.2f}s: {parquet_file} -> {root_file} with TTree name '{tree_name}'")
63+
if __name__ == "__main__":
64+
# allow 3 or 4 args: script, glob, output.root, tree_name, [batch_size]
65+
if len(sys.argv) not in (4, 5):
66+
print("Usage: parquet_to_root.py <input_glob> <output.root> <tree_name> [batch_size]")
67+
sys.exit(1)
68+
69+
parquet_glob = sys.argv[1]
70+
parquet_files = sorted(glob.glob(parquet_glob))
71+
if not parquet_files:
72+
print(f"No files match pattern: {parquet_glob}")
73+
sys.exit(1)
74+
75+
print("Converting the following Parquet files:")
76+
for path in parquet_files:
77+
print(f" {path}")
78+
79+
root_file = sys.argv[2]
80+
tree_name = sys.argv[3]
81+
batch_size = int(sys.argv[4]) if len(sys.argv) == 5 else 100_000
82+
83+
parquet_to_root_stream(
84+
parquet_paths=parquet_files,
85+
root_path=root_file,
86+
tree_name=tree_name,
87+
batch_size=batch_size
88+
)
89+
90+
# Requires:
91+
# pip install pyarrow uproot tqdm numpy

0 commit comments

Comments
 (0)