1+ #!/usr/bin/env python3
2+ import os
3+ import sys
4+ import glob
15
2- # This script demonstrates how to convert a Parquet file to a ROOT TTree using the `parquet_to_root` function
3- # from the `parquet_to_root` module.
6+ import pyarrow .parquet as pq
7+ import uproot
8+ from tqdm import tqdm
9+ import numpy as np
410
5- # pip install parquet-to-root
6- # import ROOT must work for the script to run
11+ def parquet_to_root_stream (parquet_paths : list [str ],
12+ root_path : str ,
13+ tree_name : str ,
14+ batch_size : int = 100_000 ) -> None :
15+ """
16+ Stream one or more Parquet files into a single ROOT TTree without
17+ loading all data into memory. Shows progress with tqdm, including
18+ total elapsed time.
19+ """
20+ # Precompute batches per file
21+ file_batches = []
22+ for path in parquet_paths :
23+ pf_meta = pq .ParquetFile (path )
24+ n_rows = pf_meta .metadata .num_rows
25+ n_batches = (n_rows + batch_size - 1 ) // batch_size
26+ file_batches .append ((path , n_batches ))
27+ total_batches = sum (nb for _ , nb in file_batches )
728
8- from parquet_to_root import parquet_to_root
9- import time
29+ # Custom bar format to show elapsed, remaining, rate, and total elapsed
30+ bar_format = (
31+ "{desc}: {percentage:3.0f}%|{bar}| "
32+ "{n}/{total} [{elapsed}<{remaining}, {rate_fmt}] "
33+ "[total: {elapsed}]"
34+ )
1035
11- start_time = time .time ()
36+ with uproot .recreate (root_path ) as root_file :
37+ first = True
38+ with tqdm (total = total_batches ,
39+ desc = f"Parquet → ROOT ({ tree_name } )" ,
40+ bar_format = bar_format ) as pbar :
41+ for path , _ in file_batches :
42+ pf = pq .ParquetFile (path )
43+ for batch in pf .iter_batches (batch_size = batch_size ):
44+ # Convert RecordBatch to numpy arrays
45+ data = {
46+ col : batch .column (col ).to_numpy (zero_copy_only = False )
47+ for col in batch .schema .names
48+ }
1249
13- # Specify the input Parquet file and the output ROOT file
14- parquet_file = "./input_file.parquet"
50+ if first :
51+ root_file [tree_name ] = data
52+ first = False
53+ else :
54+ root_file [tree_name ].extend (data )
1555
16- root_file = "./output_file.root"
17- tree_name = "TreeName" # Name of the ROOT TTree
56+ pbar .update (1 )
1857
19- # Convert the Parquet file to a ROOT TTree
20- parquet_to_root (parquet_file , root_file , treename = tree_name , verbose = True )
58+ # Print final file size
59+ size_bytes = os .path .getsize (root_path )
60+ size_mb = size_bytes / (1024 ** 2 )
61+ print (f"Wrote streaming TTree '{ tree_name } ' to { root_path } ({ size_mb :.2f} MB)" )
2162
22- print (f"Conversion complete in { time .time () - start_time :.2f} s: { parquet_file } -> { root_file } with TTree name '{ tree_name } '" )
63+ if __name__ == "__main__" :
64+ # allow 3 or 4 args: script, glob, output.root, tree_name, [batch_size]
65+ if len (sys .argv ) not in (4 , 5 ):
66+ print ("Usage: parquet_to_root.py <input_glob> <output.root> <tree_name> [batch_size]" )
67+ sys .exit (1 )
68+
69+ parquet_glob = sys .argv [1 ]
70+ parquet_files = sorted (glob .glob (parquet_glob ))
71+ if not parquet_files :
72+ print (f"No files match pattern: { parquet_glob } " )
73+ sys .exit (1 )
74+
75+ print ("Converting the following Parquet files:" )
76+ for path in parquet_files :
77+ print (f" { path } " )
78+
79+ root_file = sys .argv [2 ]
80+ tree_name = sys .argv [3 ]
81+ batch_size = int (sys .argv [4 ]) if len (sys .argv ) == 5 else 100_000
82+
83+ parquet_to_root_stream (
84+ parquet_paths = parquet_files ,
85+ root_path = root_file ,
86+ tree_name = tree_name ,
87+ batch_size = batch_size
88+ )
89+
90+ # Requires:
91+ # pip install pyarrow uproot tqdm numpy
0 commit comments