3030import concurrent .futures
3131import logging
3232import os
33- import tarfile
34- import tempfile
3533
3634import clickhouse_connect
3735import clickhouse_connect .driver .client
@@ -78,58 +76,26 @@ def download_file(url: str, output_path: str):
7876 logging .info (f"Downloaded { url } to { output_path } " )
7977
8078
81- def extract_package (temp_path : str , target_dir : str ):
82- with tarfile .open (temp_path , "r:gz" ) as tar :
83- safe_extract (tar , target_dir )
84-
85- def _is_within_directory (directory : str , target : str ) -> bool :
86- directory = os .path .abspath (directory )
87- target = os .path .abspath (target )
88- try :
89- common = os .path .commonpath ([directory , target ])
90- except Exception :
91- return False
92- return common == directory
93-
94-
95- def safe_extract (tar : tarfile .TarFile , path : str ) -> None :
96- os .makedirs (path , exist_ok = True )
97- for member in tar .getmembers ():
98- member_path = os .path .join (path , member .name )
99- if not _is_within_directory (path , member_path ):
100- raise RuntimeError (f"Unsafe tar member path: { member .name } " )
101- if member .issym () or member .islnk ():
102- logging .debug (f"Skipping link in archive: { member .name } " )
103- continue
104- tar .extract (member , path )
105-
10679def process_package (row : tuple [str , str ], output_path_base : str ):
10780 download_path = f"{ PYPI_PREFIX } /{ row [1 ]} "
10881 file_name = os .path .basename (row [1 ])
109- package_name = file_name [:- 7 ] # remove .tar.gz
110- target_dir = os .path .join (output_path_base , package_name )
82+ target_path = os .path .join (output_path_base , file_name )
11183
112- if os .path .exists (target_dir ):
113- logging .info (f"Package { package_name } already exists, skipping" )
84+ if os .path .exists (target_path ):
85+ logging .info (f"Package { file_name } already exists, skipping" )
11486 return
11587
116- temp_path = None
11788 try :
118- temp_fd , temp_path = tempfile .mkstemp (suffix = ".tar.gz" )
119- os .close (temp_fd )
120- download_file (download_path , temp_path )
121- extract_package (temp_path , target_dir )
122- os .remove (temp_path )
123- logging .info (f"Unpacked { file_name } to { target_dir } " )
89+ download_file (download_path , target_path )
12490 except Exception as e :
12591 logging .error (f"Failed to process { file_name } : { str (e )} " )
126- if temp_path and os .path .exists (temp_path ):
127- os .remove (temp_path )
92+ if os .path .exists (target_path ):
93+ os .remove (target_path )
12894
12995
13096def main ():
13197 parser = argparse .ArgumentParser (
132- description = "Download and extract recent PyPI packages."
98+ description = "Download recent PyPI packages."
13399 )
134100 parser .add_argument (
135101 "--days" ,
@@ -140,7 +106,7 @@ def main():
140106 parser .add_argument (
141107 "--output-path" ,
142108 default = "pypi_packages" ,
143- help = "Output directory path for extracted packages (default: pypi_packages)" ,
109+ help = "Output directory path for downloaded packages (default: pypi_packages)" ,
144110 )
145111 parser .add_argument (
146112 "--dry" ,
0 commit comments