Skip to content

Commit 533fb3c

Browse files
committed
Update download script
1 parent 6549628 commit 533fb3c

File tree

1 file changed

+8
-42
lines changed

1 file changed

+8
-42
lines changed

scripts/download_pypi.py

Lines changed: 8 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,6 @@
3030
import concurrent.futures
3131
import logging
3232
import os
33-
import tarfile
34-
import tempfile
3533

3634
import clickhouse_connect
3735
import clickhouse_connect.driver.client
@@ -78,58 +76,26 @@ def download_file(url: str, output_path: str):
7876
logging.info(f"Downloaded {url} to {output_path}")
7977

8078

81-
def extract_package(temp_path: str, target_dir: str):
82-
with tarfile.open(temp_path, "r:gz") as tar:
83-
safe_extract(tar, target_dir)
84-
85-
def _is_within_directory(directory: str, target: str) -> bool:
86-
directory = os.path.abspath(directory)
87-
target = os.path.abspath(target)
88-
try:
89-
common = os.path.commonpath([directory, target])
90-
except Exception:
91-
return False
92-
return common == directory
93-
94-
95-
def safe_extract(tar: tarfile.TarFile, path: str) -> None:
96-
os.makedirs(path, exist_ok=True)
97-
for member in tar.getmembers():
98-
member_path = os.path.join(path, member.name)
99-
if not _is_within_directory(path, member_path):
100-
raise RuntimeError(f"Unsafe tar member path: {member.name}")
101-
if member.issym() or member.islnk():
102-
logging.debug(f"Skipping link in archive: {member.name}")
103-
continue
104-
tar.extract(member, path)
105-
10679
def process_package(row: tuple[str, str], output_path_base: str):
10780
download_path = f"{PYPI_PREFIX}/{row[1]}"
10881
file_name = os.path.basename(row[1])
109-
package_name = file_name[:-7] # remove .tar.gz
110-
target_dir = os.path.join(output_path_base, package_name)
82+
target_path = os.path.join(output_path_base, file_name)
11183

112-
if os.path.exists(target_dir):
113-
logging.info(f"Package {package_name} already exists, skipping")
84+
if os.path.exists(target_path):
85+
logging.info(f"Package {file_name} already exists, skipping")
11486
return
11587

116-
temp_path = None
11788
try:
118-
temp_fd, temp_path = tempfile.mkstemp(suffix=".tar.gz")
119-
os.close(temp_fd)
120-
download_file(download_path, temp_path)
121-
extract_package(temp_path, target_dir)
122-
os.remove(temp_path)
123-
logging.info(f"Unpacked {file_name} to {target_dir}")
89+
download_file(download_path, target_path)
12490
except Exception as e:
12591
logging.error(f"Failed to process {file_name}: {str(e)}")
126-
if temp_path and os.path.exists(temp_path):
127-
os.remove(temp_path)
92+
if os.path.exists(target_path):
93+
os.remove(target_path)
12894

12995

13096
def main():
13197
parser = argparse.ArgumentParser(
132-
description="Download and extract recent PyPI packages."
98+
description="Download recent PyPI packages."
13399
)
134100
parser.add_argument(
135101
"--days",
@@ -140,7 +106,7 @@ def main():
140106
parser.add_argument(
141107
"--output-path",
142108
default="pypi_packages",
143-
help="Output directory path for extracted packages (default: pypi_packages)",
109+
help="Output directory path for downloaded packages (default: pypi_packages)",
144110
)
145111
parser.add_argument(
146112
"--dry",

0 commit comments

Comments
 (0)