From c156e4c4eb2e25eba5f3c3bcece275422db44c1d Mon Sep 17 00:00:00 2001 From: jtschwar Date: Thu, 29 Jan 2026 13:35:21 -0800 Subject: [PATCH 1/4] add parser uri to readers, add function to download portal project for sta --- src/copick_utils/cli/download.py | 25 ++++ src/copick_utils/io/portal.py | 127 +++++++++++++++++++ src/copick_utils/io/readers.py | 210 ++++++++++++++++++++----------- src/copick_utils/io/writers.py | 18 +-- 4 files changed, 299 insertions(+), 81 deletions(-) create mode 100644 src/copick_utils/cli/download.py create mode 100644 src/copick_utils/io/portal.py diff --git a/src/copick_utils/cli/download.py b/src/copick_utils/cli/download.py new file mode 100644 index 0000000..f15ece4 --- /dev/null +++ b/src/copick_utils/cli/download.py @@ -0,0 +1,25 @@ +import click + +@click.command( + context_settings={"show_default": True}, + short_help="Download tilt series and alignments from the CryoET Data Portal.", + no_args_is_help=True, +) + +@click.option( + '-d', '--dataset', + required=True, type=str, + help='Dataset ID to download from the CryoET Data Portal.', +) +@click.option( + '-o', '--output', + required=True, default='.', type=str, + help='Output directory to save the downloaded files.', +) + +def download(dataset: str, output: str): + download_project(dataset, output) + +def download_project(dataset: str, output: str): + import copick_utils.io.portal as portal + portal.download_aretomo_files(dataset, output) \ No newline at end of file diff --git a/src/copick_utils/io/portal.py b/src/copick_utils/io/portal.py new file mode 100644 index 0000000..25c0840 --- /dev/null +++ b/src/copick_utils/io/portal.py @@ -0,0 +1,127 @@ +""" +A minimal example using minimal libraries / imports to download relevant AreTomo files from the CryoET Data Portal. Downloads the corresponding files, using the run ID as the base filename. +""" +import multiprocessing, requests, argparse, os +import pandas as pd +import numpy as np +import mdocfile, click + +import cryoet_data_portal as cdp +import s3fs + +global_client = cdp.Client() + +def download_aretomo_files(dataset_id: int, output_dir: str): + print(f"Fetching tiltseries for dataset id {dataset_id}...", flush=True) + tiltseries_list: list[cdp.TiltSeries] = [tiltseries for run in cdp.Dataset.get_by_id(global_client, dataset_id).runs for tiltseries in run.tiltseries] # a bit slow for some reason, can take some time + tiltseries_run_ids_and_ts_ids = [(ts.run.id, ts.id) for ts in tiltseries_list] + print(f"Found {len(tiltseries_run_ids_and_ts_ids)} tiltseries for dataset id {dataset_id}. Starting downloads...", flush=True) + with multiprocessing.Pool(processes=8) as pool: # adjust number of processes as needed + for _ in pool.imap_unordered(_worker_download_aretomo_files_for_tiltseries, [(dataset_id, run_name, output_dir, tiltseries_id) for run_name, tiltseries_id in tiltseries_run_ids_and_ts_ids]): + pass + +def _worker_download_aretomo_files_for_tiltseries(args): + dataset_id, run_name, output_dir, tiltseries_id = args + download_aretomo_files_for_tiltseries(dataset_id, run_name, output_dir, tiltseries_id) + +# note: this function assumes that there is only one tiltseries per run +# note: the tiltseries name is equivlaent to the run name +# if tiltseries_id is provided, will be prioritized over dataset_id + run_name +def download_aretomo_files_for_tiltseries(dataset_id: int, run_name: str, output_dir: str, tiltseries_id: int = None): + + print(f"[{run_name}] Downloading AreTomo files for tiltseries id {tiltseries_id}...", flush=True) + + client = cdp.Client() + s3 = s3fs.S3FileSystem(anon=True) + if not tiltseries_id: + all_tiltseries = cdp.TiltSeries.find(client, query_filters=[cdp.TiltSeries.run.dataset_id == dataset_id, cdp.TiltSeries.run.name == run_name]) + if len(all_tiltseries) == 0: + raise ValueError(f"No tiltseries found for dataset_id {dataset_id} and run_name {run_name}") + if len(all_tiltseries) > 1: + raise ValueError(f"Multiple tiltseries found for dataset_id {dataset_id} and run_name {run_name}") + tiltseries = all_tiltseries[0] + else: + tiltseries = cdp.TiltSeries.get_by_id(client, tiltseries_id) + + # get the s3 folder path and then glob for *.tlt / *.rawtlt files to download them, renaming the base to match the run id + s3_folder_path = tiltseries.s3_mrc_file.rsplit('/', 1)[0] + '/' + tlt_files = s3.glob(s3_folder_path + '*.tlt') + s3.glob(s3_folder_path + '*.rawtlt') + for tlt_file in tlt_files: + base_name = os.path.basename(tlt_file) + ext = os.path.splitext(base_name)[1] + dest_file = os.path.join(output_dir, f"{tiltseries.run.id}{ext}") + s3.get(tlt_file, dest_file) + print(f"[{tiltseries.run.id}] Downloaded {base_name} as {os.path.basename(dest_file)}.", flush=True) + + # do the same for "*CTF*.txt" files and "*ctf*.txt" files + ctf_files = s3.glob(s3_folder_path + '*CTF*.txt') + s3.glob(s3_folder_path + '*ctf*.txt') + if len(ctf_files) == 0: + print(f"WARNING: No CTF files found for tiltseries id {tiltseries.id}") + else: + ctf_file = ctf_files[0] + base_name = os.path.basename(ctf_file) + if len(ctf_files) > 1: + print(f"WARNING: Multiple CTF files found for tiltseries id {tiltseries.id}, using {base_name}") + ext = os.path.splitext(base_name)[1] + dest_file = os.path.join(output_dir, f"{tiltseries.run.id}_CTF.txt") + s3.get(ctf_file, dest_file) + print(f"[{tiltseries.run.id}] Downloaded {base_name} as {os.path.basename(dest_file)}.", flush=True) + + # now find the corresponding alignment for this tiltseries and download the "*.aln" file + if len(tiltseries.alignments) == 0: + print(f"WARNING: No alignments found for tiltseries id {tiltseries.id}") + elif len(tiltseries.alignments) > 1: + print(f"WARNING: Multiple alignments found for tiltseries id {tiltseries.id}") + else: + alignment = tiltseries.alignments[0] + s3_alignment_folder_path = alignment.s3_alignment_metadata.rsplit('/', 1)[0] + '/' + aln_files = s3.glob(s3_alignment_folder_path + '*.aln') + if len(aln_files) == 0: + raise ValueError(f"No .aln files found for run name {tiltseries.run.name} and alignment id {alignment.id}") + aln_file = aln_files[0] + base_name = os.path.basename(aln_file) + if len(aln_files) > 1: + print(f"WARNING: Multiple .aln files found for run name {tiltseries.run.name}, using {base_name}") + ext = os.path.splitext(base_name)[1] + dest_file = os.path.join(output_dir, f"{tiltseries.run.id}{ext}") + s3.get(aln_file, dest_file) + print(f"[{tiltseries.run.id}] Downloaded {base_name} as {os.path.basename(dest_file)}.", flush=True) + + # now get the mdoc file from the Frames/ folder + frames = tiltseries.run.frames + if len(frames) == 0: + raise ValueError(f"No frames found for run name {tiltseries.run.name}") + frame = frames[0] + s3_frames_folder_path = frame.s3_frame_path.rsplit('/', 1)[0] + '/' + mdoc_files = s3.glob(s3_frames_folder_path + '*.mdoc') + if len(mdoc_files) == 0: + raise ValueError(f"No .mdoc files found for run name {tiltseries.run.name}") + mdoc_file = mdoc_files[0] + base_name = os.path.basename(mdoc_file) + if len(mdoc_files) > 1: + print(f"WARNING: Multiple .mdoc files found for run name {tiltseries.run.name}, using {base_name}") + ext = os.path.splitext(base_name)[1] + dest_file = os.path.join(output_dir, f"{tiltseries.run.id}{ext}") + s3.get(mdoc_file, dest_file) + print(f"[{tiltseries.run.id}] Downloaded {base_name} as {os.path.basename(dest_file)}.", flush=True) + + # download tiltseries mrc file + tiltseries_file = os.path.join(output_dir, f"{tiltseries.run.id}.mrc") + tiltseries_url = tiltseries.https_mrc_file + response = requests.get(tiltseries_url, stream=True) + response.raise_for_status() + with open(tiltseries_file, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + print(f"[{tiltseries.run.id}] Downloaded tiltseries mrc file as {os.path.basename(tiltseries_file)}.", flush=True) + + # create imod file for order list + mdoc = mdocfile.read(os.path.join(output_dir, f"{tiltseries.run.id}.mdoc")) + order_list = mdoc['TiltAngle'] + imodpath = os.path.join(output_dir, f"{tiltseries.run.id}_Imod") + os.makedirs(imodpath, exist_ok=True) + number = np.arange(len(order_list)) + 1 + + # save in csv with 'ImageNumber', 'TiltAngle' headers + df = pd.DataFrame({'ImageNumber': number, 'TiltAngle': order_list}) + df.to_csv(os.path.join(imodpath, f"{tiltseries.run.id}_order_list.csv"), index=False) diff --git a/src/copick_utils/io/readers.py b/src/copick_utils/io/readers.py index 3de85f6..a7209e2 100644 --- a/src/copick_utils/io/readers.py +++ b/src/copick_utils/io/readers.py @@ -1,81 +1,131 @@ +from copick.util.uri import resolve_copick_objects import numpy as np def tomogram(run, voxel_size: float = 10, algorithm: str = "wbp", raise_error: bool = False): - voxel_spacing_obj = run.get_voxel_spacing(voxel_size) - - if voxel_spacing_obj is None: - # Query Avaiable Voxel Spacings - availableVoxelSpacings = [tomo.voxel_size for tomo in run.voxel_spacings] - - # Report to the user which voxel spacings they can use - message = ( - f"[Warning] No tomogram found for {run.name} with voxel size {voxel_size} and tomogram type {algorithm}" - f"Available spacings are: {', '.join(map(str, availableVoxelSpacings))}" - ) - if raise_error: - raise ValueError(message) - else: - print(message) - return None - - tomogram = voxel_spacing_obj.get_tomogram(algorithm) - if tomogram is None: - # Get available algorithms - availableAlgorithms = [tomo.tomo_type for tomo in run.get_voxel_spacing(voxel_size).tomograms] - - # Report to the user which algorithms are available - message = ( - f"[Warning] No tomogram found for {run.name} with voxel size {voxel_size} and tomogram type {algorithm}" - f"Available algorithms are: {', '.join(availableAlgorithms)}" - ) - if raise_error: - raise ValueError(message) - else: - print(message) - return None - - return tomogram.numpy() - - -def segmentation(run, voxel_spacing: float, segmentation_name: str, session_id=None, user_id=None, raise_error=False): - seg = run.get_segmentations( - name=segmentation_name, - session_id=session_id, - user_id=user_id, - voxel_size=voxel_spacing, - ) - - # No Segmentations Are Available, Result in Error - if len(seg) == 0: - # Get all available segmentations with their metadata - available_segs = run.get_segmentations(voxel_size=voxel_spacing) - seg_info = [(s.name, s.user_id, s.session_id) for s in available_segs] - - # Format the information for display - seg_details = [f"(name: {name}, user_id: {uid}, session_id: {sid})" for name, uid, sid in seg_info] - - message = ( - f"\nNo segmentation found matching:\n" - f" name: {segmentation_name}, user_id: {user_id}, session_id: {session_id}\n" - f"Available segmentations in {run.name} are:\n " + "\n ".join(seg_details) - ) - if raise_error: - raise ValueError(message) - else: - print(message) - return None - - # No Segmentations Are Available, Result in Error - if len(seg) > 1: - print( - f"[Warning] More Than 1 Segmentation is Available for the Query Information. " - f"Available Segmentations are: {seg} " - f"Defaulting to Loading: {seg[0]}\n", + """ + Reads a tomogram from a Copick run. + + Parameters: + ----------- + run: copick.Run + voxel_size: float + algorithm: str + raise_error: bool + + Returns: + -------- + vol: np.ndarray - The tomogram. + """ + + # Get the tomogram from the Copick URI + try: + uri = f'{algorithm}@{voxel_size}' + vol = resolve_copick_objects(uri, run.root, 'tomogram', run_name = run.name) + return vol[0].numpy() + except: # Report which orbject is missing + + # Try to resolve the tomogram using the Copick URI + voxel_spacing_obj = run.get_voxel_spacing(voxel_size) + + if voxel_spacing_obj is None: + # Query Avaiable Voxel Spacings + availableVoxelSpacings = [tomo.voxel_size for tomo in run.voxel_spacings] + + # Report to the user which voxel spacings they can use + message = ( + f"[Warning] No tomogram found for {run.name} with voxel size {voxel_size} and tomogram type {algorithm}" + f"Available spacings are: {', '.join(map(str, availableVoxelSpacings))}" + ) + if raise_error: + raise ValueError(message) + else: + print(message) + return None + + tomogram = voxel_spacing_obj.get_tomogram(algorithm) + if tomogram is None: + # Get available algorithms + availableAlgorithms = [tomo.tomo_type for tomo in run.get_voxel_spacing(voxel_size).tomograms] + + # Report to the user which algorithms are available + message = ( + f"[Warning] No tomogram found for {run.name} with voxel size {voxel_size} and tomogram type {algorithm}" + f"Available algorithms are: {', '.join(availableAlgorithms)}" + ) + if raise_error: + raise ValueError(message) + else: + print(message) + return None + + +def segmentation(run, voxel_spacing: float, name: str, user_id=None, session_id=None, raise_error=False): + """ + Reads a segmentation from a Copick run. + + Parameters: + ----------- + run: copick.Run + voxel_spacing: float + name: str + user_id: str + session_id: str + raise_error: bool + + Returns: + -------- + seg: np.ndarray - The segmentation. + """ + + # Fill in the missing values with wildcards + if user_id is None: user_id = '*' + if session_id is None: session_id = '*' + + # Try to resolve the segmentation using the Copick URI + try: + uri = f'{name}:{user_id}/{session_id}@{voxel_spacing}' + segs = resolve_copick_objects(uri, run.root, 'segmentation', run_name = run.name) + return segs[0].numpy() + except: + # If the query was unavailable, set the user_id and session_id to None + user_id, session_id = None, None + + # Query Was Unavailable, Let's List Out All Available Segmentations + seg = run.get_segmentations( + name=name, + session_id=session_id, + user_id=user_id, + voxel_size=voxel_spacing, ) - seg = seg[0] - return seg.numpy() + # No Segmentations Are Available, Result in Error + if len(seg) == 0: + # Get all available segmentations with their metadata + available_segs = run.get_segmentations(voxel_size=voxel_spacing) + seg_info = [(s.name, s.user_id, s.session_id) for s in available_segs] + + # Format the information for display + seg_details = [f"(name: {name}, user_id: {uid}, session_id: {sid})" for name, uid, sid in seg_info] + + message = ( + f"\nNo segmentation found matching:\n" + f" name: {name}, user_id: {user_id}, session_id: {session_id}\n" + f"Available segmentations in {run.name} are:\n " + "\n ".join(seg_details) + ) + if raise_error: + raise ValueError(message) + else: + print(message) + return None + + # No Segmentations Are Available, Result in Error + if len(seg) > 1: + print( + f"[Warning] More Than 1 Segmentation is Available for the Query Information. " + f"Available Segmentations are: {seg} " + f"Defaulting to Loading: {seg[0]}\n", + ) def coordinates( @@ -86,6 +136,22 @@ def coordinates( voxel_size: float = 10, # Voxel size of the tomogram, used for scaling the coordinates raise_error: bool = False, ): + """ + Reads the coordinates of the picks from a Copick run. + + Parameters: + ----------- + run: copick.Run + name: str + user_id: str + session_id: str + voxel_size: float + raise_error: bool + + Returns: + -------- + coordinates: np.ndarray - The 3D coordinates of the picks in voxel space. + """ # Retrieve the pick points associated with the specified object and user ID picks = run.get_picks(object_name=name, user_id=user_id, session_id=session_id) diff --git a/src/copick_utils/io/writers.py b/src/copick_utils/io/writers.py index 572525c..6c9877f 100644 --- a/src/copick_utils/io/writers.py +++ b/src/copick_utils/io/writers.py @@ -28,17 +28,17 @@ def tomogram(run, input_volume, voxel_size=10, algorithm="wbp"): voxel_spacing = run.new_voxel_spacing(voxel_size=voxel_size) # Check if We Need to Create a New Tomogram for Given Algorithm - tomogram = voxel_spacing.get_tomogram(algorithm) - if tomogram is None: - tomogram = voxel_spacing.new_tomogram(tomo_type=algorithm) + tomo = voxel_spacing.get_tomogram(algorithm) + if tomo is None: + tomo = voxel_spacing.new_tomogram(tomo_type=algorithm) # Write the tomogram data - tomogram.from_numpy(input_volume) + tomo.from_numpy(input_volume) def segmentation( run, - segmentation_volume, + seg_vol, user_id, name="segmentation", session_id="0", @@ -52,7 +52,7 @@ def segmentation( ----------- run : copick.Run The current Copick run object. - segmentation_volume : np.ndarray + seg_vol : np.ndarray The segmentation data to be written. user_id : str The ID of the user creating the segmentation. @@ -76,7 +76,7 @@ def segmentation( # If no segmentation exists or no segmentation at the given voxel size, create a new one if len(segmentations) == 0 or any(seg.voxel_size != voxel_size for seg in segmentations): - segmentation = run.new_segmentation( + seg = run.new_segmentation( voxel_size=voxel_size, name=name, session_id=session_id, @@ -85,7 +85,7 @@ def segmentation( ) else: # Overwrite the current segmentation at the specified voxel size if it exists - segmentation = next(seg for seg in segmentations if seg.voxel_size == voxel_size) + seg = next(seg for seg in segmentations if seg.voxel_size == voxel_size) # Write the segmentation data - segmentation.from_numpy(segmentation_volume, dtype=np.uint8) + seg.from_numpy(seg_vol, dtype=np.uint8) From 7a02e51e7c68de5b84a1acf5be22504188b87333 Mon Sep 17 00:00:00 2001 From: jtschwar Date: Fri, 30 Jan 2026 14:22:32 -0800 Subject: [PATCH 2/4] configure command to copick cli --- pyproject.toml | 4 ++++ src/copick_utils/cli/download.py | 7 ++++--- src/copick_utils/cli/download_commands.py | 11 +++++++++++ src/copick_utils/io/portal.py | 4 ++-- 4 files changed, 21 insertions(+), 5 deletions(-) create mode 100644 src/copick_utils/cli/download_commands.py diff --git a/pyproject.toml b/pyproject.toml index 812fb86..8ea6305 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ dependencies = [ "trimesh", "manifold3d", "mapbox-earcut", + "mdocfile", "tqdm", "scikit-learn", "shapely", @@ -87,6 +88,9 @@ clippicks = "copick_utils.cli.logical_commands:clippicks" picksin = "copick_utils.cli.logical_commands:picksin" picksout = "copick_utils.cli.logical_commands:picksout" +[project.entry-points."copick.download.commands"] +project = "copick_utils.cli.download_commands:project" + [tool.hatch.version] path = "src/copick_utils/__init__.py" diff --git a/src/copick_utils/cli/download.py b/src/copick_utils/cli/download.py index f15ece4..3503762 100644 --- a/src/copick_utils/cli/download.py +++ b/src/copick_utils/cli/download.py @@ -5,7 +5,6 @@ short_help="Download tilt series and alignments from the CryoET Data Portal.", no_args_is_help=True, ) - @click.option( '-d', '--dataset', required=True, type=str, @@ -16,8 +15,10 @@ required=True, default='.', type=str, help='Output directory to save the downloaded files.', ) - -def download(dataset: str, output: str): +def project(dataset: str, output: str): + """ + Download tilt series and alignments from the CryoET Data Portal for sub-tomogram averaging with py2rely. + """ download_project(dataset, output) def download_project(dataset: str, output: str): diff --git a/src/copick_utils/cli/download_commands.py b/src/copick_utils/cli/download_commands.py new file mode 100644 index 0000000..d7c0792 --- /dev/null +++ b/src/copick_utils/cli/download_commands.py @@ -0,0 +1,11 @@ +"""CLI commands for downloading data from the CryoET Data Portal. + +This module imports all download commands from specialized files for better organization. +""" + +from copick_utils.cli.download import project + +# All commands are now available for import by the main CLI +__all__ = [ + "project", +] \ No newline at end of file diff --git a/src/copick_utils/io/portal.py b/src/copick_utils/io/portal.py index 25c0840..64f65e6 100644 --- a/src/copick_utils/io/portal.py +++ b/src/copick_utils/io/portal.py @@ -1,10 +1,10 @@ """ A minimal example using minimal libraries / imports to download relevant AreTomo files from the CryoET Data Portal. Downloads the corresponding files, using the run ID as the base filename. """ -import multiprocessing, requests, argparse, os +import multiprocessing, requests, os import pandas as pd import numpy as np -import mdocfile, click +import mdocfile import cryoet_data_portal as cdp import s3fs From 812de6c3749f825b30ac31e9eda981908b93f4da Mon Sep 17 00:00:00 2001 From: jtschwar Date: Fri, 30 Jan 2026 14:28:44 -0800 Subject: [PATCH 3/4] linting + acknowledgements --- src/copick_utils/cli/download.py | 22 ++++--- src/copick_utils/cli/download_commands.py | 2 +- src/copick_utils/io/portal.py | 72 +++++++++++++++-------- src/copick_utils/io/readers.py | 29 ++++----- 4 files changed, 78 insertions(+), 47 deletions(-) diff --git a/src/copick_utils/cli/download.py b/src/copick_utils/cli/download.py index 3503762..895e042 100644 --- a/src/copick_utils/cli/download.py +++ b/src/copick_utils/cli/download.py @@ -1,19 +1,25 @@ import click + @click.command( context_settings={"show_default": True}, short_help="Download tilt series and alignments from the CryoET Data Portal.", no_args_is_help=True, ) @click.option( - '-d', '--dataset', - required=True, type=str, - help='Dataset ID to download from the CryoET Data Portal.', + "-d", + "--dataset", + required=True, + type=str, + help="Dataset ID to download from the CryoET Data Portal.", ) @click.option( - '-o', '--output', - required=True, default='.', type=str, - help='Output directory to save the downloaded files.', + "-o", + "--output", + required=True, + default=".", + type=str, + help="Output directory to save the downloaded files.", ) def project(dataset: str, output: str): """ @@ -21,6 +27,8 @@ def project(dataset: str, output: str): """ download_project(dataset, output) + def download_project(dataset: str, output: str): import copick_utils.io.portal as portal - portal.download_aretomo_files(dataset, output) \ No newline at end of file + + portal.download_aretomo_files(dataset, output) diff --git a/src/copick_utils/cli/download_commands.py b/src/copick_utils/cli/download_commands.py index d7c0792..a97525f 100644 --- a/src/copick_utils/cli/download_commands.py +++ b/src/copick_utils/cli/download_commands.py @@ -8,4 +8,4 @@ # All commands are now available for import by the main CLI __all__ = [ "project", -] \ No newline at end of file +] diff --git a/src/copick_utils/io/portal.py b/src/copick_utils/io/portal.py index 64f65e6..8e96a37 100644 --- a/src/copick_utils/io/portal.py +++ b/src/copick_utils/io/portal.py @@ -1,40 +1,62 @@ """ -A minimal example using minimal libraries / imports to download relevant AreTomo files from the CryoET Data Portal. Downloads the corresponding files, using the run ID as the base filename. +A minimal example using minimal libraries / imports to download relevant AreTomo files +from the CryoET Data Portal. Downloads the corresponding files, using the run ID as the +base filename. + +Original implementation by Daniel Ji and Utz Ermel. """ -import multiprocessing, requests, os -import pandas as pd -import numpy as np -import mdocfile +import multiprocessing +import os import cryoet_data_portal as cdp +import mdocfile +import numpy as np +import pandas as pd +import requests import s3fs global_client = cdp.Client() + def download_aretomo_files(dataset_id: int, output_dir: str): print(f"Fetching tiltseries for dataset id {dataset_id}...", flush=True) - tiltseries_list: list[cdp.TiltSeries] = [tiltseries for run in cdp.Dataset.get_by_id(global_client, dataset_id).runs for tiltseries in run.tiltseries] # a bit slow for some reason, can take some time + tiltseries_list: list[cdp.TiltSeries] = [ + tiltseries for run in cdp.Dataset.get_by_id(global_client, dataset_id).runs for tiltseries in run.tiltseries + ] # a bit slow for some reason, can take some time tiltseries_run_ids_and_ts_ids = [(ts.run.id, ts.id) for ts in tiltseries_list] - print(f"Found {len(tiltseries_run_ids_and_ts_ids)} tiltseries for dataset id {dataset_id}. Starting downloads...", flush=True) - with multiprocessing.Pool(processes=8) as pool: # adjust number of processes as needed - for _ in pool.imap_unordered(_worker_download_aretomo_files_for_tiltseries, [(dataset_id, run_name, output_dir, tiltseries_id) for run_name, tiltseries_id in tiltseries_run_ids_and_ts_ids]): + print( + f"Found {len(tiltseries_run_ids_and_ts_ids)} tiltseries for dataset id {dataset_id}. Starting downloads...", + flush=True, + ) + with multiprocessing.Pool(processes=8) as pool: # adjust number of processes as needed + for _ in pool.imap_unordered( + _worker_download_aretomo_files_for_tiltseries, + [ + (dataset_id, run_name, output_dir, tiltseries_id) + for run_name, tiltseries_id in tiltseries_run_ids_and_ts_ids + ], + ): pass - + + def _worker_download_aretomo_files_for_tiltseries(args): dataset_id, run_name, output_dir, tiltseries_id = args download_aretomo_files_for_tiltseries(dataset_id, run_name, output_dir, tiltseries_id) + # note: this function assumes that there is only one tiltseries per run -# note: the tiltseries name is equivlaent to the run name +# note: the tiltseries name is equivlaent to the run name # if tiltseries_id is provided, will be prioritized over dataset_id + run_name def download_aretomo_files_for_tiltseries(dataset_id: int, run_name: str, output_dir: str, tiltseries_id: int = None): - print(f"[{run_name}] Downloading AreTomo files for tiltseries id {tiltseries_id}...", flush=True) client = cdp.Client() s3 = s3fs.S3FileSystem(anon=True) if not tiltseries_id: - all_tiltseries = cdp.TiltSeries.find(client, query_filters=[cdp.TiltSeries.run.dataset_id == dataset_id, cdp.TiltSeries.run.name == run_name]) + all_tiltseries = cdp.TiltSeries.find( + client, + query_filters=[cdp.TiltSeries.run.dataset_id == dataset_id, cdp.TiltSeries.run.name == run_name], + ) if len(all_tiltseries) == 0: raise ValueError(f"No tiltseries found for dataset_id {dataset_id} and run_name {run_name}") if len(all_tiltseries) > 1: @@ -44,8 +66,8 @@ def download_aretomo_files_for_tiltseries(dataset_id: int, run_name: str, output tiltseries = cdp.TiltSeries.get_by_id(client, tiltseries_id) # get the s3 folder path and then glob for *.tlt / *.rawtlt files to download them, renaming the base to match the run id - s3_folder_path = tiltseries.s3_mrc_file.rsplit('/', 1)[0] + '/' - tlt_files = s3.glob(s3_folder_path + '*.tlt') + s3.glob(s3_folder_path + '*.rawtlt') + s3_folder_path = tiltseries.s3_mrc_file.rsplit("/", 1)[0] + "/" + tlt_files = s3.glob(s3_folder_path + "*.tlt") + s3.glob(s3_folder_path + "*.rawtlt") for tlt_file in tlt_files: base_name = os.path.basename(tlt_file) ext = os.path.splitext(base_name)[1] @@ -54,7 +76,7 @@ def download_aretomo_files_for_tiltseries(dataset_id: int, run_name: str, output print(f"[{tiltseries.run.id}] Downloaded {base_name} as {os.path.basename(dest_file)}.", flush=True) # do the same for "*CTF*.txt" files and "*ctf*.txt" files - ctf_files = s3.glob(s3_folder_path + '*CTF*.txt') + s3.glob(s3_folder_path + '*ctf*.txt') + ctf_files = s3.glob(s3_folder_path + "*CTF*.txt") + s3.glob(s3_folder_path + "*ctf*.txt") if len(ctf_files) == 0: print(f"WARNING: No CTF files found for tiltseries id {tiltseries.id}") else: @@ -63,7 +85,7 @@ def download_aretomo_files_for_tiltseries(dataset_id: int, run_name: str, output if len(ctf_files) > 1: print(f"WARNING: Multiple CTF files found for tiltseries id {tiltseries.id}, using {base_name}") ext = os.path.splitext(base_name)[1] - dest_file = os.path.join(output_dir, f"{tiltseries.run.id}_CTF.txt") + dest_file = os.path.join(output_dir, f"{tiltseries.run.id}_CTF.txt") s3.get(ctf_file, dest_file) print(f"[{tiltseries.run.id}] Downloaded {base_name} as {os.path.basename(dest_file)}.", flush=True) @@ -74,8 +96,8 @@ def download_aretomo_files_for_tiltseries(dataset_id: int, run_name: str, output print(f"WARNING: Multiple alignments found for tiltseries id {tiltseries.id}") else: alignment = tiltseries.alignments[0] - s3_alignment_folder_path = alignment.s3_alignment_metadata.rsplit('/', 1)[0] + '/' - aln_files = s3.glob(s3_alignment_folder_path + '*.aln') + s3_alignment_folder_path = alignment.s3_alignment_metadata.rsplit("/", 1)[0] + "/" + aln_files = s3.glob(s3_alignment_folder_path + "*.aln") if len(aln_files) == 0: raise ValueError(f"No .aln files found for run name {tiltseries.run.name} and alignment id {alignment.id}") aln_file = aln_files[0] @@ -92,8 +114,8 @@ def download_aretomo_files_for_tiltseries(dataset_id: int, run_name: str, output if len(frames) == 0: raise ValueError(f"No frames found for run name {tiltseries.run.name}") frame = frames[0] - s3_frames_folder_path = frame.s3_frame_path.rsplit('/', 1)[0] + '/' - mdoc_files = s3.glob(s3_frames_folder_path + '*.mdoc') + s3_frames_folder_path = frame.s3_frame_path.rsplit("/", 1)[0] + "/" + mdoc_files = s3.glob(s3_frames_folder_path + "*.mdoc") if len(mdoc_files) == 0: raise ValueError(f"No .mdoc files found for run name {tiltseries.run.name}") mdoc_file = mdoc_files[0] @@ -110,18 +132,18 @@ def download_aretomo_files_for_tiltseries(dataset_id: int, run_name: str, output tiltseries_url = tiltseries.https_mrc_file response = requests.get(tiltseries_url, stream=True) response.raise_for_status() - with open(tiltseries_file, 'wb') as f: + with open(tiltseries_file, "wb") as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) print(f"[{tiltseries.run.id}] Downloaded tiltseries mrc file as {os.path.basename(tiltseries_file)}.", flush=True) # create imod file for order list mdoc = mdocfile.read(os.path.join(output_dir, f"{tiltseries.run.id}.mdoc")) - order_list = mdoc['TiltAngle'] + order_list = mdoc["TiltAngle"] imodpath = os.path.join(output_dir, f"{tiltseries.run.id}_Imod") os.makedirs(imodpath, exist_ok=True) number = np.arange(len(order_list)) + 1 - + # save in csv with 'ImageNumber', 'TiltAngle' headers - df = pd.DataFrame({'ImageNumber': number, 'TiltAngle': order_list}) + df = pd.DataFrame({"ImageNumber": number, "TiltAngle": order_list}) df.to_csv(os.path.join(imodpath, f"{tiltseries.run.id}_order_list.csv"), index=False) diff --git a/src/copick_utils/io/readers.py b/src/copick_utils/io/readers.py index a7209e2..b24efe7 100644 --- a/src/copick_utils/io/readers.py +++ b/src/copick_utils/io/readers.py @@ -1,5 +1,5 @@ -from copick.util.uri import resolve_copick_objects import numpy as np +from copick.util.uri import resolve_copick_objects def tomogram(run, voxel_size: float = 10, algorithm: str = "wbp", raise_error: bool = False): @@ -20,11 +20,10 @@ def tomogram(run, voxel_size: float = 10, algorithm: str = "wbp", raise_error: b # Get the tomogram from the Copick URI try: - uri = f'{algorithm}@{voxel_size}' - vol = resolve_copick_objects(uri, run.root, 'tomogram', run_name = run.name) + uri = f"{algorithm}@{voxel_size}" + vol = resolve_copick_objects(uri, run.root, "tomogram", run_name=run.name) return vol[0].numpy() - except: # Report which orbject is missing - + except Exception as err: # Report which orbject is missing # Try to resolve the tomogram using the Copick URI voxel_spacing_obj = run.get_voxel_spacing(voxel_size) @@ -38,7 +37,7 @@ def tomogram(run, voxel_size: float = 10, algorithm: str = "wbp", raise_error: b f"Available spacings are: {', '.join(map(str, availableVoxelSpacings))}" ) if raise_error: - raise ValueError(message) + raise ValueError(message) from err else: print(message) return None @@ -54,13 +53,13 @@ def tomogram(run, voxel_size: float = 10, algorithm: str = "wbp", raise_error: b f"Available algorithms are: {', '.join(availableAlgorithms)}" ) if raise_error: - raise ValueError(message) + raise ValueError(message) from err else: print(message) return None -def segmentation(run, voxel_spacing: float, name: str, user_id=None, session_id=None, raise_error=False): +def segmentation(run, voxel_spacing: float, name: str, user_id=None, session_id=None, raise_error=False): """ Reads a segmentation from a Copick run. @@ -79,15 +78,17 @@ def segmentation(run, voxel_spacing: float, name: str, user_id=None, session_id """ # Fill in the missing values with wildcards - if user_id is None: user_id = '*' - if session_id is None: session_id = '*' + if user_id is None: + user_id = "*" + if session_id is None: + session_id = "*" # Try to resolve the segmentation using the Copick URI try: - uri = f'{name}:{user_id}/{session_id}@{voxel_spacing}' - segs = resolve_copick_objects(uri, run.root, 'segmentation', run_name = run.name) + uri = f"{name}:{user_id}/{session_id}@{voxel_spacing}" + segs = resolve_copick_objects(uri, run.root, "segmentation", run_name=run.name) return segs[0].numpy() - except: + except Exception as err: # If the query was unavailable, set the user_id and session_id to None user_id, session_id = None, None @@ -114,7 +115,7 @@ def segmentation(run, voxel_spacing: float, name: str, user_id=None, session_id f"Available segmentations in {run.name} are:\n " + "\n ".join(seg_details) ) if raise_error: - raise ValueError(message) + raise ValueError(message) from err else: print(message) return None From bd0dd640beb903c15005273b6e55a094f45722a2 Mon Sep 17 00:00:00 2001 From: jtschwar Date: Fri, 30 Jan 2026 15:06:53 -0800 Subject: [PATCH 4/4] update flag --- src/copick_utils/cli/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/copick_utils/cli/download.py b/src/copick_utils/cli/download.py index 895e042..473fe2a 100644 --- a/src/copick_utils/cli/download.py +++ b/src/copick_utils/cli/download.py @@ -7,7 +7,7 @@ no_args_is_help=True, ) @click.option( - "-d", + "-ds", "--dataset", required=True, type=str,