diff --git a/pyproject.toml b/pyproject.toml index 812fb86..8ea6305 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ dependencies = [ "trimesh", "manifold3d", "mapbox-earcut", + "mdocfile", "tqdm", "scikit-learn", "shapely", @@ -87,6 +88,9 @@ clippicks = "copick_utils.cli.logical_commands:clippicks" picksin = "copick_utils.cli.logical_commands:picksin" picksout = "copick_utils.cli.logical_commands:picksout" +[project.entry-points."copick.download.commands"] +project = "copick_utils.cli.download_commands:project" + [tool.hatch.version] path = "src/copick_utils/__init__.py" diff --git a/src/copick_utils/cli/download.py b/src/copick_utils/cli/download.py new file mode 100644 index 0000000..473fe2a --- /dev/null +++ b/src/copick_utils/cli/download.py @@ -0,0 +1,34 @@ +import click + + +@click.command( + context_settings={"show_default": True}, + short_help="Download tilt series and alignments from the CryoET Data Portal.", + no_args_is_help=True, +) +@click.option( + "-ds", + "--dataset", + required=True, + type=str, + help="Dataset ID to download from the CryoET Data Portal.", +) +@click.option( + "-o", + "--output", + required=True, + default=".", + type=str, + help="Output directory to save the downloaded files.", +) +def project(dataset: str, output: str): + """ + Download tilt series and alignments from the CryoET Data Portal for sub-tomogram averaging with py2rely. + """ + download_project(dataset, output) + + +def download_project(dataset: str, output: str): + import copick_utils.io.portal as portal + + portal.download_aretomo_files(dataset, output) diff --git a/src/copick_utils/cli/download_commands.py b/src/copick_utils/cli/download_commands.py new file mode 100644 index 0000000..a97525f --- /dev/null +++ b/src/copick_utils/cli/download_commands.py @@ -0,0 +1,11 @@ +"""CLI commands for downloading data from the CryoET Data Portal. + +This module imports all download commands from specialized files for better organization. +""" + +from copick_utils.cli.download import project + +# All commands are now available for import by the main CLI +__all__ = [ + "project", +] diff --git a/src/copick_utils/io/portal.py b/src/copick_utils/io/portal.py new file mode 100644 index 0000000..8e96a37 --- /dev/null +++ b/src/copick_utils/io/portal.py @@ -0,0 +1,149 @@ +""" +A minimal example using minimal libraries / imports to download relevant AreTomo files +from the CryoET Data Portal. Downloads the corresponding files, using the run ID as the +base filename. + +Original implementation by Daniel Ji and Utz Ermel. +""" +import multiprocessing +import os + +import cryoet_data_portal as cdp +import mdocfile +import numpy as np +import pandas as pd +import requests +import s3fs + +global_client = cdp.Client() + + +def download_aretomo_files(dataset_id: int, output_dir: str): + print(f"Fetching tiltseries for dataset id {dataset_id}...", flush=True) + tiltseries_list: list[cdp.TiltSeries] = [ + tiltseries for run in cdp.Dataset.get_by_id(global_client, dataset_id).runs for tiltseries in run.tiltseries + ] # a bit slow for some reason, can take some time + tiltseries_run_ids_and_ts_ids = [(ts.run.id, ts.id) for ts in tiltseries_list] + print( + f"Found {len(tiltseries_run_ids_and_ts_ids)} tiltseries for dataset id {dataset_id}. Starting downloads...", + flush=True, + ) + with multiprocessing.Pool(processes=8) as pool: # adjust number of processes as needed + for _ in pool.imap_unordered( + _worker_download_aretomo_files_for_tiltseries, + [ + (dataset_id, run_name, output_dir, tiltseries_id) + for run_name, tiltseries_id in tiltseries_run_ids_and_ts_ids + ], + ): + pass + + +def _worker_download_aretomo_files_for_tiltseries(args): + dataset_id, run_name, output_dir, tiltseries_id = args + download_aretomo_files_for_tiltseries(dataset_id, run_name, output_dir, tiltseries_id) + + +# note: this function assumes that there is only one tiltseries per run +# note: the tiltseries name is equivlaent to the run name +# if tiltseries_id is provided, will be prioritized over dataset_id + run_name +def download_aretomo_files_for_tiltseries(dataset_id: int, run_name: str, output_dir: str, tiltseries_id: int = None): + print(f"[{run_name}] Downloading AreTomo files for tiltseries id {tiltseries_id}...", flush=True) + + client = cdp.Client() + s3 = s3fs.S3FileSystem(anon=True) + if not tiltseries_id: + all_tiltseries = cdp.TiltSeries.find( + client, + query_filters=[cdp.TiltSeries.run.dataset_id == dataset_id, cdp.TiltSeries.run.name == run_name], + ) + if len(all_tiltseries) == 0: + raise ValueError(f"No tiltseries found for dataset_id {dataset_id} and run_name {run_name}") + if len(all_tiltseries) > 1: + raise ValueError(f"Multiple tiltseries found for dataset_id {dataset_id} and run_name {run_name}") + tiltseries = all_tiltseries[0] + else: + tiltseries = cdp.TiltSeries.get_by_id(client, tiltseries_id) + + # get the s3 folder path and then glob for *.tlt / *.rawtlt files to download them, renaming the base to match the run id + s3_folder_path = tiltseries.s3_mrc_file.rsplit("/", 1)[0] + "/" + tlt_files = s3.glob(s3_folder_path + "*.tlt") + s3.glob(s3_folder_path + "*.rawtlt") + for tlt_file in tlt_files: + base_name = os.path.basename(tlt_file) + ext = os.path.splitext(base_name)[1] + dest_file = os.path.join(output_dir, f"{tiltseries.run.id}{ext}") + s3.get(tlt_file, dest_file) + print(f"[{tiltseries.run.id}] Downloaded {base_name} as {os.path.basename(dest_file)}.", flush=True) + + # do the same for "*CTF*.txt" files and "*ctf*.txt" files + ctf_files = s3.glob(s3_folder_path + "*CTF*.txt") + s3.glob(s3_folder_path + "*ctf*.txt") + if len(ctf_files) == 0: + print(f"WARNING: No CTF files found for tiltseries id {tiltseries.id}") + else: + ctf_file = ctf_files[0] + base_name = os.path.basename(ctf_file) + if len(ctf_files) > 1: + print(f"WARNING: Multiple CTF files found for tiltseries id {tiltseries.id}, using {base_name}") + ext = os.path.splitext(base_name)[1] + dest_file = os.path.join(output_dir, f"{tiltseries.run.id}_CTF.txt") + s3.get(ctf_file, dest_file) + print(f"[{tiltseries.run.id}] Downloaded {base_name} as {os.path.basename(dest_file)}.", flush=True) + + # now find the corresponding alignment for this tiltseries and download the "*.aln" file + if len(tiltseries.alignments) == 0: + print(f"WARNING: No alignments found for tiltseries id {tiltseries.id}") + elif len(tiltseries.alignments) > 1: + print(f"WARNING: Multiple alignments found for tiltseries id {tiltseries.id}") + else: + alignment = tiltseries.alignments[0] + s3_alignment_folder_path = alignment.s3_alignment_metadata.rsplit("/", 1)[0] + "/" + aln_files = s3.glob(s3_alignment_folder_path + "*.aln") + if len(aln_files) == 0: + raise ValueError(f"No .aln files found for run name {tiltseries.run.name} and alignment id {alignment.id}") + aln_file = aln_files[0] + base_name = os.path.basename(aln_file) + if len(aln_files) > 1: + print(f"WARNING: Multiple .aln files found for run name {tiltseries.run.name}, using {base_name}") + ext = os.path.splitext(base_name)[1] + dest_file = os.path.join(output_dir, f"{tiltseries.run.id}{ext}") + s3.get(aln_file, dest_file) + print(f"[{tiltseries.run.id}] Downloaded {base_name} as {os.path.basename(dest_file)}.", flush=True) + + # now get the mdoc file from the Frames/ folder + frames = tiltseries.run.frames + if len(frames) == 0: + raise ValueError(f"No frames found for run name {tiltseries.run.name}") + frame = frames[0] + s3_frames_folder_path = frame.s3_frame_path.rsplit("/", 1)[0] + "/" + mdoc_files = s3.glob(s3_frames_folder_path + "*.mdoc") + if len(mdoc_files) == 0: + raise ValueError(f"No .mdoc files found for run name {tiltseries.run.name}") + mdoc_file = mdoc_files[0] + base_name = os.path.basename(mdoc_file) + if len(mdoc_files) > 1: + print(f"WARNING: Multiple .mdoc files found for run name {tiltseries.run.name}, using {base_name}") + ext = os.path.splitext(base_name)[1] + dest_file = os.path.join(output_dir, f"{tiltseries.run.id}{ext}") + s3.get(mdoc_file, dest_file) + print(f"[{tiltseries.run.id}] Downloaded {base_name} as {os.path.basename(dest_file)}.", flush=True) + + # download tiltseries mrc file + tiltseries_file = os.path.join(output_dir, f"{tiltseries.run.id}.mrc") + tiltseries_url = tiltseries.https_mrc_file + response = requests.get(tiltseries_url, stream=True) + response.raise_for_status() + with open(tiltseries_file, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + print(f"[{tiltseries.run.id}] Downloaded tiltseries mrc file as {os.path.basename(tiltseries_file)}.", flush=True) + + # create imod file for order list + mdoc = mdocfile.read(os.path.join(output_dir, f"{tiltseries.run.id}.mdoc")) + order_list = mdoc["TiltAngle"] + imodpath = os.path.join(output_dir, f"{tiltseries.run.id}_Imod") + os.makedirs(imodpath, exist_ok=True) + number = np.arange(len(order_list)) + 1 + + # save in csv with 'ImageNumber', 'TiltAngle' headers + df = pd.DataFrame({"ImageNumber": number, "TiltAngle": order_list}) + df.to_csv(os.path.join(imodpath, f"{tiltseries.run.id}_order_list.csv"), index=False) diff --git a/src/copick_utils/io/readers.py b/src/copick_utils/io/readers.py index 3de85f6..b24efe7 100644 --- a/src/copick_utils/io/readers.py +++ b/src/copick_utils/io/readers.py @@ -1,81 +1,132 @@ import numpy as np +from copick.util.uri import resolve_copick_objects def tomogram(run, voxel_size: float = 10, algorithm: str = "wbp", raise_error: bool = False): - voxel_spacing_obj = run.get_voxel_spacing(voxel_size) - - if voxel_spacing_obj is None: - # Query Avaiable Voxel Spacings - availableVoxelSpacings = [tomo.voxel_size for tomo in run.voxel_spacings] - - # Report to the user which voxel spacings they can use - message = ( - f"[Warning] No tomogram found for {run.name} with voxel size {voxel_size} and tomogram type {algorithm}" - f"Available spacings are: {', '.join(map(str, availableVoxelSpacings))}" - ) - if raise_error: - raise ValueError(message) - else: - print(message) - return None - - tomogram = voxel_spacing_obj.get_tomogram(algorithm) - if tomogram is None: - # Get available algorithms - availableAlgorithms = [tomo.tomo_type for tomo in run.get_voxel_spacing(voxel_size).tomograms] - - # Report to the user which algorithms are available - message = ( - f"[Warning] No tomogram found for {run.name} with voxel size {voxel_size} and tomogram type {algorithm}" - f"Available algorithms are: {', '.join(availableAlgorithms)}" - ) - if raise_error: - raise ValueError(message) - else: - print(message) - return None - - return tomogram.numpy() - - -def segmentation(run, voxel_spacing: float, segmentation_name: str, session_id=None, user_id=None, raise_error=False): - seg = run.get_segmentations( - name=segmentation_name, - session_id=session_id, - user_id=user_id, - voxel_size=voxel_spacing, - ) - - # No Segmentations Are Available, Result in Error - if len(seg) == 0: - # Get all available segmentations with their metadata - available_segs = run.get_segmentations(voxel_size=voxel_spacing) - seg_info = [(s.name, s.user_id, s.session_id) for s in available_segs] - - # Format the information for display - seg_details = [f"(name: {name}, user_id: {uid}, session_id: {sid})" for name, uid, sid in seg_info] - - message = ( - f"\nNo segmentation found matching:\n" - f" name: {segmentation_name}, user_id: {user_id}, session_id: {session_id}\n" - f"Available segmentations in {run.name} are:\n " + "\n ".join(seg_details) - ) - if raise_error: - raise ValueError(message) - else: - print(message) - return None - - # No Segmentations Are Available, Result in Error - if len(seg) > 1: - print( - f"[Warning] More Than 1 Segmentation is Available for the Query Information. " - f"Available Segmentations are: {seg} " - f"Defaulting to Loading: {seg[0]}\n", + """ + Reads a tomogram from a Copick run. + + Parameters: + ----------- + run: copick.Run + voxel_size: float + algorithm: str + raise_error: bool + + Returns: + -------- + vol: np.ndarray - The tomogram. + """ + + # Get the tomogram from the Copick URI + try: + uri = f"{algorithm}@{voxel_size}" + vol = resolve_copick_objects(uri, run.root, "tomogram", run_name=run.name) + return vol[0].numpy() + except Exception as err: # Report which orbject is missing + # Try to resolve the tomogram using the Copick URI + voxel_spacing_obj = run.get_voxel_spacing(voxel_size) + + if voxel_spacing_obj is None: + # Query Avaiable Voxel Spacings + availableVoxelSpacings = [tomo.voxel_size for tomo in run.voxel_spacings] + + # Report to the user which voxel spacings they can use + message = ( + f"[Warning] No tomogram found for {run.name} with voxel size {voxel_size} and tomogram type {algorithm}" + f"Available spacings are: {', '.join(map(str, availableVoxelSpacings))}" + ) + if raise_error: + raise ValueError(message) from err + else: + print(message) + return None + + tomogram = voxel_spacing_obj.get_tomogram(algorithm) + if tomogram is None: + # Get available algorithms + availableAlgorithms = [tomo.tomo_type for tomo in run.get_voxel_spacing(voxel_size).tomograms] + + # Report to the user which algorithms are available + message = ( + f"[Warning] No tomogram found for {run.name} with voxel size {voxel_size} and tomogram type {algorithm}" + f"Available algorithms are: {', '.join(availableAlgorithms)}" + ) + if raise_error: + raise ValueError(message) from err + else: + print(message) + return None + + +def segmentation(run, voxel_spacing: float, name: str, user_id=None, session_id=None, raise_error=False): + """ + Reads a segmentation from a Copick run. + + Parameters: + ----------- + run: copick.Run + voxel_spacing: float + name: str + user_id: str + session_id: str + raise_error: bool + + Returns: + -------- + seg: np.ndarray - The segmentation. + """ + + # Fill in the missing values with wildcards + if user_id is None: + user_id = "*" + if session_id is None: + session_id = "*" + + # Try to resolve the segmentation using the Copick URI + try: + uri = f"{name}:{user_id}/{session_id}@{voxel_spacing}" + segs = resolve_copick_objects(uri, run.root, "segmentation", run_name=run.name) + return segs[0].numpy() + except Exception as err: + # If the query was unavailable, set the user_id and session_id to None + user_id, session_id = None, None + + # Query Was Unavailable, Let's List Out All Available Segmentations + seg = run.get_segmentations( + name=name, + session_id=session_id, + user_id=user_id, + voxel_size=voxel_spacing, ) - seg = seg[0] - return seg.numpy() + # No Segmentations Are Available, Result in Error + if len(seg) == 0: + # Get all available segmentations with their metadata + available_segs = run.get_segmentations(voxel_size=voxel_spacing) + seg_info = [(s.name, s.user_id, s.session_id) for s in available_segs] + + # Format the information for display + seg_details = [f"(name: {name}, user_id: {uid}, session_id: {sid})" for name, uid, sid in seg_info] + + message = ( + f"\nNo segmentation found matching:\n" + f" name: {name}, user_id: {user_id}, session_id: {session_id}\n" + f"Available segmentations in {run.name} are:\n " + "\n ".join(seg_details) + ) + if raise_error: + raise ValueError(message) from err + else: + print(message) + return None + + # No Segmentations Are Available, Result in Error + if len(seg) > 1: + print( + f"[Warning] More Than 1 Segmentation is Available for the Query Information. " + f"Available Segmentations are: {seg} " + f"Defaulting to Loading: {seg[0]}\n", + ) def coordinates( @@ -86,6 +137,22 @@ def coordinates( voxel_size: float = 10, # Voxel size of the tomogram, used for scaling the coordinates raise_error: bool = False, ): + """ + Reads the coordinates of the picks from a Copick run. + + Parameters: + ----------- + run: copick.Run + name: str + user_id: str + session_id: str + voxel_size: float + raise_error: bool + + Returns: + -------- + coordinates: np.ndarray - The 3D coordinates of the picks in voxel space. + """ # Retrieve the pick points associated with the specified object and user ID picks = run.get_picks(object_name=name, user_id=user_id, session_id=session_id) diff --git a/src/copick_utils/io/writers.py b/src/copick_utils/io/writers.py index 572525c..6c9877f 100644 --- a/src/copick_utils/io/writers.py +++ b/src/copick_utils/io/writers.py @@ -28,17 +28,17 @@ def tomogram(run, input_volume, voxel_size=10, algorithm="wbp"): voxel_spacing = run.new_voxel_spacing(voxel_size=voxel_size) # Check if We Need to Create a New Tomogram for Given Algorithm - tomogram = voxel_spacing.get_tomogram(algorithm) - if tomogram is None: - tomogram = voxel_spacing.new_tomogram(tomo_type=algorithm) + tomo = voxel_spacing.get_tomogram(algorithm) + if tomo is None: + tomo = voxel_spacing.new_tomogram(tomo_type=algorithm) # Write the tomogram data - tomogram.from_numpy(input_volume) + tomo.from_numpy(input_volume) def segmentation( run, - segmentation_volume, + seg_vol, user_id, name="segmentation", session_id="0", @@ -52,7 +52,7 @@ def segmentation( ----------- run : copick.Run The current Copick run object. - segmentation_volume : np.ndarray + seg_vol : np.ndarray The segmentation data to be written. user_id : str The ID of the user creating the segmentation. @@ -76,7 +76,7 @@ def segmentation( # If no segmentation exists or no segmentation at the given voxel size, create a new one if len(segmentations) == 0 or any(seg.voxel_size != voxel_size for seg in segmentations): - segmentation = run.new_segmentation( + seg = run.new_segmentation( voxel_size=voxel_size, name=name, session_id=session_id, @@ -85,7 +85,7 @@ def segmentation( ) else: # Overwrite the current segmentation at the specified voxel size if it exists - segmentation = next(seg for seg in segmentations if seg.voxel_size == voxel_size) + seg = next(seg for seg in segmentations if seg.voxel_size == voxel_size) # Write the segmentation data - segmentation.from_numpy(segmentation_volume, dtype=np.uint8) + seg.from_numpy(seg_vol, dtype=np.uint8)