From 31e4c9a599f3e41770ae5c54a8a3653cda737eca Mon Sep 17 00:00:00 2001 From: David Abramov Date: Fri, 23 Jan 2026 10:20:25 -0800 Subject: [PATCH 01/35] Updating ALCF endpoints to include the synaps-i allocation (to be set up) --- config.yml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/config.yml b/config.yml index 3f26a4f0..0d24832a 100644 --- a/config.yml +++ b/config.yml @@ -72,17 +72,23 @@ globus: uuid: 75b478b2-37af-46df-bfbd-71ed692c6506 name: data832_scratch - alcf832_raw: + alcf832_synaps: + root_path: / + uri: alcf.anl.gov + uuid: TBD + name: alcf832_synaps + + alcf832_iri_raw: root_path: /data/raw uri: alcf.anl.gov uuid: 55c3adf6-31f1-4647-9a38-52591642f7e7 - name: alcf_raw + name: alcf_iri_raw - alcf832_scratch: + alcf832_iri_scratch: root_path: /data/scratch uri: alcf.anl.gov uuid: 55c3adf6-31f1-4647-9a38-52591642f7e7 - name: alcf_scratch + name: alcf_iri_scratch alcf_eagle832: root_path: /IRIBeta/als/example From bf712a4f11af3717daf3b3c9722923e64ecb012d Mon Sep 17 00:00:00 2001 From: David Abramov Date: Fri, 23 Jan 2026 10:21:44 -0800 Subject: [PATCH 02/35] Updating bl832 config.py to distinguish IRI and SYNAPS-I ALCF endpoints --- orchestration/flows/bl832/config.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/orchestration/flows/bl832/config.py b/orchestration/flows/bl832/config.py index 788eef4a..7294b0a7 100644 --- a/orchestration/flows/bl832/config.py +++ b/orchestration/flows/bl832/config.py @@ -24,7 +24,8 @@ def _beam_specific_config(self) -> None: self.nersc832_alsdev_pscratch_raw = self.endpoints["nersc832_alsdev_pscratch_raw"] self.nersc832_alsdev_pscratch_scratch = self.endpoints["nersc832_alsdev_pscratch_scratch"] self.nersc832_alsdev_recon_scripts = self.endpoints["nersc832_alsdev_recon_scripts"] - self.alcf832_raw = self.endpoints["alcf832_raw"] - self.alcf832_scratch = self.endpoints["alcf832_scratch"] - self.scicat = self.config["scicat"] - self.ghcr_images832 = self.config["ghcr_images832"] + self.alcf832_synaps = self.endpoints["alcf832_synaps"] + self.alcf832_iri_raw = self.endpoints["alcf832_iri_raw"] + self.alcf832_iri_scratch = self.endpoints["alcf832_iri_scratch"] + self.scicat = config["scicat"] + self.ghcr_images832 = config["ghcr_images832"] From 77d6bc8a5187bc45b3d63a17569c61d15f934318 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Fri, 23 Jan 2026 10:22:11 -0800 Subject: [PATCH 03/35] Adding the config.yaml file for setting up the globus compute endpoint for reconstruction on ALCF --- .../polaris/globus_compute_recon_config.yaml | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 scripts/polaris/globus_compute_recon_config.yaml diff --git a/scripts/polaris/globus_compute_recon_config.yaml b/scripts/polaris/globus_compute_recon_config.yaml new file mode 100644 index 00000000..66ffd331 --- /dev/null +++ b/scripts/polaris/globus_compute_recon_config.yaml @@ -0,0 +1,39 @@ +engine: + type: GlobusComputeEngine # This engine uses the HighThroughputExecutor + max_retries_on_system_failure: 2 + max_workers: 1 # Sets one worker per node + prefetch_capacity: 0 # Increase if you have many more tasks than workers + + address: + type: address_by_interface + ifname: bond0 + + strategy: simple + job_status_kwargs: + max_idletime: 300 + strategy_period: 60 + + provider: + type: PBSProProvider + + launcher: + type: MpiExecLauncher + # Ensures 1 manger per node, work on all 64 cores + bind_cmd: --cpu-bind + overrides: --depth=64 --ppn 1 + + account: SYNAPS-I + queue: debug + cpus_per_node: 64 + + # e.g., "#PBS -l filesystems=home:grand:eagle\n#PBS -k doe" + scheduler_options: "#PBS -l filesystems=home:eagle" + + # Node setup: activate necessary conda environment and such + worker_init: "module use /soft/modulefiles; module load conda; conda activate /eagle/SYNAPS-I/reconstruction/env/tomopy; export PATH=$PATH:/eagle/SYNAPSE-I/; cd $HOME/.globus_compute/globus_compute_reconstruction" + + walltime: 00:60:00 # Jobs will end after 60 minutes + nodes_per_block: 2 # All jobs will have 1 node + init_blocks: 0 + min_blocks: 0 + max_blocks: 2 # No more than 1 job will be scheduled at a time From f4f9efa31338f482f6ef8e6b25410ead0ecc97a6 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Fri, 23 Jan 2026 10:22:54 -0800 Subject: [PATCH 04/35] Adding the config.yaml file for setting up the globus compute endpoint for segmentation on ALCF. Still needs to be configured for GPU and the environment with dependencies --- .../globus_compute_segment_config.yaml | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 scripts/polaris/globus_compute_segment_config.yaml diff --git a/scripts/polaris/globus_compute_segment_config.yaml b/scripts/polaris/globus_compute_segment_config.yaml new file mode 100644 index 00000000..07bced00 --- /dev/null +++ b/scripts/polaris/globus_compute_segment_config.yaml @@ -0,0 +1,41 @@ +# This needs to be updated to use GPUs and a segmentation environment + +engine: + type: GlobusComputeEngine # This engine uses the HighThroughputExecutor + max_retries_on_system_failure: 2 + max_workers: 1 # Sets one worker per node + prefetch_capacity: 0 # Increase if you have many more tasks than workers + + address: + type: address_by_interface + ifname: bond0 + + strategy: simple + job_status_kwargs: + max_idletime: 300 + strategy_period: 60 + + provider: + type: PBSProProvider + + launcher: + type: MpiExecLauncher + # Ensures 1 manger per node, work on all 64 cores + bind_cmd: --cpu-bind + overrides: --depth=64 --ppn 1 + + account: SYNAPS-I + queue: debug + cpus_per_node: 64 + + # e.g., "#PBS -l filesystems=home:grand:eagle\n#PBS -k doe" + scheduler_options: "#PBS -l filesystems=home:eagle" + + # Node setup: activate necessary conda environment and such + worker_init: "module use /soft/modulefiles; module load conda; conda activate /eagle/SYNAPS-I/reconstruction/env/tomopy; export PATH=$PATH:/eagle/SYNAPSE-I/; cd $HOME/.globus_compute/globus_compute_reconstruction" + + walltime: 00:60:00 # Jobs will end after 60 minutes + nodes_per_block: 2 # All jobs will have 1 node + init_blocks: 0 + min_blocks: 0 + max_blocks: 2 # No more than 1 job will be scheduled at a time From 0f5d5c9c0e9fcb71d5d5406ee6a312ae17615ea2 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Fri, 23 Jan 2026 10:23:53 -0800 Subject: [PATCH 05/35] Adding segmentation Prefect task, and segmentation globus compute code for the TomographyController. Turning off TIFF to ZARR on ALCF for the demo --- orchestration/flows/bl832/alcf.py | 195 +++++++++++++++++++++++++----- 1 file changed, 166 insertions(+), 29 deletions(-) diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index bdf96ac2..c0126985 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -35,7 +35,7 @@ def __init__( # The block must be registered with the name "alcf-allocation-root-path" logger = get_run_logger() allocation_data = Variable.get("alcf-allocation-root-path", _sync=True) - self.allocation_root = allocation_data.get("alcf-allocation-root-path") + self.allocation_root = allocation_data.get("alcf-allocation-root-path") # eagle/SYNAPS-I/ if not self.allocation_root: raise ValueError("Allocation root not found in JSON block 'alcf-allocation-root-path'") logger.info(f"Allocation root loaded: {self.allocation_root}") @@ -57,17 +57,19 @@ def reconstruct( file_name = Path(file_path).stem + ".h5" folder_name = Path(file_path).parent.name - iri_als_bl832_rundir = f"{self.allocation_root}/data/raw" - iri_als_bl832_recon_script = f"{self.allocation_root}/scripts/globus_reconstruction.py" + rundir = f"{self.allocation_root}/data/bl832/raw" + recon_script = f"{self.allocation_root}/reconstruction/scripts/globus_reconstruction.py" gcc = Client(code_serialization_strategy=CombinedCode()) + # TODO: Update globus-compute-endpoint Secret block with the new endpoint UUID + # We will probably have 2 endpoints, one for recon, one for segmentation with Executor(endpoint_id=Secret.load("globus-compute-endpoint").get(), client=gcc) as fxe: logger.info(f"Running Tomopy reconstruction on {file_name} at ALCF") future = fxe.submit( self._reconstruct_wrapper, - iri_als_bl832_rundir, - iri_als_bl832_recon_script, + rundir, + recon_script, file_name, folder_name ) @@ -76,8 +78,8 @@ def reconstruct( @staticmethod def _reconstruct_wrapper( - rundir: str = "/eagle/IRIProd/ALS/data/raw", - script_path: str = "/eagle/IRIProd/ALS/scripts/globus_reconstruction.py", + rundir: str = "/eagle/SYNAPS-I/data/bl832/raw", + script_path: str = "/eagle/SYNAPS-I/reconstruction/scripts/globus_reconstruction.py", h5_file_name: str = None, folder_path: str = None ) -> str: @@ -185,6 +187,101 @@ def _build_multi_resolution_wrapper( f"Converted tiff files to zarr;\n {zarr_res}" ) + def segmentation( + self, + folder_path: str = "", + ) -> bool: + """ + Run tomography segmentation at ALCF through Globus Compute. + + :param folder_path: Path to the TIFF folder to be processed. + + :return: True if the task completed successfully, False otherwise. + """ + logger = get_run_logger() + + # Operate on reconstructed data + rundir = f"{self.allocation_root}/data/bl832/scratch/reconstruction/{Path(folder_path).name}" + output_dir = f"{self.allocation_root}/data/bl832/scratch/segmentation/{Path(folder_path).name}" + segmentation_script = f"{self.allocation_root}/segmentation/scripts/forge_feb_seg_model_demo/src/inference.py" + + gcc = Client(code_serialization_strategy=CombinedCode()) + + # TODO: Update globus-compute-endpoint Secret block with the new endpoint UUID + # We will probably have 2 endpoints, one for recon, one for segmentation + with Executor(endpoint_id=Secret.load("globus-compute-endpoint").get(), client=gcc) as fxe: + logger.info(f"Running segmentation on {folder_path} at ALCF") + future = fxe.submit( + self._segmentation_wrapper, + input_dir=rundir, + output_dir=output_dir, + script_path=segmentation_script, + output_dir=folder_path, + ) + result = self._wait_for_globus_compute_future(future, "segmentation", check_interval=10) + return result + + @staticmethod + def _segmentation_wrapper( + input_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/reconstruction/", + output_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/segmentation/", + script_path: str = "/eagle/SYNAPS-I/segmentation/scripts/forge_feb_seg_model_demo/src/inference.py", + nproc_per_node: int = 4, + nnodes: int = 1, + nnode_rank: int = 0, + master_addr: str = "localhost", + master_port: str = "29500", + patch_size: int = 512, + batch_size: int = 1, + num_workers: int = 4, + confidence: float = 0.5, + prompts: list[str] = ["background", "cell"], + ) -> str: + """ + Python function that wraps around the application call for segmentation on ALCF + + :param rundir: the directory on the eagle file system (ALCF) where the input data are located + :param script_path: the path to the script that will run the segmentation + :param folder_path: the path to the folder containing the TIFF data to be segmented + :return: confirmation message + """ + import os + import subprocess + import time + + seg_start = time.time() + + # Move to directory where data are located + os.chdir(input_dir) + + # Run segmentation.py + command = [ + "torchrun", + f"--nproc_per_node={nproc_per_node}", + f"--nnodes={nnodes}", + f"--node_rank={nnode_rank}", + f"--master_addr={master_addr}", + f"--master_port={master_port}", + "-m", script_path, + "--input-dir", input_dir, + "--output-dir", output_dir, + "--patch-size", str(patch_size), + "--batch-size", str(batch_size), + "--num-workers", str(num_workers), + "--confidence", str(confidence), + "--prompts", *prompts, + ] + + segment_res = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + seg_end = time.time() + + print(f"Segmented data in {input_dir} in {seg_end-seg_start} seconds;\n {segment_res}") + return ( + f"Segmented data specified in {input_dir} in {seg_end-seg_start} seconds;\n" + f"{segment_res}" + ) + @staticmethod def _wait_for_globus_compute_future( future: Future, @@ -368,7 +465,7 @@ def alcf_recon_flow( config: Optional[Config832] = None, ) -> bool: """ - Process and transfer a file from a source to the ALCF. + Process and transfer a file from bl832 to ALCF and run reconstruction and segmentation. Args: file_path (str): The path to the file to be processed. @@ -437,51 +534,91 @@ def alcf_recon_flow( destination=config.data832_scratch ) - # STEP 2B: Run the Tiff to Zarr Globus Flow - logger.info(f"Starting ALCF tiff to zarr flow for {file_path=}") - alcf_multi_res_success = tomography_controller.build_multi_resolution( - file_path=file_path, + # STEP 3: Run the Segmentation Task at ALCF + logger.info(f"Starting ALCF segmentation task for {scratch_path_tiff=}") + alcf_segmentation_success = alcf_segmentation_task( + recon_folder_path=scratch_path_tiff, + config=config ) - if not alcf_multi_res_success: - logger.error("Tiff to Zarr Failed.") - raise ValueError("Tiff to Zarr at ALCF Failed") + if not alcf_segmentation_success: + logger.warning("Segmentation at ALCF Failed") else: - logger.info("Tiff to Zarr Successful.") - # Transfer B: Send reconstructed data (zarr) to data832 - logger.info(f"Transferring {file_name} from {config.alcf832_scratch} " - f"at ALCF to {config.data832_scratch} at data832") - data832_zarr_transfer_success = transfer_controller.copy( - file_path=scratch_path_zarr, - source=config.alcf832_scratch, - destination=config.data832_scratch - ) + logger.info("Segmentation at ALCF Successful") + + # Not running TIFF to Zarr conversion at ALCF for now + # STEP 2B: Run the Tiff to Zarr Globus Flow + # logger.info(f"Starting ALCF tiff to zarr flow for {file_path=}") + # alcf_multi_res_success = tomography_controller.build_multi_resolution( + # file_path=file_path, + # ) + # if not alcf_multi_res_success: + # logger.error("Tiff to Zarr Failed.") + # raise ValueError("Tiff to Zarr at ALCF Failed") + # else: + # logger.info("Tiff to Zarr Successful.") + # # Transfer B: Send reconstructed data (zarr) to data832 + # logger.info(f"Transferring {file_name} from {config.alcf832_scratch} " + # f"at ALCF to {config.data832_scratch} at data832") + # data832_zarr_transfer_success = transfer_controller.copy( + # file_path=scratch_path_zarr, + # source=config.alcf832_scratch, + # destination=config.data832_scratch + # ) # Place holder in case we want to transfer to NERSC for long term storage nersc_transfer_success = False - data832_tiff_transfer_success, data832_zarr_transfer_success, nersc_transfer_success + # data832_tiff_transfer_success, data832_zarr_transfer_success, nersc_transfer_success schedule_pruning( alcf_raw_path=f"{folder_name}/{h5_file_name}" if alcf_transfer_success else None, alcf_scratch_path_tiff=f"{scratch_path_tiff}" if alcf_reconstruction_success else None, - alcf_scratch_path_zarr=f"{scratch_path_zarr}" if alcf_multi_res_success else None, + # alcf_scratch_path_zarr=f"{scratch_path_zarr}" if alcf_multi_res_success else None, # Commenting out zarr for now nersc_scratch_path_tiff=f"{scratch_path_tiff}" if nersc_transfer_success else None, nersc_scratch_path_zarr=f"{scratch_path_zarr}" if nersc_transfer_success else None, data832_raw_path=f"{folder_name}/{h5_file_name}" if alcf_transfer_success else None, data832_scratch_path_tiff=f"{scratch_path_tiff}" if data832_tiff_transfer_success else None, - data832_scratch_path_zarr=f"{scratch_path_zarr}" if data832_zarr_transfer_success else None, + # data832_scratch_path_zarr=f"{scratch_path_zarr}" if data832_zarr_transfer_success else None, # Commenting out zarr one_minute=False, # Set to False for production durations config=config ) # TODO: ingest to scicat - if alcf_reconstruction_success and alcf_multi_res_success: + if alcf_reconstruction_success and alcf_segmentation_success: # and alcf_multi_res_success: return True else: return False -if __name__ == "__main__": +@task(name="alcf_segmentation_task") +def alcf_segmentation_task( + recon_folder_path: str, + config: Optional[Config832] = None, +): + logger = get_run_logger() + if config is None: + logger.info("No config provided, using default Config832.") + config = Config832() + + # Initialize the Tomography Controller and run the segmentation + logger.info("Initializing ALCF Tomography HPC Controller.") + tomography_controller = get_controller( + hpc_type=HPC.ALCF, + config=config + ) + logger.info(f"Starting ALCF segmentation task for {recon_folder_path=}") + alcf_segmentation_success = tomography_controller.segmentation( + recon_folder_path=recon_folder_path, + ) + if not alcf_segmentation_success: + logger.error("Segmentation Failed.") + else: + logger.info("Segmentation Successful.") + return alcf_segmentation_success + + +@flow(name="alcf_segmentation_integration_test", flow_run_name="alcf_segmentation_integration_test") +def alcf_segmentation_integration_test(): folder_name = 'dabramov' file_name = '20230606_151124_jong-seto_fungal-mycelia_roll-AQ_fungi1_fast' flow_success = alcf_recon_flow( From d3ad2197ea9cbcdcabf013f2d87cb3cf13628dc2 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Fri, 23 Jan 2026 10:30:02 -0800 Subject: [PATCH 06/35] ensuring self.config for scicat and ghcr images --- orchestration/flows/bl832/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/orchestration/flows/bl832/config.py b/orchestration/flows/bl832/config.py index 7294b0a7..17a2afbe 100644 --- a/orchestration/flows/bl832/config.py +++ b/orchestration/flows/bl832/config.py @@ -27,5 +27,5 @@ def _beam_specific_config(self) -> None: self.alcf832_synaps = self.endpoints["alcf832_synaps"] self.alcf832_iri_raw = self.endpoints["alcf832_iri_raw"] self.alcf832_iri_scratch = self.endpoints["alcf832_iri_scratch"] - self.scicat = config["scicat"] - self.ghcr_images832 = config["ghcr_images832"] + self.scicat = self.config["scicat"] + self.ghcr_images832 = self.config["ghcr_images832"] \ No newline at end of file From a96c4a87c26446e0a67961b7b1d04aa46c4f6146 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Fri, 23 Jan 2026 10:30:39 -0800 Subject: [PATCH 07/35] linting --- orchestration/flows/bl832/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orchestration/flows/bl832/config.py b/orchestration/flows/bl832/config.py index 17a2afbe..da753411 100644 --- a/orchestration/flows/bl832/config.py +++ b/orchestration/flows/bl832/config.py @@ -28,4 +28,4 @@ def _beam_specific_config(self) -> None: self.alcf832_iri_raw = self.endpoints["alcf832_iri_raw"] self.alcf832_iri_scratch = self.endpoints["alcf832_iri_scratch"] self.scicat = self.config["scicat"] - self.ghcr_images832 = self.config["ghcr_images832"] \ No newline at end of file + self.ghcr_images832 = self.config["ghcr_images832"] From 5873d9412c3df6098b7bee23efaf48e0a2b27905 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Fri, 23 Jan 2026 10:57:25 -0800 Subject: [PATCH 08/35] Making separate ALCF SYNAPS-I endpoint configs for raw, reconstructed, and segmented data --- config.yml | 19 ++++++++++++++++--- orchestration/flows/bl832/alcf.py | 18 ++++++++++++------ orchestration/flows/bl832/config.py | 4 +++- 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/config.yml b/config.yml index 0d24832a..b4d29a5d 100644 --- a/config.yml +++ b/config.yml @@ -46,6 +46,7 @@ globus: uri: beegfs.als.lbl.gov uuid: d33b5d6e-1603-414e-93cb-bcb732b7914a name: bl733-beegfs-data + # 8.3.2 ENDPOINTS spot832: @@ -72,11 +73,23 @@ globus: uuid: 75b478b2-37af-46df-bfbd-71ed692c6506 name: data832_scratch - alcf832_synaps: - root_path: / + alcf832_synaps_raw: + root_path: /data/bl832/raw + uri: alcf.anl.gov + uuid: TBD + name: alcf832_synaps_raw + + alcf832_synaps_recon: + root_path: /data/bl832/scratch/reconstruction/ + uri: alcf.anl.gov + uuid: TBD + name: alcf832_synaps_recon + + alcf832_synaps_segment: + root_path: /data/bl832/scratch/segmentation/ uri: alcf.anl.gov uuid: TBD - name: alcf832_synaps + name: alcf832_synaps_segment alcf832_iri_raw: root_path: /data/raw diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index c0126985..766ae211 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -195,7 +195,6 @@ def segmentation( Run tomography segmentation at ALCF through Globus Compute. :param folder_path: Path to the TIFF folder to be processed. - :return: True if the task completed successfully, False otherwise. """ logger = get_run_logger() @@ -484,6 +483,7 @@ def alcf_recon_flow( file_name = path.stem h5_file_name = file_name + '.h5' scratch_path_tiff = folder_name + '/rec' + file_name + '/' + scratch_path_segment = folder_name + '/seg' + file_name + '/' scratch_path_zarr = folder_name + '/rec' + file_name + '.zarr/' # initialize transfer_controller with globus @@ -498,7 +498,7 @@ def alcf_recon_flow( alcf_transfer_success = transfer_controller.copy( file_path=data832_raw_path, source=config.data832_raw, - destination=config.alcf832_raw + destination=config.alcf832_synaps_raw ) logger.info(f"Transfer status: {alcf_transfer_success}") @@ -526,11 +526,11 @@ def alcf_recon_flow( logger.info("Reconstruction Successful.") # Transfer A: Send reconstructed data (tiff) to data832 - logger.info(f"Transferring {file_name} from {config.alcf832_scratch} " + logger.info(f"Transferring {file_name} from {config.alcf832_synaps_recon} " f"at ALCF to {config.data832_scratch} at data832") data832_tiff_transfer_success = transfer_controller.copy( file_path=scratch_path_tiff, - source=config.alcf832_scratch, + source=config.alcf832_synaps_recon, destination=config.data832_scratch ) @@ -544,6 +544,12 @@ def alcf_recon_flow( logger.warning("Segmentation at ALCF Failed") else: logger.info("Segmentation at ALCF Successful") + segment_transfer_success = transfer_controller.copy( + file_path=scratch_path_segment, + source=config.alcf832_synaps_segment, + destination=config.data832_scratch + ) + logger.info(f"Transfer segmented data to data832 success: {segment_transfer_success}") # Not running TIFF to Zarr conversion at ALCF for now # STEP 2B: Run the Tiff to Zarr Globus Flow @@ -621,8 +627,8 @@ def alcf_segmentation_task( def alcf_segmentation_integration_test(): folder_name = 'dabramov' file_name = '20230606_151124_jong-seto_fungal-mycelia_roll-AQ_fungi1_fast' - flow_success = alcf_recon_flow( - file_path=f"/{folder_name}/{file_name}.h5", + flow_success = alcf_segmentation_task( + recon_folder_path=f"/{folder_name}/{file_name}", config=Config832() ) print(flow_success) diff --git a/orchestration/flows/bl832/config.py b/orchestration/flows/bl832/config.py index da753411..d523952d 100644 --- a/orchestration/flows/bl832/config.py +++ b/orchestration/flows/bl832/config.py @@ -24,7 +24,9 @@ def _beam_specific_config(self) -> None: self.nersc832_alsdev_pscratch_raw = self.endpoints["nersc832_alsdev_pscratch_raw"] self.nersc832_alsdev_pscratch_scratch = self.endpoints["nersc832_alsdev_pscratch_scratch"] self.nersc832_alsdev_recon_scripts = self.endpoints["nersc832_alsdev_recon_scripts"] - self.alcf832_synaps = self.endpoints["alcf832_synaps"] + self.alcf832_synaps_raw = self.endpoints["alcf832_synaps_raw"] + self.alcf832_synaps_recon = self.endpoints["alcf832_synaps_recon"] + self.alcf832_synaps_segment = self.endpoints["alcf832_synaps_segment"] self.alcf832_iri_raw = self.endpoints["alcf832_iri_raw"] self.alcf832_iri_scratch = self.endpoints["alcf832_iri_scratch"] self.scicat = self.config["scicat"] From 49e6e7f881507f9a18f98cab217cb89e03bd65b9 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Mon, 26 Jan 2026 11:53:46 -0800 Subject: [PATCH 09/35] Refactoring ALCF reconstruction flow to use the prune_controller class --- orchestration/flows/bl832/alcf.py | 356 +++++++++++++++++++----------- 1 file changed, 222 insertions(+), 134 deletions(-) diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index 766ae211..0794be99 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -1,5 +1,5 @@ from concurrent.futures import Future -import datetime +# import datetime from pathlib import Path import time from typing import Optional @@ -12,8 +12,9 @@ from orchestration.flows.bl832.config import Config832 from orchestration.flows.bl832.job_controller import get_controller, HPC, TomographyHPCController +# from orchestration.prefect import schedule_prefect_flow +from orchestration.prune_controller import get_prune_controller, PruneMethod from orchestration.transfer_controller import get_transfer_controller, CopyMethod -from orchestration.prefect import schedule_prefect_flow class ALCFTomographyHPCController(TomographyHPCController): @@ -189,33 +190,36 @@ def _build_multi_resolution_wrapper( def segmentation( self, - folder_path: str = "", + recon_folder_path: str = "", ) -> bool: """ Run tomography segmentation at ALCF through Globus Compute. - :param folder_path: Path to the TIFF folder to be processed. + :param recon_folder_path: Path to the reconstructed data folder to be processed. :return: True if the task completed successfully, False otherwise. """ logger = get_run_logger() # Operate on reconstructed data - rundir = f"{self.allocation_root}/data/bl832/scratch/reconstruction/{Path(folder_path).name}" - output_dir = f"{self.allocation_root}/data/bl832/scratch/segmentation/{Path(folder_path).name}" - segmentation_script = f"{self.allocation_root}/segmentation/scripts/forge_feb_seg_model_demo/src/inference.py" + rundir = f"{self.allocation_root}/data/bl832/scratch/reconstruction/{recon_folder_path}" + output_dir = f"{self.allocation_root}/data/bl832/scratch/segmentation/{recon_folder_path}" + segmentation_module = "src.inference" + workdir = f"{self.allocation_root}/segmentation/scripts/forge_feb_seg_model_demo" gcc = Client(code_serialization_strategy=CombinedCode()) # TODO: Update globus-compute-endpoint Secret block with the new endpoint UUID # We will probably have 2 endpoints, one for recon, one for segmentation - with Executor(endpoint_id=Secret.load("globus-compute-endpoint").get(), client=gcc) as fxe: - logger.info(f"Running segmentation on {folder_path} at ALCF") + endpoint_id = "168c595b-9493-42db-9c6a-aad960913de2" + # with Executor(endpoint_id=Secret.load("globus-compute-endpoint").get(), client=gcc) as fxe: + with Executor(endpoint_id=endpoint_id, client=gcc) as fxe: + logger.info(f"Running segmentation on {recon_folder_path} at ALCF") future = fxe.submit( self._segmentation_wrapper, input_dir=rundir, output_dir=output_dir, - script_path=segmentation_script, - output_dir=folder_path, + script_module=segmentation_module, + workdir=workdir ) result = self._wait_for_globus_compute_future(future, "segmentation", check_interval=10) return result @@ -224,7 +228,8 @@ def segmentation( def _segmentation_wrapper( input_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/reconstruction/", output_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/segmentation/", - script_path: str = "/eagle/SYNAPS-I/segmentation/scripts/forge_feb_seg_model_demo/src/inference.py", + script_module: str = "src.inference", + workdir: str = "/eagle/SYNAPS-I/segmentation/scripts/forge_feb_seg_model_demo", nproc_per_node: int = 4, nnodes: int = 1, nnode_rank: int = 0, @@ -250,18 +255,18 @@ def _segmentation_wrapper( seg_start = time.time() - # Move to directory where data are located - os.chdir(input_dir) + # Move to directory where the segmentation code is located + os.chdir(workdir) # Run segmentation.py command = [ - "torchrun", + "python", "-m", "torch.distributed.run", f"--nproc_per_node={nproc_per_node}", f"--nnodes={nnodes}", f"--node_rank={nnode_rank}", f"--master_addr={master_addr}", f"--master_port={master_port}", - "-m", script_path, + "-m", script_module, "--input-dir", input_dir, "--output-dir", output_dir, "--patch-size", str(patch_size), @@ -353,109 +358,109 @@ def _wait_for_globus_compute_future( return success -@task(name="schedule_prune_task") -def schedule_prune_task( - path: str, - location: str, - schedule_days: datetime.timedelta, - source_endpoint=None, - check_endpoint=None -) -> bool: - """ - Schedules a Prefect flow to prune files from a specified location. - - Args: - path (str): The file path to the folder containing the files. - location (str): The server location (e.g., 'alcf832_raw') where the files will be pruned. - schedule_days (int): The number of days after which the file should be deleted. - source_endpoint (str): The source endpoint for the files. - check_endpoint (str): The endpoint to check for the existence of the files. - - Returns: - bool: True if the task was scheduled successfully, False otherwise. - """ - logger = get_run_logger() - - try: - flow_name = f"delete {location}: {Path(path).name}" - schedule_prefect_flow( - deployment_name=f"prune_{location}/prune_{location}", - flow_run_name=flow_name, - parameters={ - "relative_path": path, - "source_endpoint": source_endpoint, - "check_endpoint": check_endpoint - }, - duration_from_now=schedule_days - ) - return True - except Exception as e: - logger.error(f"Failed to schedule prune task: {e}") - return False - - -@task(name="schedule_pruning") -def schedule_pruning( - alcf_raw_path: str = None, - alcf_scratch_path_tiff: str = None, - alcf_scratch_path_zarr: str = None, - nersc_scratch_path_tiff: str = None, - nersc_scratch_path_zarr: str = None, - data832_raw_path: str = None, - data832_scratch_path_tiff: str = None, - data832_scratch_path_zarr: str = None, - one_minute: bool = False, - config: Config832 = None -) -> bool: - """ - This function schedules the deletion of files from specified locations on ALCF, NERSC, and data832. - - Args: - alcf_raw_path (str, optional): The raw path of the h5 file on ALCF. - alcf_scratch_path_tiff (str, optional): The scratch path for TIFF files on ALCF. - alcf_scratch_path_zarr (str, optional): The scratch path for Zarr files on ALCF. - nersc_scratch_path_tiff (str, optional): The scratch path for TIFF files on NERSC. - nersc_scratch_path_zarr (str, optional): The scratch path for Zarr files on NERSC. - data832_scratch_path (str, optional): The scratch path on data832. - one_minute (bool, optional): Defaults to False. Whether to schedule the deletion after one minute. - config (Config832, optional): Configuration object for the flow. - - Returns: - bool: True if the tasks were scheduled successfully, False otherwise. - """ - logger = get_run_logger() - - pruning_config = Variable.get("pruning-config", _sync=True) - - if one_minute: - alcf_delay = datetime.timedelta(minutes=1) - nersc_delay = datetime.timedelta(minutes=1) - data832_delay = datetime.timedelta(minutes=1) - else: - alcf_delay = datetime.timedelta(days=pruning_config["delete_alcf832_files_after_days"]) - nersc_delay = datetime.timedelta(days=pruning_config["delete_nersc832_files_after_days"]) - data832_delay = datetime.timedelta(days=pruning_config["delete_data832_files_after_days"]) - - # (path, location, days, source_endpoint, check_endpoint) - delete_schedules = [ - (alcf_raw_path, "alcf832_raw", alcf_delay, config.alcf832_raw, config.data832_raw), - (alcf_scratch_path_tiff, "alcf832_scratch", alcf_delay, config.alcf832_scratch, config.data832_scratch), - (alcf_scratch_path_zarr, "alcf832_scratch", alcf_delay, config.alcf832_scratch, config.data832_scratch), - (nersc_scratch_path_tiff, "nersc832_alsdev_scratch", nersc_delay, config.nersc832_alsdev_scratch, None), - (nersc_scratch_path_zarr, "nersc832_alsdev_scratch", nersc_delay, config.nersc832_alsdev_scratch, None), - (data832_raw_path, "data832_raw", data832_delay, config.data832_raw, None), - (data832_scratch_path_tiff, "data832_scratch", data832_delay, config.data832_scratch, None), - (data832_scratch_path_zarr, "data832_scratch", data832_delay, config.data832_scratch, None) - ] - - for path, location, days, source_endpoint, check_endpoint in delete_schedules: - if path: - schedule_prune_task(path, location, days, source_endpoint, check_endpoint) - logger.info(f"Scheduled delete from {location} at {days} days") - else: - logger.info(f"Path not provided for {location}, skipping scheduling of deletion task.") - - return True +# @task(name="schedule_prune_task") +# def schedule_prune_task( +# path: str, +# location: str, +# schedule_days: datetime.timedelta, +# source_endpoint=None, +# check_endpoint=None +# ) -> bool: +# """ +# Schedules a Prefect flow to prune files from a specified location. + +# Args: +# path (str): The file path to the folder containing the files. +# location (str): The server location (e.g., 'alcf832_raw') where the files will be pruned. +# schedule_days (int): The number of days after which the file should be deleted. +# source_endpoint (str): The source endpoint for the files. +# check_endpoint (str): The endpoint to check for the existence of the files. + +# Returns: +# bool: True if the task was scheduled successfully, False otherwise. +# """ +# logger = get_run_logger() + +# try: +# flow_name = f"delete {location}: {Path(path).name}" +# schedule_prefect_flow( +# deployment_name=f"prune_{location}/prune_{location}", +# flow_run_name=flow_name, +# parameters={ +# "relative_path": path, +# "source_endpoint": source_endpoint, +# "check_endpoint": check_endpoint +# }, +# duration_from_now=schedule_days +# ) +# return True +# except Exception as e: +# logger.error(f"Failed to schedule prune task: {e}") +# return False + + +# @task(name="schedule_pruning") +# def schedule_pruning( +# alcf_raw_path: str = None, +# alcf_scratch_path_tiff: str = None, +# alcf_scratch_path_zarr: str = None, +# nersc_scratch_path_tiff: str = None, +# nersc_scratch_path_zarr: str = None, +# data832_raw_path: str = None, +# data832_scratch_path_tiff: str = None, +# data832_scratch_path_zarr: str = None, +# one_minute: bool = False, +# config: Config832 = None +# ) -> bool: +# """ +# This function schedules the deletion of files from specified locations on ALCF, NERSC, and data832. + +# Args: +# alcf_raw_path (str, optional): The raw path of the h5 file on ALCF. +# alcf_scratch_path_tiff (str, optional): The scratch path for TIFF files on ALCF. +# alcf_scratch_path_zarr (str, optional): The scratch path for Zarr files on ALCF. +# nersc_scratch_path_tiff (str, optional): The scratch path for TIFF files on NERSC. +# nersc_scratch_path_zarr (str, optional): The scratch path for Zarr files on NERSC. +# data832_scratch_path (str, optional): The scratch path on data832. +# one_minute (bool, optional): Defaults to False. Whether to schedule the deletion after one minute. +# config (Config832, optional): Configuration object for the flow. + +# Returns: +# bool: True if the tasks were scheduled successfully, False otherwise. +# """ +# logger = get_run_logger() + +# pruning_config = Variable.get("pruning-config", _sync=True) + +# if one_minute: +# alcf_delay = datetime.timedelta(minutes=1) +# nersc_delay = datetime.timedelta(minutes=1) +# data832_delay = datetime.timedelta(minutes=1) +# else: +# alcf_delay = datetime.timedelta(days=pruning_config["delete_alcf832_files_after_days"]) +# nersc_delay = datetime.timedelta(days=pruning_config["delete_nersc832_files_after_days"]) +# data832_delay = datetime.timedelta(days=pruning_config["delete_data832_files_after_days"]) + +# # (path, location, days, source_endpoint, check_endpoint) +# delete_schedules = [ +# (alcf_raw_path, "alcf832_raw", alcf_delay, config.alcf832_raw, config.data832_raw), +# (alcf_scratch_path_tiff, "alcf832_scratch", alcf_delay, config.alcf832_scratch, config.data832_scratch), +# (alcf_scratch_path_zarr, "alcf832_scratch", alcf_delay, config.alcf832_scratch, config.data832_scratch), +# (nersc_scratch_path_tiff, "nersc832_alsdev_scratch", nersc_delay, config.nersc832_alsdev_scratch, None), +# (nersc_scratch_path_zarr, "nersc832_alsdev_scratch", nersc_delay, config.nersc832_alsdev_scratch, None), +# (data832_raw_path, "data832_raw", data832_delay, config.data832_raw, None), +# (data832_scratch_path_tiff, "data832_scratch", data832_delay, config.data832_scratch, None), +# (data832_scratch_path_zarr, "data832_scratch", data832_delay, config.data832_scratch, None) +# ] + +# for path, location, days, source_endpoint, check_endpoint in delete_schedules: +# if path: +# schedule_prune_task(path, location, days, source_endpoint, check_endpoint) +# logger.info(f"Scheduled delete from {location} at {days} days") +# else: +# logger.info(f"Path not provided for {location}, skipping scheduling of deletion task.") + +# return True @flow(name="alcf_recon_flow", flow_run_name="alcf_recon-{file_path}") @@ -533,6 +538,7 @@ def alcf_recon_flow( source=config.alcf832_synaps_recon, destination=config.data832_scratch ) + logger.info(f"Transfer reconstructed TIFF data to data832 success: {data832_tiff_transfer_success}") # STEP 3: Run the Segmentation Task at ALCF logger.info(f"Starting ALCF segmentation task for {scratch_path_tiff=}") @@ -552,6 +558,8 @@ def alcf_recon_flow( logger.info(f"Transfer segmented data to data832 success: {segment_transfer_success}") # Not running TIFF to Zarr conversion at ALCF for now + alcf_multi_res_success = False + data832_zarr_transfer_success = False # STEP 2B: Run the Tiff to Zarr Globus Flow # logger.info(f"Starting ALCF tiff to zarr flow for {file_path=}") # alcf_multi_res_success = tomography_controller.build_multi_resolution( @@ -572,22 +580,99 @@ def alcf_recon_flow( # ) # Place holder in case we want to transfer to NERSC for long term storage - nersc_transfer_success = False + # nersc_transfer_success = False - # data832_tiff_transfer_success, data832_zarr_transfer_success, nersc_transfer_success - schedule_pruning( - alcf_raw_path=f"{folder_name}/{h5_file_name}" if alcf_transfer_success else None, - alcf_scratch_path_tiff=f"{scratch_path_tiff}" if alcf_reconstruction_success else None, - # alcf_scratch_path_zarr=f"{scratch_path_zarr}" if alcf_multi_res_success else None, # Commenting out zarr for now - nersc_scratch_path_tiff=f"{scratch_path_tiff}" if nersc_transfer_success else None, - nersc_scratch_path_zarr=f"{scratch_path_zarr}" if nersc_transfer_success else None, - data832_raw_path=f"{folder_name}/{h5_file_name}" if alcf_transfer_success else None, - data832_scratch_path_tiff=f"{scratch_path_tiff}" if data832_tiff_transfer_success else None, - # data832_scratch_path_zarr=f"{scratch_path_zarr}" if data832_zarr_transfer_success else None, # Commenting out zarr - one_minute=False, # Set to False for production durations + # STEP 4: Schedule Pruning of files + logger.info("Scheduling file pruning tasks.") + prune_controller = get_prune_controller( + prune_type=PruneMethod.GLOBUS, config=config ) + # Prune from ALCF raw + if alcf_transfer_success: + logger.info("Scheduling pruning of ALCF raw data.") + prune_controller.prune( + file_path=data832_raw_path, + source_endpoint=config.alcf832_synaps_raw, + check_endpoint=None, + days_from_now=2.0 + ) + + # Prune TIFFs from ALCF scratch/reconstruction + if alcf_reconstruction_success: + logger.info("Scheduling pruning of ALCF scratch reconstruction data.") + prune_controller.prune( + file_path=scratch_path_tiff, + source_endpoint=config.alcf832_synaps_recon, + check_endpoint=config.data832_scratch, + days_from_now=2.0 + ) + + # Prune TIFFs from ALCF scratch/segmentation + if alcf_segmentation_success: + logger.info("Scheduling pruning of ALCF scratch segmentation data.") + prune_controller.prune( + file_path=scratch_path_segment, + source_endpoint=config.alcf832_synaps_segment, + check_endpoint=config.data832_scratch, + days_from_now=2.0 + ) + + # Prune ZARR from ALCF scratch/reconstruction + if alcf_multi_res_success: + logger.info("Scheduling pruning of ALCF scratch zarr reconstruction data.") + prune_controller.prune( + file_path=scratch_path_zarr, + source_endpoint=config.alcf832_synaps_recon, + check_endpoint=config.data832_scratch, + days_from_now=2.0 + ) + + # Prune reconstructed TIFFs from data832 scratch + if data832_tiff_transfer_success: + logger.info("Scheduling pruning of data832 scratch reconstruction TIFF data.") + prune_controller.prune( + file_path=scratch_path_tiff, + source_endpoint=config.data832_scratch, + check_endpoint=None, + days_from_now=30.0 + ) + + # Prune reconstructed ZARR from data832 scratch + if data832_zarr_transfer_success: + logger.info("Scheduling pruning of data832 scratch reconstruction ZARR data.") + prune_controller.prune( + file_path=scratch_path_zarr, + source_endpoint=config.data832_scratch, + check_endpoint=None, + days_from_now=30.0 + ) + + # Prune segmented data from data832 scratch + if alcf_segmentation_success: + logger.info("Scheduling pruning of data832 scratch segmentation data.") + prune_controller.prune( + file_path=scratch_path_segment, + source_endpoint=config.data832_scratch, + check_endpoint=None, + days_from_now=30.0 + ) + + # data832_tiff_transfer_success, data832_zarr_transfer_success, nersc_transfer_success + # schedule_pruning( + # alcf_raw_path=f"{folder_name}/{h5_file_name}" if alcf_transfer_success else None, + # alcf_scratch_path_tiff=f"{scratch_path_tiff}" if alcf_reconstruction_success else None, + # # alcf_scratch_path_zarr=f"{scratch_path_zarr}" if alcf_multi_res_success else None, # Commenting out zarr for now + # nersc_scratch_path_tiff=f"{scratch_path_tiff}" if nersc_transfer_success else None, + # nersc_scratch_path_zarr=f"{scratch_path_zarr}" if nersc_transfer_success else None, + # data832_raw_path=f"{folder_name}/{h5_file_name}" if alcf_transfer_success else None, + # data832_scratch_path_tiff=f"{scratch_path_tiff}" if data832_tiff_transfer_success else None, + # # data832_scratch_path_zarr=f"{scratch_path_zarr}" if data832_zarr_transfer_success else None, # Commenting out zarr + # one_minute=False, # Set to False for production durations + # config=config + # ) + # TODO: ingest to scicat if alcf_reconstruction_success and alcf_segmentation_success: # and alcf_multi_res_success: @@ -625,10 +710,13 @@ def alcf_segmentation_task( @flow(name="alcf_segmentation_integration_test", flow_run_name="alcf_segmentation_integration_test") def alcf_segmentation_integration_test(): - folder_name = 'dabramov' - file_name = '20230606_151124_jong-seto_fungal-mycelia_roll-AQ_fungi1_fast' + recon_folder_path = 'rec20211222_125057_petiole4' flow_success = alcf_segmentation_task( - recon_folder_path=f"/{folder_name}/{file_name}", + recon_folder_path=recon_folder_path, config=Config832() ) print(flow_success) + + +if __name__ == "__main__": + alcf_segmentation_integration_test() From 922b7151f2c0c624d25f9adb7a720adadc2f911b Mon Sep 17 00:00:00 2001 From: David Abramov Date: Mon, 26 Jan 2026 11:54:47 -0800 Subject: [PATCH 10/35] Removing old commented out prune code --- orchestration/flows/bl832/alcf.py | 121 ------------------------------ 1 file changed, 121 deletions(-) diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index 0794be99..086a3668 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -1,5 +1,4 @@ from concurrent.futures import Future -# import datetime from pathlib import Path import time from typing import Optional @@ -12,7 +11,6 @@ from orchestration.flows.bl832.config import Config832 from orchestration.flows.bl832.job_controller import get_controller, HPC, TomographyHPCController -# from orchestration.prefect import schedule_prefect_flow from orchestration.prune_controller import get_prune_controller, PruneMethod from orchestration.transfer_controller import get_transfer_controller, CopyMethod @@ -358,111 +356,6 @@ def _wait_for_globus_compute_future( return success -# @task(name="schedule_prune_task") -# def schedule_prune_task( -# path: str, -# location: str, -# schedule_days: datetime.timedelta, -# source_endpoint=None, -# check_endpoint=None -# ) -> bool: -# """ -# Schedules a Prefect flow to prune files from a specified location. - -# Args: -# path (str): The file path to the folder containing the files. -# location (str): The server location (e.g., 'alcf832_raw') where the files will be pruned. -# schedule_days (int): The number of days after which the file should be deleted. -# source_endpoint (str): The source endpoint for the files. -# check_endpoint (str): The endpoint to check for the existence of the files. - -# Returns: -# bool: True if the task was scheduled successfully, False otherwise. -# """ -# logger = get_run_logger() - -# try: -# flow_name = f"delete {location}: {Path(path).name}" -# schedule_prefect_flow( -# deployment_name=f"prune_{location}/prune_{location}", -# flow_run_name=flow_name, -# parameters={ -# "relative_path": path, -# "source_endpoint": source_endpoint, -# "check_endpoint": check_endpoint -# }, -# duration_from_now=schedule_days -# ) -# return True -# except Exception as e: -# logger.error(f"Failed to schedule prune task: {e}") -# return False - - -# @task(name="schedule_pruning") -# def schedule_pruning( -# alcf_raw_path: str = None, -# alcf_scratch_path_tiff: str = None, -# alcf_scratch_path_zarr: str = None, -# nersc_scratch_path_tiff: str = None, -# nersc_scratch_path_zarr: str = None, -# data832_raw_path: str = None, -# data832_scratch_path_tiff: str = None, -# data832_scratch_path_zarr: str = None, -# one_minute: bool = False, -# config: Config832 = None -# ) -> bool: -# """ -# This function schedules the deletion of files from specified locations on ALCF, NERSC, and data832. - -# Args: -# alcf_raw_path (str, optional): The raw path of the h5 file on ALCF. -# alcf_scratch_path_tiff (str, optional): The scratch path for TIFF files on ALCF. -# alcf_scratch_path_zarr (str, optional): The scratch path for Zarr files on ALCF. -# nersc_scratch_path_tiff (str, optional): The scratch path for TIFF files on NERSC. -# nersc_scratch_path_zarr (str, optional): The scratch path for Zarr files on NERSC. -# data832_scratch_path (str, optional): The scratch path on data832. -# one_minute (bool, optional): Defaults to False. Whether to schedule the deletion after one minute. -# config (Config832, optional): Configuration object for the flow. - -# Returns: -# bool: True if the tasks were scheduled successfully, False otherwise. -# """ -# logger = get_run_logger() - -# pruning_config = Variable.get("pruning-config", _sync=True) - -# if one_minute: -# alcf_delay = datetime.timedelta(minutes=1) -# nersc_delay = datetime.timedelta(minutes=1) -# data832_delay = datetime.timedelta(minutes=1) -# else: -# alcf_delay = datetime.timedelta(days=pruning_config["delete_alcf832_files_after_days"]) -# nersc_delay = datetime.timedelta(days=pruning_config["delete_nersc832_files_after_days"]) -# data832_delay = datetime.timedelta(days=pruning_config["delete_data832_files_after_days"]) - -# # (path, location, days, source_endpoint, check_endpoint) -# delete_schedules = [ -# (alcf_raw_path, "alcf832_raw", alcf_delay, config.alcf832_raw, config.data832_raw), -# (alcf_scratch_path_tiff, "alcf832_scratch", alcf_delay, config.alcf832_scratch, config.data832_scratch), -# (alcf_scratch_path_zarr, "alcf832_scratch", alcf_delay, config.alcf832_scratch, config.data832_scratch), -# (nersc_scratch_path_tiff, "nersc832_alsdev_scratch", nersc_delay, config.nersc832_alsdev_scratch, None), -# (nersc_scratch_path_zarr, "nersc832_alsdev_scratch", nersc_delay, config.nersc832_alsdev_scratch, None), -# (data832_raw_path, "data832_raw", data832_delay, config.data832_raw, None), -# (data832_scratch_path_tiff, "data832_scratch", data832_delay, config.data832_scratch, None), -# (data832_scratch_path_zarr, "data832_scratch", data832_delay, config.data832_scratch, None) -# ] - -# for path, location, days, source_endpoint, check_endpoint in delete_schedules: -# if path: -# schedule_prune_task(path, location, days, source_endpoint, check_endpoint) -# logger.info(f"Scheduled delete from {location} at {days} days") -# else: -# logger.info(f"Path not provided for {location}, skipping scheduling of deletion task.") - -# return True - - @flow(name="alcf_recon_flow", flow_run_name="alcf_recon-{file_path}") def alcf_recon_flow( file_path: str, @@ -659,20 +552,6 @@ def alcf_recon_flow( days_from_now=30.0 ) - # data832_tiff_transfer_success, data832_zarr_transfer_success, nersc_transfer_success - # schedule_pruning( - # alcf_raw_path=f"{folder_name}/{h5_file_name}" if alcf_transfer_success else None, - # alcf_scratch_path_tiff=f"{scratch_path_tiff}" if alcf_reconstruction_success else None, - # # alcf_scratch_path_zarr=f"{scratch_path_zarr}" if alcf_multi_res_success else None, # Commenting out zarr for now - # nersc_scratch_path_tiff=f"{scratch_path_tiff}" if nersc_transfer_success else None, - # nersc_scratch_path_zarr=f"{scratch_path_zarr}" if nersc_transfer_success else None, - # data832_raw_path=f"{folder_name}/{h5_file_name}" if alcf_transfer_success else None, - # data832_scratch_path_tiff=f"{scratch_path_tiff}" if data832_tiff_transfer_success else None, - # # data832_scratch_path_zarr=f"{scratch_path_zarr}" if data832_zarr_transfer_success else None, # Commenting out zarr - # one_minute=False, # Set to False for production durations - # config=config - # ) - # TODO: ingest to scicat if alcf_reconstruction_success and alcf_segmentation_success: # and alcf_multi_res_success: From c9034117f1c3dd7a46cb4d07a70969590c3f96f7 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Mon, 26 Jan 2026 11:58:37 -0800 Subject: [PATCH 11/35] linting and docstrings --- orchestration/flows/bl832/alcf.py | 69 ++++++++++++++++--------------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index 086a3668..57874b38 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -21,14 +21,18 @@ class ALCFTomographyHPCController(TomographyHPCController): There is a @staticmethod wrapper for each compute task submitted via Globus Compute. Also, there is a shared wait_for_globus_compute_future method that waits for the task to complete. - Args: - TomographyHPCController (ABC): Abstract class for tomography HPC controllers. + :param TomographyHPCController: Abstract class for tomography HPC controllers. """ def __init__( self, config: Config832 ) -> None: + """ + Initialize the ALCF Tomography HPC Controller. + + :param config: Configuration object for the controller. + """ super().__init__(config) # Load allocation root from the Prefect JSON block # The block must be registered with the name "alcf-allocation-root-path" @@ -46,11 +50,8 @@ def reconstruct( """ Run tomography reconstruction at ALCF through Globus Compute. - Args: - file_path (str): Path to the file to be processed. - - Returns: - bool: True if the task completed successfully, False otherwise. + :param file_path : Path to the file to be processed. + :return: True if the task completed successfully, False otherwise. """ logger = get_run_logger() file_name = Path(file_path).stem + ".h5" @@ -85,14 +86,11 @@ def _reconstruct_wrapper( """ Python function that wraps around the application call for Tomopy reconstruction on ALCF - Args: - rundir (str): the directory on the eagle file system (ALCF) where the input data are located - script_path (str): the path to the script that will run the reconstruction - h5_file_name (str): the name of the h5 file to be reconstructed - folder_path (str): the path to the folder containing the h5 file - - Returns: - str: confirmation message + :param rundir: the directory on the eagle file system (ALCF) where the input data are located + :param script_path: the path to the script that will run the reconstruction + :param h5_file_name: the name of the h5 file to be reconstructed + :param folder_path: the path to the folder containing the h5 file + :return: confirmation message """ import os import subprocess @@ -123,11 +121,8 @@ def build_multi_resolution( """ Tiff to Zarr code that is executed using Globus Compute - Args: - file_path (str): Path to the file to be processed. - - Returns: - bool: True if the task completed successfully, False otherwise. + :param file_path: Path to the file to be processed. + :return: True if the task completed successfully, False otherwise. """ logger = get_run_logger() @@ -294,14 +289,11 @@ def _wait_for_globus_compute_future( """ Wait for a Globus Compute task to complete, assuming that if future.done() is False, the task is running. - Args: - future: The future object returned from the Globus Compute Executor submit method. - task_name: A descriptive name for the task being executed (used for logging). - check_interval: The interval (in seconds) between status checks. - walltime: The maximum time (in seconds) to wait for the task to complete. - - Returns: - bool: True if the task completed successfully within walltime, False otherwise. + :param future: The future object returned from the Globus Compute Executor submit method. + :param task_name: A descriptive name for the task being executed (used for logging). + :param check_interval: The interval (in seconds) between status checks. + :param walltime: The maximum time (in seconds) to wait for the task to complete. + :return: True if the task completed successfully within walltime, False otherwise. """ logger = get_run_logger() @@ -364,12 +356,9 @@ def alcf_recon_flow( """ Process and transfer a file from bl832 to ALCF and run reconstruction and segmentation. - Args: - file_path (str): The path to the file to be processed. - config (Config832): Configuration object for the flow. - - Returns: - bool: True if the flow completed successfully, False otherwise. + :param file_path: The path to the file to be processed. + :param config: Configuration object for the flow. + :return: True if the flow completed successfully, False otherwise. """ logger = get_run_logger() @@ -565,6 +554,13 @@ def alcf_segmentation_task( recon_folder_path: str, config: Optional[Config832] = None, ): + """ + Run segmentation task at ALCF. + + :param recon_folder_path: Path to the reconstructed data folder to be processed. + :param config: Configuration object for the flow. + :return: True if the task completed successfully, False otherwise. + """ logger = get_run_logger() if config is None: logger.info("No config provided, using default Config832.") @@ -589,6 +585,11 @@ def alcf_segmentation_task( @flow(name="alcf_segmentation_integration_test", flow_run_name="alcf_segmentation_integration_test") def alcf_segmentation_integration_test(): + """ + Integration test for the ALCF segmentation task. + + :return: None + """ recon_folder_path = 'rec20211222_125057_petiole4' flow_success = alcf_segmentation_task( recon_folder_path=recon_folder_path, From 5a5cff45614b33fd5b1adde55e03f68089058895 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Mon, 26 Jan 2026 12:04:49 -0800 Subject: [PATCH 12/35] Docstrings, linting, and type hints --- orchestration/flows/bl832/alcf.py | 44 ++++++++++++++++++------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index 57874b38..ecdb84a2 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -159,13 +159,11 @@ def _build_multi_resolution_wrapper( """ Python function that wraps around the application call for Tiff to Zarr on ALCF - Args: - rundir (str): the directory on the eagle file system (ALCF) where the input data are located - script_path (str): the path to the script that will convert the tiff files to zarr - recon_path (str): the path to the reconstructed data - raw_path (str): the path to the raw data - Returns: - str: confirmation message + :param rundir: the directory on the eagle file system (ALCF) where the input data are located + :param script_path: the path to the script that will convert the tiff files to zarr + :param recon_path: the path to the reconstructed data + :param raw_path: the path to the raw data + :return: confirmation message """ import os import subprocess @@ -374,13 +372,14 @@ def alcf_recon_flow( scratch_path_zarr = folder_name + '/rec' + file_name + '.zarr/' # initialize transfer_controller with globus + logger.info("Initializing Globus Transfer Controller.") transfer_controller = get_transfer_controller( transfer_type=CopyMethod.GLOBUS, config=config ) # STEP 1: Transfer data from data832 to ALCF - logger.info("Copying data to ALCF.") + logger.info("Copying raw data to ALCF.") data832_raw_path = f"{folder_name}/{h5_file_name}" alcf_transfer_success = transfer_controller.copy( file_path=data832_raw_path, @@ -395,14 +394,16 @@ def alcf_recon_flow( else: logger.info("Transfer to ALCF Successful.") - # STEP 2A: Run the Tomopy Reconstruction Globus Flow + # STEP 2: Run the Tomopy Reconstruction Globus Flow logger.info(f"Starting ALCF reconstruction flow for {file_path=}") # Initialize the Tomography Controller and run the reconstruction + logger.info("Initializing ALCF Tomography HPC Controller.") tomography_controller = get_controller( hpc_type=HPC.ALCF, config=config ) + logger.info(f"Starting ALCF reconstruction task for {file_path=}") alcf_reconstruction_success = tomography_controller.reconstruct( file_path=file_path, ) @@ -412,7 +413,7 @@ def alcf_recon_flow( else: logger.info("Reconstruction Successful.") - # Transfer A: Send reconstructed data (tiff) to data832 + # STEP 3: Send reconstructed data (tiff) to data832 logger.info(f"Transferring {file_name} from {config.alcf832_synaps_recon} " f"at ALCF to {config.data832_scratch} at data832") data832_tiff_transfer_success = transfer_controller.copy( @@ -422,7 +423,7 @@ def alcf_recon_flow( ) logger.info(f"Transfer reconstructed TIFF data to data832 success: {data832_tiff_transfer_success}") - # STEP 3: Run the Segmentation Task at ALCF + # STEP 4: Run the Segmentation Task at ALCF logger.info(f"Starting ALCF segmentation task for {scratch_path_tiff=}") alcf_segmentation_success = alcf_segmentation_task( recon_folder_path=scratch_path_tiff, @@ -432,6 +433,10 @@ def alcf_recon_flow( logger.warning("Segmentation at ALCF Failed") else: logger.info("Segmentation at ALCF Successful") + + # STEP 5: Send segmented data to data832 + logger.info(f"Transferring {file_name} from {config.alcf832_synaps_segment} " + f"at ALCF to {config.data832_scratch} at data832") segment_transfer_success = transfer_controller.copy( file_path=scratch_path_segment, source=config.alcf832_synaps_segment, @@ -442,7 +447,7 @@ def alcf_recon_flow( # Not running TIFF to Zarr conversion at ALCF for now alcf_multi_res_success = False data832_zarr_transfer_success = False - # STEP 2B: Run the Tiff to Zarr Globus Flow + # STEP 6: Run the Tiff to Zarr Globus Flow # logger.info(f"Starting ALCF tiff to zarr flow for {file_path=}") # alcf_multi_res_success = tomography_controller.build_multi_resolution( # file_path=file_path, @@ -452,7 +457,7 @@ def alcf_recon_flow( # raise ValueError("Tiff to Zarr at ALCF Failed") # else: # logger.info("Tiff to Zarr Successful.") - # # Transfer B: Send reconstructed data (zarr) to data832 + # # STEP 7: Send reconstructed data (zarr) to data832 # logger.info(f"Transferring {file_name} from {config.alcf832_scratch} " # f"at ALCF to {config.data832_scratch} at data832") # data832_zarr_transfer_success = transfer_controller.copy( @@ -464,7 +469,7 @@ def alcf_recon_flow( # Place holder in case we want to transfer to NERSC for long term storage # nersc_transfer_success = False - # STEP 4: Schedule Pruning of files + # STEP 8: Schedule Pruning of files logger.info("Scheduling file pruning tasks.") prune_controller = get_prune_controller( prune_type=PruneMethod.GLOBUS, @@ -553,7 +558,7 @@ def alcf_recon_flow( def alcf_segmentation_task( recon_folder_path: str, config: Optional[Config832] = None, -): +) -> bool: """ Run segmentation task at ALCF. @@ -584,18 +589,21 @@ def alcf_segmentation_task( @flow(name="alcf_segmentation_integration_test", flow_run_name="alcf_segmentation_integration_test") -def alcf_segmentation_integration_test(): +def alcf_segmentation_integration_test() -> bool: """ Integration test for the ALCF segmentation task. - :return: None + :return: True if the segmentation task completed successfully, False otherwise. """ + logger = get_run_logger() + logger.info("Starting ALCF segmentation integration test.") recon_folder_path = 'rec20211222_125057_petiole4' flow_success = alcf_segmentation_task( recon_folder_path=recon_folder_path, config=Config832() ) - print(flow_success) + logger.info(f"Flow success: {flow_success}") + return flow_success if __name__ == "__main__": From b5e0ba9b7800ae7335455406b25850596ba8b9a0 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Mon, 26 Jan 2026 12:05:42 -0800 Subject: [PATCH 13/35] Updating globus compute config for segmentation --- .../globus_compute_segment_config.yaml | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/scripts/polaris/globus_compute_segment_config.yaml b/scripts/polaris/globus_compute_segment_config.yaml index 07bced00..15f150ea 100644 --- a/scripts/polaris/globus_compute_segment_config.yaml +++ b/scripts/polaris/globus_compute_segment_config.yaml @@ -1,9 +1,8 @@ -# This needs to be updated to use GPUs and a segmentation environment - engine: type: GlobusComputeEngine # This engine uses the HighThroughputExecutor max_retries_on_system_failure: 2 max_workers: 1 # Sets one worker per node + max_workers_per_node: 4 prefetch_capacity: 0 # Increase if you have many more tasks than workers address: @@ -25,16 +24,24 @@ engine: overrides: --depth=64 --ppn 1 account: SYNAPS-I - queue: debug - cpus_per_node: 64 + queue: debug # debug (1-2 nodes), debug-scaling (1-10 nodes), or some other queue, probably want demand (1-56 nodes) for real-time things, prod (496 nodes) + # minimum node 1, max 56 nodes. Max time 59 minutes + cpus_per_node: 32 # may want to change to 4 (only 4 GPUs per node) # e.g., "#PBS -l filesystems=home:grand:eagle\n#PBS -k doe" scheduler_options: "#PBS -l filesystems=home:eagle" - # Node setup: activate necessary conda environment and such - worker_init: "module use /soft/modulefiles; module load conda; conda activate /eagle/SYNAPS-I/reconstruction/env/tomopy; export PATH=$PATH:/eagle/SYNAPSE-I/; cd $HOME/.globus_compute/globus_compute_reconstruction" - - walltime: 00:60:00 # Jobs will end after 60 minutes + # worker_init: "module use /soft/modulefiles; module load conda; conda activate /eagle/SYNAPS-I/segmentation/env/; export PATH=$PATH:/eagle/SYNAPS-I/; cd $HOME/.globus_compute/globus_compute_segmentation" + worker_init: | + module use /soft/modulefiles + module load conda + conda activate base + source /eagle/SYNAPS-I/segmentation/env/bin/activate + export HF_HUB_CACHE=/eagle/SYNAPS-I/segmentation/.cache/huggingface + export HF_HOME=$HF_HUB_CACHE + cd /eagle/SYNAPS-I/segmentation/scripts/forge_feb_seg_model_demo + + walltime: 59:00 # Jobs will end after 59 minutes nodes_per_block: 2 # All jobs will have 1 node init_blocks: 0 min_blocks: 0 From 54dab5df4239935c71c9e24a0ba236e8f3fe09b4 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Wed, 28 Jan 2026 11:04:24 -0800 Subject: [PATCH 14/35] turning ALCF recon+segmentation into a separate flow from recon+zarr conversion --- orchestration/flows/bl832/alcf.py | 213 +++++++++++++++++++++++------- 1 file changed, 165 insertions(+), 48 deletions(-) diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index ecdb84a2..9b8bdb28 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -368,7 +368,6 @@ def alcf_recon_flow( file_name = path.stem h5_file_name = file_name + '.h5' scratch_path_tiff = folder_name + '/rec' + file_name + '/' - scratch_path_segment = folder_name + '/seg' + file_name + '/' scratch_path_zarr = folder_name + '/rec' + file_name + '.zarr/' # initialize transfer_controller with globus @@ -388,6 +387,169 @@ def alcf_recon_flow( ) logger.info(f"Transfer status: {alcf_transfer_success}") + if not alcf_transfer_success: + logger.error("Transfer failed due to configuration or authorization issues.") + raise ValueError("Transfer to ALCF Failed") + else: + logger.info("Transfer to ALCF Successful.") + + # STEP 2: Run Tomopy Reconstruction on Globus Compute + logger.info(f"Starting ALCF reconstruction flow for {file_path=}") + + # Initialize the Tomography Controller and run the reconstruction + logger.info("Initializing ALCF Tomography HPC Controller.") + tomography_controller = get_controller( + hpc_type=HPC.ALCF, + config=config + ) + logger.info(f"Starting ALCF reconstruction task for {file_path=}") + alcf_reconstruction_success = tomography_controller.reconstruct( + file_path=file_path, + ) + if not alcf_reconstruction_success: + logger.error("Reconstruction Failed.") + raise ValueError("Reconstruction at ALCF Failed") + else: + logger.info("Reconstruction Successful.") + + # STEP 3: Send reconstructed data (tiff) to data832 + logger.info(f"Transferring {file_name} from {config.alcf832_synaps_recon} " + f"at ALCF to {config.data832_scratch} at data832") + data832_tiff_transfer_success = transfer_controller.copy( + file_path=scratch_path_tiff, + source=config.alcf832_synaps_recon, + destination=config.data832_scratch + ) + logger.info(f"Transfer reconstructed TIFF data to data832 success: {data832_tiff_transfer_success}") + + # STEP 4: Run the Tiff to Zarr Globus Flow + logger.info(f"Starting ALCF tiff to zarr flow for {file_path=}") + alcf_multi_res_success = tomography_controller.build_multi_resolution( + file_path=file_path, + ) + if not alcf_multi_res_success: + logger.error("Tiff to Zarr Failed.") + raise ValueError("Tiff to Zarr at ALCF Failed") + else: + logger.info("Tiff to Zarr Successful.") + # STEP 5: Send reconstructed data (zarr) to data832 + logger.info(f"Transferring {file_name} from {config.alcf832_scratch} " + f"at ALCF to {config.data832_scratch} at data832") + data832_zarr_transfer_success = transfer_controller.copy( + file_path=scratch_path_zarr, + source=config.alcf832_scratch, + destination=config.data832_scratch + ) + + # Place holder in case we want to transfer to NERSC for long term storage + # nersc_transfer_success = False + + # STEP 6: Schedule Pruning of files + logger.info("Scheduling file pruning tasks.") + prune_controller = get_prune_controller( + prune_type=PruneMethod.GLOBUS, + config=config + ) + + # Prune from ALCF raw + if alcf_transfer_success: + logger.info("Scheduling pruning of ALCF raw data.") + prune_controller.prune( + file_path=data832_raw_path, + source_endpoint=config.alcf832_synaps_raw, + check_endpoint=None, + days_from_now=2.0 + ) + + # Prune TIFFs from ALCF scratch/reconstruction + if alcf_reconstruction_success: + logger.info("Scheduling pruning of ALCF scratch reconstruction data.") + prune_controller.prune( + file_path=scratch_path_tiff, + source_endpoint=config.alcf832_synaps_recon, + check_endpoint=config.data832_scratch, + days_from_now=2.0 + ) + + # Prune ZARR from ALCF scratch/reconstruction + if alcf_multi_res_success: + logger.info("Scheduling pruning of ALCF scratch zarr reconstruction data.") + prune_controller.prune( + file_path=scratch_path_zarr, + source_endpoint=config.alcf832_synaps_recon, + check_endpoint=config.data832_scratch, + days_from_now=2.0 + ) + + # Prune reconstructed TIFFs from data832 scratch + if data832_tiff_transfer_success: + logger.info("Scheduling pruning of data832 scratch reconstruction TIFF data.") + prune_controller.prune( + file_path=scratch_path_tiff, + source_endpoint=config.data832_scratch, + check_endpoint=None, + days_from_now=30.0 + ) + + # Prune reconstructed ZARR from data832 scratch + if data832_zarr_transfer_success: + logger.info("Scheduling pruning of data832 scratch reconstruction ZARR data.") + prune_controller.prune( + file_path=scratch_path_zarr, + source_endpoint=config.data832_scratch, + check_endpoint=None, + days_from_now=30.0 + ) + + # TODO: ingest to scicat + + if alcf_reconstruction_success and alcf_multi_res_success: + return True + else: + return False + + +@flow(name="forge_alcf_recon_segment_flow", flow_run_name="alcf_recon_seg-{file_path}") +def forge_alcf_recon_segment_flow( + file_path: str, + config: Optional[Config832] = None, +) -> bool: + """ + Process and transfer a file from bl832 to ALCF and run reconstruction and segmentation. + + :param file_path: The path to the file to be processed. + :param config: Configuration object for the flow. + :return: True if the flow completed successfully, False otherwise. + """ + logger = get_run_logger() + + if config is None: + config = Config832() + # set up file paths + path = Path(file_path) + folder_name = path.parent.name + file_name = path.stem + h5_file_name = file_name + '.h5' + scratch_path_tiff = folder_name + '/rec' + file_name + '/' + scratch_path_segment = folder_name + '/seg' + file_name + '/' + + # initialize transfer_controller with globus + logger.info("Initializing Globus Transfer Controller.") + transfer_controller = get_transfer_controller( + transfer_type=CopyMethod.GLOBUS, + config=config + ) + + # STEP 1: Transfer data from data832 to ALCF + logger.info("Copying raw data to ALCF.") + data832_raw_path = f"{folder_name}/{h5_file_name}" + alcf_transfer_success = transfer_controller.copy( + file_path=data832_raw_path, + source=config.data832_raw, + destination=config.alcf832_synaps_raw + ) + logger.info(f"Transfer status: {alcf_transfer_success}") + if not alcf_transfer_success: logger.error("Transfer failed due to configuration or authorization issues.") raise ValueError("Transfer to ALCF Failed") @@ -444,32 +606,7 @@ def alcf_recon_flow( ) logger.info(f"Transfer segmented data to data832 success: {segment_transfer_success}") - # Not running TIFF to Zarr conversion at ALCF for now - alcf_multi_res_success = False - data832_zarr_transfer_success = False - # STEP 6: Run the Tiff to Zarr Globus Flow - # logger.info(f"Starting ALCF tiff to zarr flow for {file_path=}") - # alcf_multi_res_success = tomography_controller.build_multi_resolution( - # file_path=file_path, - # ) - # if not alcf_multi_res_success: - # logger.error("Tiff to Zarr Failed.") - # raise ValueError("Tiff to Zarr at ALCF Failed") - # else: - # logger.info("Tiff to Zarr Successful.") - # # STEP 7: Send reconstructed data (zarr) to data832 - # logger.info(f"Transferring {file_name} from {config.alcf832_scratch} " - # f"at ALCF to {config.data832_scratch} at data832") - # data832_zarr_transfer_success = transfer_controller.copy( - # file_path=scratch_path_zarr, - # source=config.alcf832_scratch, - # destination=config.data832_scratch - # ) - - # Place holder in case we want to transfer to NERSC for long term storage - # nersc_transfer_success = False - - # STEP 8: Schedule Pruning of files + # STEP 6: Schedule Pruning of files logger.info("Scheduling file pruning tasks.") prune_controller = get_prune_controller( prune_type=PruneMethod.GLOBUS, @@ -506,16 +643,6 @@ def alcf_recon_flow( days_from_now=2.0 ) - # Prune ZARR from ALCF scratch/reconstruction - if alcf_multi_res_success: - logger.info("Scheduling pruning of ALCF scratch zarr reconstruction data.") - prune_controller.prune( - file_path=scratch_path_zarr, - source_endpoint=config.alcf832_synaps_recon, - check_endpoint=config.data832_scratch, - days_from_now=2.0 - ) - # Prune reconstructed TIFFs from data832 scratch if data832_tiff_transfer_success: logger.info("Scheduling pruning of data832 scratch reconstruction TIFF data.") @@ -526,16 +653,6 @@ def alcf_recon_flow( days_from_now=30.0 ) - # Prune reconstructed ZARR from data832 scratch - if data832_zarr_transfer_success: - logger.info("Scheduling pruning of data832 scratch reconstruction ZARR data.") - prune_controller.prune( - file_path=scratch_path_zarr, - source_endpoint=config.data832_scratch, - check_endpoint=None, - days_from_now=30.0 - ) - # Prune segmented data from data832 scratch if alcf_segmentation_success: logger.info("Scheduling pruning of data832 scratch segmentation data.") @@ -548,7 +665,7 @@ def alcf_recon_flow( # TODO: ingest to scicat - if alcf_reconstruction_success and alcf_segmentation_success: # and alcf_multi_res_success: + if alcf_reconstruction_success and alcf_segmentation_success: return True else: return False From 6361a332a792f2e1718a8ae8c87fddd40f624b6a Mon Sep 17 00:00:00 2001 From: David Abramov Date: Wed, 28 Jan 2026 11:22:42 -0800 Subject: [PATCH 15/35] updating pytest for alcf reconstruction --- orchestration/_tests/test_globus_flow.py | 39 +++++++++++++----------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/orchestration/_tests/test_globus_flow.py b/orchestration/_tests/test_globus_flow.py index 4e424bad..6459815e 100644 --- a/orchestration/_tests/test_globus_flow.py +++ b/orchestration/_tests/test_globus_flow.py @@ -147,8 +147,8 @@ def __init__(self) -> None: MockSecret.for_endpoint("nersc832_alsdev_raw")), "nersc832_alsdev_scratch": MockEndpoint("mock_nersc832_alsdev_scratch_path", MockSecret.for_endpoint("nersc832_alsdev_scratch")), - "alcf832_raw": MockEndpoint("mock_alcf832_raw_path", MockSecret.for_endpoint("alcf832_raw")), - "alcf832_scratch": MockEndpoint("mock_alcf832_scratch_path", MockSecret.for_endpoint("alcf832_scratch")), + "alcf832_iri_raw": MockEndpoint("mock_alcf832_raw_path", MockSecret.for_endpoint("alcf832_iri_raw")), + "alcf832_iri_scratch": MockEndpoint("mock_alcf832_scratch_path", MockSecret.for_endpoint("alcf832_iri_scratch")), } # Mock apps @@ -163,8 +163,8 @@ def __init__(self) -> None: self.spot832 = self.endpoints["spot832"] self.data832 = self.endpoints["data832"] self.nersc832 = self.endpoints["nersc832"] - self.alcf832_raw = self.endpoints["alcf832_raw"] - self.alcf832_scratch = self.endpoints["alcf832_scratch"] + self.alcf832_iri_raw = self.endpoints["alcf832_iri_raw"] + self.alcf832_iri_scratch = self.endpoints["alcf832_iri_scratch"] self.data832_raw = self.endpoints["data832_raw"] self.data832_scratch = self.endpoints["data832_scratch"] self.nersc832_alsdev_scratch = self.endpoints["nersc832_alsdev_scratch"] @@ -247,8 +247,11 @@ def test_alcf_recon_flow(mocker: MockFixture): "nersc832_alsdev_pscratch_raw": mocker.MagicMock(), "nersc832_alsdev_pscratch_scratch": mocker.MagicMock(), "nersc832_alsdev_recon_scripts": mocker.MagicMock(), - "alcf832_raw": mocker.MagicMock(), - "alcf832_scratch": mocker.MagicMock(), + "alcf832_iri_raw": mocker.MagicMock(), + "alcf832_iri_scratch": mocker.MagicMock(), + "alcf832_synaps_raw": mocker.MagicMock(), + "alcf832_synaps_recon": mocker.MagicMock(), + "alcf832_synaps_segment": mocker.MagicMock(), } ) mocker.patch( @@ -298,10 +301,12 @@ def test_alcf_recon_flow(mocker: MockFixture): return_value=mock_transfer_controller ) - # 7) Patch schedule_pruning => skip real scheduling - mock_schedule_pruning = mocker.patch( - "orchestration.flows.bl832.alcf.schedule_pruning", - return_value=True + # 7) Patch get_prune_controller(...) => skip real scheduling + mock_prune_controller = mocker.MagicMock() + mock_prune_controller.prune.return_value = True + mocker.patch( + "orchestration.flows.bl832.alcf.get_prune_controller", + return_value=mock_prune_controller ) file_path = "/global/raw/transfer_tests/test.h5" @@ -316,13 +321,13 @@ def test_alcf_recon_flow(mocker: MockFixture): assert mock_transfer_controller.copy.call_count == 3, "Should do 3 transfers in success path" mock_hpc_reconstruct.assert_called_once() mock_hpc_multires.assert_called_once() - mock_schedule_pruning.assert_called_once() + assert mock_prune_controller.prune.call_count == 5, "Should schedule 5 prune operations in success path" # Reset for next scenario mock_transfer_controller.copy.reset_mock() mock_hpc_reconstruct.reset_mock() mock_hpc_multires.reset_mock() - mock_schedule_pruning.reset_mock() + mock_prune_controller.prune.reset_mock() # # ---------- CASE 2: HPC reconstruction fails ---------- @@ -339,13 +344,13 @@ def test_alcf_recon_flow(mocker: MockFixture): assert mock_transfer_controller.copy.call_count == 1, ( "Should only do the first data832->alcf copy before HPC fails" ) - mock_schedule_pruning.assert_not_called() + mock_prune_controller.prune.assert_not_called() # Reset mock_transfer_controller.copy.reset_mock() mock_hpc_reconstruct.reset_mock() mock_hpc_multires.reset_mock() - mock_schedule_pruning.reset_mock() + mock_prune_controller.prune.reset_mock() # ---------- CASE 3: Tiff->Zarr fails ---------- mock_transfer_controller.copy.return_value = True @@ -360,13 +365,13 @@ def test_alcf_recon_flow(mocker: MockFixture): # HPC is done, so there's 2 successful transfer (data832->alcf). # We have not transferred tiff or zarr => total 2 copies assert mock_transfer_controller.copy.call_count == 2 - mock_schedule_pruning.assert_not_called() + mock_prune_controller.prune.assert_not_called() # Reset mock_transfer_controller.copy.reset_mock() mock_hpc_reconstruct.reset_mock() mock_hpc_multires.reset_mock() - mock_schedule_pruning.reset_mock() + mock_prune_controller.prune.reset_mock() # ---------- CASE 4: data832->ALCF fails immediately ---------- mock_transfer_controller.copy.return_value = False @@ -380,4 +385,4 @@ def test_alcf_recon_flow(mocker: MockFixture): mock_hpc_multires.assert_not_called() # The only call is the failing copy mock_transfer_controller.copy.assert_called_once() - mock_schedule_pruning.assert_not_called() + mock_prune_controller.prune.assert_not_called() From f96c5cd7243c0885a1624b9783ec79e3a6e480b7 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Wed, 28 Jan 2026 11:22:56 -0800 Subject: [PATCH 16/35] Adjusting endpoint names for synaps --- orchestration/flows/bl832/alcf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index 9b8bdb28..84b22692 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -433,11 +433,11 @@ def alcf_recon_flow( else: logger.info("Tiff to Zarr Successful.") # STEP 5: Send reconstructed data (zarr) to data832 - logger.info(f"Transferring {file_name} from {config.alcf832_scratch} " + logger.info(f"Transferring {file_name} from {config.alcf832_synaps_recon} " f"at ALCF to {config.data832_scratch} at data832") data832_zarr_transfer_success = transfer_controller.copy( file_path=scratch_path_zarr, - source=config.alcf832_scratch, + source=config.alcf832_synaps_recon, destination=config.data832_scratch ) From 8324b7d8b4a6554b42be46e36c9ad31d0a2da7c3 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Wed, 28 Jan 2026 11:44:35 -0800 Subject: [PATCH 17/35] adding the alcf_forge_recon_segment flow to prefect.yaml as a separate deployment --- orchestration/flows/bl832/alcf.py | 4 ++-- orchestration/flows/bl832/prefect.yaml | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index 84b22692..da864c83 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -509,8 +509,8 @@ def alcf_recon_flow( return False -@flow(name="forge_alcf_recon_segment_flow", flow_run_name="alcf_recon_seg-{file_path}") -def forge_alcf_recon_segment_flow( +@flow(name="alcf_forge_recon_segment_flow", flow_run_name="alcf_recon_seg-{file_path}") +def alcf_forge_recon_segment_flow( file_path: str, config: Optional[Config832] = None, ) -> bool: diff --git a/orchestration/flows/bl832/prefect.yaml b/orchestration/flows/bl832/prefect.yaml index a1d4613b..20858610 100644 --- a/orchestration/flows/bl832/prefect.yaml +++ b/orchestration/flows/bl832/prefect.yaml @@ -55,6 +55,12 @@ deployments: name: alcf_recon_flow_pool work_queue_name: alcf_recon_flow_queue +- alcf_forge_recon_segment_flow: + entrypoint: orchestration/flows/bl832/alcf.py:alcf_forge_recon_segment_flow + work_pool: + name: alcf_recon_flow_pool + work_queue_name: alcf_forge_recon_segment_flow_queue + # Pruning flows - name: prune_globus_endpoint entrypoint: orchestration/prune_controller.py:prune_globus_endpoint From 7599f2e6a55d3b2807d0a60a306383b7096a70f1 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Wed, 28 Jan 2026 11:45:02 -0800 Subject: [PATCH 18/35] updating bl832 dispatcher to include alcf_forge_recon_segment as a separate option --- orchestration/flows/bl832/dispatcher.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/orchestration/flows/bl832/dispatcher.py b/orchestration/flows/bl832/dispatcher.py index cf1d0c64..7c799c2b 100644 --- a/orchestration/flows/bl832/dispatcher.py +++ b/orchestration/flows/bl832/dispatcher.py @@ -17,6 +17,9 @@ class FlowParameterMapper: "alcf_recon_flow/alcf_recon_flow": [ "file_path", "config"], + "alcf_forge_recon_segment_flow/alcf_forge_recon_segment_flow": [ + "file_path", + "config"], # From move.py "new_832_file_flow/new_file_832": [ "file_path", @@ -55,22 +58,26 @@ class DecisionFlowInputModel(BaseModel): @task(name="setup_decision_settings") -def setup_decision_settings(alcf_recon: bool, nersc_recon: bool, new_file_832: bool) -> dict: +def setup_decision_settings(alcf_recon: bool, alcf_forge_recon_segment: bool, nersc_recon: bool, new_file_832: bool) -> dict: """ This task is used to define the settings for the decision making process of the BL832 beamline. :param alcf_recon: Boolean indicating whether to run the ALCF reconstruction flow. + :param alcf_forge_recon_segment: Boolean indicating whether to run the ALCF forge reconstruction segmentation flow. :param nersc_recon: Boolean indicating whether to run the NERSC reconstruction flow. - :param nersc_move: Boolean indicating whether to move files to NERSC. + :param new_file_832: Boolean indicating whether to run the new file 832 flow. :return: A dictionary containing the settings for each flow. """ logger = get_run_logger() try: logger.info(f"Setting up decision settings: alcf_recon={alcf_recon}, " - f"nersc_recon={nersc_recon}, new_file_832={new_file_832}") + f"alcf_forge_recon_segment={alcf_forge_recon_segment}, " + f"nersc_recon={nersc_recon}, " + f"new_file_832={new_file_832}") # Define which flows to run based on the input settings settings = { "alcf_recon_flow/alcf_recon_flow": alcf_recon, + "alcf_forge_recon_segment_flow/alcf_forge_recon_segment_flow": alcf_forge_recon_segment, "nersc_recon_flow/nersc_recon_flow": nersc_recon, "new_832_file_flow/new_file_832": new_file_832 } @@ -145,6 +152,13 @@ async def dispatcher( alcf_params = FlowParameterMapper.get_flow_parameters("alcf_recon_flow/alcf_recon_flow", available_params) tasks.append(run_recon_flow_async("alcf_recon_flow/alcf_recon_flow", alcf_params)) + if decision_settings.get("alcf_forge_recon_segment_flow/alcf_forge_recon_segment_flow"): + alcf_forge_params = FlowParameterMapper.get_flow_parameters( + "alcf_forge_recon_segment_flow/alcf_forge_recon_segment_flow", + available_params + ) + tasks.append(run_recon_flow_async("alcf_forge_recon_segment_flow/alcf_forge_recon_segment_flow", alcf_forge_params)) + if decision_settings.get("nersc_recon_flow/nersc_recon_flow"): nersc_params = FlowParameterMapper.get_flow_parameters("nersc_recon_flow/nersc_recon_flow", available_params) tasks.append(run_recon_flow_async("nersc_recon_flow/nersc_recon_flow", nersc_params)) @@ -169,7 +183,7 @@ async def dispatcher( """ try: # Setup decision settings based on input parameters - setup_decision_settings(alcf_recon=True, nersc_recon=True, new_file_832=True) + setup_decision_settings(alcf_recon=True, alcf_forge_recon_segment=False, nersc_recon=True, new_file_832=True) # Run the main decision flow with the specified parameters # asyncio.run(dispatcher( # config={}, # PYTEST, ALCF, NERSC From d78c98db377c4535878ca68a910f664d555c4312 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Mon, 2 Feb 2026 10:56:57 -0800 Subject: [PATCH 19/35] adding transfer client uuid for ALCF SYNAPS-I --- config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config.yml b/config.yml index b4d29a5d..f224ce14 100644 --- a/config.yml +++ b/config.yml @@ -76,19 +76,19 @@ globus: alcf832_synaps_raw: root_path: /data/bl832/raw uri: alcf.anl.gov - uuid: TBD + uuid: 728a8e30-32ef-4000-814c-f9ccbc00bf13 name: alcf832_synaps_raw alcf832_synaps_recon: root_path: /data/bl832/scratch/reconstruction/ uri: alcf.anl.gov - uuid: TBD + uuid: 728a8e30-32ef-4000-814c-f9ccbc00bf13 name: alcf832_synaps_recon alcf832_synaps_segment: root_path: /data/bl832/scratch/segmentation/ uri: alcf.anl.gov - uuid: TBD + uuid: 728a8e30-32ef-4000-814c-f9ccbc00bf13 name: alcf832_synaps_segment alcf832_iri_raw: From 3c0f25eaaaaf9ea54cb0b956e26c40cdbce5f8e8 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Mon, 2 Feb 2026 10:57:20 -0800 Subject: [PATCH 20/35] this configuration worked for launching segmentation on 1 GPU --- orchestration/flows/bl832/alcf.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index da864c83..ac122f30 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -221,7 +221,7 @@ def _segmentation_wrapper( output_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/segmentation/", script_module: str = "src.inference", workdir: str = "/eagle/SYNAPS-I/segmentation/scripts/forge_feb_seg_model_demo", - nproc_per_node: int = 4, + nproc_per_node: int = 4, # 1 works nnodes: int = 1, nnode_rank: int = 0, master_addr: str = "localhost", @@ -267,7 +267,10 @@ def _segmentation_wrapper( "--prompts", *prompts, ] - segment_res = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + segment_res = subprocess.run(command) # stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + if segment_res.returncode != 0: + raise RuntimeError(f"Segmentation failed with return code {segment_res.returncode}") seg_end = time.time() @@ -282,7 +285,7 @@ def _wait_for_globus_compute_future( future: Future, task_name: str, check_interval: int = 20, - walltime: int = 1200 # seconds = 20 minutes + walltime: int = 3600 # seconds = 60 minutes ) -> bool: """ Wait for a Globus Compute task to complete, assuming that if future.done() is False, the task is running. @@ -714,7 +717,7 @@ def alcf_segmentation_integration_test() -> bool: """ logger = get_run_logger() logger.info("Starting ALCF segmentation integration test.") - recon_folder_path = 'rec20211222_125057_petiole4' + recon_folder_path = 'test' # 'rec20211222_125057_petiole4' flow_success = alcf_segmentation_task( recon_folder_path=recon_folder_path, config=Config832() From e6ebd1f609cc35047f590e1f99af50dee8f0bc38 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Mon, 2 Feb 2026 10:57:39 -0800 Subject: [PATCH 21/35] Updating segmentation compute endpoint config --- scripts/polaris/globus_compute_segment_config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/polaris/globus_compute_segment_config.yaml b/scripts/polaris/globus_compute_segment_config.yaml index 15f150ea..6479e84f 100644 --- a/scripts/polaris/globus_compute_segment_config.yaml +++ b/scripts/polaris/globus_compute_segment_config.yaml @@ -1,7 +1,7 @@ engine: type: GlobusComputeEngine # This engine uses the HighThroughputExecutor max_retries_on_system_failure: 2 - max_workers: 1 # Sets one worker per node + # max_workers: 1 # Sets one worker per node max_workers_per_node: 4 prefetch_capacity: 0 # Increase if you have many more tasks than workers @@ -29,7 +29,7 @@ engine: cpus_per_node: 32 # may want to change to 4 (only 4 GPUs per node) # e.g., "#PBS -l filesystems=home:grand:eagle\n#PBS -k doe" - scheduler_options: "#PBS -l filesystems=home:eagle" + scheduler_options: "#PBS -l filesystems=home:eagle -l select=1:ngpus=4" # Node setup: activate necessary conda environment and such # worker_init: "module use /soft/modulefiles; module load conda; conda activate /eagle/SYNAPS-I/segmentation/env/; export PATH=$PATH:/eagle/SYNAPS-I/; cd $HOME/.globus_compute/globus_compute_segmentation" worker_init: | From 3293d68cc63828dcb588317756412658b0ed8596 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Tue, 3 Feb 2026 13:05:11 -0800 Subject: [PATCH 22/35] Adding a separate wrapper for segmentation v2. At some point we may want to collapse these into the final version, but for testing purposes I'm leaving both codes --- orchestration/flows/bl832/alcf.py | 132 ++++++++++++++++++++++++++---- 1 file changed, 118 insertions(+), 14 deletions(-) diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index ac122f30..3d328510 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -191,11 +191,11 @@ def segmentation( """ logger = get_run_logger() + SEGMENTATION_VERSION = "v2" # "v2" + # Operate on reconstructed data rundir = f"{self.allocation_root}/data/bl832/scratch/reconstruction/{recon_folder_path}" output_dir = f"{self.allocation_root}/data/bl832/scratch/segmentation/{recon_folder_path}" - segmentation_module = "src.inference" - workdir = f"{self.allocation_root}/segmentation/scripts/forge_feb_seg_model_demo" gcc = Client(code_serialization_strategy=CombinedCode()) @@ -203,20 +203,40 @@ def segmentation( # We will probably have 2 endpoints, one for recon, one for segmentation endpoint_id = "168c595b-9493-42db-9c6a-aad960913de2" # with Executor(endpoint_id=Secret.load("globus-compute-endpoint").get(), client=gcc) as fxe: - with Executor(endpoint_id=endpoint_id, client=gcc) as fxe: - logger.info(f"Running segmentation on {recon_folder_path} at ALCF") - future = fxe.submit( - self._segmentation_wrapper, - input_dir=rundir, - output_dir=output_dir, - script_module=segmentation_module, - workdir=workdir - ) - result = self._wait_for_globus_compute_future(future, "segmentation", check_interval=10) - return result + + if SEGMENTATION_VERSION == "v1": + segmentation_module = "src.inference" + workdir = f"{self.allocation_root}/segmentation/scripts/forge_feb_seg_model_demo" + + with Executor(endpoint_id=endpoint_id, client=gcc) as fxe: + logger.info(f"Running segmentation on {recon_folder_path} at ALCF") + future = fxe.submit( + self._segmentation_wrapper_v1, + input_dir=rundir, + output_dir=output_dir, + script_module=segmentation_module, + workdir=workdir + ) + result = self._wait_for_globus_compute_future(future, "segmentation", check_interval=10) + + elif SEGMENTATION_VERSION == "v2": + segmentation_module = "src.inference_v2" + workdir = f"{self.allocation_root}/segmentation/scripts/forge_feb_seg_model_demo_v2/forge_feb_seg_model_demo" + with Executor(endpoint_id=endpoint_id, client=gcc) as fxe: + logger.info(f"Running segmentation on {recon_folder_path} at ALCF") + future = fxe.submit( + self._segmentation_wrapper_v2, + input_dir=rundir, + output_dir=output_dir, + script_module=segmentation_module, + workdir=workdir + ) + result = self._wait_for_globus_compute_future(future, "segmentation", check_interval=10) + + return result @staticmethod - def _segmentation_wrapper( + def _segmentation_wrapper_v1( input_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/reconstruction/", output_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/segmentation/", script_module: str = "src.inference", @@ -280,6 +300,90 @@ def _segmentation_wrapper( f"{segment_res}" ) + @staticmethod + def _segmentation_wrapper_v2( + input_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/reconstruction/", + output_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/segmentation/", + script_module: str = "src.inference_v2", + workdir: str = "/eagle/SYNAPS-I/segmentation/scripts/forge_feb_seg_model_demo_v2/forge_feb_seg_model_demo", + nproc_per_node: int = 4, + nnodes: int = 1, + patch_size: int = 640, + batch_size: int = 1, + confidence: float = 0.5, + prompts: list[str] = ["Cortex", "Phloem Fibers", "Air-based Pith cells", "Water-based Pith cells", "Xylem vessels"], + bpe_path: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/bpe_simple_vocab_16e6.txt.gz", + finetuned_checkpoint: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/checkpoint.pt", + original_checkpoint: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/sam3.pt", + use_finetuned: bool = True, + ) -> str: + """ + Python function that wraps around the application call for segmentation on ALCF. + + :param input_dir: the directory on the eagle file system (ALCF) where the input data are located + :param output_dir: the directory where segmentation results will be saved + :param script_module: the module path to the inference script + :param workdir: the path to the working directory containing the segmentation code + :param nproc_per_node: number of processes per node (typically number of GPUs) + :param nnodes: number of nodes to use + :param patch_size: size of patches for processing + :param batch_size: batch size per GPU + :param confidence: confidence threshold for predictions + :param prompts: list of text prompts for segmentation classes + :param bpe_path: path to BPE vocabulary file + :param finetuned_checkpoint: path to finetuned model checkpoint + :param original_checkpoint: path to original SAM3 checkpoint + :param use_finetuned: whether to use finetuned model (True) or original model (False) + :return: confirmation message + """ + import os + import subprocess + import time + + seg_start = time.time() + + # Move to directory where the segmentation code is located + os.chdir(workdir) + + # Build command + command = [ + "python", "-m", "torch.distributed.run", + f"--nproc_per_node={nproc_per_node}", + f"--nnodes={nnodes}", + "-m", script_module, + "--input-dir", input_dir, + "--output-dir", output_dir, + "--patch-size", str(patch_size), + "--batch-size", str(batch_size), + "--confidence", str(confidence), + "--prompts", *prompts, + "--bpe-path", bpe_path, + ] + + # Add checkpoint arguments based on whether using finetuned model + if use_finetuned: + command.extend([ + "--finetuned-checkpoint", finetuned_checkpoint, + "--original-checkpoint", original_checkpoint, + ]) + else: + command.extend([ + "--original-checkpoint", original_checkpoint, + ]) + + segment_res = subprocess.run(command) + + if segment_res.returncode != 0: + raise RuntimeError(f"Segmentation failed with return code {segment_res.returncode}") + + seg_end = time.time() + + print(f"Segmented data in {input_dir} in {seg_end - seg_start} seconds;\n {segment_res}") + return ( + f"Segmented data specified in {input_dir} in {seg_end - seg_start} seconds;\n" + f"{segment_res}" + ) + @staticmethod def _wait_for_globus_compute_future( future: Future, From 78e9c792a079c3034523d1f776c0d66bb413e809 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Wed, 4 Feb 2026 15:18:57 -0800 Subject: [PATCH 23/35] adding globus compute configs for segmentation (single and multinode) --- ...bus_compute_segment_config_multi_node.yaml | 48 ++++++++++++++++++ ...us_compute_segment_config_single_node.yaml | 50 +++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 scripts/polaris/globus_compute_segment_config_multi_node.yaml create mode 100644 scripts/polaris/globus_compute_segment_config_single_node.yaml diff --git a/scripts/polaris/globus_compute_segment_config_multi_node.yaml b/scripts/polaris/globus_compute_segment_config_multi_node.yaml new file mode 100644 index 00000000..35de5bd9 --- /dev/null +++ b/scripts/polaris/globus_compute_segment_config_multi_node.yaml @@ -0,0 +1,48 @@ +engine: + type: GlobusComputeEngine + max_retries_on_system_failure: 0 + max_workers_per_node: 1 + prefetch_capacity: 0 + + address: + type: address_by_interface + ifname: bond0 + + strategy: simple + job_status_kwargs: + max_idletime: 300 + strategy_period: 60 + + provider: + type: PBSProProvider + + launcher: + type: SimpleLauncher + + account: SYNAPS-I + queue: demand + cpus_per_node: 64 # Full node for multi-node jobs + + # Request 4 nodes with 4 GPUs each + scheduler_options: "#PBS -l filesystems=home:eagle -l select=4:ngpus=4" + + worker_init: | + export TMPDIR=/tmp + module use /soft/modulefiles + module load conda + conda activate base + source /eagle/SYNAPS-I/segmentation/env/bin/activate + export HF_HUB_CACHE=/eagle/SYNAPS-I/segmentation/.cache/huggingface + export HF_HOME=$HF_HUB_CACHE + export CUDA_DEVICE_ORDER=PCI_BUS_ID + # Enable IB for multi-node communication + export NCCL_IB_DISABLE=0 + export NCCL_P2P_DISABLE=0 + export OMP_NUM_THREADS=8 + cd /eagle/SYNAPS-I/segmentation/scripts/forge_feb_seg_model_demo + + walltime: 59:00 + nodes_per_block: 4 # Changed from 1 to 2 + init_blocks: 0 + min_blocks: 0 + max_blocks: 1 diff --git a/scripts/polaris/globus_compute_segment_config_single_node.yaml b/scripts/polaris/globus_compute_segment_config_single_node.yaml new file mode 100644 index 00000000..89dd9979 --- /dev/null +++ b/scripts/polaris/globus_compute_segment_config_single_node.yaml @@ -0,0 +1,50 @@ +engine: + type: GlobusComputeEngine # This engine uses the HighThroughputExecutor + max_retries_on_system_failure: 2 + # max_workers: 1 # Sets one worker per node + max_workers_per_node: 1 + prefetch_capacity: 0 # Increase if you have many more tasks than workers + + address: + type: address_by_interface + ifname: bond0 + + strategy: simple + job_status_kwargs: + max_idletime: 300 + strategy_period: 60 + + provider: + type: PBSProProvider + + launcher: + type: SimpleLauncher + # type: MpiExecLauncher + # Ensures 1 manger per node, work on all 64 cores + # bind_cmd: --cpu-bind + # overrides: --depth=64 --ppn 1 + + account: SYNAPS-I + queue: demand # debug (1-2 nodes), debug-scaling (1-10 nodes), or some other queue, probably want demand (1-56 nodes) for real-time things, prod (496 nodes) + # minimum node 1, max 56 nodes. Max time 59 minutes + cpus_per_node: 4 # may want to change to 4 (only 4 GPUs per node) + + # e.g., "#PBS -l filesystems=home:grand:eagle\n#PBS -k doe" + scheduler_options: "#PBS -l filesystems=home:eagle -l select=1:ngpus=4" + # Node setup: activate necessary conda environment and such + # worker_init: "module use /soft/modulefiles; module load conda; conda activate /eagle/SYNAPS-I/segmentation/env/; export PATH=$PATH:/eagle/SYNAPS-I/; cd $HOME/.globus_compute/globus_compute_segmentation" + worker_init: | + export TMPDIR=/tmp + module use /soft/modulefiles + module load conda + conda activate base + source /eagle/SYNAPS-I/segmentation/env/bin/activate + export HF_HUB_CACHE=/eagle/SYNAPS-I/segmentation/.cache/huggingface + export HF_HOME=$HF_HUB_CACHE + cd /eagle/SYNAPS-I/segmentation/scripts/forge_feb_seg_model_demo + + walltime: 59:00 # Jobs will end after 59 minutes + nodes_per_block: 1 # All jobs will have 1 node + init_blocks: 0 + min_blocks: 0 + max_blocks: 2 # No more than 1 job will be scheduled at a time From 0778cde1474eceeab670e3085882a8d5bbae26a7 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Wed, 4 Feb 2026 15:19:26 -0800 Subject: [PATCH 24/35] removing old segment config --- .../globus_compute_segment_config.yaml | 48 ------------------- 1 file changed, 48 deletions(-) delete mode 100644 scripts/polaris/globus_compute_segment_config.yaml diff --git a/scripts/polaris/globus_compute_segment_config.yaml b/scripts/polaris/globus_compute_segment_config.yaml deleted file mode 100644 index 6479e84f..00000000 --- a/scripts/polaris/globus_compute_segment_config.yaml +++ /dev/null @@ -1,48 +0,0 @@ -engine: - type: GlobusComputeEngine # This engine uses the HighThroughputExecutor - max_retries_on_system_failure: 2 - # max_workers: 1 # Sets one worker per node - max_workers_per_node: 4 - prefetch_capacity: 0 # Increase if you have many more tasks than workers - - address: - type: address_by_interface - ifname: bond0 - - strategy: simple - job_status_kwargs: - max_idletime: 300 - strategy_period: 60 - - provider: - type: PBSProProvider - - launcher: - type: MpiExecLauncher - # Ensures 1 manger per node, work on all 64 cores - bind_cmd: --cpu-bind - overrides: --depth=64 --ppn 1 - - account: SYNAPS-I - queue: debug # debug (1-2 nodes), debug-scaling (1-10 nodes), or some other queue, probably want demand (1-56 nodes) for real-time things, prod (496 nodes) - # minimum node 1, max 56 nodes. Max time 59 minutes - cpus_per_node: 32 # may want to change to 4 (only 4 GPUs per node) - - # e.g., "#PBS -l filesystems=home:grand:eagle\n#PBS -k doe" - scheduler_options: "#PBS -l filesystems=home:eagle -l select=1:ngpus=4" - # Node setup: activate necessary conda environment and such - # worker_init: "module use /soft/modulefiles; module load conda; conda activate /eagle/SYNAPS-I/segmentation/env/; export PATH=$PATH:/eagle/SYNAPS-I/; cd $HOME/.globus_compute/globus_compute_segmentation" - worker_init: | - module use /soft/modulefiles - module load conda - conda activate base - source /eagle/SYNAPS-I/segmentation/env/bin/activate - export HF_HUB_CACHE=/eagle/SYNAPS-I/segmentation/.cache/huggingface - export HF_HOME=$HF_HUB_CACHE - cd /eagle/SYNAPS-I/segmentation/scripts/forge_feb_seg_model_demo - - walltime: 59:00 # Jobs will end after 59 minutes - nodes_per_block: 2 # All jobs will have 1 node - init_blocks: 0 - min_blocks: 0 - max_blocks: 2 # No more than 1 job will be scheduled at a time From 93da03ac7b42ac64bf2179cb8c7acbfad9e36abb Mon Sep 17 00:00:00 2001 From: David Abramov Date: Wed, 4 Feb 2026 15:20:14 -0800 Subject: [PATCH 25/35] Segmentation wrapper v2 calls the segmentation inference version on polaris that scales well to multiple gpu nodes --- orchestration/flows/bl832/alcf.py | 218 +++++++++++++++++++++++------- 1 file changed, 172 insertions(+), 46 deletions(-) diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index 3d328510..1aa77a80 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -220,7 +220,7 @@ def segmentation( result = self._wait_for_globus_compute_future(future, "segmentation", check_interval=10) elif SEGMENTATION_VERSION == "v2": - segmentation_module = "src.inference_v2" + segmentation_module = "src.inference_v2_optimized2" # "src.inference_v2" workdir = f"{self.allocation_root}/segmentation/scripts/forge_feb_seg_model_demo_v2/forge_feb_seg_model_demo" with Executor(endpoint_id=endpoint_id, client=gcc) as fxe: logger.info(f"Running segmentation on {recon_folder_path} at ALCF") @@ -300,16 +300,101 @@ def _segmentation_wrapper_v1( f"{segment_res}" ) + # @staticmethod + # def _segmentation_wrapper_v2( + # input_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/reconstruction/", + # output_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/segmentation/", + # script_module: str = "src.inference_v2", + # workdir: str = "/eagle/SYNAPS-I/segmentation/scripts/forge_feb_seg_model_demo_v2/forge_feb_seg_model_demo", + # nproc_per_node: int = 4, + # nnodes: int = 2, + # patch_size: int = 640, + # batch_size: int = 1, + # confidence: float = 0.5, + # prompts: list[str] = ["Cortex", "Phloem Fibers", "Air-based Pith cells", "Water-based Pith cells", "Xylem vessels"], + # bpe_path: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/bpe_simple_vocab_16e6.txt.gz", + # finetuned_checkpoint: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/checkpoint.pt", + # original_checkpoint: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/sam3.pt", + # use_finetuned: bool = True, + # ) -> str: + # """ + # Python function that wraps around the application call for segmentation on ALCF. + + # :param input_dir: the directory on the eagle file system (ALCF) where the input data are located + # :param output_dir: the directory where segmentation results will be saved + # :param script_module: the module path to the inference script + # :param workdir: the path to the working directory containing the segmentation code + # :param nproc_per_node: number of processes per node (typically number of GPUs) + # :param nnodes: number of nodes to use + # :param patch_size: size of patches for processing + # :param batch_size: batch size per GPU + # :param confidence: confidence threshold for predictions + # :param prompts: list of text prompts for segmentation classes + # :param bpe_path: path to BPE vocabulary file + # :param finetuned_checkpoint: path to finetuned model checkpoint + # :param original_checkpoint: path to original SAM3 checkpoint + # :param use_finetuned: whether to use finetuned model (True) or original model (False) + # :return: confirmation message + # """ + # import os + # import subprocess + # import time + + # seg_start = time.time() + + # # Move to directory where the segmentation code is located + # os.chdir(workdir) + + # # Build command + # command = [ + # "python", "-m", "torch.distributed.run", + # f"--nproc_per_node={nproc_per_node}", + # f"--nnodes={nnodes}", + # "-m", script_module, + # "--input-dir", input_dir, + # "--output-dir", output_dir, + # "--patch-size", str(patch_size), + # "--batch-size", str(batch_size), + # "--confidence", str(confidence), + # "--prompts", *prompts, + # "--bpe-path", bpe_path, + # ] + + # # Add checkpoint arguments based on whether using finetuned model + # if use_finetuned: + # command.extend([ + # "--finetuned-checkpoint", finetuned_checkpoint, + # "--original-checkpoint", original_checkpoint, + # ]) + # else: + # command.extend([ + # "--original-checkpoint", original_checkpoint, + # ]) + + # segment_res = subprocess.run(command) + + # if segment_res.returncode != 0: + # raise RuntimeError(f"Segmentation failed with return code {segment_res.returncode}") + + # seg_end = time.time() + + # print(f"Segmented data in {input_dir} in {seg_end - seg_start} seconds;\n {segment_res}") + # return ( + # f"Segmented data specified in {input_dir} in {seg_end - seg_start} seconds;\n" + # f"{segment_res}" + # ) + @staticmethod def _segmentation_wrapper_v2( input_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/reconstruction/", output_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/segmentation/", - script_module: str = "src.inference_v2", + # script_module: str = "src.inference_v2", + script_module: str = "src.inference_v2_optimized", workdir: str = "/eagle/SYNAPS-I/segmentation/scripts/forge_feb_seg_model_demo_v2/forge_feb_seg_model_demo", nproc_per_node: int = 4, - nnodes: int = 1, + nnodes: int = 2, patch_size: int = 640, - batch_size: int = 1, + batch_size: int = 8, confidence: float = 0.5, prompts: list[str] = ["Cortex", "Phloem Fibers", "Air-based Pith cells", "Water-based Pith cells", "Xylem vessels"], bpe_path: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/bpe_simple_vocab_16e6.txt.gz", @@ -317,72 +402,113 @@ def _segmentation_wrapper_v2( original_checkpoint: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/sam3.pt", use_finetuned: bool = True, ) -> str: - """ - Python function that wraps around the application call for segmentation on ALCF. - - :param input_dir: the directory on the eagle file system (ALCF) where the input data are located - :param output_dir: the directory where segmentation results will be saved - :param script_module: the module path to the inference script - :param workdir: the path to the working directory containing the segmentation code - :param nproc_per_node: number of processes per node (typically number of GPUs) - :param nnodes: number of nodes to use - :param patch_size: size of patches for processing - :param batch_size: batch size per GPU - :param confidence: confidence threshold for predictions - :param prompts: list of text prompts for segmentation classes - :param bpe_path: path to BPE vocabulary file - :param finetuned_checkpoint: path to finetuned model checkpoint - :param original_checkpoint: path to original SAM3 checkpoint - :param use_finetuned: whether to use finetuned model (True) or original model (False) - :return: confirmation message - """ import os import subprocess import time seg_start = time.time() - - # Move to directory where the segmentation code is located os.chdir(workdir) - # Build command - command = [ - "python", "-m", "torch.distributed.run", + # Get PBS info + pbs_nodefile = os.environ.get("PBS_NODEFILE") + pbs_jobid = os.environ.get("PBS_JOBID", "12345") + + print("=== PBS DEBUG ===") + print(f"PBS_NODEFILE: {pbs_nodefile}") + print(f"PBS_JOBID: {pbs_jobid}") + + if pbs_nodefile and os.path.exists(pbs_nodefile): + with open(pbs_nodefile, 'r') as f: + all_lines = [line.strip() for line in f if line.strip()] + unique_nodes = list(dict.fromkeys(all_lines)) + actual_nnodes = len(unique_nodes) + master_addr = unique_nodes[0] + print(f"PBS_NODEFILE contents: {all_lines}") + print(f"Unique nodes ({actual_nnodes}): {unique_nodes}") + print(f"Master: {master_addr}") + else: + actual_nnodes = 1 + master_addr = "localhost" + print("No PBS_NODEFILE, single node mode") + + # Use explicit path to torchrun from the virtual environment + venv_path = "/eagle/SYNAPS-I/segmentation/env" + # torchrun_path = f"{venv_path}/bin/torchrun" + # python_path = f"{venv_path}/bin/python" + + # Build torchrun arguments + torchrun_args = [ + f"--nnodes={actual_nnodes}", f"--nproc_per_node={nproc_per_node}", - f"--nnodes={nnodes}", + f"--rdzv_id={pbs_jobid}", + "--rdzv_backend=c10d", + f"--rdzv_endpoint={master_addr}:29500", "-m", script_module, "--input-dir", input_dir, "--output-dir", output_dir, "--patch-size", str(patch_size), "--batch-size", str(batch_size), "--confidence", str(confidence), - "--prompts", *prompts, - "--bpe-path", bpe_path, + "--prompts", ] + # torchrun_args.extend(prompts) + torchrun_args.extend([f'"{p}"' for p in prompts]) + + torchrun_args.extend(["--bpe-path", bpe_path]) - # Add checkpoint arguments based on whether using finetuned model if use_finetuned: - command.extend([ + torchrun_args.extend([ "--finetuned-checkpoint", finetuned_checkpoint, "--original-checkpoint", original_checkpoint, ]) else: - command.extend([ - "--original-checkpoint", original_checkpoint, - ]) + torchrun_args.extend(["--original-checkpoint", original_checkpoint]) + + torchrun_cmd = f"{venv_path}/bin/python -m torch.distributed.run " + " ".join(torchrun_args) + + # Environment + NCCL setup - activate venv and set PATH explicitly + env_setup = ( + f"source {venv_path}/bin/activate && " + f"export PATH={venv_path}/bin:$PATH && " + "export HF_HUB_CACHE=/eagle/SYNAPS-I/segmentation/.cache/huggingface && " + "export HF_HOME=$HF_HUB_CACHE && " + "export CUDA_DEVICE_ORDER=PCI_BUS_ID && " + "export NCCL_NET_GDR_LEVEL=PHB && " + "export NCCL_CROSS_NIC=1 && " + "export NCCL_COLLNET_ENABLE=1 && " + 'export NCCL_NET="AWS Libfabric" && ' + "export LD_LIBRARY_PATH=/soft/libraries/aws-ofi-nccl/v1.9.1-aws/lib:$LD_LIBRARY_PATH && " + "export LD_LIBRARY_PATH=/soft/libraries/hwloc/lib/:$LD_LIBRARY_PATH && " + "export FI_CXI_DISABLE_HOST_REGISTER=1 && " + "export FI_MR_CACHE_MONITOR=userfaultfd && " + "export FI_CXI_DEFAULT_CQ_SIZE=131072 && " + f"cd {workdir} && " + ) - segment_res = subprocess.run(command) + if actual_nnodes > 1: + # Use mpiexec to launch torchrun on all nodes + command = [ + "mpiexec", + "-n", str(actual_nnodes), + "-ppn", "1", + "-hostfile", pbs_nodefile, + "--cpu-bind", "depth", + "-d", "16", + "bash", "-c", env_setup + torchrun_cmd + ] + else: + command = ["bash", "-c", env_setup + torchrun_cmd] - if segment_res.returncode != 0: - raise RuntimeError(f"Segmentation failed with return code {segment_res.returncode}") + print(f"Running: {' '.join(command)}") - seg_end = time.time() + result = subprocess.run(command, stdout=None, stderr=None, text=True) + print(f"STDOUT: {result.stdout[-3000:] if result.stdout else 'None'}") + print(f"STDERR: {result.stderr[-3000:] if result.stderr else 'None'}") - print(f"Segmented data in {input_dir} in {seg_end - seg_start} seconds;\n {segment_res}") - return ( - f"Segmented data specified in {input_dir} in {seg_end - seg_start} seconds;\n" - f"{segment_res}" - ) + if result.returncode != 0: + raise RuntimeError(f"Segmentation failed: {result.returncode}\nSTDERR: {result.stderr[-2000:]}") + + return f"Completed in {time.time() - seg_start:.1f}s" @staticmethod def _wait_for_globus_compute_future( @@ -821,7 +947,7 @@ def alcf_segmentation_integration_test() -> bool: """ logger = get_run_logger() logger.info("Starting ALCF segmentation integration test.") - recon_folder_path = 'test' # 'rec20211222_125057_petiole4' + recon_folder_path = 'rec20211222_125057_petiole4' # 'test' # flow_success = alcf_segmentation_task( recon_folder_path=recon_folder_path, config=Config832() From b2ee3cefbf7926ec5bb7a1ae0accc56b41617e38 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Wed, 4 Feb 2026 15:29:48 -0800 Subject: [PATCH 26/35] removing stale comments --- orchestration/flows/bl832/alcf.py | 84 ------------------------------- 1 file changed, 84 deletions(-) diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index 1aa77a80..15db25b5 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -300,90 +300,6 @@ def _segmentation_wrapper_v1( f"{segment_res}" ) - # @staticmethod - # def _segmentation_wrapper_v2( - # input_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/reconstruction/", - # output_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/segmentation/", - # script_module: str = "src.inference_v2", - # workdir: str = "/eagle/SYNAPS-I/segmentation/scripts/forge_feb_seg_model_demo_v2/forge_feb_seg_model_demo", - # nproc_per_node: int = 4, - # nnodes: int = 2, - # patch_size: int = 640, - # batch_size: int = 1, - # confidence: float = 0.5, - # prompts: list[str] = ["Cortex", "Phloem Fibers", "Air-based Pith cells", "Water-based Pith cells", "Xylem vessels"], - # bpe_path: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/bpe_simple_vocab_16e6.txt.gz", - # finetuned_checkpoint: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/checkpoint.pt", - # original_checkpoint: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/sam3.pt", - # use_finetuned: bool = True, - # ) -> str: - # """ - # Python function that wraps around the application call for segmentation on ALCF. - - # :param input_dir: the directory on the eagle file system (ALCF) where the input data are located - # :param output_dir: the directory where segmentation results will be saved - # :param script_module: the module path to the inference script - # :param workdir: the path to the working directory containing the segmentation code - # :param nproc_per_node: number of processes per node (typically number of GPUs) - # :param nnodes: number of nodes to use - # :param patch_size: size of patches for processing - # :param batch_size: batch size per GPU - # :param confidence: confidence threshold for predictions - # :param prompts: list of text prompts for segmentation classes - # :param bpe_path: path to BPE vocabulary file - # :param finetuned_checkpoint: path to finetuned model checkpoint - # :param original_checkpoint: path to original SAM3 checkpoint - # :param use_finetuned: whether to use finetuned model (True) or original model (False) - # :return: confirmation message - # """ - # import os - # import subprocess - # import time - - # seg_start = time.time() - - # # Move to directory where the segmentation code is located - # os.chdir(workdir) - - # # Build command - # command = [ - # "python", "-m", "torch.distributed.run", - # f"--nproc_per_node={nproc_per_node}", - # f"--nnodes={nnodes}", - # "-m", script_module, - # "--input-dir", input_dir, - # "--output-dir", output_dir, - # "--patch-size", str(patch_size), - # "--batch-size", str(batch_size), - # "--confidence", str(confidence), - # "--prompts", *prompts, - # "--bpe-path", bpe_path, - # ] - - # # Add checkpoint arguments based on whether using finetuned model - # if use_finetuned: - # command.extend([ - # "--finetuned-checkpoint", finetuned_checkpoint, - # "--original-checkpoint", original_checkpoint, - # ]) - # else: - # command.extend([ - # "--original-checkpoint", original_checkpoint, - # ]) - - # segment_res = subprocess.run(command) - - # if segment_res.returncode != 0: - # raise RuntimeError(f"Segmentation failed with return code {segment_res.returncode}") - - # seg_end = time.time() - - # print(f"Segmented data in {input_dir} in {seg_end - seg_start} seconds;\n {segment_res}") - # return ( - # f"Segmented data specified in {input_dir} in {seg_end - seg_start} seconds;\n" - # f"{segment_res}" - # ) - @staticmethod def _segmentation_wrapper_v2( input_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/reconstruction/", From e38815bf958f8233e66beccb42d1172df1791e1e Mon Sep 17 00:00:00 2001 From: David Abramov Date: Wed, 4 Feb 2026 15:30:07 -0800 Subject: [PATCH 27/35] removing stale comments --- orchestration/flows/bl832/alcf.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index 15db25b5..b9ae17c5 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -349,8 +349,6 @@ def _segmentation_wrapper_v2( # Use explicit path to torchrun from the virtual environment venv_path = "/eagle/SYNAPS-I/segmentation/env" - # torchrun_path = f"{venv_path}/bin/torchrun" - # python_path = f"{venv_path}/bin/python" # Build torchrun arguments torchrun_args = [ @@ -367,7 +365,6 @@ def _segmentation_wrapper_v2( "--confidence", str(confidence), "--prompts", ] - # torchrun_args.extend(prompts) torchrun_args.extend([f'"{p}"' for p in prompts]) torchrun_args.extend(["--bpe-path", bpe_path]) From 738ada2201962dc5f81e2020928105bd9e4ccf7e Mon Sep 17 00:00:00 2001 From: David Abramov Date: Wed, 4 Feb 2026 15:41:51 -0800 Subject: [PATCH 28/35] Cleaning, adding helpful comments --- orchestration/flows/bl832/alcf.py | 138 +++++++++--------------------- 1 file changed, 39 insertions(+), 99 deletions(-) diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index b9ae17c5..253540bb 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -191,8 +191,6 @@ def segmentation( """ logger = get_run_logger() - SEGMENTATION_VERSION = "v2" # "v2" - # Operate on reconstructed data rundir = f"{self.allocation_root}/data/bl832/scratch/reconstruction/{recon_folder_path}" output_dir = f"{self.allocation_root}/data/bl832/scratch/segmentation/{recon_folder_path}" @@ -204,111 +202,28 @@ def segmentation( endpoint_id = "168c595b-9493-42db-9c6a-aad960913de2" # with Executor(endpoint_id=Secret.load("globus-compute-endpoint").get(), client=gcc) as fxe: - if SEGMENTATION_VERSION == "v1": - segmentation_module = "src.inference" - workdir = f"{self.allocation_root}/segmentation/scripts/forge_feb_seg_model_demo" - - with Executor(endpoint_id=endpoint_id, client=gcc) as fxe: - logger.info(f"Running segmentation on {recon_folder_path} at ALCF") - future = fxe.submit( - self._segmentation_wrapper_v1, - input_dir=rundir, - output_dir=output_dir, - script_module=segmentation_module, - workdir=workdir - ) - result = self._wait_for_globus_compute_future(future, "segmentation", check_interval=10) - - elif SEGMENTATION_VERSION == "v2": - segmentation_module = "src.inference_v2_optimized2" # "src.inference_v2" - workdir = f"{self.allocation_root}/segmentation/scripts/forge_feb_seg_model_demo_v2/forge_feb_seg_model_demo" - with Executor(endpoint_id=endpoint_id, client=gcc) as fxe: - logger.info(f"Running segmentation on {recon_folder_path} at ALCF") - future = fxe.submit( - self._segmentation_wrapper_v2, - input_dir=rundir, - output_dir=output_dir, - script_module=segmentation_module, - workdir=workdir - ) - result = self._wait_for_globus_compute_future(future, "segmentation", check_interval=10) + segmentation_module = "src.inference_v2_optimized2" + workdir = f"{self.allocation_root}/segmentation/scripts/forge_feb_seg_model_demo_v2/forge_feb_seg_model_demo" + with Executor(endpoint_id=endpoint_id, client=gcc) as fxe: + logger.info(f"Running segmentation on {recon_folder_path} at ALCF") + future = fxe.submit( + self._segmentation_wrapper_v2, + input_dir=rundir, + output_dir=output_dir, + script_module=segmentation_module, + workdir=workdir + ) + result = self._wait_for_globus_compute_future(future, "segmentation", check_interval=10) return result @staticmethod - def _segmentation_wrapper_v1( - input_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/reconstruction/", - output_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/segmentation/", - script_module: str = "src.inference", - workdir: str = "/eagle/SYNAPS-I/segmentation/scripts/forge_feb_seg_model_demo", - nproc_per_node: int = 4, # 1 works - nnodes: int = 1, - nnode_rank: int = 0, - master_addr: str = "localhost", - master_port: str = "29500", - patch_size: int = 512, - batch_size: int = 1, - num_workers: int = 4, - confidence: float = 0.5, - prompts: list[str] = ["background", "cell"], - ) -> str: - """ - Python function that wraps around the application call for segmentation on ALCF - - :param rundir: the directory on the eagle file system (ALCF) where the input data are located - :param script_path: the path to the script that will run the segmentation - :param folder_path: the path to the folder containing the TIFF data to be segmented - :return: confirmation message - """ - import os - import subprocess - import time - - seg_start = time.time() - - # Move to directory where the segmentation code is located - os.chdir(workdir) - - # Run segmentation.py - command = [ - "python", "-m", "torch.distributed.run", - f"--nproc_per_node={nproc_per_node}", - f"--nnodes={nnodes}", - f"--node_rank={nnode_rank}", - f"--master_addr={master_addr}", - f"--master_port={master_port}", - "-m", script_module, - "--input-dir", input_dir, - "--output-dir", output_dir, - "--patch-size", str(patch_size), - "--batch-size", str(batch_size), - "--num-workers", str(num_workers), - "--confidence", str(confidence), - "--prompts", *prompts, - ] - - segment_res = subprocess.run(command) # stdout=subprocess.PIPE, stderr=subprocess.PIPE) - - if segment_res.returncode != 0: - raise RuntimeError(f"Segmentation failed with return code {segment_res.returncode}") - - seg_end = time.time() - - print(f"Segmented data in {input_dir} in {seg_end-seg_start} seconds;\n {segment_res}") - return ( - f"Segmented data specified in {input_dir} in {seg_end-seg_start} seconds;\n" - f"{segment_res}" - ) - - @staticmethod - def _segmentation_wrapper_v2( + def _segmentation_wrapper( input_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/reconstruction/", output_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/segmentation/", - # script_module: str = "src.inference_v2", - script_module: str = "src.inference_v2_optimized", + script_module: str = "src.inference_v2_optimized2", workdir: str = "/eagle/SYNAPS-I/segmentation/scripts/forge_feb_seg_model_demo_v2/forge_feb_seg_model_demo", nproc_per_node: int = 4, - nnodes: int = 2, patch_size: int = 640, batch_size: int = 8, confidence: float = 0.5, @@ -318,6 +233,26 @@ def _segmentation_wrapper_v2( original_checkpoint: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/sam3.pt", use_finetuned: bool = True, ) -> str: + """ + Wrapper function to run segmentation using torch.distributed.run on ALCF. + This is the code that is executed by Globus Compute. + + :param input_dir: Directory containing input data for segmentation. + :param output_dir: Directory to save segmentation outputs. + :param script_module: Python module to run for segmentation. + :param workdir: Working directory for the segmentation script. + :param nproc_per_node: Number of processes per node. + :param patch_size: Size of the patches for segmentation. + :param batch_size: Batch size for segmentation. + :param confidence: Confidence threshold for segmentation. + :param prompts: List of prompts for segmentation. + :param bpe_path: Path to the BPE vocabulary file. + :param finetuned_checkpoint: Path to the finetuned model checkpoint. + :param original_checkpoint: Path to the original model checkpoint. + :param use_finetuned: Whether to use the finetuned model checkpoint. + + :return: Confirmation message upon completion. + """ import os import subprocess import time @@ -333,6 +268,7 @@ def _segmentation_wrapper_v2( print(f"PBS_NODEFILE: {pbs_nodefile}") print(f"PBS_JOBID: {pbs_jobid}") + # Determine number of nodes and master address based on PBS_NODEFILE if pbs_nodefile and os.path.exists(pbs_nodefile): with open(pbs_nodefile, 'r') as f: all_lines = [line.strip() for line in f if line.strip()] @@ -351,6 +287,7 @@ def _segmentation_wrapper_v2( venv_path = "/eagle/SYNAPS-I/segmentation/env" # Build torchrun arguments + # rdzv is used for rendezvous in multi-node setups, meaning all nodes can find each other torchrun_args = [ f"--nnodes={actual_nnodes}", f"--nproc_per_node={nproc_per_node}", @@ -365,6 +302,7 @@ def _segmentation_wrapper_v2( "--confidence", str(confidence), "--prompts", ] + # Add prompts to the arguments, each prompt is a separate argument torchrun_args.extend([f'"{p}"' for p in prompts]) torchrun_args.extend(["--bpe-path", bpe_path]) @@ -380,6 +318,8 @@ def _segmentation_wrapper_v2( torchrun_cmd = f"{venv_path}/bin/python -m torch.distributed.run " + " ".join(torchrun_args) # Environment + NCCL setup - activate venv and set PATH explicitly + # Following best practices from ALCF: + # https://docs.alcf.anl.gov/polaris/data-science/frameworks/pytorch/#multi-gpu-multi-node-scale-up env_setup = ( f"source {venv_path}/bin/activate && " f"export PATH={venv_path}/bin:$PATH && " From 2d56d4d8b8701cb3f94d63ab3655b20bbe31dce6 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Wed, 4 Feb 2026 16:12:18 -0800 Subject: [PATCH 29/35] typo, and cleaning file paths --- orchestration/flows/bl832/alcf.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index 253540bb..ebe8f290 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -192,22 +192,28 @@ def segmentation( logger = get_run_logger() # Operate on reconstructed data + # Input: folder_name/rec20211222_125057_petiole4/ + # Output should go to: folder_name/seg20211222_125057_petiole4/ + rundir = f"{self.allocation_root}/data/bl832/scratch/reconstruction/{recon_folder_path}" - output_dir = f"{self.allocation_root}/data/bl832/scratch/segmentation/{recon_folder_path}" + output_folder = recon_folder_path.replace('/rec', '/seg') + output_dir = f"{self.allocation_root}/data/bl832/scratch/segmentation/{output_folder}" gcc = Client(code_serialization_strategy=CombinedCode()) - # TODO: Update globus-compute-endpoint Secret block with the new endpoint UUID - # We will probably have 2 endpoints, one for recon, one for segmentation - endpoint_id = "168c595b-9493-42db-9c6a-aad960913de2" - # with Executor(endpoint_id=Secret.load("globus-compute-endpoint").get(), client=gcc) as fxe: + endpoint_id = Variable.get( + "alcf-globus-compute-seg-uuid", + default="168c595b-9493-42db-9c6a-aad960913de2", + _sync=True + ) segmentation_module = "src.inference_v2_optimized2" workdir = f"{self.allocation_root}/segmentation/scripts/forge_feb_seg_model_demo_v2/forge_feb_seg_model_demo" + with Executor(endpoint_id=endpoint_id, client=gcc) as fxe: logger.info(f"Running segmentation on {recon_folder_path} at ALCF") future = fxe.submit( - self._segmentation_wrapper_v2, + self._segmentation_wrapper, input_dir=rundir, output_dir=output_dir, script_module=segmentation_module, @@ -740,7 +746,7 @@ def alcf_forge_recon_segment_flow( ) # Prune segmented data from data832 scratch - if alcf_segmentation_success: + if segment_transfer_success: logger.info("Scheduling pruning of data832 scratch segmentation data.") prune_controller.prune( file_path=scratch_path_segment, From 7587cd6ab7881ec2d9d8c8bbb51a9dbfed64e8de Mon Sep 17 00:00:00 2001 From: David Abramov Date: Thu, 5 Feb 2026 13:28:07 -0800 Subject: [PATCH 30/35] Adding ALCF recon multinode --- orchestration/flows/bl832/alcf.py | 154 +++++++++++++++++- ...globus_compute_recon_config_multinode.yaml | 39 +++++ 2 files changed, 188 insertions(+), 5 deletions(-) create mode 100644 scripts/polaris/globus_compute_recon_config_multinode.yaml diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index ebe8f290..4b392e3e 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -58,16 +58,24 @@ def reconstruct( folder_name = Path(file_path).parent.name rundir = f"{self.allocation_root}/data/bl832/raw" - recon_script = f"{self.allocation_root}/reconstruction/scripts/globus_reconstruction.py" + recon_script = f"{self.allocation_root}/reconstruction/scripts/globus_reconstruction_multinode.py" # globus_reconstruction.py" gcc = Client(code_serialization_strategy=CombinedCode()) + # endpoint_id = Secret.load("globus-compute-endpoint").get() + + endpoint_id = Variable.get( + "alcf-globus-compute-recon-uuid", + default="4953017e-6127-4587-9ee3-b71db7623122", + _sync=True + ) + # TODO: Update globus-compute-endpoint Secret block with the new endpoint UUID # We will probably have 2 endpoints, one for recon, one for segmentation - with Executor(endpoint_id=Secret.load("globus-compute-endpoint").get(), client=gcc) as fxe: + with Executor(endpoint_id=endpoint_id, client=gcc) as fxe: logger.info(f"Running Tomopy reconstruction on {file_name} at ALCF") future = fxe.submit( - self._reconstruct_wrapper, + self._reconstruct_wrapper_multinode, rundir, recon_script, file_name, @@ -114,6 +122,118 @@ def _reconstruct_wrapper( f"{recon_res}" ) + @staticmethod + def _reconstruct_wrapper_multinode( + rundir: str, + script_path: str, + h5_file_name: str, + folder_path: str, + ) -> str: + import os + import subprocess + import time + import h5py + + rec_start = time.time() + os.chdir(rundir) + + # Get PBS info + pbs_nodefile = os.environ.get("PBS_NODEFILE") + + if pbs_nodefile and os.path.exists(pbs_nodefile): + with open(pbs_nodefile, 'r') as f: + all_lines = [line.strip() for line in f if line.strip()] + unique_nodes = list(dict.fromkeys(all_lines)) + num_nodes = len(unique_nodes) + else: + num_nodes = 1 + unique_nodes = ["localhost"] + + # Read number of slices from HDF5 + h5_path = f"{rundir}/{folder_path}/{h5_file_name}" + with h5py.File(h5_path, 'r') as f: + if '/exchange/data' in f: + num_slices = f['/exchange/data'].shape[1] + else: + # fallback to attrs + for key in f.keys(): + if 'nslices' in f[key].attrs: + num_slices = int(f[key].attrs['nslices']) + break + + print("=== RECON DEBUG ===") + print(f"PBS_NODEFILE: {pbs_nodefile}") + print(f"Unique nodes ({num_nodes}): {unique_nodes}") + print(f"Total slices: {num_slices}") + + slices_per_node = num_slices // num_nodes + + venv_path = "/eagle/SYNAPS-I/reconstruction/env/tomopy" + env_setup = ( + "export TMPDIR=/tmp && " + "module use /soft/modulefiles && " + "module load conda && " + "source $(conda info --base)/etc/profile.d/conda.sh && " + f"conda activate {venv_path} && " + f"cd {rundir} && " + ) + + if num_nodes > 1: + import tempfile + + # Launch each node's work as a separate background process via mpiexec + procs = [] + temp_hostfiles = [] + + for i, node in enumerate(unique_nodes): + sino_start = i * slices_per_node + sino_end = num_slices if i == num_nodes - 1 else (i + 1) * slices_per_node + + cmd = f"python {script_path} {h5_file_name} {folder_path} {sino_start} {sino_end}" + + # Write single-node hostfile + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.hosts') as f: + f.write(node + '\n') + temp_hostfile = f.name + temp_hostfiles.append(temp_hostfile) + + full_cmd = [ + "mpiexec", + "-n", "1", + "-ppn", "1", + "-hostfile", temp_hostfile, + "bash", "-c", env_setup + cmd + ] + + print(f"Launching on {node}: slices {sino_start}-{sino_end}") + proc = subprocess.Popen(full_cmd) + procs.append((proc, node)) + + # Wait for all + failed = [] + for proc, node in procs: + proc.wait() + if proc.returncode != 0: + failed.append(node) + + # Cleanup temp hostfiles + for hf in temp_hostfiles: + try: + os.remove(hf) + except OSError: + pass + + if failed: + raise RuntimeError(f"Reconstruction failed on nodes: {failed}") + else: + # Single node - run directly + cmd = f"python {script_path} {h5_file_name} {folder_path}" + result = subprocess.run(["bash", "-c", env_setup + cmd]) + if result.returncode != 0: + raise RuntimeError("Reconstruction failed") + + return f"Reconstructed {h5_file_name} across {num_nodes} nodes in {time.time() - rec_start:.1f}s" + def build_multi_resolution( self, file_path: str = "", @@ -746,7 +866,7 @@ def alcf_forge_recon_segment_flow( ) # Prune segmented data from data832 scratch - if segment_transfer_success: + if alcf_segmentation_success and segment_transfer_success: logger.info("Scheduling pruning of data832 scratch segmentation data.") prune_controller.prune( file_path=scratch_path_segment, @@ -815,5 +935,29 @@ def alcf_segmentation_integration_test() -> bool: return flow_success +@flow(name="alcf_reconstruction_integration_test", flow_run_name="alcf_reconstruction_integration_test") +def alcf_reconstruction_integration_test() -> bool: + """ + Integration test for the ALCF reconstruction task. + + :return: True if the reconstruction task completed successfully, False otherwise. + """ + logger = get_run_logger() + logger.info("Starting ALCF reconstruction integration test.") + raw_file_path = '_ra-00823_bard/20251218_111600_silkraw.h5' # 'test' # + + tomography_controller = get_controller( + hpc_type=HPC.ALCF, + config=Config832() + ) + + flow_success = tomography_controller.reconstruct( + file_path=f"{raw_file_path}", + ) + + logger.info(f"Flow success: {flow_success}") + return flow_success + + if __name__ == "__main__": - alcf_segmentation_integration_test() + alcf_reconstruction_integration_test() diff --git a/scripts/polaris/globus_compute_recon_config_multinode.yaml b/scripts/polaris/globus_compute_recon_config_multinode.yaml new file mode 100644 index 00000000..8ae3d728 --- /dev/null +++ b/scripts/polaris/globus_compute_recon_config_multinode.yaml @@ -0,0 +1,39 @@ +engine: + type: GlobusComputeEngine # This engine uses the HighThroughputExecutor + max_retries_on_system_failure: 0 + max_workers: 1 # Sets one worker per node + prefetch_capacity: 0 # Increase if you have many more tasks than workers + + address: + type: address_by_interface + ifname: bond0 + + strategy: simple + job_status_kwargs: + max_idletime: 300 + strategy_period: 60 + + provider: + type: PBSProProvider + + launcher: + type: MpiExecLauncher + # Ensures 1 manger per node, work on all 64 cores + bind_cmd: --cpu-bind + overrides: --depth=64 --ppn 1 + + account: SYNAPS-I + queue: demand + cpus_per_node: 64 + + # e.g., "#PBS -l filesystems=home:grand:eagle\n#PBS -k doe" + scheduler_options: "#PBS -l filesystems=home:eagle" + + # Node setup: activate necessary conda environment and such + worker_init: "module use /soft/modulefiles; module load conda; conda activate /eagle/SYNAPS-I/reconstruction/env/tomopy; export PATH=$PATH:/eagle/SYNAPSE-I/; cd $HOME/.globus_compute/globus_compute_reconstruction" + + walltime: 59:00 # Jobs will end after 60 minutes + nodes_per_block: 4 # All jobs will have 1 node + init_blocks: 0 + min_blocks: 0 + max_blocks: 2 # No more than 1 job will be scheduled at a time From a8da1eb54fe77b7025af39fc7570a456726e6c79 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Thu, 5 Feb 2026 14:10:05 -0800 Subject: [PATCH 31/35] More optimization. Went from 284s on 8 nodes to 77s --- orchestration/flows/bl832/alcf.py | 151 +++++++++++++++--------------- 1 file changed, 74 insertions(+), 77 deletions(-) diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index 4b392e3e..a11acfc4 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -58,7 +58,7 @@ def reconstruct( folder_name = Path(file_path).parent.name rundir = f"{self.allocation_root}/data/bl832/raw" - recon_script = f"{self.allocation_root}/reconstruction/scripts/globus_reconstruction_multinode.py" # globus_reconstruction.py" + recon_script = f"{self.allocation_root}/reconstruction/scripts/globus_reconstruction_multinode.py" gcc = Client(code_serialization_strategy=CombinedCode()) @@ -128,49 +128,50 @@ def _reconstruct_wrapper_multinode( script_path: str, h5_file_name: str, folder_path: str, + node_list: list[str] = None, # Pass explicitly + num_nodes: int = 8, ) -> str: import os import subprocess import time import h5py + import tempfile rec_start = time.time() os.chdir(rundir) - # Get PBS info - pbs_nodefile = os.environ.get("PBS_NODEFILE") + # If node_list not provided, try PBS_NODEFILE + if node_list is None: + pbs_nodefile = os.environ.get("PBS_NODEFILE") + if pbs_nodefile and os.path.exists(pbs_nodefile): + with open(pbs_nodefile, 'r') as f: + all_lines = [line.strip() for line in f if line.strip()] + node_list = list(dict.fromkeys(all_lines)) + else: + # Fallback: get nodes from PBS_NODENUM or assume localhost + node_list = ["localhost"] - if pbs_nodefile and os.path.exists(pbs_nodefile): - with open(pbs_nodefile, 'r') as f: - all_lines = [line.strip() for line in f if line.strip()] - unique_nodes = list(dict.fromkeys(all_lines)) - num_nodes = len(unique_nodes) - else: - num_nodes = 1 - unique_nodes = ["localhost"] + num_nodes = len(node_list) + print("=== RECON DEBUG ===") + print(f"Using {num_nodes} nodes: {node_list}") - # Read number of slices from HDF5 + # Read number of slices h5_path = f"{rundir}/{folder_path}/{h5_file_name}" with h5py.File(h5_path, 'r') as f: - if '/exchange/data' in f: - num_slices = f['/exchange/data'].shape[1] - else: - # fallback to attrs - for key in f.keys(): - if 'nslices' in f[key].attrs: - num_slices = int(f[key].attrs['nslices']) - break + num_slices = f['/exchange/data'].shape[1] - print("=== RECON DEBUG ===") - print(f"PBS_NODEFILE: {pbs_nodefile}") - print(f"Unique nodes ({num_nodes}): {unique_nodes}") print(f"Total slices: {num_slices}") - slices_per_node = num_slices // num_nodes venv_path = "/eagle/SYNAPS-I/reconstruction/env/tomopy" + + # Critical: Set environment variables BEFORE the conda activation env_setup = ( "export TMPDIR=/tmp && " + "export NUMEXPR_MAX_THREADS=64 && " + "export NUMEXPR_NUM_THREADS=64 && " + "export OMP_NUM_THREADS=64 && " + "export MKL_NUM_THREADS=64 && " "module use /soft/modulefiles && " "module load conda && " "source $(conda info --base)/etc/profile.d/conda.sh && " @@ -178,59 +179,55 @@ def _reconstruct_wrapper_multinode( f"cd {rundir} && " ) - if num_nodes > 1: - import tempfile - - # Launch each node's work as a separate background process via mpiexec - procs = [] - temp_hostfiles = [] - - for i, node in enumerate(unique_nodes): - sino_start = i * slices_per_node - sino_end = num_slices if i == num_nodes - 1 else (i + 1) * slices_per_node - - cmd = f"python {script_path} {h5_file_name} {folder_path} {sino_start} {sino_end}" - - # Write single-node hostfile - with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.hosts') as f: - f.write(node + '\n') - temp_hostfile = f.name - temp_hostfiles.append(temp_hostfile) - - full_cmd = [ - "mpiexec", - "-n", "1", - "-ppn", "1", - "-hostfile", temp_hostfile, - "bash", "-c", env_setup + cmd - ] - - print(f"Launching on {node}: slices {sino_start}-{sino_end}") - proc = subprocess.Popen(full_cmd) - procs.append((proc, node)) - - # Wait for all - failed = [] - for proc, node in procs: - proc.wait() - if proc.returncode != 0: - failed.append(node) - - # Cleanup temp hostfiles - for hf in temp_hostfiles: - try: - os.remove(hf) - except OSError: - pass - - if failed: - raise RuntimeError(f"Reconstruction failed on nodes: {failed}") - else: - # Single node - run directly - cmd = f"python {script_path} {h5_file_name} {folder_path}" - result = subprocess.run(["bash", "-c", env_setup + cmd]) - if result.returncode != 0: - raise RuntimeError("Reconstruction failed") + procs = [] + temp_hostfiles = [] + + for i, node in enumerate(node_list): + sino_start = i * slices_per_node + sino_end = num_slices if i == num_nodes - 1 else (i + 1) * slices_per_node + + cmd = f"python {script_path} {h5_file_name} {folder_path} {sino_start} {sino_end}" + + with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.hosts') as f: + f.write(node + '\n') + temp_hostfile = f.name + temp_hostfiles.append(temp_hostfile) + + # Use --cpu-bind to ensure proper CPU affinity + full_cmd = [ + "mpiexec", + "-n", "1", + "-ppn", "1", + "--cpu-bind", "depth", + "-d", "64", # depth=64 cores per rank + "-hostfile", temp_hostfile, + "bash", "-c", env_setup + cmd + ] + + print(f"Launching on {node}: slices {sino_start}-{sino_end}") + proc = subprocess.Popen(full_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + procs.append((proc, node, sino_start, sino_end)) + + # Wait and collect results + failed = [] + for proc, node, sino_start, sino_end in procs: + stdout, stderr = proc.communicate() + if proc.returncode != 0: + print(f"FAILED on {node} (slices {sino_start}-{sino_end})") + print(f"STDERR: {stderr.decode()[-2000:]}") + failed.append(node) + else: + print(f"SUCCESS on {node} (slices {sino_start}-{sino_end})") + + # Cleanup + for hf in temp_hostfiles: + try: + os.remove(hf) + except OSError: + pass + + if failed: + raise RuntimeError(f"Reconstruction failed on nodes: {failed}") return f"Reconstructed {h5_file_name} across {num_nodes} nodes in {time.time() - rec_start:.1f}s" From c73b91aa2e8a975fe9b60539b0447677f7a41354 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Thu, 5 Feb 2026 14:51:51 -0800 Subject: [PATCH 32/35] docstring --- orchestration/flows/bl832/alcf.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index a11acfc4..71e79f47 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -131,6 +131,17 @@ def _reconstruct_wrapper_multinode( node_list: list[str] = None, # Pass explicitly num_nodes: int = 8, ) -> str: + """ + Wrapper function to run Tomopy reconstruction using mpiexec on ALCF across multiple nodes. + + :param rundir: the directory on the eagle file system (ALCF) where the input data are located + :param script_path: the path to the script that will run the reconstruction + :param h5_file_name: the name of the h5 file to be reconstructed + :param folder_path: the path to the folder containing the h5 file + :param node_list: list of nodes to use for reconstruction (if None, will attempt to read from PBS_NODEFILE) + :param num_nodes: number of nodes to use for reconstruction (used if node_list is None) + :return: confirmation message + """ import os import subprocess import time From c58e3cd244bd89226cc7a423f85cbb9f36ccec36 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Tue, 10 Feb 2026 10:48:33 -0800 Subject: [PATCH 33/35] fixing typo --- orchestration/flows/bl832/prefect.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orchestration/flows/bl832/prefect.yaml b/orchestration/flows/bl832/prefect.yaml index 20858610..53ba7b69 100644 --- a/orchestration/flows/bl832/prefect.yaml +++ b/orchestration/flows/bl832/prefect.yaml @@ -55,7 +55,7 @@ deployments: name: alcf_recon_flow_pool work_queue_name: alcf_recon_flow_queue -- alcf_forge_recon_segment_flow: +- name: alcf_forge_recon_segment_flow entrypoint: orchestration/flows/bl832/alcf.py:alcf_forge_recon_segment_flow work_pool: name: alcf_recon_flow_pool From dab25f7125711f5550f5ce278d042ce893c4abae Mon Sep 17 00:00:00 2001 From: David Abramov Date: Tue, 10 Feb 2026 15:32:38 -0800 Subject: [PATCH 34/35] Segmentation with inference_v4 working --- orchestration/flows/bl832/alcf.py | 254 +++++++++++++++++++++++------- 1 file changed, 193 insertions(+), 61 deletions(-) diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index 71e79f47..22871152 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -335,8 +335,11 @@ def segmentation( _sync=True ) - segmentation_module = "src.inference_v2_optimized2" - workdir = f"{self.allocation_root}/segmentation/scripts/forge_feb_seg_model_demo_v2/forge_feb_seg_model_demo" + # segmentation_module = "src.inference_v2_optimized2" + # workdir = f"{self.allocation_root}/segmentation/scripts/forge_feb_seg_model_demo_v2/forge_feb_seg_model_demo" + + segmentation_module = "src.inference_v4" + workdir = f"{self.allocation_root}/segmentation/scripts/inference_v4/forge_feb_seg_model_demo" with Executor(endpoint_id=endpoint_id, client=gcc) as fxe: logger.info(f"Running segmentation on {recon_folder_path} at ALCF") @@ -351,14 +354,161 @@ def segmentation( return result + # @staticmethod + # def _segmentation_wrapper( + # input_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/reconstruction/", + # output_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/segmentation/", + # script_module: str = "src.inference_v2_optimized2", + # workdir: str = "/eagle/SYNAPS-I/segmentation/scripts/forge_feb_seg_model_demo_v2/forge_feb_seg_model_demo", + # nproc_per_node: int = 4, + # patch_size: int = 640, + # batch_size: int = 8, + # confidence: float = 0.5, + # prompts: list[str] = ["Cortex", "Phloem Fibers", "Air-based Pith cells", "Water-based Pith cells", "Xylem vessels"], + # bpe_path: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/bpe_simple_vocab_16e6.txt.gz", + # finetuned_checkpoint: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/checkpoint.pt", + # original_checkpoint: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/sam3.pt", + # use_finetuned: bool = True, + # ) -> str: + # """ + # Wrapper function to run segmentation using torch.distributed.run on ALCF. + # This is the code that is executed by Globus Compute. + + # :param input_dir: Directory containing input data for segmentation. + # :param output_dir: Directory to save segmentation outputs. + # :param script_module: Python module to run for segmentation. + # :param workdir: Working directory for the segmentation script. + # :param nproc_per_node: Number of processes per node. + # :param patch_size: Size of the patches for segmentation. + # :param batch_size: Batch size for segmentation. + # :param confidence: Confidence threshold for segmentation. + # :param prompts: List of prompts for segmentation. + # :param bpe_path: Path to the BPE vocabulary file. + # :param finetuned_checkpoint: Path to the finetuned model checkpoint. + # :param original_checkpoint: Path to the original model checkpoint. + # :param use_finetuned: Whether to use the finetuned model checkpoint. + + # :return: Confirmation message upon completion. + # """ + # import os + # import subprocess + # import time + + # seg_start = time.time() + # os.chdir(workdir) + + # # Get PBS info + # pbs_nodefile = os.environ.get("PBS_NODEFILE") + # pbs_jobid = os.environ.get("PBS_JOBID", "12345") + + # print("=== PBS DEBUG ===") + # print(f"PBS_NODEFILE: {pbs_nodefile}") + # print(f"PBS_JOBID: {pbs_jobid}") + + # # Determine number of nodes and master address based on PBS_NODEFILE + # if pbs_nodefile and os.path.exists(pbs_nodefile): + # with open(pbs_nodefile, 'r') as f: + # all_lines = [line.strip() for line in f if line.strip()] + # unique_nodes = list(dict.fromkeys(all_lines)) + # actual_nnodes = len(unique_nodes) + # master_addr = unique_nodes[0] + # print(f"PBS_NODEFILE contents: {all_lines}") + # print(f"Unique nodes ({actual_nnodes}): {unique_nodes}") + # print(f"Master: {master_addr}") + # else: + # actual_nnodes = 1 + # master_addr = "localhost" + # print("No PBS_NODEFILE, single node mode") + + # # Use explicit path to torchrun from the virtual environment + # venv_path = "/eagle/SYNAPS-I/segmentation/env" + + # # Build torchrun arguments + # # rdzv is used for rendezvous in multi-node setups, meaning all nodes can find each other + # torchrun_args = [ + # f"--nnodes={actual_nnodes}", + # f"--nproc_per_node={nproc_per_node}", + # f"--rdzv_id={pbs_jobid}", + # "--rdzv_backend=c10d", + # f"--rdzv_endpoint={master_addr}:29500", + # "-m", script_module, + # "--input-dir", input_dir, + # "--output-dir", output_dir, + # "--patch-size", str(patch_size), + # "--batch-size", str(batch_size), + # "--confidence", str(confidence), + # "--prompts", + # ] + # # Add prompts to the arguments, each prompt is a separate argument + # torchrun_args.extend([f'"{p}"' for p in prompts]) + + # torchrun_args.extend(["--bpe-path", bpe_path]) + + # if use_finetuned: + # torchrun_args.extend([ + # "--finetuned-checkpoint", finetuned_checkpoint, + # "--original-checkpoint", original_checkpoint, + # ]) + # else: + # torchrun_args.extend(["--original-checkpoint", original_checkpoint]) + + # torchrun_cmd = f"{venv_path}/bin/python -m torch.distributed.run " + " ".join(torchrun_args) + + # # Environment + NCCL setup - activate venv and set PATH explicitly + # # Following best practices from ALCF: + # # https://docs.alcf.anl.gov/polaris/data-science/frameworks/pytorch/#multi-gpu-multi-node-scale-up + # env_setup = ( + # f"source {venv_path}/bin/activate && " + # f"export PATH={venv_path}/bin:$PATH && " + # "export HF_HUB_CACHE=/eagle/SYNAPS-I/segmentation/.cache/huggingface && " + # "export HF_HOME=$HF_HUB_CACHE && " + # "export CUDA_DEVICE_ORDER=PCI_BUS_ID && " + # "export NCCL_NET_GDR_LEVEL=PHB && " + # "export NCCL_CROSS_NIC=1 && " + # "export NCCL_COLLNET_ENABLE=1 && " + # 'export NCCL_NET="AWS Libfabric" && ' + # "export LD_LIBRARY_PATH=/soft/libraries/aws-ofi-nccl/v1.9.1-aws/lib:$LD_LIBRARY_PATH && " + # "export LD_LIBRARY_PATH=/soft/libraries/hwloc/lib/:$LD_LIBRARY_PATH && " + # "export FI_CXI_DISABLE_HOST_REGISTER=1 && " + # "export FI_MR_CACHE_MONITOR=userfaultfd && " + # "export FI_CXI_DEFAULT_CQ_SIZE=131072 && " + # f"cd {workdir} && " + # ) + + # if actual_nnodes > 1: + # # Use mpiexec to launch torchrun on all nodes + # command = [ + # "mpiexec", + # "-n", str(actual_nnodes), + # "-ppn", "1", + # "-hostfile", pbs_nodefile, + # "--cpu-bind", "depth", + # "-d", "16", + # "bash", "-c", env_setup + torchrun_cmd + # ] + # else: + # command = ["bash", "-c", env_setup + torchrun_cmd] + + # print(f"Running: {' '.join(command)}") + + # result = subprocess.run(command, stdout=None, stderr=None, text=True) + # print(f"STDOUT: {result.stdout[-3000:] if result.stdout else 'None'}") + # print(f"STDERR: {result.stderr[-3000:] if result.stderr else 'None'}") + + # if result.returncode != 0: + # raise RuntimeError(f"Segmentation failed: {result.returncode}\nSTDERR: {result.stderr[-2000:]}") + + # return f"Completed in {time.time() - seg_start:.1f}s" + @staticmethod def _segmentation_wrapper( input_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/reconstruction/", output_dir: str = "/eagle/SYNAPS-I/data/bl832/scratch/segmentation/", - script_module: str = "src.inference_v2_optimized2", - workdir: str = "/eagle/SYNAPS-I/segmentation/scripts/forge_feb_seg_model_demo_v2/forge_feb_seg_model_demo", + script_module: str = "src.inference_v4", + workdir: str = "/eagle/SYNAPS-I/segmentation/scripts/inference_v4/forge_feb_seg_model_demo", nproc_per_node: int = 4, patch_size: int = 640, + overlap_ratio: float = 0.25, batch_size: int = 8, confidence: float = 0.5, prompts: list[str] = ["Cortex", "Phloem Fibers", "Air-based Pith cells", "Water-based Pith cells", "Xylem vessels"], @@ -366,26 +516,10 @@ def _segmentation_wrapper( finetuned_checkpoint: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/checkpoint.pt", original_checkpoint: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/sam3.pt", use_finetuned: bool = True, + skip_existing: bool = False, ) -> str: """ Wrapper function to run segmentation using torch.distributed.run on ALCF. - This is the code that is executed by Globus Compute. - - :param input_dir: Directory containing input data for segmentation. - :param output_dir: Directory to save segmentation outputs. - :param script_module: Python module to run for segmentation. - :param workdir: Working directory for the segmentation script. - :param nproc_per_node: Number of processes per node. - :param patch_size: Size of the patches for segmentation. - :param batch_size: Batch size for segmentation. - :param confidence: Confidence threshold for segmentation. - :param prompts: List of prompts for segmentation. - :param bpe_path: Path to the BPE vocabulary file. - :param finetuned_checkpoint: Path to the finetuned model checkpoint. - :param original_checkpoint: Path to the original model checkpoint. - :param use_finetuned: Whether to use the finetuned model checkpoint. - - :return: Confirmation message upon completion. """ import os import subprocess @@ -402,7 +536,6 @@ def _segmentation_wrapper( print(f"PBS_NODEFILE: {pbs_nodefile}") print(f"PBS_JOBID: {pbs_jobid}") - # Determine number of nodes and master address based on PBS_NODEFILE if pbs_nodefile and os.path.exists(pbs_nodefile): with open(pbs_nodefile, 'r') as f: all_lines = [line.strip() for line in f if line.strip()] @@ -417,12 +550,11 @@ def _segmentation_wrapper( master_addr = "localhost" print("No PBS_NODEFILE, single node mode") - # Use explicit path to torchrun from the virtual environment venv_path = "/eagle/SYNAPS-I/segmentation/env" - # Build torchrun arguments - # rdzv is used for rendezvous in multi-node setups, meaning all nodes can find each other - torchrun_args = [ + # Build command as a list (no shell escaping needed) + cmd_list = [ + f"{venv_path}/bin/python", "-m", "torch.distributed.run", f"--nnodes={actual_nnodes}", f"--nproc_per_node={nproc_per_node}", f"--rdzv_id={pbs_jobid}", @@ -432,48 +564,49 @@ def _segmentation_wrapper( "--input-dir", input_dir, "--output-dir", output_dir, "--patch-size", str(patch_size), + "--overlap-ratio", str(overlap_ratio), "--batch-size", str(batch_size), "--confidence", str(confidence), + "--bpe-path", bpe_path, "--prompts", ] - # Add prompts to the arguments, each prompt is a separate argument - torchrun_args.extend([f'"{p}"' for p in prompts]) - torchrun_args.extend(["--bpe-path", bpe_path]) + # Add prompts directly - no quotes needed with list-based subprocess + cmd_list.extend(prompts) if use_finetuned: - torchrun_args.extend([ + cmd_list.extend([ "--finetuned-checkpoint", finetuned_checkpoint, "--original-checkpoint", original_checkpoint, ]) else: - torchrun_args.extend(["--original-checkpoint", original_checkpoint]) - - torchrun_cmd = f"{venv_path}/bin/python -m torch.distributed.run " + " ".join(torchrun_args) - - # Environment + NCCL setup - activate venv and set PATH explicitly - # Following best practices from ALCF: - # https://docs.alcf.anl.gov/polaris/data-science/frameworks/pytorch/#multi-gpu-multi-node-scale-up - env_setup = ( - f"source {venv_path}/bin/activate && " - f"export PATH={venv_path}/bin:$PATH && " - "export HF_HUB_CACHE=/eagle/SYNAPS-I/segmentation/.cache/huggingface && " - "export HF_HOME=$HF_HUB_CACHE && " - "export CUDA_DEVICE_ORDER=PCI_BUS_ID && " - "export NCCL_NET_GDR_LEVEL=PHB && " - "export NCCL_CROSS_NIC=1 && " - "export NCCL_COLLNET_ENABLE=1 && " - 'export NCCL_NET="AWS Libfabric" && ' - "export LD_LIBRARY_PATH=/soft/libraries/aws-ofi-nccl/v1.9.1-aws/lib:$LD_LIBRARY_PATH && " - "export LD_LIBRARY_PATH=/soft/libraries/hwloc/lib/:$LD_LIBRARY_PATH && " - "export FI_CXI_DISABLE_HOST_REGISTER=1 && " - "export FI_MR_CACHE_MONITOR=userfaultfd && " - "export FI_CXI_DEFAULT_CQ_SIZE=131072 && " - f"cd {workdir} && " - ) + cmd_list.extend(["--original-checkpoint", original_checkpoint]) + + if skip_existing: + cmd_list.append("--skip-existing") + + # Environment variables + env = os.environ.copy() + env.update({ + "PATH": f"{venv_path}/bin:{env.get('PATH', '')}", + "HF_HUB_CACHE": "/eagle/SYNAPS-I/segmentation/.cache/huggingface", + "HF_HOME": "/eagle/SYNAPS-I/segmentation/.cache/huggingface", + "CUDA_DEVICE_ORDER": "PCI_BUS_ID", + "NCCL_NET_GDR_LEVEL": "PHB", + "NCCL_CROSS_NIC": "1", + "NCCL_COLLNET_ENABLE": "1", + "NCCL_NET": "AWS Libfabric", + "FI_CXI_DISABLE_HOST_REGISTER": "1", + "FI_MR_CACHE_MONITOR": "userfaultfd", + "FI_CXI_DEFAULT_CQ_SIZE": "131072", + }) + + # Prepend to LD_LIBRARY_PATH + ld_path = env.get("LD_LIBRARY_PATH", "") + env["LD_LIBRARY_PATH"] = f"/soft/libraries/aws-ofi-nccl/v1.9.1-aws/lib:/soft/libraries/hwloc/lib/:{ld_path}" if actual_nnodes > 1: - # Use mpiexec to launch torchrun on all nodes + # Use mpiexec to launch on all nodes command = [ "mpiexec", "-n", str(actual_nnodes), @@ -481,19 +614,18 @@ def _segmentation_wrapper( "-hostfile", pbs_nodefile, "--cpu-bind", "depth", "-d", "16", - "bash", "-c", env_setup + torchrun_cmd - ] + ] + cmd_list else: - command = ["bash", "-c", env_setup + torchrun_cmd] + command = cmd_list print(f"Running: {' '.join(command)}") - result = subprocess.run(command, stdout=None, stderr=None, text=True) + result = subprocess.run(command, env=env, cwd=workdir, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) print(f"STDOUT: {result.stdout[-3000:] if result.stdout else 'None'}") print(f"STDERR: {result.stderr[-3000:] if result.stderr else 'None'}") if result.returncode != 0: - raise RuntimeError(f"Segmentation failed: {result.returncode}\nSTDERR: {result.stderr[-2000:]}") + raise RuntimeError(f"Segmentation failed: {result.returncode}\nSTDERR: {result.stderr[-2000:] if result.stderr else 'None'}") return f"Completed in {time.time() - seg_start:.1f}s" @@ -968,4 +1100,4 @@ def alcf_reconstruction_integration_test() -> bool: if __name__ == "__main__": - alcf_reconstruction_integration_test() + alcf_segmentation_integration_test() From 477b0deb7fdfe7213d2854aa8da5d7522bb51a99 Mon Sep 17 00:00:00 2001 From: David Abramov Date: Wed, 11 Feb 2026 11:50:03 -0800 Subject: [PATCH 35/35] Adding checkpoint v2 --- orchestration/flows/bl832/alcf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/orchestration/flows/bl832/alcf.py b/orchestration/flows/bl832/alcf.py index 22871152..b6553539 100644 --- a/orchestration/flows/bl832/alcf.py +++ b/orchestration/flows/bl832/alcf.py @@ -513,7 +513,7 @@ def _segmentation_wrapper( confidence: float = 0.5, prompts: list[str] = ["Cortex", "Phloem Fibers", "Air-based Pith cells", "Water-based Pith cells", "Xylem vessels"], bpe_path: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/bpe_simple_vocab_16e6.txt.gz", - finetuned_checkpoint: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/checkpoint.pt", + finetuned_checkpoint: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/checkpoint_v2.pt", original_checkpoint: str = "/eagle/SYNAPS-I/segmentation/sam3_finetune/sam3/sam3.pt", use_finetuned: bool = True, skip_existing: bool = False,