From b4e95805e009b5ac9d58968e2dc9e1a2dab67b08 Mon Sep 17 00:00:00 2001 From: Shawn Dawson Date: Fri, 9 Jan 2026 14:52:31 -0800 Subject: [PATCH] Fix for sameNode functionality of ATS with flux. Call get_physical_nodes during init. Need to call this immediately to cache the hardware nodes before any jobs start running. The logic inspects the 'free' nodes reported by flux, which will change during the run, so inspect it immediately, before starting jobs then save the list of hardware nodes. --- ats/atsMachines/fluxScheduled.py | 58 ++++++++++++++++++- ats/version.py | 2 +- pyproject.toml | 2 +- scripts/update-version.x | 4 +- test/HelloATS/READ.ME | 4 +- test/HelloCPUAffinity/READ.ME | 10 ++-- test/HelloGPU/READ.ME | 20 +++---- test/HelloGPU2/READ.ME | 10 ++-- test/HelloGPU2/READ.ME.CPX | 4 +- test/HelloOMP/READ.ME | 10 ++-- test/HelloSameNode/READ.ME | 2 +- test/HelloSameNode/test_get_node_names.py | 70 +++++++++++++++++++++++ 12 files changed, 160 insertions(+), 36 deletions(-) create mode 100755 test/HelloSameNode/test_get_node_names.py diff --git a/ats/atsMachines/fluxScheduled.py b/ats/atsMachines/fluxScheduled.py index 76f73be..a1492c9 100755 --- a/ats/atsMachines/fluxScheduled.py +++ b/ats/atsMachines/fluxScheduled.py @@ -13,6 +13,7 @@ import sys import time import subprocess +import re from math import ceil from ats import terminal @@ -22,13 +23,13 @@ from ats import configuration from ats import log - class FluxScheduled(lcMachines.LCMachineCore): """ A class to initialize Flux if necessary and return job statements from ATS tests. """ + _cached_nodes = None # static/class variable debug = False debug_canRunNow = False debug_noteLaunch = False @@ -103,6 +104,57 @@ def init(self): log(("DEBUG: FluxScheduled init : self.numNodesAvailable =%i" % (self.numNodesAvailable)), echo=True) log(("DEBUG: FluxScheduled init : self.numGPUsAvailable =%i" % (self.numGPUs)), echo=True) + # Call get_physical_node to cache the hardware node listing before starting jobs + self.get_physical_node(0) + + def expand_nodelist(self, nodelist_field): + """ + Expand a Flux nodelist string like 'rzadams[1002,1005-1007]' into a list of node names. + Handles multiple comma-separated patterns. + """ + nodes = [] + # Regex to find patterns like prefix[range] or prefixNNNN + pattern = re.compile(r'([a-zA-Z0-9_-]+)(?:\[(.*?)\])?') + for match in pattern.finditer(nodelist_field): + prefix = match.group(1) + bracket = match.group(2) + if bracket: + for part in bracket.split(','): + part = part.strip() + if '-' in part: + start, end = map(int, part.split('-')) + nodes.extend([f"{prefix}{i}" for i in range(start, end + 1)]) + else: + nodes.append(f"{prefix}{part}") + else: + nodes.append(prefix) + return nodes + + def get_physical_node(self, rel_index): + """ + Given a relative node number, return the actual physical node within the flux allocation. + Works for any node prefix (e.g., rzadams, elcap, tuo, syz). + """ + if FluxScheduled._cached_nodes is None: + out = subprocess.check_output("flux resource list", shell=True).decode() + nodelist_field = None + for line in out.splitlines(): + if line.strip().startswith("free"): + parts = line.strip().split() + if len(parts) >= 5: + nodelist_field = parts[-1] + break + if nodelist_field is None: + raise RuntimeError("Could not find NODELIST field in flux resource list output.") + FluxScheduled._cached_nodes = self.expand_nodelist(nodelist_field) + log(("Info: Physical Hardware Nodes: %s" % FluxScheduled._cached_nodes), echo=True) + + nodes = FluxScheduled._cached_nodes + if rel_index < 0 or rel_index >= len(nodes): + raise IndexError(f"Relative index {rel_index} out of range (0-{len(nodes)-1})") + return nodes[rel_index] + + def kill(self, test): """ Final cleanup if any. Not implemented for Flux yet. @@ -288,7 +340,9 @@ def calculateCommandList(self, test): if same_node is not None: if same_node not in self.node_list: self.node_list.append(same_node) - ret.append(f"--requires=-rank:{self.node_list.index(same_node) % self.numNodes}") + rel_node = self.node_list.index(same_node) % self.numNodes + physical_node = self.get_physical_node(rel_node) + ret.append(f"--requires=host:{physical_node}") """ Need to set -n{np} and -c{test.cpus_per_task}. But we also need to account for accessing diff --git a/ats/version.py b/ats/version.py index 6a9814f..f446323 100644 --- a/ats/version.py +++ b/ats/version.py @@ -2,7 +2,7 @@ # ATS_MAJOR = 7 ATS_MINOR = 0 -ATS_PATCH = 121 +ATS_PATCH = 122 # # This version, constructed from the above, is used internally by ATS diff --git a/pyproject.toml b/pyproject.toml index 12e3516..963f13b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ license = "BSD-3-Clause" publish = false readme = "README.md" repository = "https://github.com/LLNL/ATS" -version = "7.0.121" +version = "7.0.122" [tool.poetry.dependencies] python = ">=3.8" diff --git a/scripts/update-version.x b/scripts/update-version.x index 915f0be..5c47e18 100755 --- a/scripts/update-version.x +++ b/scripts/update-version.x @@ -1,8 +1,8 @@ -/usr/gapps/ats/scripts/replace 7.0.120 7.0.121 \ +/usr/gapps/ats/scripts/replace 7.0.121 7.0.122 \ pyproject.toml \ test/*/READ.ME -/usr/gapps/ats/scripts/replace "ATS_PATCH = ..." "ATS_PATCH = 121" ats/version.py +/usr/gapps/ats/scripts/replace "ATS_PATCH = ..." "ATS_PATCH = 122" ats/version.py diff --git a/test/HelloATS/READ.ME b/test/HelloATS/READ.ME index 3d584e4..670a131 100644 --- a/test/HelloATS/READ.ME +++ b/test/HelloATS/READ.ME @@ -6,7 +6,7 @@ Sample use of lightweight ATS wrapper script How to use: - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH @@ -54,7 +54,7 @@ Toss 4 Testing on slurm based toss4 machines such as rzwhippet -------------------------------------------------------------------------------- Toss 4 ATS-4 (rzvernal, rzadams, tioga, etc.) -------------------------------------------------------------------------------- - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH diff --git a/test/HelloCPUAffinity/READ.ME b/test/HelloCPUAffinity/READ.ME index c1da1bd..84a2087 100644 --- a/test/HelloCPUAffinity/READ.ME +++ b/test/HelloCPUAffinity/READ.ME @@ -6,7 +6,7 @@ Sample use of lightweight ATS wrapper script How to use: - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH @@ -26,7 +26,7 @@ an a.out executable in this directory like so: -------------------------------------------------------------------------------- Toss 3 (rzgenie, etc.). Only use Slurm -------------------------------------------------------------------------------- - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH @@ -42,7 +42,7 @@ Toss 3 (rzgenie, etc.). Only use Slurm -------------------------------------------------------------------------------- Toss 4 Cray rzvernal/rzadams ATS-4 -------------------------------------------------------------------------------- - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH @@ -59,7 +59,7 @@ Toss 4 Cray rzvernal/rzadams ATS-4 -------------------------------------------------------------------------------- Toss 4 Testing on slurm based toss4 machines such as rzwhippet -------------------------------------------------------------------------------- - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH @@ -80,7 +80,7 @@ Toss 4 Testing on slurm based toss4 machines such as rzwhippet -------------------------------------------------------------------------------- Blueos (rzansel) Uses LSF -------------------------------------------------------------------------------- - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH diff --git a/test/HelloGPU/READ.ME b/test/HelloGPU/READ.ME index 69edf39..a871a02 100644 --- a/test/HelloGPU/READ.ME +++ b/test/HelloGPU/READ.ME @@ -9,10 +9,10 @@ Blueos (rzansel) Uses LSF mpixlc-gpu -fopenmp -DHAVE_OPENMP -DHAVE_OPENMP_4 hello_gpu.cc -/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off -/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --lrun -verbose -verbose -/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --lrun --lrun_pack -verbose -/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --jsrun_exclusive -verbose +/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off +/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --lrun -verbose -verbose +/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --lrun --lrun_pack -verbose +/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --jsrun_exclusive -verbose -------------------------------------------------------------------------------- @@ -35,12 +35,12 @@ Sample Runs of the code stand alone srun -n 4 ./a.out Sample ATS runs of the code - /usr/apps/ats/7.0.121/bin/atslite1 --smpi_off - /usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --lrun - /usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --lrun --lrun_pack - /usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --jsrun - /usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --jsrun --jsrun_exclusive - /usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --jsrun --jsrun_exclusive --jsrun_np 4 --jsrun_ngpu 4 + /usr/apps/ats/7.0.122/bin/atslite1 --smpi_off + /usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --lrun + /usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --lrun --lrun_pack + /usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --jsrun + /usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --jsrun --jsrun_exclusive + /usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --jsrun --jsrun_exclusive --jsrun_np 4 --jsrun_ngpu 4 Clean rm -rf a.out blueos_3* diff --git a/test/HelloGPU2/READ.ME b/test/HelloGPU2/READ.ME index 9c3417e..7f134bf 100644 --- a/test/HelloGPU2/READ.ME +++ b/test/HelloGPU2/READ.ME @@ -25,7 +25,7 @@ flux run -N1 -n1 -c 96 ./a.out 5 flux run -N 2 --tasks-per-node 2 ./a.out # ats tests -/usr/apps/ats/7.0.121/bin/atsflux --flux test.ats +/usr/apps/ats/7.0.122/bin/atsflux --flux test.ats -------------------------------------------------------------------------------- Test a GPU code built with cuda/nvcc @@ -52,10 +52,10 @@ lrun --mpibind=off -N2 -n16 ./a.out <- same as above # the cpu and gpu affinity in the tes case, so look at them by hand for reasonableness # lrun will 'pack' the jobs. jsrun will use a resource list -/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off -/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --lrun -verbose -/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --lrun --lrun_pack -verbose -/usr/apps/ats/7.0.121/bin/atslite1 --smpi_off --jsrun_exclusive -verbose +/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off +/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --lrun -verbose +/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --lrun --lrun_pack -verbose +/usr/apps/ats/7.0.122/bin/atslite1 --smpi_off --jsrun_exclusive -verbose -------------------------------------------------------------------------------- diff --git a/test/HelloGPU2/READ.ME.CPX b/test/HelloGPU2/READ.ME.CPX index 3b87dce..ffdc468 100644 --- a/test/HelloGPU2/READ.ME.CPX +++ b/test/HelloGPU2/READ.ME.CPX @@ -7,9 +7,9 @@ export ROCR_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20 make clean make hip -/usr/apps/ats/7.0.120/bin/atsflux --flux test.ats +/usr/apps/ats/7.0.122/bin/atsflux --flux test.ats -/usr/apps/ats/7.0.120/bin/atsflux --CPX --cpx --flux test.ats \ +/usr/apps/ats/7.0.122/bin/atsflux --CPX --cpx --flux test.ats \ --test_np_max=4 --gpus_per_task=1 --num_concurrent_jobs=48 diff --git a/test/HelloOMP/READ.ME b/test/HelloOMP/READ.ME index c5956f4..42c4f7d 100644 --- a/test/HelloOMP/READ.ME +++ b/test/HelloOMP/READ.ME @@ -6,7 +6,7 @@ Sample use of lightweight ATS wrapper script How to use: - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH @@ -48,7 +48,7 @@ Toss 3 (rzgenie, etc.). -------------------------------------------------------------------------------- export PATH=${PATH}:/usr/gapps/ats/scripts module load python/3.8.2 - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH @@ -65,7 +65,7 @@ Toss 3 (rzgenie, etc.). -------------------------------------------------------------------------------- Toss 4 ATS-4 (rzvernal, rzadams, tioga, etc.) -------------------------------------------------------------------------------- - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH @@ -89,7 +89,7 @@ Toss 4 (rzwhippet, etc.). module load python/3.9.12 export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH or - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH mpicxx -g -qopenmp -pthread -O2 -o omp_test omp_test.cc @@ -108,7 +108,7 @@ Toss 4 (rzwhippet, etc.). -------------------------------------------------------------------------------- Blueos (rzansel) Uses LSF -------------------------------------------------------------------------------- - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH diff --git a/test/HelloSameNode/READ.ME b/test/HelloSameNode/READ.ME index edd25fa..68fbad1 100644 --- a/test/HelloSameNode/READ.ME +++ b/test/HelloSameNode/READ.ME @@ -4,7 +4,7 @@ How to use: - export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.121/bin:$PATH + export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.122/bin:$PATH or export PATH=/usr/gapps/ats/${SYS_TYPE}/7.0.${USER}/bin:$PATH diff --git a/test/HelloSameNode/test_get_node_names.py b/test/HelloSameNode/test_get_node_names.py new file mode 100755 index 0000000..c8b5f64 --- /dev/null +++ b/test/HelloSameNode/test_get_node_names.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 + +import subprocess +import sys +import re + +def get_node_names(): + """ + Parse the output of 'flux resource list' and return a list of node names in allocation order. + """ + try: + out = subprocess.check_output("flux resource list", shell=True).decode() + except Exception as e: + print(f"Error running 'flux resource list': {e}") + sys.exit(1) + + node_names = [] + # Look for NODELIST field in the output + for line in out.splitlines(): + if re.search(r'NODELIST', line): + # The next line should contain the node list + continue + m = re.search(r'(rzadams\[[0-9,-]+\]|rzadams[0-9]+)', line) + if m: + nodelist = m.group(1) + # Expand bracket notation, e.g. rzadams[1010-1013] + bracket = re.match(r'([a-zA-Z]+)\[([0-9,-]+)\]', nodelist) + if bracket: + base = bracket.group(1) + rng = bracket.group(2) + for part in rng.split(','): + if '-' in part: + start, end = map(int, part.split('-')) + node_names.extend([f"{base}{i}" for i in range(start, end+1)]) + else: + node_names.append(f"{base}{part}") + else: + node_names.append(nodelist) + return node_names + +def get_node_name(rel_index): + nodes = get_node_names() + if rel_index < 0 or rel_index >= len(nodes): + raise IndexError(f"Relative index {rel_index} out of range (0-{len(nodes)-1})") + return nodes[rel_index] + +def main(): + nodes = get_node_names() + print("Allocated nodes in order:") + for idx, n in enumerate(nodes): + print(f" [{idx}] {n}") + + if len(sys.argv) > 1: + try: + rel_index = int(sys.argv[1]) + except ValueError: + print("Usage: flux_node_mapper.py ") + sys.exit(1) + else: + rel_index = int(input(f"Enter relative node index (0-{len(nodes)-1}): ")) + + try: + node_name = get_node_name(rel_index) + print(f"Node {rel_index} maps to: {node_name}") + except Exception as e: + print(f"Error: {e}") + sys.exit(1) + +if __name__ == "__main__": + main()