From 2dc1b084838d0cad30c9050d66c3641e9c7b8b1d Mon Sep 17 00:00:00 2001 From: AVA <39534996+avazr@users.noreply.github.com> Date: Tue, 30 Sep 2025 16:44:40 -0500 Subject: [PATCH 1/8] Add multi-node support for LLaMA benchmarks and update PyTorch distributed settings --- .../benchmarks/model-benchmarks.md | 30 ++++++++++++ .../model_benchmarks/pytorch_base.py | 49 +++++++++++++------ superbench/runner/runner.py | 9 ++-- 3 files changed, 70 insertions(+), 18 deletions(-) diff --git a/docs/user-tutorial/benchmarks/model-benchmarks.md b/docs/user-tutorial/benchmarks/model-benchmarks.md index ba89ed6ff..f23e33458 100644 --- a/docs/user-tutorial/benchmarks/model-benchmarks.md +++ b/docs/user-tutorial/benchmarks/model-benchmarks.md @@ -65,3 +65,33 @@ Run GPT pretrain tasks with float32, float16, bfloat16 precisions with [Megatron | megatron-gpt/${precision}_train_mem_allocated | GB | The average GPU memory allocated per iteration. | | megatron-gpt/${precision}_train_max_mem_allocated | GB | The average maximum GPU memory allocated per iteration. | +## Multi-node LLaMA Benchmarks + +SuperBench uses [torchrun](https://docs.pytorch.org/docs/stable/elastic/run.html) for multi-node LLaMA benchmarks based on PyTorch. Follow the steps below. + +1. Configure the Ansible inventory as described in [configuration](../getting-started/configuration.md), using the private IPs of all nodes. + +2. Set the number of nodes (`node_num`), number of GPUs per node (`proc_num`), and required environment variables, including a resolvable `MASTER_ADDR` and an open (TCP) `MASTER_PORT`. + +```yaml title="llama.yaml" +# SuperBench Config +default_pytorch_mode: &default_pytorch_mode + modes: + - name: torch.distributed + proc_num: 4 # GPUs per node + node_num: 10 # Total nodes + env: + NCCL_DEBUG: WARN + TORCH_NCCL_ASYNC_ERROR_HANDLING: '0' + NCCL_SOCKET_IFNAME: 'eth0' + NCCL_IB_DISABLE: '1' + NCCL_IGNORE_DISABLED_P2P: '0' + MASTER_ADDR: '0.0.0.0' # Rank 0 node IP + MASTER_PORT: '29603' # TCP port +``` + +#### Prerequisites + +- Passwordless SSH configured across all nodes. +- NVIDIA IMEX service running (verify with `nvidia-imex-ctl --H`). +- Chosen `MASTER_PORT` open and reachable between all nodes. diff --git a/superbench/benchmarks/model_benchmarks/pytorch_base.py b/superbench/benchmarks/model_benchmarks/pytorch_base.py index 6bc3420ca..1f654f86c 100644 --- a/superbench/benchmarks/model_benchmarks/pytorch_base.py +++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py @@ -109,23 +109,44 @@ def _init_distributed_setting(self): ) return False # torch >= 1.9.0a0 torch.distributed.elastic is used by default - port = int(os.environ.get('MASTER_PORT', '29500')) + 1 - os.environ['MASTER_PORT'] = str(port) - addr = os.environ['MASTER_ADDR'] self._global_rank = int(os.environ['RANK']) self._local_rank = int(os.environ['LOCAL_RANK']) self._world_size = int(os.environ['WORLD_SIZE']) - logger.debug('ip:{},port:{},rank:{},world:{}'.format(addr, port, self._global_rank, self._world_size)) - store = PrefixStore( - self._name, TCPStore(addr, port, self._world_size, self._global_rank == 0, timedelta(seconds=300)) - ) - torch.distributed.init_process_group( - backend=self._args.distributed_backend.value, - timeout=timedelta(seconds=300), - rank=self._global_rank, - world_size=self._world_size, - store=store - ) + self._local_world_size = int(os.environ['LOCAL_WORLD_SIZE']) + self._multi_node = True if self._world_size != self._local_world_size else False + + if self._multi_node: + logger.debug( + 'rank:{},world:{}, local_world:{}'.format( + self._global_rank, self._world_size, self._local_world_size + ) + ) + torch.distributed.init_process_group( + backend=self._args.distributed_backend.value, + timeout=timedelta(seconds=300), + rank=self._global_rank, + world_size=self._world_size, + ) + else: + port = int(os.environ.get('MASTER_PORT', '29500')) + 1 + os.environ['MASTER_PORT'] = str(port) + addr = os.environ['MASTER_ADDR'] + logger.debug( + 'ip:{},port:{},rank:{},world:{}, local_world:{}'.format( + addr, port, self._global_rank, self._world_size, self._local_world_size + ) + ) + store = PrefixStore( + self._name, + TCPStore(addr, port, self._world_size, self._global_rank == 0, timedelta(seconds=300)) + ) + torch.distributed.init_process_group( + backend=self._args.distributed_backend.value, + timeout=timedelta(seconds=300), + rank=self._global_rank, + world_size=self._world_size, + store=store + ) else: logger.error( diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index adf9561fd..ae9a71748 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -139,10 +139,11 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None): ) mode_command = f'PROC_RANK={mode.proc_rank} {mode_command.strip()}' elif mode.name == 'torch.distributed': - # TODO: replace with torch.distributed.run in v1.9 - # TODO: only supports node_num=1 and node_num=all currently - torch_dist_params = '' if 'node_num' in mode and mode.node_num == 1 else \ - '--nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT ' + torch_dist_params = ( + f'--nnodes={mode.node_num} --rdzv-endpoint=$MASTER_ADDR:$MASTER_PORT ' + f'--rdzv-id={random.randint(100, 999)} --rdzv-backend=c10d ' + if 'node_num' in mode and mode.node_num > 1 else '' + ) mode_command = ( f'torchrun' f' --no_python --nproc_per_node={mode.proc_num} {torch_dist_params}{exec_command}' From 3ac61bff79f4c444f65556bd29160d8ae0515ffa Mon Sep 17 00:00:00 2001 From: AVA <39534996+avazr@users.noreply.github.com> Date: Tue, 30 Sep 2025 17:14:47 -0500 Subject: [PATCH 2/8] Add example port --- docs/user-tutorial/benchmarks/model-benchmarks.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/user-tutorial/benchmarks/model-benchmarks.md b/docs/user-tutorial/benchmarks/model-benchmarks.md index f23e33458..e77f58dfe 100644 --- a/docs/user-tutorial/benchmarks/model-benchmarks.md +++ b/docs/user-tutorial/benchmarks/model-benchmarks.md @@ -74,7 +74,6 @@ SuperBench uses [torchrun](https://docs.pytorch.org/docs/stable/elastic/run.html 2. Set the number of nodes (`node_num`), number of GPUs per node (`proc_num`), and required environment variables, including a resolvable `MASTER_ADDR` and an open (TCP) `MASTER_PORT`. ```yaml title="llama.yaml" -# SuperBench Config default_pytorch_mode: &default_pytorch_mode modes: - name: torch.distributed @@ -86,8 +85,8 @@ default_pytorch_mode: &default_pytorch_mode NCCL_SOCKET_IFNAME: 'eth0' NCCL_IB_DISABLE: '1' NCCL_IGNORE_DISABLED_P2P: '0' - MASTER_ADDR: '0.0.0.0' # Rank 0 node IP - MASTER_PORT: '29603' # TCP port + MASTER_ADDR: '10.0.0.6' # Example of rank 0 node IP + MASTER_PORT: '29603' # Example of TCP port ``` #### Prerequisites From 334e9a4f69c80eee6164fab4b2a2dcbb913b9964 Mon Sep 17 00:00:00 2001 From: AVA <39534996+avazr@users.noreply.github.com> Date: Tue, 18 Nov 2025 15:59:50 -0600 Subject: [PATCH 3/8] Empty lines --- superbench/runner/runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index b93141031..086faf18a 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -27,7 +27,6 @@ class SuperBenchRunner(): """SuperBench runner class.""" - def __init__(self, sb_config, docker_config, ansible_config, sb_output_dir): """Initilize. @@ -158,10 +157,12 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None): f'--rdzv-id={random.randint(100, 999)} --rdzv-backend=c10d ' if 'node_num' in mode and mode.node_num > 1 else '' ) + nsys_prefix = ( f'nsys profile --output {trace_dir}/{benchmark_name}_traces ' f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx ' ) if enable_nsys else '' + mode_command = ( f'{nsys_prefix}' f'torchrun' From 27406a8e4a9f8cc1de2d3c30b025c3ca9ed6a60d Mon Sep 17 00:00:00 2001 From: AVA <39534996+avazr@users.noreply.github.com> Date: Tue, 18 Nov 2025 16:23:24 -0600 Subject: [PATCH 4/8] Fix documentation link for Ansible inventory configuration in model benchmarks --- docs/user-tutorial/benchmarks/model-benchmarks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-tutorial/benchmarks/model-benchmarks.md b/docs/user-tutorial/benchmarks/model-benchmarks.md index e77f58dfe..3e277d194 100644 --- a/docs/user-tutorial/benchmarks/model-benchmarks.md +++ b/docs/user-tutorial/benchmarks/model-benchmarks.md @@ -69,7 +69,7 @@ Run GPT pretrain tasks with float32, float16, bfloat16 precisions with [Megatron SuperBench uses [torchrun](https://docs.pytorch.org/docs/stable/elastic/run.html) for multi-node LLaMA benchmarks based on PyTorch. Follow the steps below. -1. Configure the Ansible inventory as described in [configuration](../getting-started/configuration.md), using the private IPs of all nodes. +1. Configure the Ansible inventory as described in [configuration](../../getting-started/configuration.md), using the private IPs of all nodes. 2. Set the number of nodes (`node_num`), number of GPUs per node (`proc_num`), and required environment variables, including a resolvable `MASTER_ADDR` and an open (TCP) `MASTER_PORT`. From ccdc2935386c48cafa99988ded53b472754ade2c Mon Sep 17 00:00:00 2001 From: AVA <39534996+avazr@users.noreply.github.com> Date: Wed, 19 Nov 2025 17:22:11 -0600 Subject: [PATCH 5/8] Fix type conversion for node_num in __get_mode_command method --- superbench/runner/runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 086faf18a..52e5ce4da 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -27,6 +27,7 @@ class SuperBenchRunner(): """SuperBench runner class.""" + def __init__(self, sb_config, docker_config, ansible_config, sb_output_dir): """Initilize. @@ -155,7 +156,7 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None): torch_dist_params = ( f'--nnodes={mode.node_num} --rdzv-endpoint=$MASTER_ADDR:$MASTER_PORT ' f'--rdzv-id={random.randint(100, 999)} --rdzv-backend=c10d ' - if 'node_num' in mode and mode.node_num > 1 else '' + if 'node_num' in mode and int(mode.node_num) > 1 else '' ) nsys_prefix = ( From 43df3f1022f3e4255023c3b89e424d7a788f0720 Mon Sep 17 00:00:00 2001 From: AVA <39534996+avazr@users.noreply.github.com> Date: Wed, 19 Nov 2025 17:30:22 -0600 Subject: [PATCH 6/8] Empty line --- superbench/runner/runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 52e5ce4da..e56f96514 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -27,7 +27,6 @@ class SuperBenchRunner(): """SuperBench runner class.""" - def __init__(self, sb_config, docker_config, ansible_config, sb_output_dir): """Initilize. From e7451b68473356aa433e90be2ae52b4be9bbf99c Mon Sep 17 00:00:00 2001 From: AVA <39534996+avazr@users.noreply.github.com> Date: Wed, 19 Nov 2025 17:53:21 -0600 Subject: [PATCH 7/8] Fix type checking for node_num in __get_mode_command method --- superbench/runner/runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index e56f96514..4eea30e1b 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -155,7 +155,7 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None): torch_dist_params = ( f'--nnodes={mode.node_num} --rdzv-endpoint=$MASTER_ADDR:$MASTER_PORT ' f'--rdzv-id={random.randint(100, 999)} --rdzv-backend=c10d ' - if 'node_num' in mode and int(mode.node_num) > 1 else '' + if ('node_num' in mode and str(mode.node_num).isdigit() and int(mode.node_num) > 1) else '' ) nsys_prefix = ( From 5d4f6d9aa8881cfa9f4727cb933905856a09df5b Mon Sep 17 00:00:00 2001 From: AVA <39534996+avazr@users.noreply.github.com> Date: Wed, 19 Nov 2025 17:59:41 -0600 Subject: [PATCH 8/8] Lint fix --- superbench/runner/runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 4eea30e1b..de8a8c461 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -154,8 +154,8 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None): elif mode.name == 'torch.distributed': torch_dist_params = ( f'--nnodes={mode.node_num} --rdzv-endpoint=$MASTER_ADDR:$MASTER_PORT ' - f'--rdzv-id={random.randint(100, 999)} --rdzv-backend=c10d ' - if ('node_num' in mode and str(mode.node_num).isdigit() and int(mode.node_num) > 1) else '' + f'--rdzv-id={random.randint(100, 999)} --rdzv-backend=c10d ' if + ('node_num' in mode and str(mode.node_num).isdigit() and int(mode.node_num) > 1) else '' ) nsys_prefix = (