From 2dc1b084838d0cad30c9050d66c3641e9c7b8b1d Mon Sep 17 00:00:00 2001
From: AVA <39534996+avazr@users.noreply.github.com>
Date: Tue, 30 Sep 2025 16:44:40 -0500
Subject: [PATCH 1/8] Add multi-node support for LLaMA benchmarks and update
 PyTorch distributed settings

---
 .../benchmarks/model-benchmarks.md            | 30 ++++++++++++
 .../model_benchmarks/pytorch_base.py          | 49 +++++++++++++------
 superbench/runner/runner.py                   |  9 ++--
 3 files changed, 70 insertions(+), 18 deletions(-)

diff --git a/docs/user-tutorial/benchmarks/model-benchmarks.md b/docs/user-tutorial/benchmarks/model-benchmarks.md
index ba89ed6ff..f23e33458 100644
--- a/docs/user-tutorial/benchmarks/model-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/model-benchmarks.md
@@ -65,3 +65,33 @@ Run GPT pretrain tasks with float32, float16, bfloat16 precisions with [Megatron
 | megatron-gpt/${precision}_train_mem_allocated     | GB                     | The average GPU memory allocated per iteration.         |
 | megatron-gpt/${precision}_train_max_mem_allocated | GB                     | The average maximum GPU memory allocated per iteration. |
 
+## Multi-node LLaMA Benchmarks
+
+SuperBench uses [torchrun](https://docs.pytorch.org/docs/stable/elastic/run.html) for multi-node LLaMA benchmarks based on PyTorch. Follow the steps below.
+
+1. Configure the Ansible inventory as described in [configuration](../getting-started/configuration.md), using the private IPs of all nodes.
+
+2. Set the number of nodes (`node_num`), number of GPUs per node (`proc_num`), and required environment variables, including a resolvable `MASTER_ADDR` and an open (TCP) `MASTER_PORT`.
+
+```yaml title="llama.yaml"
+# SuperBench Config
+default_pytorch_mode: &default_pytorch_mode
+  modes:
+    - name: torch.distributed
+      proc_num: 4          # GPUs per node
+      node_num: 10         # Total nodes
+      env:
+        NCCL_DEBUG: WARN
+        TORCH_NCCL_ASYNC_ERROR_HANDLING: '0'
+        NCCL_SOCKET_IFNAME: 'eth0'
+        NCCL_IB_DISABLE: '1'
+        NCCL_IGNORE_DISABLED_P2P: '0'
+        MASTER_ADDR: '0.0.0.0'             # Rank 0 node IP
+        MASTER_PORT: '29603'               # TCP port
+```
+
+#### Prerequisites
+
+- Passwordless SSH configured across all nodes.
+- NVIDIA IMEX service running (verify with `nvidia-imex-ctl --H`).
+- Chosen `MASTER_PORT` open and reachable between all nodes.
diff --git a/superbench/benchmarks/model_benchmarks/pytorch_base.py b/superbench/benchmarks/model_benchmarks/pytorch_base.py
index 6bc3420ca..1f654f86c 100644
--- a/superbench/benchmarks/model_benchmarks/pytorch_base.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py
@@ -109,23 +109,44 @@ def _init_distributed_setting(self):
                     )
                     return False
                 # torch >= 1.9.0a0 torch.distributed.elastic is used by default
-                port = int(os.environ.get('MASTER_PORT', '29500')) + 1
-                os.environ['MASTER_PORT'] = str(port)
-                addr = os.environ['MASTER_ADDR']
                 self._global_rank = int(os.environ['RANK'])
                 self._local_rank = int(os.environ['LOCAL_RANK'])
                 self._world_size = int(os.environ['WORLD_SIZE'])
-                logger.debug('ip:{},port:{},rank:{},world:{}'.format(addr, port, self._global_rank, self._world_size))
-                store = PrefixStore(
-                    self._name, TCPStore(addr, port, self._world_size, self._global_rank == 0, timedelta(seconds=300))
-                )
-                torch.distributed.init_process_group(
-                    backend=self._args.distributed_backend.value,
-                    timeout=timedelta(seconds=300),
-                    rank=self._global_rank,
-                    world_size=self._world_size,
-                    store=store
-                )
+                self._local_world_size = int(os.environ['LOCAL_WORLD_SIZE'])
+                self._multi_node = True if self._world_size != self._local_world_size else False
+
+                if self._multi_node:
+                    logger.debug(
+                        'rank:{},world:{}, local_world:{}'.format(
+                            self._global_rank, self._world_size, self._local_world_size
+                        )
+                    )
+                    torch.distributed.init_process_group(
+                        backend=self._args.distributed_backend.value,
+                        timeout=timedelta(seconds=300),
+                        rank=self._global_rank,
+                        world_size=self._world_size,
+                    )
+                else:
+                    port = int(os.environ.get('MASTER_PORT', '29500')) + 1
+                    os.environ['MASTER_PORT'] = str(port)
+                    addr = os.environ['MASTER_ADDR']
+                    logger.debug(
+                        'ip:{},port:{},rank:{},world:{}, local_world:{}'.format(
+                            addr, port, self._global_rank, self._world_size, self._local_world_size
+                        )
+                    )
+                    store = PrefixStore(
+                        self._name,
+                        TCPStore(addr, port, self._world_size, self._global_rank == 0, timedelta(seconds=300))
+                    )
+                    torch.distributed.init_process_group(
+                        backend=self._args.distributed_backend.value,
+                        timeout=timedelta(seconds=300),
+                        rank=self._global_rank,
+                        world_size=self._world_size,
+                        store=store
+                    )
 
             else:
                 logger.error(
diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py
index adf9561fd..ae9a71748 100644
--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@@ -139,10 +139,11 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None):
             )
             mode_command = f'PROC_RANK={mode.proc_rank} {mode_command.strip()}'
         elif mode.name == 'torch.distributed':
-            # TODO: replace with torch.distributed.run in v1.9
-            # TODO: only supports node_num=1 and node_num=all currently
-            torch_dist_params = '' if 'node_num' in mode and mode.node_num == 1 else \
-                '--nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
+            torch_dist_params = (
+                f'--nnodes={mode.node_num} --rdzv-endpoint=$MASTER_ADDR:$MASTER_PORT '
+                f'--rdzv-id={random.randint(100, 999)} --rdzv-backend=c10d '
+                if 'node_num' in mode and mode.node_num > 1 else ''
+            )
             mode_command = (
                 f'torchrun'
                 f' --no_python --nproc_per_node={mode.proc_num} {torch_dist_params}{exec_command}'

From 3ac61bff79f4c444f65556bd29160d8ae0515ffa Mon Sep 17 00:00:00 2001
From: AVA <39534996+avazr@users.noreply.github.com>
Date: Tue, 30 Sep 2025 17:14:47 -0500
Subject: [PATCH 2/8] Add example port

---
 docs/user-tutorial/benchmarks/model-benchmarks.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/docs/user-tutorial/benchmarks/model-benchmarks.md b/docs/user-tutorial/benchmarks/model-benchmarks.md
index f23e33458..e77f58dfe 100644
--- a/docs/user-tutorial/benchmarks/model-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/model-benchmarks.md
@@ -74,7 +74,6 @@ SuperBench uses [torchrun](https://docs.pytorch.org/docs/stable/elastic/run.html
 2. Set the number of nodes (`node_num`), number of GPUs per node (`proc_num`), and required environment variables, including a resolvable `MASTER_ADDR` and an open (TCP) `MASTER_PORT`.
 
 ```yaml title="llama.yaml"
-# SuperBench Config
 default_pytorch_mode: &default_pytorch_mode
   modes:
     - name: torch.distributed
@@ -86,8 +85,8 @@ default_pytorch_mode: &default_pytorch_mode
         NCCL_SOCKET_IFNAME: 'eth0'
         NCCL_IB_DISABLE: '1'
         NCCL_IGNORE_DISABLED_P2P: '0'
-        MASTER_ADDR: '0.0.0.0'             # Rank 0 node IP
-        MASTER_PORT: '29603'               # TCP port
+        MASTER_ADDR: '10.0.0.6'            # Example of rank 0 node IP
+        MASTER_PORT: '29603'               # Example of TCP port
 ```
 
 #### Prerequisites

From 334e9a4f69c80eee6164fab4b2a2dcbb913b9964 Mon Sep 17 00:00:00 2001
From: AVA <39534996+avazr@users.noreply.github.com>
Date: Tue, 18 Nov 2025 15:59:50 -0600
Subject: [PATCH 3/8] Empty lines

---
 superbench/runner/runner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py
index b93141031..086faf18a 100644
--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@@ -27,7 +27,6 @@
 
 class SuperBenchRunner():
     """SuperBench runner class."""
-
     def __init__(self, sb_config, docker_config, ansible_config, sb_output_dir):
         """Initilize.
 
@@ -158,10 +157,12 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None):
                 f'--rdzv-id={random.randint(100, 999)} --rdzv-backend=c10d '
                 if 'node_num' in mode and mode.node_num > 1 else ''
             )
+
             nsys_prefix = (
                 f'nsys profile --output {trace_dir}/{benchmark_name}_traces '
                 f'--backtrace none --sample none --force-overwrite true --cpuctxsw none --trace cuda,nvtx '
             ) if enable_nsys else ''
+
             mode_command = (
                 f'{nsys_prefix}'
                 f'torchrun'

From 27406a8e4a9f8cc1de2d3c30b025c3ca9ed6a60d Mon Sep 17 00:00:00 2001
From: AVA <39534996+avazr@users.noreply.github.com>
Date: Tue, 18 Nov 2025 16:23:24 -0600
Subject: [PATCH 4/8] Fix documentation link for Ansible inventory
 configuration in model benchmarks

---
 docs/user-tutorial/benchmarks/model-benchmarks.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/user-tutorial/benchmarks/model-benchmarks.md b/docs/user-tutorial/benchmarks/model-benchmarks.md
index e77f58dfe..3e277d194 100644
--- a/docs/user-tutorial/benchmarks/model-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/model-benchmarks.md
@@ -69,7 +69,7 @@ Run GPT pretrain tasks with float32, float16, bfloat16 precisions with [Megatron
 
 SuperBench uses [torchrun](https://docs.pytorch.org/docs/stable/elastic/run.html) for multi-node LLaMA benchmarks based on PyTorch. Follow the steps below.
 
-1. Configure the Ansible inventory as described in [configuration](../getting-started/configuration.md), using the private IPs of all nodes.
+1. Configure the Ansible inventory as described in [configuration](../../getting-started/configuration.md), using the private IPs of all nodes.
 
 2. Set the number of nodes (`node_num`), number of GPUs per node (`proc_num`), and required environment variables, including a resolvable `MASTER_ADDR` and an open (TCP) `MASTER_PORT`.
 

From ccdc2935386c48cafa99988ded53b472754ade2c Mon Sep 17 00:00:00 2001
From: AVA <39534996+avazr@users.noreply.github.com>
Date: Wed, 19 Nov 2025 17:22:11 -0600
Subject: [PATCH 5/8] Fix type conversion for node_num in __get_mode_command
 method

---
 superbench/runner/runner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py
index 086faf18a..52e5ce4da 100644
--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@@ -27,6 +27,7 @@
 
 class SuperBenchRunner():
     """SuperBench runner class."""
+
     def __init__(self, sb_config, docker_config, ansible_config, sb_output_dir):
         """Initilize.
 
@@ -155,7 +156,7 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None):
             torch_dist_params = (
                 f'--nnodes={mode.node_num} --rdzv-endpoint=$MASTER_ADDR:$MASTER_PORT '
                 f'--rdzv-id={random.randint(100, 999)} --rdzv-backend=c10d '
-                if 'node_num' in mode and mode.node_num > 1 else ''
+                if 'node_num' in mode and int(mode.node_num) > 1 else ''
             )
 
             nsys_prefix = (

From 43df3f1022f3e4255023c3b89e424d7a788f0720 Mon Sep 17 00:00:00 2001
From: AVA <39534996+avazr@users.noreply.github.com>
Date: Wed, 19 Nov 2025 17:30:22 -0600
Subject: [PATCH 6/8] Empty line

---
 superbench/runner/runner.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py
index 52e5ce4da..e56f96514 100644
--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@@ -27,7 +27,6 @@
 
 class SuperBenchRunner():
     """SuperBench runner class."""
-
     def __init__(self, sb_config, docker_config, ansible_config, sb_output_dir):
         """Initilize.
 

From e7451b68473356aa433e90be2ae52b4be9bbf99c Mon Sep 17 00:00:00 2001
From: AVA <39534996+avazr@users.noreply.github.com>
Date: Wed, 19 Nov 2025 17:53:21 -0600
Subject: [PATCH 7/8] Fix type checking for node_num in __get_mode_command
 method

---
 superbench/runner/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py
index e56f96514..4eea30e1b 100644
--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@@ -155,7 +155,7 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None):
             torch_dist_params = (
                 f'--nnodes={mode.node_num} --rdzv-endpoint=$MASTER_ADDR:$MASTER_PORT '
                 f'--rdzv-id={random.randint(100, 999)} --rdzv-backend=c10d '
-                if 'node_num' in mode and int(mode.node_num) > 1 else ''
+                if ('node_num' in mode and str(mode.node_num).isdigit() and int(mode.node_num) > 1) else ''
             )
 
             nsys_prefix = (

From 5d4f6d9aa8881cfa9f4727cb933905856a09df5b Mon Sep 17 00:00:00 2001
From: AVA <39534996+avazr@users.noreply.github.com>
Date: Wed, 19 Nov 2025 17:59:41 -0600
Subject: [PATCH 8/8] Lint fix

---
 superbench/runner/runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py
index 4eea30e1b..de8a8c461 100644
--- a/superbench/runner/runner.py
+++ b/superbench/runner/runner.py
@@ -154,8 +154,8 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None):
         elif mode.name == 'torch.distributed':
             torch_dist_params = (
                 f'--nnodes={mode.node_num} --rdzv-endpoint=$MASTER_ADDR:$MASTER_PORT '
-                f'--rdzv-id={random.randint(100, 999)} --rdzv-backend=c10d '
-                if ('node_num' in mode and str(mode.node_num).isdigit() and int(mode.node_num) > 1) else ''
+                f'--rdzv-id={random.randint(100, 999)} --rdzv-backend=c10d ' if
+                ('node_num' in mode and str(mode.node_num).isdigit() and int(mode.node_num) > 1) else ''
             )
 
             nsys_prefix = (