microsoft · avazr · Sep 30, 2025 · Sep 30, 2025 · Nov 18, 2025 · Nov 18, 2025
@@ -65,3 +65,32 @@ Run GPT pretrain tasks with float32, float16, bfloat16 precisions with [Megatron
 | megatron-gpt/${precision}_train_mem_allocated     | GB                     | The average GPU memory allocated per iteration.         |
 | megatron-gpt/${precision}_train_max_mem_allocated | GB                     | The average maximum GPU memory allocated per iteration. |
 
+## Multi-node LLaMA Benchmarks
+
+SuperBench uses [torchrun](https://docs.pytorch.org/docs/stable/elastic/run.html) for multi-node LLaMA benchmarks based on PyTorch. Follow the steps below.
+
+1. Configure the Ansible inventory as described in [configuration](../../getting-started/configuration.md), using the private IPs of all nodes.
+
+2. Set the number of nodes (`node_num`), number of GPUs per node (`proc_num`), and required environment variables, including a resolvable `MASTER_ADDR` and an open (TCP) `MASTER_PORT`.
+
+```yaml title="llama.yaml"
+default_pytorch_mode: &default_pytorch_mode
+  modes:
+    - name: torch.distributed
+      proc_num: 4          # GPUs per node
+      node_num: 10         # Total nodes
+      env:
+        NCCL_DEBUG: WARN
+        TORCH_NCCL_ASYNC_ERROR_HANDLING: '0'
+        NCCL_SOCKET_IFNAME: 'eth0'
+        NCCL_IB_DISABLE: '1'
+        NCCL_IGNORE_DISABLED_P2P: '0'
+        MASTER_ADDR: '10.0.0.6'            # Example of rank 0 node IP
+        MASTER_PORT: '29603'               # Example of TCP port
+```
+
+#### Prerequisites
+
+- Passwordless SSH configured across all nodes.
+- NVIDIA IMEX service running (verify with `nvidia-imex-ctl --H`).
+- Chosen `MASTER_PORT` open and reachable between all nodes.
@@ -109,23 +109,44 @@ def _init_distributed_setting(self):
                     )
                     return False
                 # torch >= 1.9.0a0 torch.distributed.elastic is used by default
-                port = int(os.environ.get('MASTER_PORT', '29500')) + 1
-                os.environ['MASTER_PORT'] = str(port)
-                addr = os.environ['MASTER_ADDR']
                 self._global_rank = int(os.environ['RANK'])
                 self._local_rank = int(os.environ['LOCAL_RANK'])
                 self._world_size = int(os.environ['WORLD_SIZE'])
-                logger.debug('ip:{},port:{},rank:{},world:{}'.format(addr, port, self._global_rank, self._world_size))
-                store = PrefixStore(
-                    self._name, TCPStore(addr, port, self._world_size, self._global_rank == 0, timedelta(seconds=300))
-                )
-                torch.distributed.init_process_group(
-                    backend=self._args.distributed_backend.value,
-                    timeout=timedelta(seconds=300),
-                    rank=self._global_rank,
-                    world_size=self._world_size,
-                    store=store
-                )
+                self._local_world_size = int(os.environ['LOCAL_WORLD_SIZE'])
+                self._multi_node = True if self._world_size != self._local_world_size else False
+
+                if self._multi_node:
+                    logger.debug(
+                        'rank:{},world:{}, local_world:{}'.format(
+                            self._global_rank, self._world_size, self._local_world_size
+                        )
+                    )
+                    torch.distributed.init_process_group(
+                        backend=self._args.distributed_backend.value,
+                        timeout=timedelta(seconds=300),
+                        rank=self._global_rank,
+                        world_size=self._world_size,
+                    )
+                else:
+                    port = int(os.environ.get('MASTER_PORT', '29500')) + 1
+                    os.environ['MASTER_PORT'] = str(port)
+                    addr = os.environ['MASTER_ADDR']
+                    logger.debug(
+                        'ip:{},port:{},rank:{},world:{}, local_world:{}'.format(
+                            addr, port, self._global_rank, self._world_size, self._local_world_size
+                        )
+                    )
+                    store = PrefixStore(
+                        self._name,
+                        TCPStore(addr, port, self._world_size, self._global_rank == 0, timedelta(seconds=300))
+                    )
+                    torch.distributed.init_process_group(
+                        backend=self._args.distributed_backend.value,
+                        timeout=timedelta(seconds=300),
+                        rank=self._global_rank,
+                        world_size=self._world_size,
+                        store=store
+                    )
 
             else:
                 logger.error(

@@ -152,11 +152,10 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None):
             mode_command = ' '.join(command_parts)
             mode_command = f'PROC_RANK={mode.proc_rank} {mode_command}'
         elif mode.name == 'torch.distributed':
-            # TODO: replace with torch.distributed.run in v1.9
-            # TODO: only supports node_num=1 and node_num=all currently
             torch_dist_params = (
-                '' if 'node_num' in mode and mode.node_num == 1 else
-                '--nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT '
+                f'--nnodes={mode.node_num} --rdzv-endpoint=$MASTER_ADDR:$MASTER_PORT '
+                f'--rdzv-id={random.randint(100, 999)} --rdzv-backend=c10d ' if
+                ('node_num' in mode and str(mode.node_num).isdigit() and int(mode.node_num) > 1) else ''
             )
 
             nsys_prefix = (