diff --git a/docs/user-tutorial/benchmarks/model-benchmarks.md b/docs/user-tutorial/benchmarks/model-benchmarks.md index ba89ed6ff..3e277d194 100644 --- a/docs/user-tutorial/benchmarks/model-benchmarks.md +++ b/docs/user-tutorial/benchmarks/model-benchmarks.md @@ -65,3 +65,32 @@ Run GPT pretrain tasks with float32, float16, bfloat16 precisions with [Megatron | megatron-gpt/${precision}_train_mem_allocated | GB | The average GPU memory allocated per iteration. | | megatron-gpt/${precision}_train_max_mem_allocated | GB | The average maximum GPU memory allocated per iteration. | +## Multi-node LLaMA Benchmarks + +SuperBench uses [torchrun](https://docs.pytorch.org/docs/stable/elastic/run.html) for multi-node LLaMA benchmarks based on PyTorch. Follow the steps below. + +1. Configure the Ansible inventory as described in [configuration](../../getting-started/configuration.md), using the private IPs of all nodes. + +2. Set the number of nodes (`node_num`), number of GPUs per node (`proc_num`), and required environment variables, including a resolvable `MASTER_ADDR` and an open (TCP) `MASTER_PORT`. + +```yaml title="llama.yaml" +default_pytorch_mode: &default_pytorch_mode + modes: + - name: torch.distributed + proc_num: 4 # GPUs per node + node_num: 10 # Total nodes + env: + NCCL_DEBUG: WARN + TORCH_NCCL_ASYNC_ERROR_HANDLING: '0' + NCCL_SOCKET_IFNAME: 'eth0' + NCCL_IB_DISABLE: '1' + NCCL_IGNORE_DISABLED_P2P: '0' + MASTER_ADDR: '10.0.0.6' # Example of rank 0 node IP + MASTER_PORT: '29603' # Example of TCP port +``` + +#### Prerequisites + +- Passwordless SSH configured across all nodes. +- NVIDIA IMEX service running (verify with `nvidia-imex-ctl --H`). +- Chosen `MASTER_PORT` open and reachable between all nodes. diff --git a/superbench/benchmarks/model_benchmarks/pytorch_base.py b/superbench/benchmarks/model_benchmarks/pytorch_base.py index 1d7950cad..c403599e8 100644 --- a/superbench/benchmarks/model_benchmarks/pytorch_base.py +++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py @@ -109,23 +109,44 @@ def _init_distributed_setting(self): ) return False # torch >= 1.9.0a0 torch.distributed.elastic is used by default - port = int(os.environ.get('MASTER_PORT', '29500')) + 1 - os.environ['MASTER_PORT'] = str(port) - addr = os.environ['MASTER_ADDR'] self._global_rank = int(os.environ['RANK']) self._local_rank = int(os.environ['LOCAL_RANK']) self._world_size = int(os.environ['WORLD_SIZE']) - logger.debug('ip:{},port:{},rank:{},world:{}'.format(addr, port, self._global_rank, self._world_size)) - store = PrefixStore( - self._name, TCPStore(addr, port, self._world_size, self._global_rank == 0, timedelta(seconds=300)) - ) - torch.distributed.init_process_group( - backend=self._args.distributed_backend.value, - timeout=timedelta(seconds=300), - rank=self._global_rank, - world_size=self._world_size, - store=store - ) + self._local_world_size = int(os.environ['LOCAL_WORLD_SIZE']) + self._multi_node = True if self._world_size != self._local_world_size else False + + if self._multi_node: + logger.debug( + 'rank:{},world:{}, local_world:{}'.format( + self._global_rank, self._world_size, self._local_world_size + ) + ) + torch.distributed.init_process_group( + backend=self._args.distributed_backend.value, + timeout=timedelta(seconds=300), + rank=self._global_rank, + world_size=self._world_size, + ) + else: + port = int(os.environ.get('MASTER_PORT', '29500')) + 1 + os.environ['MASTER_PORT'] = str(port) + addr = os.environ['MASTER_ADDR'] + logger.debug( + 'ip:{},port:{},rank:{},world:{}, local_world:{}'.format( + addr, port, self._global_rank, self._world_size, self._local_world_size + ) + ) + store = PrefixStore( + self._name, + TCPStore(addr, port, self._world_size, self._global_rank == 0, timedelta(seconds=300)) + ) + torch.distributed.init_process_group( + backend=self._args.distributed_backend.value, + timeout=timedelta(seconds=300), + rank=self._global_rank, + world_size=self._world_size, + store=store + ) else: logger.error( diff --git a/superbench/runner/runner.py b/superbench/runner/runner.py index 5787274c7..de8a8c461 100644 --- a/superbench/runner/runner.py +++ b/superbench/runner/runner.py @@ -152,11 +152,10 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None): mode_command = ' '.join(command_parts) mode_command = f'PROC_RANK={mode.proc_rank} {mode_command}' elif mode.name == 'torch.distributed': - # TODO: replace with torch.distributed.run in v1.9 - # TODO: only supports node_num=1 and node_num=all currently torch_dist_params = ( - '' if 'node_num' in mode and mode.node_num == 1 else - '--nnodes=$NNODES --node_rank=$NODE_RANK --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT ' + f'--nnodes={mode.node_num} --rdzv-endpoint=$MASTER_ADDR:$MASTER_PORT ' + f'--rdzv-id={random.randint(100, 999)} --rdzv-backend=c10d ' if + ('node_num' in mode and str(mode.node_num).isdigit() and int(mode.node_num) > 1) else '' ) nsys_prefix = (