diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index b423e3265..562d9c38c 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -10,6 +10,7 @@ on: - 'src/**' - 'include/**' - 'CMakeLists.txt' + - 'apps/nccl/**' - '.github/workflows/codeql-analysis.yml' pull_request: branches: @@ -20,6 +21,7 @@ on: - 'src/**' - 'include/**' - 'CMakeLists.txt' + - 'apps/nccl/**' - '.github/workflows/codeql-analysis.yml' schedule: - cron: "30 1 * * 1" diff --git a/apps/nccl/src/allreduce.cu b/apps/nccl/src/allreduce.cu index 257db2ccc..e1d160995 100644 --- a/apps/nccl/src/allreduce.cu +++ b/apps/nccl/src/allreduce.cu @@ -79,7 +79,11 @@ struct NvlsAdapter { #endif { using ChannelType = mscclpp::DeviceHandle; - int nBlocks = nRanksPerNode; + cudaDeviceProp prop; + MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&prop, 0)); + // On GB200, the optimal number of blocks depends on the GPU issue rate + + // NVLink switch reduction capacity, which is 24 here + int nBlocks = (prop.major == 10) ? 24 : nRanksPerNode; int nThreadsPerBlock = 1024; allreduce9<<>>((ChannelType*)memoryChannels, nvlsChannels, nvlsOutChannels, channelInOffset, channelOutOffset, @@ -331,7 +335,11 @@ mscclpp::Algorithm AllreducePacket::build() { void AllreduceNvls::initialize(std::shared_ptr comm, std::unordered_map>&) { - nSwitchChannels_ = 8; + cudaDeviceProp prop; + MSCCLPP_CUDATHROW(cudaGetDeviceProperties(&prop, 0)); + // On GB200, the optimal number of blocks depends on the GPU issue rate + + // NVLink switch reduction capacity, which is 24 here + nSwitchChannels_ = (prop.major == 10) ? 24 : 8; this->conns_ = setupConnections(comm); // setup semaphores std::vector> memorySemaphores = @@ -680,4 +688,4 @@ mscclpp::Algorithm AllreduceNvlsPacket::build() { return self->generateAllreduceContextKey(input, output, count, dtype); }); return allreduceAlgo; -} \ No newline at end of file +} diff --git a/apps/nccl/src/allreduce.hpp b/apps/nccl/src/allreduce.hpp index 82adc323b..b9db69785 100644 --- a/apps/nccl/src/allreduce.hpp +++ b/apps/nccl/src/allreduce.hpp @@ -821,7 +821,11 @@ __global__ void __launch_bounds__(1024, 1) int nBlocks = gridDim.x; int bid = blockIdx.x; size_t sizePerRank = size / nRanksPerNode; +#if __CUDA_ARCH__ >= 1000 + size_t sizePerBlock = (sizePerRank / nBlocks) / 16 * 16; +#else size_t sizePerBlock = sizePerRank / nBlocks; +#endif size_t rankOffset = sizePerRank * rank; size_t blockOffset = sizePerBlock * bid + rankOffset; mscclpp::DeviceHandle* multicastPtr = multicast + bid; @@ -842,6 +846,11 @@ __global__ void __launch_bounds__(1024, 1) __syncthreads(); T* src = (T*)multicastPtr->mcPtr; T* dst = (T*)multicastOutPtr->mcPtr; +#if __CUDA_ARCH__ >= 1000 + if (bid == nBlocks - 1) { + sizePerBlock = sizePerRank - sizePerBlock * (nBlocks - 1); + } +#endif handleMultiLoadReduceStore(src, dst, blockOffset + channelInOffset, blockOffset + channelOutOffset, sizePerBlock, threadIdx.x, blockDim.x); __syncthreads();