From 94ba75d7f470a412a5d4f3cea3728792f49b38f7 Mon Sep 17 00:00:00 2001 From: Vadim Markovtsev Date: Tue, 3 Feb 2026 02:00:07 +0100 Subject: [PATCH 1/2] Support building with headers from nvidia wheels (#2623) * Support building with headers from nvidia wheels There are two changes: 1. `import nvidia` returns a namespace package with `__file__` equal to `None` 2. Add the way to force headers from nvidia wheels. Without that envvar, it's practically impossible with CUDA installed system-wide. I successfully built the package with torch using the following `uv` configuration: ``` [tool.uv.extra-build-dependencies] "transformer-engine-torch" = [ "ninja", "nvidia-cuda-crt==13.0.88", "nvidia-cuda-cccl==13.0.85", { requirement = "torch", match-runtime = true }, { requirement = "pytorch-triton", match-runtime = true }, { requirement = "nvidia-cusolver", match-runtime = true }, { requirement = "nvidia-curand", match-runtime = true }, { requirement = "nvidia-cublas", match-runtime = true }, { requirement = "nvidia-cusparse", match-runtime = true }, { requirement = "nvidia-cudnn-cu13", match-runtime = true }, { requirement = "nvidia-nvtx", match-runtime = true }, { requirement = "nvidia-cuda-nvrtc", match-runtime = true }, { requirement = "nvidia-cuda-runtime", match-runtime = true }, ] ``` Signed-off-by: Vadim Markovtsev * Apply suggestion from @ksivaman Signed-off-by: Kirthi Shankar Sivamani Signed-off-by: Kirthi Shankar Sivamani --------- Signed-off-by: Vadim Markovtsev Signed-off-by: Kirthi Shankar Sivamani Co-authored-by: Kirthi Shankar Sivamani --- build_tools/utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/build_tools/utils.py b/build_tools/utils.py index 8a52440310..885901068a 100644 --- a/build_tools/utils.py +++ b/build_tools/utils.py @@ -228,9 +228,10 @@ def nvcc_path() -> Tuple[str, str]: def get_cuda_include_dirs() -> Tuple[str, str]: """Returns the CUDA header directory.""" + force_wheels = bool(int(os.getenv("NVTE_BUILD_USE_NVIDIA_WHEELS", "0"))) # If cuda is installed via toolkit, all necessary headers # are bundled inside the top level cuda directory. - if cuda_toolkit_include_path() is not None: + if not force_wheels and cuda_toolkit_include_path() is not None: return [cuda_toolkit_include_path()] # Use pip wheels to include all headers. @@ -239,7 +240,10 @@ def get_cuda_include_dirs() -> Tuple[str, str]: except ModuleNotFoundError as e: raise RuntimeError("CUDA not found.") - cuda_root = Path(nvidia.__file__).parent + if nvidia.__file__ is not None: + cuda_root = Path(nvidia.__file__).parent + else: + cuda_root = Path(nvidia.__path__[0]) # namespace return [ subdir / "include" for subdir in cuda_root.iterdir() From 29b84c168ebbc151990e06d7a147532273837376 Mon Sep 17 00:00:00 2001 From: Oleg Goncharov <64355998+Oleg-Goncharov@users.noreply.github.com> Date: Tue, 3 Feb 2026 02:49:34 +0100 Subject: [PATCH 2/2] [Common] Fix NVFP4 tuned-kernel numerics (#2639) * Fixed scaling-factor computation for FP32 to match the reference implementation. Signed-off-by: Oleg Goncharov * Uncommented the tuned kernel path Signed-off-by: Oleg Goncharov * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Oleg Goncharov Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../cast/nvfp4/quantize_transpose_nvfp4.cuh | 8 +++---- .../quantize_transpose_nvfp4_tuned_1D.cuh | 21 ++++++++++++++++--- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/transformer_engine/common/cast/nvfp4/quantize_transpose_nvfp4.cuh b/transformer_engine/common/cast/nvfp4/quantize_transpose_nvfp4.cuh index 61c6ba9cef..99776db281 100644 --- a/transformer_engine/common/cast/nvfp4/quantize_transpose_nvfp4.cuh +++ b/transformer_engine/common/cast/nvfp4/quantize_transpose_nvfp4.cuh @@ -1168,10 +1168,10 @@ void quantize_transpose(const Tensor &input, const Tensor *noop, Tensor *output, // TODO(Frank): Is there a better way to do this? bool return_transpose = output->has_columnwise_data(); - // if (!use_2d_quantization && (input.dtype() == DType::kBFloat16)) { - // quantize_transpose_tuned_1D(input, noop, output, quant_config, stream); - // return; - // } + if (!use_2d_quantization && (input.dtype() == DType::kBFloat16)) { + quantize_transpose_tuned_1D(input, noop, output, quant_config, stream); + return; + } constexpr bool COMPUTE_ACTIVATIONS = false; using ParamOP = Empty; diff --git a/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh b/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh index 4119001686..061a88fd6d 100644 --- a/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh +++ b/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh @@ -163,9 +163,24 @@ __device__ __forceinline__ float get_amax_of_pair(const IType2 pair) { template __device__ __forceinline__ SF_TYPE compute_nvfp4_scaling_coefficient(const nvfp4_scale_t S_dec_block, const float S_enc) { - constexpr float float_max = detail::TypeExtrema::max; - const float scale_rcp = fminf(S_enc / static_cast(S_dec_block), float_max); - return static_cast(scale_rcp); + NVTE_DEVICE_ERROR("Unsupported scaling-factor type. Only FP32 and BF16 are supported."); +} + +template <> +__device__ __forceinline__ float compute_nvfp4_scaling_coefficient( + const nvfp4_scale_t S_dec_block, const float S_enc) { + const float S_dec = 1.0f / S_enc; + const float scale_rcp = + fminf(1.0f / (static_cast(S_dec_block) * S_dec), detail::TypeExtrema::max); + return scale_rcp; +} + +template <> +__device__ __forceinline__ bf16 +compute_nvfp4_scaling_coefficient(const nvfp4_scale_t S_dec_block, const float S_enc) { + const float scale_rcp = + fminf(S_enc / (static_cast(S_dec_block)), detail::TypeExtrema::max); + return static_cast(scale_rcp); } template