From 94ba75d7f470a412a5d4f3cea3728792f49b38f7 Mon Sep 17 00:00:00 2001
From: Vadim Markovtsev <gmarkhor@gmail.com>
Date: Tue, 3 Feb 2026 02:00:07 +0100
Subject: [PATCH 1/2] Support building with headers from nvidia wheels (#2623)

* Support building with headers from nvidia wheels

There are two changes:
1. `import nvidia` returns a namespace package with `__file__` equal to `None`
2. Add the way to force headers from nvidia wheels. Without that envvar, it's practically impossible with CUDA installed system-wide.

I successfully built the package with torch using the following `uv` configuration:
```
[tool.uv.extra-build-dependencies]
"transformer-engine-torch" = [
    "ninja",
    "nvidia-cuda-crt==13.0.88",
    "nvidia-cuda-cccl==13.0.85",
    { requirement = "torch", match-runtime = true },
    { requirement = "pytorch-triton", match-runtime = true },
    { requirement = "nvidia-cusolver", match-runtime = true },
    { requirement = "nvidia-curand", match-runtime = true },
    { requirement = "nvidia-cublas", match-runtime = true },
    { requirement = "nvidia-cusparse", match-runtime = true },
    { requirement = "nvidia-cudnn-cu13", match-runtime = true },
    { requirement = "nvidia-nvtx", match-runtime = true },
    { requirement = "nvidia-cuda-nvrtc", match-runtime = true },
    { requirement = "nvidia-cuda-runtime", match-runtime = true },
]
```

Signed-off-by: Vadim Markovtsev <vadim@poolside.ai>

* Apply suggestion from @ksivaman

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>

---------

Signed-off-by: Vadim Markovtsev <vadim@poolside.ai>
Signed-off-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
Co-authored-by: Kirthi Shankar Sivamani <ksivamani@nvidia.com>
---
 build_tools/utils.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/build_tools/utils.py b/build_tools/utils.py
index 8a52440310..885901068a 100644
--- a/build_tools/utils.py
+++ b/build_tools/utils.py
@@ -228,9 +228,10 @@ def nvcc_path() -> Tuple[str, str]:
 def get_cuda_include_dirs() -> Tuple[str, str]:
     """Returns the CUDA header directory."""
 
+    force_wheels = bool(int(os.getenv("NVTE_BUILD_USE_NVIDIA_WHEELS", "0")))
     # If cuda is installed via toolkit, all necessary headers
     # are bundled inside the top level cuda directory.
-    if cuda_toolkit_include_path() is not None:
+    if not force_wheels and cuda_toolkit_include_path() is not None:
         return [cuda_toolkit_include_path()]
 
     # Use pip wheels to include all headers.
@@ -239,7 +240,10 @@ def get_cuda_include_dirs() -> Tuple[str, str]:
     except ModuleNotFoundError as e:
         raise RuntimeError("CUDA not found.")
 
-    cuda_root = Path(nvidia.__file__).parent
+    if nvidia.__file__ is not None:
+        cuda_root = Path(nvidia.__file__).parent
+    else:
+        cuda_root = Path(nvidia.__path__[0])  # namespace
     return [
         subdir / "include"
         for subdir in cuda_root.iterdir()

From 29b84c168ebbc151990e06d7a147532273837376 Mon Sep 17 00:00:00 2001
From: Oleg Goncharov <64355998+Oleg-Goncharov@users.noreply.github.com>
Date: Tue, 3 Feb 2026 02:49:34 +0100
Subject: [PATCH 2/2] [Common] Fix NVFP4 tuned-kernel numerics (#2639)

* Fixed scaling-factor computation for FP32 to match the reference implementation.

Signed-off-by: Oleg Goncharov <ogoncharov@nvidia.com>

* Uncommented the tuned kernel path

Signed-off-by: Oleg Goncharov <ogoncharov@nvidia.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: Oleg Goncharov <ogoncharov@nvidia.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../cast/nvfp4/quantize_transpose_nvfp4.cuh   |  8 +++----
 .../quantize_transpose_nvfp4_tuned_1D.cuh     | 21 ++++++++++++++++---
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/transformer_engine/common/cast/nvfp4/quantize_transpose_nvfp4.cuh b/transformer_engine/common/cast/nvfp4/quantize_transpose_nvfp4.cuh
index 61c6ba9cef..99776db281 100644
--- a/transformer_engine/common/cast/nvfp4/quantize_transpose_nvfp4.cuh
+++ b/transformer_engine/common/cast/nvfp4/quantize_transpose_nvfp4.cuh
@@ -1168,10 +1168,10 @@ void quantize_transpose(const Tensor &input, const Tensor *noop, Tensor *output,
   // TODO(Frank): Is there a better way to do this?
   bool return_transpose = output->has_columnwise_data();
 
-  // if (!use_2d_quantization && (input.dtype() == DType::kBFloat16)) {
-  //   quantize_transpose_tuned_1D(input, noop, output, quant_config, stream);
-  //   return;
-  // }
+  if (!use_2d_quantization && (input.dtype() == DType::kBFloat16)) {
+    quantize_transpose_tuned_1D(input, noop, output, quant_config, stream);
+    return;
+  }
 
   constexpr bool COMPUTE_ACTIVATIONS = false;
   using ParamOP = Empty;
diff --git a/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh b/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh
index 4119001686..061a88fd6d 100644
--- a/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh
+++ b/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh
@@ -163,9 +163,24 @@ __device__ __forceinline__ float get_amax_of_pair(const IType2 pair) {
 template <typename SF_TYPE>
 __device__ __forceinline__ SF_TYPE
 compute_nvfp4_scaling_coefficient(const nvfp4_scale_t S_dec_block, const float S_enc) {
-  constexpr float float_max = detail::TypeExtrema<SF_TYPE>::max;
-  const float scale_rcp = fminf(S_enc / static_cast<float>(S_dec_block), float_max);
-  return static_cast<SF_TYPE>(scale_rcp);
+  NVTE_DEVICE_ERROR("Unsupported scaling-factor type. Only FP32 and BF16 are supported.");
+}
+
+template <>
+__device__ __forceinline__ float compute_nvfp4_scaling_coefficient<float>(
+    const nvfp4_scale_t S_dec_block, const float S_enc) {
+  const float S_dec = 1.0f / S_enc;
+  const float scale_rcp =
+      fminf(1.0f / (static_cast<float>(S_dec_block) * S_dec), detail::TypeExtrema<float>::max);
+  return scale_rcp;
+}
+
+template <>
+__device__ __forceinline__ bf16
+compute_nvfp4_scaling_coefficient<bf16>(const nvfp4_scale_t S_dec_block, const float S_enc) {
+  const float scale_rcp =
+      fminf(S_enc / (static_cast<float>(S_dec_block)), detail::TypeExtrema<bf16>::max);
+  return static_cast<bf16>(scale_rcp);
 }
 
 template <bool USE_STOCHASTIC_ROUNDING, bool USE_FAST_MATH>