phu0ngng · pull · Feb 3, 2026 · Feb 3, 2026 · Feb 3, 2026
diff --git a/build_tools/utils.py b/build_tools/utils.py
@@ -228,9 +228,10 @@ def nvcc_path() -> Tuple[str, str]:
 def get_cuda_include_dirs() -> Tuple[str, str]:
     """Returns the CUDA header directory."""
 
+    force_wheels = bool(int(os.getenv("NVTE_BUILD_USE_NVIDIA_WHEELS", "0")))
     # If cuda is installed via toolkit, all necessary headers
     # are bundled inside the top level cuda directory.
-    if cuda_toolkit_include_path() is not None:
+    if not force_wheels and cuda_toolkit_include_path() is not None:
         return [cuda_toolkit_include_path()]
 
     # Use pip wheels to include all headers.
@@ -239,7 +240,10 @@ def get_cuda_include_dirs() -> Tuple[str, str]:
     except ModuleNotFoundError as e:
         raise RuntimeError("CUDA not found.")
 
-    cuda_root = Path(nvidia.__file__).parent
+    if nvidia.__file__ is not None:
+        cuda_root = Path(nvidia.__file__).parent
+    else:
+        cuda_root = Path(nvidia.__path__[0])  # namespace
     return [
         subdir / "include"
         for subdir in cuda_root.iterdir()

diff --git a/transformer_engine/common/cast/nvfp4/quantize_transpose_nvfp4.cuh b/transformer_engine/common/cast/nvfp4/quantize_transpose_nvfp4.cuh
@@ -1168,10 +1168,10 @@ void quantize_transpose(const Tensor &input, const Tensor *noop, Tensor *output,
   // TODO(Frank): Is there a better way to do this?
   bool return_transpose = output->has_columnwise_data();
 
-  // if (!use_2d_quantization && (input.dtype() == DType::kBFloat16)) {
-  //   quantize_transpose_tuned_1D(input, noop, output, quant_config, stream);
-  //   return;
-  // }
+  if (!use_2d_quantization && (input.dtype() == DType::kBFloat16)) {
+    quantize_transpose_tuned_1D(input, noop, output, quant_config, stream);
+    return;
+  }
 
   constexpr bool COMPUTE_ACTIVATIONS = false;
   using ParamOP = Empty;

diff --git a/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh b/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh
@@ -163,9 +163,24 @@ __device__ __forceinline__ float get_amax_of_pair(const IType2 pair) {
 template <typename SF_TYPE>
 __device__ __forceinline__ SF_TYPE
 compute_nvfp4_scaling_coefficient(const nvfp4_scale_t S_dec_block, const float S_enc) {
-  constexpr float float_max = detail::TypeExtrema<SF_TYPE>::max;
-  const float scale_rcp = fminf(S_enc / static_cast<float>(S_dec_block), float_max);
-  return static_cast<SF_TYPE>(scale_rcp);
+  NVTE_DEVICE_ERROR("Unsupported scaling-factor type. Only FP32 and BF16 are supported.");
+}
+
+template <>
+__device__ __forceinline__ float compute_nvfp4_scaling_coefficient<float>(
+    const nvfp4_scale_t S_dec_block, const float S_enc) {
+  const float S_dec = 1.0f / S_enc;
+  const float scale_rcp =
+      fminf(1.0f / (static_cast<float>(S_dec_block) * S_dec), detail::TypeExtrema<float>::max);
+  return scale_rcp;
+}
+
+template <>
+__device__ __forceinline__ bf16
+compute_nvfp4_scaling_coefficient<bf16>(const nvfp4_scale_t S_dec_block, const float S_enc) {
+  const float scale_rcp =
+      fminf(S_enc / (static_cast<float>(S_dec_block)), detail::TypeExtrema<bf16>::max);
+  return static_cast<bf16>(scale_rcp);
 }
 
 template <bool USE_STOCHASTIC_ROUNDING, bool USE_FAST_MATH>