diff --git a/build_tools/utils.py b/build_tools/utils.py index 8a52440310..885901068a 100644 --- a/build_tools/utils.py +++ b/build_tools/utils.py @@ -228,9 +228,10 @@ def nvcc_path() -> Tuple[str, str]: def get_cuda_include_dirs() -> Tuple[str, str]: """Returns the CUDA header directory.""" + force_wheels = bool(int(os.getenv("NVTE_BUILD_USE_NVIDIA_WHEELS", "0"))) # If cuda is installed via toolkit, all necessary headers # are bundled inside the top level cuda directory. - if cuda_toolkit_include_path() is not None: + if not force_wheels and cuda_toolkit_include_path() is not None: return [cuda_toolkit_include_path()] # Use pip wheels to include all headers. @@ -239,7 +240,10 @@ def get_cuda_include_dirs() -> Tuple[str, str]: except ModuleNotFoundError as e: raise RuntimeError("CUDA not found.") - cuda_root = Path(nvidia.__file__).parent + if nvidia.__file__ is not None: + cuda_root = Path(nvidia.__file__).parent + else: + cuda_root = Path(nvidia.__path__[0]) # namespace return [ subdir / "include" for subdir in cuda_root.iterdir() diff --git a/transformer_engine/common/cast/nvfp4/quantize_transpose_nvfp4.cuh b/transformer_engine/common/cast/nvfp4/quantize_transpose_nvfp4.cuh index 61c6ba9cef..99776db281 100644 --- a/transformer_engine/common/cast/nvfp4/quantize_transpose_nvfp4.cuh +++ b/transformer_engine/common/cast/nvfp4/quantize_transpose_nvfp4.cuh @@ -1168,10 +1168,10 @@ void quantize_transpose(const Tensor &input, const Tensor *noop, Tensor *output, // TODO(Frank): Is there a better way to do this? bool return_transpose = output->has_columnwise_data(); - // if (!use_2d_quantization && (input.dtype() == DType::kBFloat16)) { - // quantize_transpose_tuned_1D(input, noop, output, quant_config, stream); - // return; - // } + if (!use_2d_quantization && (input.dtype() == DType::kBFloat16)) { + quantize_transpose_tuned_1D(input, noop, output, quant_config, stream); + return; + } constexpr bool COMPUTE_ACTIVATIONS = false; using ParamOP = Empty; diff --git a/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh b/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh index 4119001686..061a88fd6d 100644 --- a/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh +++ b/transformer_engine/common/cast/nvfp4/specialized/quantize_transpose_nvfp4_tuned_1D.cuh @@ -163,9 +163,24 @@ __device__ __forceinline__ float get_amax_of_pair(const IType2 pair) { template __device__ __forceinline__ SF_TYPE compute_nvfp4_scaling_coefficient(const nvfp4_scale_t S_dec_block, const float S_enc) { - constexpr float float_max = detail::TypeExtrema::max; - const float scale_rcp = fminf(S_enc / static_cast(S_dec_block), float_max); - return static_cast(scale_rcp); + NVTE_DEVICE_ERROR("Unsupported scaling-factor type. Only FP32 and BF16 are supported."); +} + +template <> +__device__ __forceinline__ float compute_nvfp4_scaling_coefficient( + const nvfp4_scale_t S_dec_block, const float S_enc) { + const float S_dec = 1.0f / S_enc; + const float scale_rcp = + fminf(1.0f / (static_cast(S_dec_block) * S_dec), detail::TypeExtrema::max); + return scale_rcp; +} + +template <> +__device__ __forceinline__ bf16 +compute_nvfp4_scaling_coefficient(const nvfp4_scale_t S_dec_block, const float S_enc) { + const float scale_rcp = + fminf(S_enc / (static_cast(S_dec_block)), detail::TypeExtrema::max); + return static_cast(scale_rcp); } template