Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions build_tools/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,9 +228,10 @@ def nvcc_path() -> Tuple[str, str]:
def get_cuda_include_dirs() -> Tuple[str, str]:
"""Returns the CUDA header directory."""

force_wheels = bool(int(os.getenv("NVTE_BUILD_USE_NVIDIA_WHEELS", "0")))
# If cuda is installed via toolkit, all necessary headers
# are bundled inside the top level cuda directory.
if cuda_toolkit_include_path() is not None:
if not force_wheels and cuda_toolkit_include_path() is not None:
return [cuda_toolkit_include_path()]

# Use pip wheels to include all headers.
Expand All @@ -239,7 +240,10 @@ def get_cuda_include_dirs() -> Tuple[str, str]:
except ModuleNotFoundError as e:
raise RuntimeError("CUDA not found.")

cuda_root = Path(nvidia.__file__).parent
if nvidia.__file__ is not None:
cuda_root = Path(nvidia.__file__).parent
else:
cuda_root = Path(nvidia.__path__[0]) # namespace
return [
subdir / "include"
for subdir in cuda_root.iterdir()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1168,10 +1168,10 @@ void quantize_transpose(const Tensor &input, const Tensor *noop, Tensor *output,
// TODO(Frank): Is there a better way to do this?
bool return_transpose = output->has_columnwise_data();

// if (!use_2d_quantization && (input.dtype() == DType::kBFloat16)) {
// quantize_transpose_tuned_1D(input, noop, output, quant_config, stream);
// return;
// }
if (!use_2d_quantization && (input.dtype() == DType::kBFloat16)) {
quantize_transpose_tuned_1D(input, noop, output, quant_config, stream);
return;
}

constexpr bool COMPUTE_ACTIVATIONS = false;
using ParamOP = Empty;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,9 +163,24 @@ __device__ __forceinline__ float get_amax_of_pair(const IType2 pair) {
template <typename SF_TYPE>
__device__ __forceinline__ SF_TYPE
compute_nvfp4_scaling_coefficient(const nvfp4_scale_t S_dec_block, const float S_enc) {
constexpr float float_max = detail::TypeExtrema<SF_TYPE>::max;
const float scale_rcp = fminf(S_enc / static_cast<float>(S_dec_block), float_max);
return static_cast<SF_TYPE>(scale_rcp);
NVTE_DEVICE_ERROR("Unsupported scaling-factor type. Only FP32 and BF16 are supported.");
}

template <>
__device__ __forceinline__ float compute_nvfp4_scaling_coefficient<float>(
const nvfp4_scale_t S_dec_block, const float S_enc) {
const float S_dec = 1.0f / S_enc;
const float scale_rcp =
fminf(1.0f / (static_cast<float>(S_dec_block) * S_dec), detail::TypeExtrema<float>::max);
return scale_rcp;
}

template <>
__device__ __forceinline__ bf16
compute_nvfp4_scaling_coefficient<bf16>(const nvfp4_scale_t S_dec_block, const float S_enc) {
const float scale_rcp =
fminf(S_enc / (static_cast<float>(S_dec_block)), detail::TypeExtrema<bf16>::max);
return static_cast<bf16>(scale_rcp);
}

template <bool USE_STOCHASTIC_ROUNDING, bool USE_FAST_MATH>
Expand Down
Loading