GeYuhong · lhb8125 · Sep 3, 2025 · Sep 3, 2025 · Sep 4, 2025 · Sep 4, 2025
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -19,7 +19,7 @@ jobs:
         run: |
           apt-get update
           apt-get install -y git python3.9 pip cudnn9-cuda-12
-          pip install cmake==3.21.0 pybind11[global] ninja
+          pip install cmake==3.21.0 pybind11[global] ninja nvidia-mathdx==25.1.1
       - name: 'Checkout'
         uses: actions/checkout@v3
         with:
@@ -43,7 +43,7 @@ jobs:
         run: |
           apt-get update
           apt-get install -y git python3.9 pip cudnn9-cuda-12
-          pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript
+          pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript nvidia-mathdx==25.1.1
       - name: 'Checkout'
         uses: actions/checkout@v3
         with:
@@ -63,7 +63,7 @@ jobs:
       options: --user root
     steps:
       - name: 'Dependencies'
-        run: pip install pybind11[global]
+        run: pip install pybind11[global] nvidia-mathdx==25.1.1
       - name: 'Checkout'
         uses: actions/checkout@v3
         with:
@@ -83,7 +83,7 @@ jobs:
       options: --user root
     steps:
       - name: 'Dependencies'
-        run: pip install torch pybind11[global] einops onnxscript
+        run: pip install torch pybind11[global] einops onnxscript nvidia-mathdx==25.1.1
       - name: 'Checkout'
         uses: actions/checkout@v3
         with:

diff --git a/.github/workflows/trigger-ci.yml b/.github/workflows/trigger-ci.yml
@@ -57,6 +57,7 @@ jobs:
            || github.actor == 'tdophung'
            || github.actor == 'vthumbe1503'
            || github.actor == 'janekb04'
+           || github.actor == 'shengfangd'
          )
     steps:
       - name: Check if comment is issued by authorized person

diff --git a/.gitmodules b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "3rdparty/cudnn-frontend"]
 	path = 3rdparty/cudnn-frontend
 	url = https://github.com/NVIDIA/cudnn-frontend.git
+[submodule "3rdparty/cutlass"]
+	path = 3rdparty/cutlass
+	url = https://github.com/NVIDIA/cutlass.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -38,3 +38,9 @@ repos:
         entry: clang-format -i
         args: ["-style=file"]
         files: ^transformer_engine.*\.(c|cc|cxx|cpp|cu|cuh|h|hpp)$
+
+  - repo: https://github.com/netromdk/vermin
+    rev: c75aca72f4e85c6e47252139e8695f1c8b5f9ae3
+    hooks:
+      - id: vermin
+        args: ['-t=3.10', '--violations']
diff --git a/3rdparty/cudnn-frontend b/3rdparty/cudnn-frontend
diff --git a/3rdparty/cutlass b/3rdparty/cutlass
diff --git a/benchmarks/benchmark_rht_cast.py b/benchmarks/benchmark_rht_cast.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import argparse
+import torch
+import pandas as pd
+import torch.utils.benchmark as benchmark
+
+import transformer_engine.pytorch as te
+import transformer_engine_torch as tex
+import transformer_engine.pytorch.cpp_extensions as ext
+
+from transformer_engine.pytorch.tensor.nvfp4_tensor import NVFP4Quantizer
+
+scale_padding_to = 1
+permute_scale = False
+
+TORCH_TO_TE_FLOAT_MAP = {
+    torch.bfloat16: tex.DType.kBFloat16,
+}
+
+
+def run_kernel(shape, stochastic_rounding: bool, input_dtype=torch.bfloat16):
+    # Generate random input data
+    M, K = shape
+    x = torch.randn([M, K], dtype=input_dtype, device="cuda")
+
+    assert shape[0] % 16 == 0, "Shape must be divisible by 16"
+    assert shape[1] % 16 == 0, "Shape must be divisible by 16"
+
+    # Quantize
+    nvfp4_quantizer = NVFP4Quantizer(
+        fp4_dtype=tex.DType.kFloat4E2M1,
+        rowwise=True,
+        columnwise=True,
+        with_amax_reduction=False,
+        amax_reduction_group=None,
+        with_rht=True,
+        with_post_rht_amax=True,
+        with_random_sign_mask=True,
+        stochastic_rounding=stochastic_rounding,
+    )
+    x_nvfp4_sut = nvfp4_quantizer.make_empty(
+        (M, K), dtype=x.dtype, device=x.device, requires_grad=False
+    )
+    x_nvfp4_sut = nvfp4_quantizer.update_quantized(x, x_nvfp4_sut)
+
+    with torch.no_grad():
+        stmt = "kernel_func(input, output)"
+        globals_dict = {
+            "kernel_func": nvfp4_quantizer.update_quantized,
+            "input": x,
+            "output": x_nvfp4_sut,
+        }
+
+        timing = benchmark.Timer(
+            stmt=stmt,
+            globals=globals_dict,
+            num_threads=1,
+        ).blocked_autorange(min_run_time=5)
+    print(timing)
+    timing_us = timing.median * 1e6
+
+    input_nbytes = shape[0] * shape[1] * 2  # bf16
+    output_nbytes = shape[0] * shape[1] // 2  # //2 for fp4
+    sf_nbytes = shape[0] * shape[1] // 16  # //16 for 1 byte per 16 elems
+
+    total_nbytes = (
+        0
+        + input_nbytes
+        * 3  # Reading input for Amax(x)&Amax(RHT(x.T)), Reading input for Cast(x), Reaindg input for Cast(RHT(x.T))
+        + 2 * 4  # Output 2 * float for scale & amax
+        + 2 * 4  # Input 2 * float
+        + output_nbytes * 2  # Output from Cast(x) and Cast(RHT(x.T))
+        + sf_nbytes * 2  # Scale factor
+    )
+
+    throughput_GBps = total_nbytes / (1024 * 1024 * 1024) / (timing_us / 1e6)
+
+    print(
+        f"Stochastic rounding: {stochastic_rounding}, Total: {total_nbytes} bytes, Throughput:"
+        f" {throughput_GBps} GB/s"
+    )
+    return timing_us, throughput_GBps
+
+
+# Nsight Compute Profiling Command:
+# ncu -f -o block_scaled_1d_cast_transpose_kernel --set=full --kernel-name "block_scaled_1d_cast_transpose_kernel" -s 5 -c 5 python benchmark_cast_transpose_1d_block.py --profile
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--profile", action="store_true", help="Enable profiling mode")
+    args = parser.parse_args()
+
+    if args.profile:
+        print("Profiling is enabled.")
+    else:
+        print("Profiling is disabled.")
+
+    shapes = [
+        (8192, 5120),
+        (8192, 10240),
+        (8192, 2560),
+        (8192, 11328),
+        (8192, 512),
+        (8192, 3584),
+        (5120, 8192),
+        (10240, 8192),
+        (2560, 8192),
+        (11328, 8192),
+        (512, 8192),
+        (3584, 8192),
+        (4096, 16384),
+        (14336, 16384),
+    ]
+
+    if args.profile:
+        shapes = [
+            (16384, 6144),
+        ]
+
+    data = []
+    for stochastic_rounding in [True]:  # , False]:
+        for shape in shapes:
+            print(
+                f"Running benchmark_func with shape {shape} and stochastic_rounding"
+                f" {stochastic_rounding}"
+            )
+            timing_us, throughput_GBps = run_kernel(shape, stochastic_rounding)
+            data.append(
+                [
+                    "benchmark_func",
+                    shape,
+                    stochastic_rounding,
+                    timing_us,
+                    throughput_GBps,
+                ]
+            )
+
+    df = pd.DataFrame(
+        data=data,
+        columns=[
+            "kernel",
+            "shape",
+            "stochastic_rounding",
+            "timing_us",
+            "throughput(GB/s)",
+        ],
+    )
+    print(df)
+    df.to_csv("benchmark_cast_nvfp4.csv", index=False)
diff --git a/build_tools/VERSION.txt b/build_tools/VERSION.txt
@@ -1 +1 @@
-2.8.0.dev0
+2.9.0.dev0
diff --git a/build_tools/build_ext.py b/build_tools/build_ext.py
@@ -57,6 +57,7 @@ def _build_cmake(self, build_dir: Path, install_dir: Path) -> None:
             build_dir,
             f"-DPython_EXECUTABLE={sys.executable}",
             f"-DPython_INCLUDE_DIR={sysconfig.get_path('include')}",
+            f"-DPython_SITEARCH={sysconfig.get_path('platlib')}",
             f"-DCMAKE_BUILD_TYPE={build_type}",
             f"-DCMAKE_INSTALL_PREFIX={install_dir}",
         ]

diff --git a/build_tools/jax.py b/build_tools/jax.py
@@ -87,4 +87,5 @@ def setup_jax_extension(
         sources=[str(path) for path in sources],
         include_dirs=[str(path) for path in include_dirs],
         extra_compile_args=cxx_flags,
+        libraries=["nccl"],
     )
diff --git a/build_tools/pytorch.py b/build_tools/pytorch.py
@@ -14,12 +14,12 @@
 
 def install_requirements() -> List[str]:
     """Install dependencies for TE/PyTorch extensions."""
-    return ["torch>=2.1", "einops", "onnxscript==0.3.1", "onnx"]
+    return ["torch>=2.1", "einops", "onnxscript", "onnx"]
 
 
 def test_requirements() -> List[str]:
     """Test dependencies for TE/JAX extensions."""
-    return ["numpy", "torchvision", "transformers"]
+    return ["numpy", "torchvision", "transformers", "torchao==0.13"]
 
 
 def setup_pytorch_extension(

diff --git a/build_tools/utils.py b/build_tools/utils.py
@@ -12,12 +12,31 @@
 import shutil
 import subprocess
 import sys
+import platform
 from pathlib import Path
 from importlib.metadata import version as get_version
 from subprocess import CalledProcessError
 from typing import List, Optional, Tuple, Union
 
 
+# Needs to stay consistent with .pre-commit-config.yaml config.
+def min_python_version() -> Tuple[int]:
+    """Minimum supported Python version."""
+    return (3, 10, 0)
+
+
+def min_python_version_str() -> str:
+    """String representing minimum supported Python version."""
+    return ".".join(map(str, min_python_version()))
+
+
+if sys.version_info < min_python_version():
+    raise RuntimeError(
+        f"Transformer Engine requires Python {min_python_version_str()} or newer, "
+        f"but found Python {platform.python_version()}."
+    )
+
+
 @functools.lru_cache(maxsize=None)
 def debug_build_enabled() -> bool:
     """Whether to build with a debug configuration"""
@@ -234,15 +253,18 @@ def get_cuda_include_dirs() -> Tuple[str, str]:
 
 @functools.lru_cache(maxsize=None)
 def cuda_archs() -> str:
-    version = cuda_version()
-    if os.getenv("NVTE_CUDA_ARCHS") is None:
+    archs = os.getenv("NVTE_CUDA_ARCHS")
+    if archs is None:
+        version = cuda_version()
         if version >= (13, 0):
-            os.environ["NVTE_CUDA_ARCHS"] = "75;80;89;90;100;120"
+            archs = "75;80;89;90;100;100a;103a;120"
+        elif version >= (12, 9):
+            archs = "70;80;89;90;100;100a;103a;120"
         elif version >= (12, 8):
-            os.environ["NVTE_CUDA_ARCHS"] = "70;80;89;90;100;120"
+            archs = "70;80;89;90;100;100a;120"
         else:
-            os.environ["NVTE_CUDA_ARCHS"] = "70;80;89;90"
-    return os.getenv("NVTE_CUDA_ARCHS")
+            archs = "70;80;89;90"
+    return archs
 
 
 def cuda_version() -> Tuple[int, ...]:

diff --git a/docs/api/common.rst b/docs/api/common.rst
@@ -12,6 +12,8 @@ Common API
 
 .. autoapiclass:: transformer_engine.common.recipe.MXFP8BlockScaling(fp8_format=Format.E4M3)
 
+.. autoapiclass:: transformer_engine.common.recipe.NVFP4BlockScaling(fp4_format=Format.E2M1)
+
 .. autoapiclass:: transformer_engine.common.recipe.Float8CurrentScaling(fp8_format=Format.HYBRID)
 
 .. autoapiclass:: transformer_engine.common.recipe.Float8BlockScaling(fp8_format=Format.E4M3)
diff --git a/docs/examples/FP4_format.png b/docs/examples/FP4_format.png
diff --git a/docs/examples/FP4_linear.png b/docs/examples/FP4_linear.png
diff --git a/docs/examples/attention/attention.ipynb b/docs/examples/attention/attention.ipynb
@@ -390,7 +390,7 @@
     "| Attention Backend | Precision | Architecture | Sliding Window Attention | MQA/GQA | Multi-Latent Attention | Context Parallelism | Determinism Possible |\n",
     "| :---------------- | :-------- | :----------- | :----------------------- | :------ | :--------------------- | :------------------ | :------------ |\n",
     "| cuDNN attention (all frameworks) | BF16, FP16, FP8 (PyTorch only) |  sm80+ | No  | Yes | Yes | Yes (`bshd`,`sbhd`, `thd`) | Yes |\n",
-    "| flash-attention (PyTorch)           | BF16, FP16      |  sm80+ | Yes | Yes | No | Yes (`bshd`,`thd`)  | Yes                                                                                    |\n",
+    "| flash-attention (PyTorch)           | BF16, FP16      |  sm80+ | Yes | Yes | Yes | Yes (`bshd`,`thd`)  | Yes                                                                                    |\n",
     "| Framework-native attention | BF16, FP16, FP32 |  Any   | No, unless used as a mask  | Yes | Yes (PyTorch only) | No                                  | Yes |\n",
     "\n",
     "Some unit tests are provided to serve as a starting point for integrating such features into users' models. For example,\n",
+1 −1		CMakeLists.txt
+2 −1		README.md
+1 −0		dlpack_version.txt
+3 −1		include/cudnn_frontend/graph_interface.h
+54 −3		include/cudnn_frontend/graph_properties.h
+12 −0		include/cudnn_frontend/node/pointwise.h
+43 −6		include/cudnn_frontend/node/scaled_dot_product_flash_attention.h
+0 −560		include/cudnn_frontend/node/sdpa_fp8.h
+32 −4		include/cudnn_frontend/node/softmax.h
+1 −1		include/cudnn_frontend_version.h
+5 −1		python/CMakeLists.txt
+1 −1		python/cudnn/__init__.py
+41 −2		python/pygraph/pointwise.cpp
+13 −0		python/pygraph/pygraph.h
+2 −0		samples/cpp/CMakeLists.txt
+332 −0		samples/cpp/sdpa/fp16_bwd_with_sink_token.cpp
+254 −0		samples/cpp/sdpa/fp16_fwd_with_sink_token.cpp
+5 −0		setup.py
+1 −1		test/python/test_matmul_bias_relu.py