From 6cb0b648e45b5b9964f926d7df584853fab25cc2 Mon Sep 17 00:00:00 2001
From: Thomas Benson <tbenson@nvidia.com>
Date: Tue, 27 Jan 2026 13:45:10 -0800
Subject: [PATCH 1/2] Add nvbench-based benchmarks for the fltflt data types

Add several benchmarks that are built via -DMATX_BUILD_BENCHMARKS=ON.
Also add bench/scripts/run_fltflt_benchmarks.py, which runs the fltflt
benchmarks and summarizes the results. Example results when running
on RTX PRO 6000 Blackwell Server Edition are as follows:

Performance relative to single-precision (float = 1.0x baseline)
Higher values indicate slower performance

Benchmark       float        double       fltflt       fltflt vs dbl
------------------------------------------------------------------
add             1.00x        71.10x       28.84x       2.47x
sub             1.00x        71.11x       28.85x       2.46x
mul             1.00x        71.17x       10.15x       7.01x
div             1.00x        52.63x       5.85x        8.99x
sqrt            1.00x        52.40x       3.89x        13.48x
abs             1.00x        2.17x        2.15x        1.01x
fma             1.00x        71.13x       25.36x       2.81x
madd            1.00x        71.14x       38.78x       1.83x

-------------------------------------------------------------------

Note that addition and subtration and only ~2.5x faster using fltflt than
fp64. Multiplication, division, and square root are significantly faster.
Future updates may improve addition performance, but potentially at an
accuracy cost, so the changes will likely be opt-in.

Signed-off-by: Thomas Benson <tbenson@nvidia.com>
---
 bench/00_misc/fltflt_arithmetic.cu     | 536 +++++++++++++++++++++++++
 bench/CMakeLists.txt                   |   1 +
 bench/scripts/run_fltflt_benchmarks.py | 343 ++++++++++++++++
 include/matx/kernels/fltflt.h          |   3 +
 4 files changed, 883 insertions(+)
 create mode 100644 bench/00_misc/fltflt_arithmetic.cu
 create mode 100755 bench/scripts/run_fltflt_benchmarks.py

diff --git a/bench/00_misc/fltflt_arithmetic.cu b/bench/00_misc/fltflt_arithmetic.cu
new file mode 100644
index 00000000..5ce24d2b
--- /dev/null
+++ b/bench/00_misc/fltflt_arithmetic.cu
@@ -0,0 +1,536 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2026, NVIDIA Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
+
+// Benchmarks for fltflt (float-float) arithmetic operations.
+// Compares performance across three precision modes: float, double, and fltflt.
+
+#include "matx.h"
+#include <nvbench/nvbench.cuh>
+
+using namespace matx;
+
+// Precision types to compare
+using precision_types = nvbench::type_list<float, double, fltflt>;
+
+//==============================================================================
+// Custom kernels that perform operations iteratively to increase arithmetic intensity
+//==============================================================================
+
+// Instruction-level parallelism factor
+static constexpr int ILP_FACTOR = 8;
+// Unroll factor for the inner loop
+static constexpr int ITER_UNROLL_FACTOR = 16;
+
+template <typename T>
+__global__ void iterative_add_kernel(T* __restrict__ result, int64_t size, int32_t iterations)
+{
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    // Initialize multiple independent accumulators in registers
+    T acc[ILP_FACTOR] = {};
+    const T val = static_cast<T>(0.123456789);
+
+    #pragma unroll ITER_UNROLL_FACTOR
+    for (int32_t i = 0; i < iterations; i++) {
+      // Independent operations for ILP - fully unrollable
+      #pragma unroll
+      for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+        acc[ilp] = acc[ilp] + val;
+      }
+    }
+
+    // Combine and write to prevent optimization
+    T result_val = acc[0];
+    #pragma unroll
+    for (int ilp = 1; ilp < ILP_FACTOR; ilp++) {
+      result_val = result_val + acc[ilp];
+    }
+    result[idx] = result_val;
+  }
+}
+
+template <typename T>
+__global__ void iterative_sub_kernel(T* __restrict__ result, int64_t size, int32_t iterations)
+{
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    T acc[ILP_FACTOR] = {};
+    const T val = static_cast<T>(0.001234567);
+
+    #pragma unroll ITER_UNROLL_FACTOR
+    for (int32_t i = 0; i < iterations; i++) {
+      #pragma unroll
+      for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+        acc[ilp] = acc[ilp] - val;
+      }
+    }
+
+    T result_val = acc[0];
+    #pragma unroll
+    for (int ilp = 1; ilp < ILP_FACTOR; ilp++) {
+      result_val = result_val + acc[ilp];
+    }
+    result[idx] = result_val;
+  }
+}
+
+template <typename T>
+__global__ void iterative_mul_kernel(T* __restrict__ result, int64_t size, int32_t iterations)
+{
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    T acc[ILP_FACTOR];
+    const T val = static_cast<T>(1.0000001);
+
+    #pragma unroll
+    for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+      acc[ilp] = val * val;
+    }
+
+    #pragma unroll ITER_UNROLL_FACTOR
+    for (int32_t i = 2; i < iterations; i++) {
+      #pragma unroll
+      for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+        acc[ilp] = acc[ilp] * val;
+      }
+    }
+
+    T result_val = acc[0];
+    #pragma unroll
+    for (int ilp = 1; ilp < ILP_FACTOR; ilp++) {
+      result_val = result_val + acc[ilp];
+    }
+    result[idx] = result_val;
+  }
+}
+
+template <typename T>
+__global__ void iterative_div_kernel(T* __restrict__ result, int64_t size, int32_t iterations)
+{
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    T acc[ILP_FACTOR];
+    const T val = static_cast<T>(1.0000001);
+
+    #pragma unroll
+    for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+      acc[ilp] = val / val;
+    }
+
+    #pragma unroll ITER_UNROLL_FACTOR
+    for (int32_t i = 1; i < iterations; i++) {
+      #pragma unroll
+      for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+        acc[ilp] = acc[ilp] / val;
+      }
+    }
+
+    T result_val = acc[0];
+    #pragma unroll
+    for (int ilp = 1; ilp < ILP_FACTOR; ilp++) {
+      result_val = result_val + acc[ilp];
+    }
+    result[idx] = result_val;
+  }
+}
+
+template <typename T>
+__global__ void iterative_sqrt_kernel(T* __restrict__ result, int64_t size, int32_t iterations)
+{
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    T val[ILP_FACTOR];
+    const T init_val = static_cast<T>(2.718281828);
+
+    #pragma unroll
+    for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+      val[ilp] = sqrt(init_val);
+    }
+
+    #pragma unroll ITER_UNROLL_FACTOR
+    for (int32_t i = 1; i < iterations; i++) {
+      #pragma unroll
+      for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+        val[ilp] = sqrt(val[ilp]);
+      }
+    }
+
+    T result_val = val[0];
+    #pragma unroll
+    for (int ilp = 1; ilp < ILP_FACTOR; ilp++) {
+      result_val = result_val + val[ilp];
+    }
+    result[idx] = result_val;
+  }
+}
+
+template <typename T>
+__global__ void iterative_abs_kernel(T* __restrict__ result, int64_t size, int32_t iterations)
+{
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    T acc[ILP_FACTOR];
+    const T val = static_cast<T>(-0.123456789);
+    #pragma unroll
+    for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+      acc[ilp] = val;
+    }
+
+    #pragma unroll ITER_UNROLL_FACTOR
+    for (int32_t i = 0; i < iterations; i++) {
+      #pragma unroll
+      for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+        acc[ilp] = -abs(acc[ilp]);
+      }
+    }
+
+    T result_val = acc[0];
+    #pragma unroll
+    for (int ilp = 1; ilp < ILP_FACTOR; ilp++) {
+      result_val = result_val + acc[ilp];
+    }
+    result[idx] = result_val;
+  }
+}
+
+template <typename T>
+__global__ void iterative_fma_kernel(T* __restrict__ result, int64_t size, int32_t iterations)
+{
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    T acc[ILP_FACTOR];
+    const T val_a = static_cast<T>(1.001);
+    const T val_b = static_cast<T>(1.002);
+    #pragma unroll
+    for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+      acc[ilp] = val_b;
+    }
+
+    #pragma unroll ITER_UNROLL_FACTOR
+    for (int32_t i = 0; i < iterations; i++) {
+      #pragma unroll
+      for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+        if constexpr (std::is_same_v<T, fltflt>) {
+          acc[ilp] = fltflt_fma(val_a, acc[ilp], val_b);
+        } else {
+          acc[ilp] = val_a * acc[ilp] + val_b;
+        }
+      }
+    }
+
+    T result_val = acc[0];
+    #pragma unroll
+    for (int ilp = 1; ilp < ILP_FACTOR; ilp++) {
+      result_val = result_val + acc[ilp];
+    }
+    result[idx] = result_val;
+  }
+}
+
+//==============================================================================
+// Addition Benchmark
+//==============================================================================
+template <typename PrecisionType>
+void fltflt_bench_add(nvbench::state &state, nvbench::type_list<PrecisionType>)
+{
+  const index_t size = static_cast<index_t>(state.get_int64("Array Size"));
+  const int32_t iterations = static_cast<int32_t>(state.get_int64("Iterations"));
+  cudaExecutor exec{0};
+
+  // Create output tensor only
+  auto result = make_tensor<PrecisionType>({size});
+
+  // Add metrics
+  state.add_element_count(size, "NumElements");
+  state.add_global_memory_writes<PrecisionType>(size);
+
+  constexpr int block_size = 256;
+  int grid_size = static_cast<int>((size + block_size - 1) / block_size);
+
+  exec.sync();
+
+  // Benchmark execution
+  state.exec([&](nvbench::launch &launch) {
+    iterative_add_kernel<<<grid_size, block_size, 0, (cudaStream_t)launch.get_stream()>>>(
+      result.Data(), size, iterations);
+  });
+}
+
+NVBENCH_BENCH_TYPES(fltflt_bench_add, NVBENCH_TYPE_AXES(precision_types))
+  .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
+  .add_int64_axis("Iterations", {250});
+
+//==============================================================================
+// Subtraction Benchmark
+//==============================================================================
+template <typename PrecisionType>
+void fltflt_bench_sub(nvbench::state &state, nvbench::type_list<PrecisionType>)
+{
+  const index_t size = static_cast<index_t>(state.get_int64("Array Size"));
+  const int32_t iterations = static_cast<int32_t>(state.get_int64("Iterations"));
+  cudaExecutor exec{0};
+
+  auto result = make_tensor<PrecisionType>({size});
+
+  state.add_element_count(size, "NumElements");
+  state.add_global_memory_writes<PrecisionType>(size);
+
+  constexpr int block_size = 256;
+  int grid_size = static_cast<int>((size + block_size - 1) / block_size);
+
+  exec.sync();
+
+  state.exec([&](nvbench::launch &launch) {
+    iterative_sub_kernel<<<grid_size, block_size, 0, (cudaStream_t)launch.get_stream()>>>(
+      result.Data(), size, iterations);
+  });
+}
+
+NVBENCH_BENCH_TYPES(fltflt_bench_sub, NVBENCH_TYPE_AXES(precision_types))
+  .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
+  .add_int64_axis("Iterations", {250});
+
+//==============================================================================
+// Multiplication Benchmark
+//==============================================================================
+template <typename PrecisionType>
+void fltflt_bench_mul(nvbench::state &state, nvbench::type_list<PrecisionType>)
+{
+  const index_t size = static_cast<index_t>(state.get_int64("Array Size"));
+  const int32_t iterations = static_cast<int32_t>(state.get_int64("Iterations"));
+  cudaExecutor exec{0};
+
+  auto result = make_tensor<PrecisionType>({size});
+
+  state.add_element_count(size, "NumElements");
+  state.add_global_memory_writes<PrecisionType>(size);
+
+  constexpr int block_size = 256;
+  int grid_size = static_cast<int>((size + block_size - 1) / block_size);
+
+  exec.sync();
+
+  state.exec([&](nvbench::launch &launch) {
+    iterative_mul_kernel<<<grid_size, block_size, 0, (cudaStream_t)launch.get_stream()>>>(
+      result.Data(), size, iterations);
+  });
+}
+
+NVBENCH_BENCH_TYPES(fltflt_bench_mul, NVBENCH_TYPE_AXES(precision_types))
+  .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
+  .add_int64_axis("Iterations", {250});
+
+//==============================================================================
+// Division Benchmark
+//==============================================================================
+template <typename PrecisionType>
+void fltflt_bench_div(nvbench::state &state, nvbench::type_list<PrecisionType>)
+{
+  const index_t size = static_cast<index_t>(state.get_int64("Array Size"));
+  const int32_t iterations = static_cast<int32_t>(state.get_int64("Iterations"));
+  cudaExecutor exec{0};
+
+  auto result = make_tensor<PrecisionType>({size});
+
+  state.add_element_count(size, "NumElements");
+  state.add_global_memory_writes<PrecisionType>(size);
+
+  constexpr int block_size = 256;
+  int grid_size = static_cast<int>((size + block_size - 1) / block_size);
+
+  exec.sync();
+
+  state.exec([&](nvbench::launch &launch) {
+    iterative_div_kernel<<<grid_size, block_size, 0, (cudaStream_t)launch.get_stream()>>>(
+      result.Data(), size, iterations);
+  });
+}
+
+NVBENCH_BENCH_TYPES(fltflt_bench_div, NVBENCH_TYPE_AXES(precision_types))
+  .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
+  .add_int64_axis("Iterations", {250});
+
+//==============================================================================
+// Square Root Benchmark
+//==============================================================================
+template <typename PrecisionType>
+void fltflt_bench_sqrt(nvbench::state &state, nvbench::type_list<PrecisionType>)
+{
+  const index_t size = static_cast<index_t>(state.get_int64("Array Size"));
+  const int32_t iterations = static_cast<int32_t>(state.get_int64("Iterations"));
+  cudaExecutor exec{0};
+
+  auto result = make_tensor<PrecisionType>({size});
+
+  state.add_element_count(size, "NumElements");
+  state.add_global_memory_writes<PrecisionType>(size);
+
+  constexpr int block_size = 256;
+  int grid_size = static_cast<int>((size + block_size - 1) / block_size);
+
+  exec.sync();
+
+  state.exec([&](nvbench::launch &launch) {
+    iterative_sqrt_kernel<<<grid_size, block_size, 0, (cudaStream_t)launch.get_stream()>>>(
+      result.Data(), size, iterations);
+  });
+}
+
+NVBENCH_BENCH_TYPES(fltflt_bench_sqrt, NVBENCH_TYPE_AXES(precision_types))
+  .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
+  .add_int64_axis("Iterations", {250});
+
+//==============================================================================
+// Absolute Value Benchmark
+//==============================================================================
+template <typename PrecisionType>
+void fltflt_bench_abs(nvbench::state &state, nvbench::type_list<PrecisionType>)
+{
+  const index_t size = static_cast<index_t>(state.get_int64("Array Size"));
+  const int32_t iterations = static_cast<int32_t>(state.get_int64("Iterations"));
+  cudaExecutor exec{0};
+
+  auto result = make_tensor<PrecisionType>({size});
+
+  state.add_element_count(size, "NumElements");
+  state.add_global_memory_writes<PrecisionType>(size);
+
+  constexpr int block_size = 256;
+  int grid_size = static_cast<int>((size + block_size - 1) / block_size);
+
+  exec.sync();
+
+  state.exec([&](nvbench::launch &launch) {
+    iterative_abs_kernel<<<grid_size, block_size, 0, (cudaStream_t)launch.get_stream()>>>(
+      result.Data(), size, iterations);
+  });
+}
+
+NVBENCH_BENCH_TYPES(fltflt_bench_abs, NVBENCH_TYPE_AXES(precision_types))
+  .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
+  .add_int64_axis("Iterations", {250});
+
+//==============================================================================
+// Fused Multiply-Add Benchmark
+//==============================================================================
+template <typename PrecisionType>
+void fltflt_bench_fma(nvbench::state &state, nvbench::type_list<PrecisionType>)
+{
+  const index_t size = static_cast<index_t>(state.get_int64("Array Size"));
+  const int32_t iterations = static_cast<int32_t>(state.get_int64("Iterations"));
+  cudaExecutor exec{0};
+
+  auto result = make_tensor<PrecisionType>({size});
+
+  state.add_element_count(size, "NumElements");
+  state.add_global_memory_writes<PrecisionType>(size);
+
+  constexpr int block_size = 256;
+  int grid_size = static_cast<int>((size + block_size - 1) / block_size);
+
+  exec.sync();
+
+  state.exec([&](nvbench::launch &launch) {
+    iterative_fma_kernel<<<grid_size, block_size, 0, (cudaStream_t)launch.get_stream()>>>(
+      result.Data(), size, iterations);
+  });
+}
+
+NVBENCH_BENCH_TYPES(fltflt_bench_fma, NVBENCH_TYPE_AXES(precision_types))
+  .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
+  .add_int64_axis("Iterations", {250});
+
+//==============================================================================
+// Multiply-Add (MADD) Benchmark - Separate Multiply and Add Operations
+//==============================================================================
+template <typename T>
+__global__ void iterative_madd_kernel(T* __restrict__ result, int64_t size, int32_t iterations)
+{
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    T acc[ILP_FACTOR];
+    const T val_a = static_cast<T>(1.001);
+    const T val_b = static_cast<T>(1.002);
+    #pragma unroll
+    for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+      acc[ilp] = val_b;
+    }
+
+    #pragma unroll ITER_UNROLL_FACTOR
+    for (int32_t i = 0; i < iterations; i++) {
+      #pragma unroll
+      for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+        if constexpr (std::is_same_v<T, fltflt>) {
+          // Explicitly separate multiply and add for fltflt
+          acc[ilp] = fltflt_add(fltflt_mul(val_a, acc[ilp]), val_b);
+        } else {
+          // For float/double, use natural expression (may or may not fuse)
+          acc[ilp] = val_a * acc[ilp] + val_b;
+        }
+      }
+    }
+
+    T result_val = acc[0];
+    #pragma unroll
+    for (int ilp = 1; ilp < ILP_FACTOR; ilp++) {
+      result_val = result_val + acc[ilp];
+    }
+    result[idx] = result_val;
+  }
+}
+
+template <typename PrecisionType>
+void fltflt_bench_madd(nvbench::state &state, nvbench::type_list<PrecisionType>)
+{
+  const index_t size = static_cast<index_t>(state.get_int64("Array Size"));
+  const int32_t iterations = static_cast<int32_t>(state.get_int64("Iterations"));
+  cudaExecutor exec{0};
+
+  auto result = make_tensor<PrecisionType>({size});
+
+  state.add_element_count(size, "NumElements");
+  state.add_global_memory_writes<PrecisionType>(size);
+
+  constexpr int block_size = 256;
+  int grid_size = static_cast<int>((size + block_size - 1) / block_size);
+
+  exec.sync();
+
+  state.exec([&](nvbench::launch &launch) {
+    iterative_madd_kernel<<<grid_size, block_size, 0, (cudaStream_t)launch.get_stream()>>>(
+      result.Data(), size, iterations);
+  });
+}
+
+NVBENCH_BENCH_TYPES(fltflt_bench_madd, NVBENCH_TYPE_AXES(precision_types))
+  .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
+  .add_int64_axis("Iterations", {250});
diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt
index 035b7ad4..f4970022 100644
--- a/bench/CMakeLists.txt
+++ b/bench/CMakeLists.txt
@@ -1,4 +1,5 @@
 set (bench_sources
+    00_misc/fltflt_arithmetic.cu
     00_transform/matmul.cu
     00_transform/fft.cu
     00_transform/conv.cu
diff --git a/bench/scripts/run_fltflt_benchmarks.py b/bench/scripts/run_fltflt_benchmarks.py
new file mode 100755
index 00000000..a866ada7
--- /dev/null
+++ b/bench/scripts/run_fltflt_benchmarks.py
@@ -0,0 +1,343 @@
+#!/usr/bin/env python3
+
+# BSD 3-Clause License
+#
+# Copyright (c) 2026, NVIDIA Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+Run fltflt arithmetic benchmarks and summarize results.
+Shows performance relative to single-precision (1x baseline).
+"""
+
+import subprocess
+import re
+import sys
+from pathlib import Path
+from collections import defaultdict
+
+def find_benchmark_executable(build_dir):
+    """Find the matx_bench executable."""
+    benchmark_path = build_dir / "bench" / "matx_bench"
+
+    if benchmark_path.exists():
+        return benchmark_path
+
+    print(f"Error: Could not find matx_bench at {benchmark_path}")
+    return None
+
+def run_benchmark(executable_path, benchmark_name):
+    """Run a specific benchmark and capture output."""
+    print(f"Running benchmark: {benchmark_name}")
+
+    try:
+        result = subprocess.run(
+            [str(executable_path), "--benchmark", benchmark_name],
+            capture_output=True,
+            text=True,
+            timeout=300  # 5 minute timeout
+        )
+
+        if result.returncode != 0:
+            print(f"  Warning: Benchmark failed with return code {result.returncode}")
+            print(f"  stderr: {result.stderr}")
+            return None
+
+        return result.stdout
+    except subprocess.TimeoutExpired:
+        print(f"  Benchmark timed out after 5 minutes")
+        return None
+    except Exception as e:
+        print(f"  Error running benchmark: {e}")
+        return None
+
+def parse_time_value(time_str):
+    """Parse time string like '668.707 us' or '6.785 ms' and convert to milliseconds."""
+    time_str = time_str.strip()
+
+    # Match number and unit
+    match = re.match(r'([\d.]+)\s*(us|ms|ns|s)', time_str)
+    if not match:
+        return None
+
+    value = float(match.group(1))
+    unit = match.group(2)
+
+    # Convert to milliseconds
+    if unit == 'us':
+        return value / 1000.0
+    elif unit == 'ms':
+        return value
+    elif unit == 'ns':
+        return value / 1_000_000.0
+    elif unit == 's':
+        return value * 1000.0
+    else:
+        return value
+
+def parse_benchmark_output(output):
+    """
+    Parse the table format output from nvbench.
+
+    Expected format:
+    |      T       |   Array Size    | ... |  GPU Time  | ...
+    |--------------|-----------------|-----|------------|-----
+    |          F32 | ...                   | 668.707 us | ...
+    |          F64 | ...                   |  47.650 ms | ...
+    | matx::fltflt | ...                   |   6.785 ms | ...
+    """
+    results = {}
+    lines = output.strip().split('\n')
+
+    # Find the header line to locate GPU Time column
+    gpu_time_col_idx = None
+    for i, line in enumerate(lines):
+        if '|' in line and 'GPU Time' in line:
+            # Split by | and find GPU Time column index
+            cols = [col.strip() for col in line.split('|')]
+            for j, col in enumerate(cols):
+                if 'GPU Time' in col:
+                    gpu_time_col_idx = j
+                    break
+            break
+
+    if gpu_time_col_idx is None:
+        print("  Warning: Could not find GPU Time column in output")
+        return results
+
+    # Parse data rows
+    for line in lines:
+        if '|' not in line:
+            continue
+
+        # Skip header and separator lines
+        if 'GPU Time' in line or '---' in line or 'T' in line.split('|')[1]:
+            continue
+
+        cols = [col.strip() for col in line.split('|')]
+
+        if len(cols) <= gpu_time_col_idx:
+            continue
+
+        # Get type column (usually first or second)
+        type_col = cols[1] if len(cols) > 1 else None
+
+        if not type_col:
+            continue
+
+        # Map type names
+        if 'F32' in type_col:
+            precision = 'float'
+        elif 'F64' in type_col:
+            precision = 'double'
+        elif 'fltflt' in type_col:
+            precision = 'fltflt'
+        else:
+            continue
+
+        # Extract GPU time
+        gpu_time_str = cols[gpu_time_col_idx]
+        gpu_time_ms = parse_time_value(gpu_time_str)
+
+        if gpu_time_ms is not None:
+            results[precision] = gpu_time_ms
+
+    return results
+
+def calculate_relative_performance(results):
+    """
+    Calculate performance relative to float (single-precision).
+    float = 1.0x (baseline)
+    Higher values mean slower (took more time relative to float)
+    """
+    relative = {}
+
+    for bench_name, timings in results.items():
+        if 'float' not in timings:
+            print(f"Warning: No float baseline for {bench_name}, skipping")
+            continue
+
+        float_time = timings['float']
+        relative[bench_name] = {}
+
+        for precision, time_value in timings.items():
+            # Relative slowdown: how many times slower than float
+            relative[bench_name][precision] = time_value / float_time
+
+    return relative
+
+def print_summary(results, relative):
+    """Print a formatted summary table."""
+    print("\n")
+    print("=" * 80)
+    print("FLTFLT BENCHMARK SUMMARY")
+    print("=" * 80)
+    print()
+    print("Performance relative to single-precision (float = 1.0x baseline)")
+    print("Higher values indicate slower performance")
+    print()
+
+    # Print header
+    print(f"{'Benchmark':<15} {'float':<12} {'double':<12} {'fltflt':<12} {'fltflt vs dbl':<15}")
+    print("-" * 66)
+
+    # Order benchmarks
+    bench_order = ['add', 'sub', 'mul', 'div', 'sqrt', 'abs', 'fma', 'madd']
+
+    for bench in bench_order:
+        if bench not in relative:
+            continue
+
+        rel = relative[bench]
+        timings = results[bench]
+
+        # Get values with defaults
+        float_rel = rel.get('float', 1.0)
+        double_rel = rel.get('double', None)
+        fltflt_rel = rel.get('fltflt', None)
+
+        # Calculate fltflt speedup vs double (double_time / fltflt_time)
+        fltflt_vs_double = None
+        if 'double' in timings and 'fltflt' in timings:
+            fltflt_vs_double = timings['double'] / timings['fltflt']
+
+        # Format output
+        float_str = f"{float_rel:.2f}x"
+        double_str = f"{double_rel:.2f}x" if double_rel is not None else "N/A"
+        fltflt_str = f"{fltflt_rel:.2f}x" if fltflt_rel is not None else "N/A"
+        speedup_str = f"{fltflt_vs_double:.2f}x" if fltflt_vs_double is not None else "N/A"
+
+        print(f"{bench:<15} {float_str:<12} {double_str:<12} {fltflt_str:<12} {speedup_str:<15}")
+
+    print()
+    print("-" * 80)
+    print("Raw timings (milliseconds):")
+    print()
+    print(f"{'Benchmark':<15} {'float':<12} {'double':<12} {'fltflt':<12} {'fltflt vs dbl':<15}")
+    print("-" * 66)
+
+    for bench in bench_order:
+        if bench not in results:
+            continue
+
+        timings = results[bench]
+
+        float_time = timings.get('float', None)
+        double_time = timings.get('double', None)
+        fltflt_time = timings.get('fltflt', None)
+
+        # Calculate fltflt speedup vs double
+        fltflt_vs_double = None
+        if double_time is not None and fltflt_time is not None:
+            fltflt_vs_double = double_time / fltflt_time
+
+        float_str = f"{float_time:.3f}" if float_time is not None else "N/A"
+        double_str = f"{double_time:.3f}" if double_time is not None else "N/A"
+        fltflt_str = f"{fltflt_time:.3f}" if fltflt_time is not None else "N/A"
+        speedup_str = f"{fltflt_vs_double:.2f}x" if fltflt_vs_double is not None else "N/A"
+
+        print(f"{bench:<15} {float_str:<12} {double_str:<12} {fltflt_str:<12} {speedup_str:<15}")
+
+    print("=" * 80)
+
+def main():
+    # Find MatX build directory
+    script_dir = Path(__file__).parent
+
+    # Try common build directory locations
+    possible_build_dirs = [
+        script_dir / "build",
+        script_dir / "repos" / "MatX" / "build",
+        script_dir / "../build",
+        script_dir / "../../build",
+    ]
+
+    build_dir = None
+    for bd in possible_build_dirs:
+        if bd.exists():
+            build_dir = bd
+            break
+
+    if build_dir is None:
+        print("Error: Could not find MatX build directory")
+        print("Please run this script from the MatX source directory or specify build path")
+        sys.exit(1)
+
+    print(f"Using build directory: {build_dir}")
+
+    # Find benchmark executable
+    benchmark_exe = find_benchmark_executable(build_dir)
+
+    if benchmark_exe is None:
+        sys.exit(1)
+
+    print(f"Found benchmark: {benchmark_exe}")
+    print()
+
+    # List of benchmarks to run
+    benchmarks = ['add', 'sub', 'mul', 'div', 'sqrt', 'abs', 'fma', 'madd']
+
+    all_results = {}
+
+    # Run each benchmark
+    for bench in benchmarks:
+        bench_name = f"fltflt_bench_{bench}"
+        print(f"\n{'=' * 80}")
+        output = run_benchmark(benchmark_exe, bench_name)
+
+        if output is None:
+            print(f"  Skipping {bench} due to error")
+            continue
+
+        # Parse results
+        results = parse_benchmark_output(output)
+
+        if not results:
+            print(f"  Warning: Could not parse results for {bench}")
+            print("  Raw output:")
+            print(output)
+            continue
+
+        all_results[bench] = results
+        print(f"  Parsed: {', '.join([f'{k}={v:.3f}ms' for k, v in results.items()])}")
+
+    print(f"\n{'=' * 80}")
+
+    if not all_results:
+        print("\nError: No benchmark results collected")
+        sys.exit(1)
+
+    print(f"\nSuccessfully collected results for {len(all_results)} benchmarks")
+
+    # Calculate relative performance
+    relative = calculate_relative_performance(all_results)
+
+    # Print summary
+    print_summary(all_results, relative)
+
+if __name__ == "__main__":
+    main()
diff --git a/include/matx/kernels/fltflt.h b/include/matx/kernels/fltflt.h
index f10e82b9..862ac198 100644
--- a/include/matx/kernels/fltflt.h
+++ b/include/matx/kernels/fltflt.h
@@ -408,6 +408,9 @@ static __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt fltflt_abs(fltflt a)
     return a;
 }
 
+// Scalar abs overload so unary operator dispatch can handle fltflt expressions
+__MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt abs(fltflt a) { return fltflt_abs(a); }
+
 __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt operator+(fltflt a, fltflt b) { return fltflt_add(a, b); }
 __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt operator+(fltflt a, float b) { return fltflt_add(a, b); }
 __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt operator+(float a, fltflt b) { return fltflt_add(b, a); }

From d743d276cc5824531169fb532408db1e04c9fad5 Mon Sep 17 00:00:00 2001
From: Thomas Benson <tbenson@nvidia.com>
Date: Fri, 30 Jan 2026 06:36:45 -0800
Subject: [PATCH 2/2] Add adding guards in run_fltflt_benchmarks.py parsing

Signed-off-by: Thomas Benson <tbenson@nvidia.com>
---
 bench/scripts/run_fltflt_benchmarks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bench/scripts/run_fltflt_benchmarks.py b/bench/scripts/run_fltflt_benchmarks.py
index a866ada7..e46cefc8 100755
--- a/bench/scripts/run_fltflt_benchmarks.py
+++ b/bench/scripts/run_fltflt_benchmarks.py
@@ -135,7 +135,7 @@ def parse_benchmark_output(output):
             continue
 
         # Skip header and separator lines
-        if 'GPU Time' in line or '---' in line or 'T' in line.split('|')[1]:
+        if 'GPU Time' in line or '---' in line or len(line.split('|')) < 2 or 'T' in line.split('|')[1]:
             continue
 
         cols = [col.strip() for col in line.split('|')]