diff --git a/bench/00_misc/fltflt_arithmetic.cu b/bench/00_misc/fltflt_arithmetic.cu
new file mode 100644
index 00000000..5ce24d2b
--- /dev/null
+++ b/bench/00_misc/fltflt_arithmetic.cu
@@ -0,0 +1,536 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2026, NVIDIA Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
+
+// Benchmarks for fltflt (float-float) arithmetic operations.
+// Compares performance across three precision modes: float, double, and fltflt.
+
+#include "matx.h"
+#include <nvbench/nvbench.cuh>
+
+using namespace matx;
+
+// Precision types to compare
+using precision_types = nvbench::type_list<float, double, fltflt>;
+
+//==============================================================================
+// Custom kernels that perform operations iteratively to increase arithmetic intensity
+//==============================================================================
+
+// Instruction-level parallelism factor
+static constexpr int ILP_FACTOR = 8;
+// Unroll factor for the inner loop
+static constexpr int ITER_UNROLL_FACTOR = 16;
+
+template <typename T>
+__global__ void iterative_add_kernel(T* __restrict__ result, int64_t size, int32_t iterations)
+{
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    // Initialize multiple independent accumulators in registers
+    T acc[ILP_FACTOR] = {};
+    const T val = static_cast<T>(0.123456789);
+
+    #pragma unroll ITER_UNROLL_FACTOR
+    for (int32_t i = 0; i < iterations; i++) {
+      // Independent operations for ILP - fully unrollable
+      #pragma unroll
+      for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+        acc[ilp] = acc[ilp] + val;
+      }
+    }
+
+    // Combine and write to prevent optimization
+    T result_val = acc[0];
+    #pragma unroll
+    for (int ilp = 1; ilp < ILP_FACTOR; ilp++) {
+      result_val = result_val + acc[ilp];
+    }
+    result[idx] = result_val;
+  }
+}
+
+template <typename T>
+__global__ void iterative_sub_kernel(T* __restrict__ result, int64_t size, int32_t iterations)
+{
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    T acc[ILP_FACTOR] = {};
+    const T val = static_cast<T>(0.001234567);
+
+    #pragma unroll ITER_UNROLL_FACTOR
+    for (int32_t i = 0; i < iterations; i++) {
+      #pragma unroll
+      for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+        acc[ilp] = acc[ilp] - val;
+      }
+    }
+
+    T result_val = acc[0];
+    #pragma unroll
+    for (int ilp = 1; ilp < ILP_FACTOR; ilp++) {
+      result_val = result_val + acc[ilp];
+    }
+    result[idx] = result_val;
+  }
+}
+
+template <typename T>
+__global__ void iterative_mul_kernel(T* __restrict__ result, int64_t size, int32_t iterations)
+{
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    T acc[ILP_FACTOR];
+    const T val = static_cast<T>(1.0000001);
+
+    #pragma unroll
+    for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+      acc[ilp] = val * val;
+    }
+
+    #pragma unroll ITER_UNROLL_FACTOR
+    for (int32_t i = 2; i < iterations; i++) {
+      #pragma unroll
+      for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+        acc[ilp] = acc[ilp] * val;
+      }
+    }
+
+    T result_val = acc[0];
+    #pragma unroll
+    for (int ilp = 1; ilp < ILP_FACTOR; ilp++) {
+      result_val = result_val + acc[ilp];
+    }
+    result[idx] = result_val;
+  }
+}
+
+template <typename T>
+__global__ void iterative_div_kernel(T* __restrict__ result, int64_t size, int32_t iterations)
+{
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    T acc[ILP_FACTOR];
+    const T val = static_cast<T>(1.0000001);
+
+    #pragma unroll
+    for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+      acc[ilp] = val / val;
+    }
+
+    #pragma unroll ITER_UNROLL_FACTOR
+    for (int32_t i = 1; i < iterations; i++) {
+      #pragma unroll
+      for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+        acc[ilp] = acc[ilp] / val;
+      }
+    }
+
+    T result_val = acc[0];
+    #pragma unroll
+    for (int ilp = 1; ilp < ILP_FACTOR; ilp++) {
+      result_val = result_val + acc[ilp];
+    }
+    result[idx] = result_val;
+  }
+}
+
+template <typename T>
+__global__ void iterative_sqrt_kernel(T* __restrict__ result, int64_t size, int32_t iterations)
+{
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    T val[ILP_FACTOR];
+    const T init_val = static_cast<T>(2.718281828);
+
+    #pragma unroll
+    for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+      val[ilp] = sqrt(init_val);
+    }
+
+    #pragma unroll ITER_UNROLL_FACTOR
+    for (int32_t i = 1; i < iterations; i++) {
+      #pragma unroll
+      for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+        val[ilp] = sqrt(val[ilp]);
+      }
+    }
+
+    T result_val = val[0];
+    #pragma unroll
+    for (int ilp = 1; ilp < ILP_FACTOR; ilp++) {
+      result_val = result_val + val[ilp];
+    }
+    result[idx] = result_val;
+  }
+}
+
+template <typename T>
+__global__ void iterative_abs_kernel(T* __restrict__ result, int64_t size, int32_t iterations)
+{
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    T acc[ILP_FACTOR];
+    const T val = static_cast<T>(-0.123456789);
+    #pragma unroll
+    for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+      acc[ilp] = val;
+    }
+
+    #pragma unroll ITER_UNROLL_FACTOR
+    for (int32_t i = 0; i < iterations; i++) {
+      #pragma unroll
+      for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+        acc[ilp] = -abs(acc[ilp]);
+      }
+    }
+
+    T result_val = acc[0];
+    #pragma unroll
+    for (int ilp = 1; ilp < ILP_FACTOR; ilp++) {
+      result_val = result_val + acc[ilp];
+    }
+    result[idx] = result_val;
+  }
+}
+
+template <typename T>
+__global__ void iterative_fma_kernel(T* __restrict__ result, int64_t size, int32_t iterations)
+{
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    T acc[ILP_FACTOR];
+    const T val_a = static_cast<T>(1.001);
+    const T val_b = static_cast<T>(1.002);
+    #pragma unroll
+    for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+      acc[ilp] = val_b;
+    }
+
+    #pragma unroll ITER_UNROLL_FACTOR
+    for (int32_t i = 0; i < iterations; i++) {
+      #pragma unroll
+      for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+        if constexpr (std::is_same_v<T, fltflt>) {
+          acc[ilp] = fltflt_fma(val_a, acc[ilp], val_b);
+        } else {
+          acc[ilp] = val_a * acc[ilp] + val_b;
+        }
+      }
+    }
+
+    T result_val = acc[0];
+    #pragma unroll
+    for (int ilp = 1; ilp < ILP_FACTOR; ilp++) {
+      result_val = result_val + acc[ilp];
+    }
+    result[idx] = result_val;
+  }
+}
+
+//==============================================================================
+// Addition Benchmark
+//==============================================================================
+template <typename PrecisionType>
+void fltflt_bench_add(nvbench::state &state, nvbench::type_list<PrecisionType>)
+{
+  const index_t size = static_cast<index_t>(state.get_int64("Array Size"));
+  const int32_t iterations = static_cast<int32_t>(state.get_int64("Iterations"));
+  cudaExecutor exec{0};
+
+  // Create output tensor only
+  auto result = make_tensor<PrecisionType>({size});
+
+  // Add metrics
+  state.add_element_count(size, "NumElements");
+  state.add_global_memory_writes<PrecisionType>(size);
+
+  constexpr int block_size = 256;
+  int grid_size = static_cast<int>((size + block_size - 1) / block_size);
+
+  exec.sync();
+
+  // Benchmark execution
+  state.exec([&](nvbench::launch &launch) {
+    iterative_add_kernel<<<grid_size, block_size, 0, (cudaStream_t)launch.get_stream()>>>(
+      result.Data(), size, iterations);
+  });
+}
+
+NVBENCH_BENCH_TYPES(fltflt_bench_add, NVBENCH_TYPE_AXES(precision_types))
+  .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
+  .add_int64_axis("Iterations", {250});
+
+//==============================================================================
+// Subtraction Benchmark
+//==============================================================================
+template <typename PrecisionType>
+void fltflt_bench_sub(nvbench::state &state, nvbench::type_list<PrecisionType>)
+{
+  const index_t size = static_cast<index_t>(state.get_int64("Array Size"));
+  const int32_t iterations = static_cast<int32_t>(state.get_int64("Iterations"));
+  cudaExecutor exec{0};
+
+  auto result = make_tensor<PrecisionType>({size});
+
+  state.add_element_count(size, "NumElements");
+  state.add_global_memory_writes<PrecisionType>(size);
+
+  constexpr int block_size = 256;
+  int grid_size = static_cast<int>((size + block_size - 1) / block_size);
+
+  exec.sync();
+
+  state.exec([&](nvbench::launch &launch) {
+    iterative_sub_kernel<<<grid_size, block_size, 0, (cudaStream_t)launch.get_stream()>>>(
+      result.Data(), size, iterations);
+  });
+}
+
+NVBENCH_BENCH_TYPES(fltflt_bench_sub, NVBENCH_TYPE_AXES(precision_types))
+  .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
+  .add_int64_axis("Iterations", {250});
+
+//==============================================================================
+// Multiplication Benchmark
+//==============================================================================
+template <typename PrecisionType>
+void fltflt_bench_mul(nvbench::state &state, nvbench::type_list<PrecisionType>)
+{
+  const index_t size = static_cast<index_t>(state.get_int64("Array Size"));
+  const int32_t iterations = static_cast<int32_t>(state.get_int64("Iterations"));
+  cudaExecutor exec{0};
+
+  auto result = make_tensor<PrecisionType>({size});
+
+  state.add_element_count(size, "NumElements");
+  state.add_global_memory_writes<PrecisionType>(size);
+
+  constexpr int block_size = 256;
+  int grid_size = static_cast<int>((size + block_size - 1) / block_size);
+
+  exec.sync();
+
+  state.exec([&](nvbench::launch &launch) {
+    iterative_mul_kernel<<<grid_size, block_size, 0, (cudaStream_t)launch.get_stream()>>>(
+      result.Data(), size, iterations);
+  });
+}
+
+NVBENCH_BENCH_TYPES(fltflt_bench_mul, NVBENCH_TYPE_AXES(precision_types))
+  .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
+  .add_int64_axis("Iterations", {250});
+
+//==============================================================================
+// Division Benchmark
+//==============================================================================
+template <typename PrecisionType>
+void fltflt_bench_div(nvbench::state &state, nvbench::type_list<PrecisionType>)
+{
+  const index_t size = static_cast<index_t>(state.get_int64("Array Size"));
+  const int32_t iterations = static_cast<int32_t>(state.get_int64("Iterations"));
+  cudaExecutor exec{0};
+
+  auto result = make_tensor<PrecisionType>({size});
+
+  state.add_element_count(size, "NumElements");
+  state.add_global_memory_writes<PrecisionType>(size);
+
+  constexpr int block_size = 256;
+  int grid_size = static_cast<int>((size + block_size - 1) / block_size);
+
+  exec.sync();
+
+  state.exec([&](nvbench::launch &launch) {
+    iterative_div_kernel<<<grid_size, block_size, 0, (cudaStream_t)launch.get_stream()>>>(
+      result.Data(), size, iterations);
+  });
+}
+
+NVBENCH_BENCH_TYPES(fltflt_bench_div, NVBENCH_TYPE_AXES(precision_types))
+  .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
+  .add_int64_axis("Iterations", {250});
+
+//==============================================================================
+// Square Root Benchmark
+//==============================================================================
+template <typename PrecisionType>
+void fltflt_bench_sqrt(nvbench::state &state, nvbench::type_list<PrecisionType>)
+{
+  const index_t size = static_cast<index_t>(state.get_int64("Array Size"));
+  const int32_t iterations = static_cast<int32_t>(state.get_int64("Iterations"));
+  cudaExecutor exec{0};
+
+  auto result = make_tensor<PrecisionType>({size});
+
+  state.add_element_count(size, "NumElements");
+  state.add_global_memory_writes<PrecisionType>(size);
+
+  constexpr int block_size = 256;
+  int grid_size = static_cast<int>((size + block_size - 1) / block_size);
+
+  exec.sync();
+
+  state.exec([&](nvbench::launch &launch) {
+    iterative_sqrt_kernel<<<grid_size, block_size, 0, (cudaStream_t)launch.get_stream()>>>(
+      result.Data(), size, iterations);
+  });
+}
+
+NVBENCH_BENCH_TYPES(fltflt_bench_sqrt, NVBENCH_TYPE_AXES(precision_types))
+  .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
+  .add_int64_axis("Iterations", {250});
+
+//==============================================================================
+// Absolute Value Benchmark
+//==============================================================================
+template <typename PrecisionType>
+void fltflt_bench_abs(nvbench::state &state, nvbench::type_list<PrecisionType>)
+{
+  const index_t size = static_cast<index_t>(state.get_int64("Array Size"));
+  const int32_t iterations = static_cast<int32_t>(state.get_int64("Iterations"));
+  cudaExecutor exec{0};
+
+  auto result = make_tensor<PrecisionType>({size});
+
+  state.add_element_count(size, "NumElements");
+  state.add_global_memory_writes<PrecisionType>(size);
+
+  constexpr int block_size = 256;
+  int grid_size = static_cast<int>((size + block_size - 1) / block_size);
+
+  exec.sync();
+
+  state.exec([&](nvbench::launch &launch) {
+    iterative_abs_kernel<<<grid_size, block_size, 0, (cudaStream_t)launch.get_stream()>>>(
+      result.Data(), size, iterations);
+  });
+}
+
+NVBENCH_BENCH_TYPES(fltflt_bench_abs, NVBENCH_TYPE_AXES(precision_types))
+  .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
+  .add_int64_axis("Iterations", {250});
+
+//==============================================================================
+// Fused Multiply-Add Benchmark
+//==============================================================================
+template <typename PrecisionType>
+void fltflt_bench_fma(nvbench::state &state, nvbench::type_list<PrecisionType>)
+{
+  const index_t size = static_cast<index_t>(state.get_int64("Array Size"));
+  const int32_t iterations = static_cast<int32_t>(state.get_int64("Iterations"));
+  cudaExecutor exec{0};
+
+  auto result = make_tensor<PrecisionType>({size});
+
+  state.add_element_count(size, "NumElements");
+  state.add_global_memory_writes<PrecisionType>(size);
+
+  constexpr int block_size = 256;
+  int grid_size = static_cast<int>((size + block_size - 1) / block_size);
+
+  exec.sync();
+
+  state.exec([&](nvbench::launch &launch) {
+    iterative_fma_kernel<<<grid_size, block_size, 0, (cudaStream_t)launch.get_stream()>>>(
+      result.Data(), size, iterations);
+  });
+}
+
+NVBENCH_BENCH_TYPES(fltflt_bench_fma, NVBENCH_TYPE_AXES(precision_types))
+  .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
+  .add_int64_axis("Iterations", {250});
+
+//==============================================================================
+// Multiply-Add (MADD) Benchmark - Separate Multiply and Add Operations
+//==============================================================================
+template <typename T>
+__global__ void iterative_madd_kernel(T* __restrict__ result, int64_t size, int32_t iterations)
+{
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    T acc[ILP_FACTOR];
+    const T val_a = static_cast<T>(1.001);
+    const T val_b = static_cast<T>(1.002);
+    #pragma unroll
+    for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+      acc[ilp] = val_b;
+    }
+
+    #pragma unroll ITER_UNROLL_FACTOR
+    for (int32_t i = 0; i < iterations; i++) {
+      #pragma unroll
+      for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+        if constexpr (std::is_same_v<T, fltflt>) {
+          // Explicitly separate multiply and add for fltflt
+          acc[ilp] = fltflt_add(fltflt_mul(val_a, acc[ilp]), val_b);
+        } else {
+          // For float/double, use natural expression (may or may not fuse)
+          acc[ilp] = val_a * acc[ilp] + val_b;
+        }
+      }
+    }
+
+    T result_val = acc[0];
+    #pragma unroll
+    for (int ilp = 1; ilp < ILP_FACTOR; ilp++) {
+      result_val = result_val + acc[ilp];
+    }
+    result[idx] = result_val;
+  }
+}
+
+template <typename PrecisionType>
+void fltflt_bench_madd(nvbench::state &state, nvbench::type_list<PrecisionType>)
+{
+  const index_t size = static_cast<index_t>(state.get_int64("Array Size"));
+  const int32_t iterations = static_cast<int32_t>(state.get_int64("Iterations"));
+  cudaExecutor exec{0};
+
+  auto result = make_tensor<PrecisionType>({size});
+
+  state.add_element_count(size, "NumElements");
+  state.add_global_memory_writes<PrecisionType>(size);
+
+  constexpr int block_size = 256;
+  int grid_size = static_cast<int>((size + block_size - 1) / block_size);
+
+  exec.sync();
+
+  state.exec([&](nvbench::launch &launch) {
+    iterative_madd_kernel<<<grid_size, block_size, 0, (cudaStream_t)launch.get_stream()>>>(
+      result.Data(), size, iterations);
+  });
+}
+
+NVBENCH_BENCH_TYPES(fltflt_bench_madd, NVBENCH_TYPE_AXES(precision_types))
+  .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
+  .add_int64_axis("Iterations", {250});
diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt
index 035b7ad4..f4970022 100644
--- a/bench/CMakeLists.txt
+++ b/bench/CMakeLists.txt
@@ -1,4 +1,5 @@
 set (bench_sources
+    00_misc/fltflt_arithmetic.cu
     00_transform/matmul.cu
     00_transform/fft.cu
     00_transform/conv.cu
diff --git a/bench/scripts/run_fltflt_benchmarks.py b/bench/scripts/run_fltflt_benchmarks.py
new file mode 100755
index 00000000..e46cefc8
--- /dev/null
+++ b/bench/scripts/run_fltflt_benchmarks.py
@@ -0,0 +1,343 @@
+#!/usr/bin/env python3
+
+# BSD 3-Clause License
+#
+# Copyright (c) 2026, NVIDIA Corporation
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+Run fltflt arithmetic benchmarks and summarize results.
+Shows performance relative to single-precision (1x baseline).
+"""
+
+import subprocess
+import re
+import sys
+from pathlib import Path
+from collections import defaultdict
+
+def find_benchmark_executable(build_dir):
+    """Find the matx_bench executable."""
+    benchmark_path = build_dir / "bench" / "matx_bench"
+
+    if benchmark_path.exists():
+        return benchmark_path
+
+    print(f"Error: Could not find matx_bench at {benchmark_path}")
+    return None
+
+def run_benchmark(executable_path, benchmark_name):
+    """Run a specific benchmark and capture output."""
+    print(f"Running benchmark: {benchmark_name}")
+
+    try:
+        result = subprocess.run(
+            [str(executable_path), "--benchmark", benchmark_name],
+            capture_output=True,
+            text=True,
+            timeout=300  # 5 minute timeout
+        )
+
+        if result.returncode != 0:
+            print(f"  Warning: Benchmark failed with return code {result.returncode}")
+            print(f"  stderr: {result.stderr}")
+            return None
+
+        return result.stdout
+    except subprocess.TimeoutExpired:
+        print(f"  Benchmark timed out after 5 minutes")
+        return None
+    except Exception as e:
+        print(f"  Error running benchmark: {e}")
+        return None
+
+def parse_time_value(time_str):
+    """Parse time string like '668.707 us' or '6.785 ms' and convert to milliseconds."""
+    time_str = time_str.strip()
+
+    # Match number and unit
+    match = re.match(r'([\d.]+)\s*(us|ms|ns|s)', time_str)
+    if not match:
+        return None
+
+    value = float(match.group(1))
+    unit = match.group(2)
+
+    # Convert to milliseconds
+    if unit == 'us':
+        return value / 1000.0
+    elif unit == 'ms':
+        return value
+    elif unit == 'ns':
+        return value / 1_000_000.0
+    elif unit == 's':
+        return value * 1000.0
+    else:
+        return value
+
+def parse_benchmark_output(output):
+    """
+    Parse the table format output from nvbench.
+
+    Expected format:
+    |      T       |   Array Size    | ... |  GPU Time  | ...
+    |--------------|-----------------|-----|------------|-----
+    |          F32 | ...                   | 668.707 us | ...
+    |          F64 | ...                   |  47.650 ms | ...
+    | matx::fltflt | ...                   |   6.785 ms | ...
+    """
+    results = {}
+    lines = output.strip().split('\n')
+
+    # Find the header line to locate GPU Time column
+    gpu_time_col_idx = None
+    for i, line in enumerate(lines):
+        if '|' in line and 'GPU Time' in line:
+            # Split by | and find GPU Time column index
+            cols = [col.strip() for col in line.split('|')]
+            for j, col in enumerate(cols):
+                if 'GPU Time' in col:
+                    gpu_time_col_idx = j
+                    break
+            break
+
+    if gpu_time_col_idx is None:
+        print("  Warning: Could not find GPU Time column in output")
+        return results
+
+    # Parse data rows
+    for line in lines:
+        if '|' not in line:
+            continue
+
+        # Skip header and separator lines
+        if 'GPU Time' in line or '---' in line or len(line.split('|')) < 2 or 'T' in line.split('|')[1]:
+            continue
+
+        cols = [col.strip() for col in line.split('|')]
+
+        if len(cols) <= gpu_time_col_idx:
+            continue
+
+        # Get type column (usually first or second)
+        type_col = cols[1] if len(cols) > 1 else None
+
+        if not type_col:
+            continue
+
+        # Map type names
+        if 'F32' in type_col:
+            precision = 'float'
+        elif 'F64' in type_col:
+            precision = 'double'
+        elif 'fltflt' in type_col:
+            precision = 'fltflt'
+        else:
+            continue
+
+        # Extract GPU time
+        gpu_time_str = cols[gpu_time_col_idx]
+        gpu_time_ms = parse_time_value(gpu_time_str)
+
+        if gpu_time_ms is not None:
+            results[precision] = gpu_time_ms
+
+    return results
+
+def calculate_relative_performance(results):
+    """
+    Calculate performance relative to float (single-precision).
+    float = 1.0x (baseline)
+    Higher values mean slower (took more time relative to float)
+    """
+    relative = {}
+
+    for bench_name, timings in results.items():
+        if 'float' not in timings:
+            print(f"Warning: No float baseline for {bench_name}, skipping")
+            continue
+
+        float_time = timings['float']
+        relative[bench_name] = {}
+
+        for precision, time_value in timings.items():
+            # Relative slowdown: how many times slower than float
+            relative[bench_name][precision] = time_value / float_time
+
+    return relative
+
+def print_summary(results, relative):
+    """Print a formatted summary table."""
+    print("\n")
+    print("=" * 80)
+    print("FLTFLT BENCHMARK SUMMARY")
+    print("=" * 80)
+    print()
+    print("Performance relative to single-precision (float = 1.0x baseline)")
+    print("Higher values indicate slower performance")
+    print()
+
+    # Print header
+    print(f"{'Benchmark':<15} {'float':<12} {'double':<12} {'fltflt':<12} {'fltflt vs dbl':<15}")
+    print("-" * 66)
+
+    # Order benchmarks
+    bench_order = ['add', 'sub', 'mul', 'div', 'sqrt', 'abs', 'fma', 'madd']
+
+    for bench in bench_order:
+        if bench not in relative:
+            continue
+
+        rel = relative[bench]
+        timings = results[bench]
+
+        # Get values with defaults
+        float_rel = rel.get('float', 1.0)
+        double_rel = rel.get('double', None)
+        fltflt_rel = rel.get('fltflt', None)
+
+        # Calculate fltflt speedup vs double (double_time / fltflt_time)
+        fltflt_vs_double = None
+        if 'double' in timings and 'fltflt' in timings:
+            fltflt_vs_double = timings['double'] / timings['fltflt']
+
+        # Format output
+        float_str = f"{float_rel:.2f}x"
+        double_str = f"{double_rel:.2f}x" if double_rel is not None else "N/A"
+        fltflt_str = f"{fltflt_rel:.2f}x" if fltflt_rel is not None else "N/A"
+        speedup_str = f"{fltflt_vs_double:.2f}x" if fltflt_vs_double is not None else "N/A"
+
+        print(f"{bench:<15} {float_str:<12} {double_str:<12} {fltflt_str:<12} {speedup_str:<15}")
+
+    print()
+    print("-" * 80)
+    print("Raw timings (milliseconds):")
+    print()
+    print(f"{'Benchmark':<15} {'float':<12} {'double':<12} {'fltflt':<12} {'fltflt vs dbl':<15}")
+    print("-" * 66)
+
+    for bench in bench_order:
+        if bench not in results:
+            continue
+
+        timings = results[bench]
+
+        float_time = timings.get('float', None)
+        double_time = timings.get('double', None)
+        fltflt_time = timings.get('fltflt', None)
+
+        # Calculate fltflt speedup vs double
+        fltflt_vs_double = None
+        if double_time is not None and fltflt_time is not None:
+            fltflt_vs_double = double_time / fltflt_time
+
+        float_str = f"{float_time:.3f}" if float_time is not None else "N/A"
+        double_str = f"{double_time:.3f}" if double_time is not None else "N/A"
+        fltflt_str = f"{fltflt_time:.3f}" if fltflt_time is not None else "N/A"
+        speedup_str = f"{fltflt_vs_double:.2f}x" if fltflt_vs_double is not None else "N/A"
+
+        print(f"{bench:<15} {float_str:<12} {double_str:<12} {fltflt_str:<12} {speedup_str:<15}")
+
+    print("=" * 80)
+
+def main():
+    # Find MatX build directory
+    script_dir = Path(__file__).parent
+
+    # Try common build directory locations
+    possible_build_dirs = [
+        script_dir / "build",
+        script_dir / "repos" / "MatX" / "build",
+        script_dir / "../build",
+        script_dir / "../../build",
+    ]
+
+    build_dir = None
+    for bd in possible_build_dirs:
+        if bd.exists():
+            build_dir = bd
+            break
+
+    if build_dir is None:
+        print("Error: Could not find MatX build directory")
+        print("Please run this script from the MatX source directory or specify build path")
+        sys.exit(1)
+
+    print(f"Using build directory: {build_dir}")
+
+    # Find benchmark executable
+    benchmark_exe = find_benchmark_executable(build_dir)
+
+    if benchmark_exe is None:
+        sys.exit(1)
+
+    print(f"Found benchmark: {benchmark_exe}")
+    print()
+
+    # List of benchmarks to run
+    benchmarks = ['add', 'sub', 'mul', 'div', 'sqrt', 'abs', 'fma', 'madd']
+
+    all_results = {}
+
+    # Run each benchmark
+    for bench in benchmarks:
+        bench_name = f"fltflt_bench_{bench}"
+        print(f"\n{'=' * 80}")
+        output = run_benchmark(benchmark_exe, bench_name)
+
+        if output is None:
+            print(f"  Skipping {bench} due to error")
+            continue
+
+        # Parse results
+        results = parse_benchmark_output(output)
+
+        if not results:
+            print(f"  Warning: Could not parse results for {bench}")
+            print("  Raw output:")
+            print(output)
+            continue
+
+        all_results[bench] = results
+        print(f"  Parsed: {', '.join([f'{k}={v:.3f}ms' for k, v in results.items()])}")
+
+    print(f"\n{'=' * 80}")
+
+    if not all_results:
+        print("\nError: No benchmark results collected")
+        sys.exit(1)
+
+    print(f"\nSuccessfully collected results for {len(all_results)} benchmarks")
+
+    # Calculate relative performance
+    relative = calculate_relative_performance(all_results)
+
+    # Print summary
+    print_summary(all_results, relative)
+
+if __name__ == "__main__":
+    main()
diff --git a/include/matx/kernels/fltflt.h b/include/matx/kernels/fltflt.h
index f10e82b9..862ac198 100644
--- a/include/matx/kernels/fltflt.h
+++ b/include/matx/kernels/fltflt.h
@@ -408,6 +408,9 @@ static __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt fltflt_abs(fltflt a)
     return a;
 }
 
+// Scalar abs overload so unary operator dispatch can handle fltflt expressions
+__MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt abs(fltflt a) { return fltflt_abs(a); }
+
 __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt operator+(fltflt a, fltflt b) { return fltflt_add(a, b); }
 __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt operator+(fltflt a, float b) { return fltflt_add(a, b); }
 __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt operator+(float a, fltflt b) { return fltflt_add(b, a); }