diff --git a/src/Metrics.zig b/src/Metrics.zig
new file mode 100644
index 0000000..8463445
--- /dev/null
+++ b/src/Metrics.zig
@@ -0,0 +1,41 @@
+///////////////////////////////////////////////////////////////////////////////
+// Meta
+
+/// The identifier string for the benchmark
+name: []const u8,
+/// Total number of measurement samples collected
+samples: usize,
+
+///////////////////////////////////////////////////////////////////////////////
+// Time
+
+/// Minimum execution time per operation (nanoseconds)
+min_ns: f64,
+/// Maximum execution time per operation (nanoseconds)
+max_ns: f64,
+/// Mean execution time (nanoseconds)
+mean_ns: f64,
+/// Median execution time (nanoseconds)
+median_ns: f64,
+/// Standard deviation of the execution time
+std_dev_ns: f64,
+
+///////////////////////////////////////////////////////////////////////////////
+// Throughput
+
+/// Calculated operations per second
+ops_sec: f64,
+/// Data throughput in MB/s (populated if `bytes_per_op` > 0)
+mb_sec: f64,
+
+///////////////////////////////////////////////////////////////////////////////
+// Hardware (Linux only, null otherwise)
+
+/// Average CPU cycles per operation
+cycles: ?f64 = null,
+/// Average CPU instructions executed per operation
+instructions: ?f64 = null,
+/// Instructions Per Cycle (efficiency ratio)
+ipc: ?f64 = null,
+/// Average cache misses per operation
+cache_misses: ?f64 = null,
diff --git a/src/Perf.test.zig b/src/Perf.test.zig
deleted file mode 100644
index fe3b1e4..0000000
--- a/src/Perf.test.zig
+++ /dev/null
@@ -1,51 +0,0 @@
-const std = @import("std");
-const testing = std.testing;
-const builtin = @import("builtin");
-const Perf = @import("Perf.zig");
-
-test "Perf: lifecycle" {
-    var perf = Perf.init() catch return error.SkipZigTest;
-    defer perf.deinit();
-
-    try perf.capture();
-
-    var x: u64 = 0;
-    for (0..10_000) |i| {
-        x +%= i;
-        std.mem.doNotOptimizeAway(x);
-    }
-
-    try perf.stop();
-    const m = try perf.read();
-
-    // Verify we captured instructions
-    if (m.instructions == 0) {
-        std.debug.print("WARN: Captured 0 instructions. Check permissions.\n", .{});
-    } else {
-        try testing.expect(m.instructions > 10_000);
-        try testing.expect(m.cycles > 0);
-    }
-}
-
-test "Perf: cache misses" {
-    var perf = Perf.init() catch return error.SkipZigTest;
-    defer perf.deinit();
-
-    try perf.capture();
-
-    // Thrash L1 cache
-    var buf = try testing.allocator.alloc(u8, 1024 * 1024);
-    defer testing.allocator.free(buf);
-    @memset(buf, 0xAA);
-
-    var sum: u64 = 0;
-    var i: usize = 0;
-    while (i < buf.len) : (i += 64) {
-        sum +%= buf[i];
-    }
-    std.mem.doNotOptimizeAway(sum);
-
-    try perf.stop();
-    const m = try perf.read();
-    std.debug.print("m = {any}", .{m});
-}
diff --git a/src/Perf.zig b/src/Perf.zig
deleted file mode 100644
index b761fc2..0000000
--- a/src/Perf.zig
+++ /dev/null
@@ -1,152 +0,0 @@
-// References: https://man7.org/linux/man-pages/man2/perf_event_open.2.html
-
-const std = @import("std");
-const builtin = @import("builtin");
-const linux = std.os.linux;
-const posix = std.posix;
-
-const Perf = @This();
-const PERF_EVENT_IOC_ID = linux.IOCTL.IOR('$', 7, u64);
-
-leader_fd: posix.fd_t = -1,
-sibling_fds: [2]posix.fd_t = .{ -1, -1 },
-
-/// IDs assigned by the kernel to identify events in the read buffer.
-/// Indices: 0=Cycles, 1=Instructions, 2=CacheMisses
-ids: [3]u64 = .{ 0, 0, 0 },
-
-pub const Measurements = struct {
-    cycles: u64,
-    instructions: u64,
-    cache_misses: u64,
-};
-
-pub fn init() !Perf {
-    var self = Perf{};
-
-    // CPU Cycles (Group Leader)
-    self.leader_fd = try openEvent(.cpu_cycles, -1);
-    self.ids[0] = try getId(self.leader_fd);
-
-    {
-        const fd = try openEvent(.instructions, self.leader_fd);
-        self.ids[1] = try getId(fd);
-        self.sibling_fds[0] = fd;
-    }
-
-    {
-        const fd = try openEvent(.cache_misses, self.leader_fd);
-        self.ids[2] = try getId(fd);
-        self.sibling_fds[1] = fd;
-    }
-
-    return self;
-}
-
-pub fn deinit(self: *Perf) void {
-    if (self.leader_fd != -1) {
-        _ = linux.close(self.leader_fd);
-        self.leader_fd = -1;
-    }
-    for (self.sibling_fds, 0..) |fd, i| {
-        if (fd != -1) _ = linux.close(fd);
-        self.sibling_fds[i] = -1;
-    }
-}
-
-pub fn capture(self: *Perf) !void {
-    if (self.leader_fd == -1) return;
-    const reset = linux.ioctl(self.leader_fd, linux.PERF.EVENT_IOC.RESET, 0);
-    if (std.c.errno(reset) != .SUCCESS) @panic("ioctl/reset fails");
-    const enable = linux.ioctl(self.leader_fd, linux.PERF.EVENT_IOC.ENABLE, 0);
-    if (std.c.errno(enable) != .SUCCESS) @panic("ioctl/enable fails");
-}
-
-pub fn stop(self: *Perf) !void {
-    if (self.leader_fd == -1) return;
-    const disable = linux.ioctl(self.leader_fd, linux.PERF.EVENT_IOC.DISABLE, 0);
-    if (std.c.errno(disable) != .SUCCESS) @panic("ioctl/disable fails");
-}
-
-/// Reads the counter values.
-/// Returns a struct with the collected data.
-pub fn read(self: *Perf) !Measurements {
-    var m = Measurements{
-        .cycles = 0,
-        .instructions = 0,
-        .cache_misses = 0,
-    };
-    if (self.leader_fd == -1) return m;
-
-    // Format: PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING | PERF_FORMAT_ID | PERF_FORMAT_GROUP
-    // Layout: nr, time_enabled, time_running, [value, id], [value, id], ...
-    // Max items = 3. Header = 3 u64. Total u64s = 3 + (2 * 3) = 9
-    var buf: [16]u64 = undefined;
-
-    _ = try posix.read(self.leader_fd, std.mem.sliceAsBytes(&buf));
-
-    const nr = buf[0];
-    const time_enabled = buf[1];
-    const time_running = buf[2];
-
-    // std.debug.print("nr={d}\n", .{nr});
-    // std.debug.print("time_running={d}\n", .{time_running});
-
-    if (time_running == 0) return m;
-
-    var i: usize = 0;
-    while (i < nr) : (i += 1) {
-        const base_idx = 3 + (i * 2);
-        if (base_idx + 1 >= buf.len) break;
-
-        var val = buf[base_idx];
-        const id = buf[base_idx + 1];
-
-        // std.debug.print("i={d} val={d} (before)\n", .{ i, val });
-        if (time_running < time_enabled) {
-            val = @as(u64, @intFromFloat(@as(f64, @floatFromInt(val)) * (@as(f64, @floatFromInt(time_enabled)) / @as(f64, @floatFromInt(time_running)))));
-        }
-
-        // std.debug.print("i={d} val={d} (after)\n", .{ i, val });
-        // std.debug.print("i={d} id={d}\n", .{ i, id });
-
-        if (id == self.ids[0]) m.cycles = val;
-        if (id == self.ids[1]) m.instructions = val;
-        if (id == self.ids[2]) m.cache_misses = val;
-    }
-
-    return m;
-}
-
-const Event = enum { cpu_cycles, instructions, cache_misses };
-
-fn openEvent(event: Event, group_fd: posix.fd_t) !posix.fd_t {
-    const config: u64 = switch (event) {
-        .cpu_cycles => @intFromEnum(linux.PERF.COUNT.HW.CPU_CYCLES),
-        .instructions => @intFromEnum(linux.PERF.COUNT.HW.INSTRUCTIONS),
-        .cache_misses => @intFromEnum(linux.PERF.COUNT.HW.CACHE_MISSES),
-    };
-
-    var attr = std.mem.zeroes(linux.perf_event_attr);
-    attr.type = linux.PERF.TYPE.HARDWARE;
-    attr.config = config;
-
-    // Enable grouping and ID tracking
-    attr.read_format = 1 << 0 | 1 << 1 | 1 << 2 | 1 << 3;
-
-    attr.flags.disabled = (group_fd == -1); // Only leader starts disabled
-    attr.flags.inherit = true;
-    attr.flags.exclude_kernel = true;
-    attr.flags.exclude_hv = true;
-
-    const fd = try posix.perf_event_open(&attr, 0, -1, group_fd, 0);
-    return fd;
-}
-
-fn getId(fd: i32) !u64 {
-    var id: u64 = 0;
-    if (linux.ioctl(fd, PERF_EVENT_IOC_ID, @intFromPtr(&id)) != 0) {
-        return error.IoctlFailed;
-    }
-    return id;
-}
diff --git a/src/Reporter.test.zig b/src/Reporter.test.zig
new file mode 100644
index 0000000..d40c49c
--- /dev/null
+++ b/src/Reporter.test.zig
@@ -0,0 +1,34 @@
+const std = @import("std");
+const testing = std.testing;
+
+const Runner = @import("Runner.zig");
+const Reporter = @import("Reporter.zig");
+
+fn fibNaive(n: u64) u64 {
+    if (n <= 1) return n;
+    return fibNaive(n - 1) + fibNaive(n - 2);
+}
+
+fn fibIterative(n: u64) u64 {
+    if (n == 0) return 0;
+    var a: u64 = 0;
+    var b: u64 = 1;
+    for (2..n + 1) |_| {
+        const c = a + b;
+        a = b;
+        b = c;
+    }
+    return b;
+}
+
+test "report fib" {
+    const allocator = testing.allocator;
+    const opts = Runner.Options{
+        .sample_size = 100,
+        .warmup_iters = 3,
+    };
+    const m_naive = try Runner.run(allocator, "fibNaive", fibNaive, .{@as(u64, 20)}, opts);
+    const m_iter = try Runner.run(allocator, "fibIterative", fibIterative, .{@as(u64, 20)}, opts);
+
+    try Reporter.report(.{ .metrics = &.{ m_naive, m_iter }, .baseline_index = 0 });
+}
diff --git a/src/Reporter.zig b/src/Reporter.zig
new file mode 100644
index 0000000..118cdf6
--- /dev/null
+++ b/src/Reporter.zig
@@ -0,0 +1,201 @@
+const std = @import("std");
+const Writer = std.Io.Writer;
+const tty = std.Io.tty;
+
+const Metrics = @import("Metrics.zig");
+
+pub const Options = struct {
+    metrics: []const Metrics,
+    /// The index in 'metrics' to use as the baseline for comparison (e.g 1.00x).
+    /// If null, no comparison column is shown.
+    baseline_index: ?usize = null,
+};
+
+/// Prints a formatted summary table to stdout.
+pub fn report(options: Options) !void {
+    var buffer: [0x2000]u8 = undefined;
+    var w: Writer = .fixed(&buffer);
+    try writeReport(&w, options);
+    std.debug.print("{s}", .{w.buffered()});
+}
+
+/// Writes the formatted report to a specific writer
+pub fn writeReport(writer: *Writer, options: Options) !void {
+    if (options.metrics.len == 0) return;
+
+    try writer.print("Benchmark Summary: {d} benchmarks run\n", .{options.metrics.len});
+
+    var max_name_len: usize = 0;
+    for (options.metrics) |m| max_name_len = @max(max_name_len, m.name.len);
+
+    for (options.metrics, 0..) |m, i| {
+        const is_last_item = i == options.metrics.len - 1;
+
+        // --- ROW 1: High Level (Name | Time | Speed | Comparison) ---
+        const tree_char = if (is_last_item) "└─ " else "├─ ";
+        try writeColor(writer, .bright_black, tree_char);
+        try writeColor(writer, .cyan, m.name);
+        // try writer.print("{s}{s}", .{ tree_char, m.name });
+
+        // Align name
+        const padding = max_name_len - m.name.len + 2;
+        _ = try writer.splatByte(' ', padding);
+
+        try fmtTime(writer, m.median_ns);
+        try writer.writeAll("   ");
+
+        if (m.mb_sec > 0.001) {
+            try fmtBandwidth(writer, m.mb_sec);
+        } else {
+            try fmtOps(writer, m.ops_sec);
+        }
+
+        // Comparison (On the first line now)
+        if (options.baseline_index) |base_idx| {
+            try writer.writeAll("   ");
+            if (i == base_idx) {
+                try writeColor(writer, .blue, "[baseline]");
+            } else if (base_idx < options.metrics.len) {
+                const base = options.metrics[base_idx];
+                const base_f = base.median_ns;
+                const curr_f = m.median_ns;
+
+                if (curr_f > 0 and base_f > 0) {
+                    if (curr_f < base_f) {
+                        try writer.writeAll("\x1b[32m"); // Green manually to mix with print
+                        try writer.print("{d:.2}x faster", .{base_f / curr_f});
+                        try writer.writeAll("\x1b[0m");
+                    } else {
+                        try writer.writeAll("\x1b[31m");
+                        try writer.print("{d:.2}x slower", .{curr_f / base_f});
+                        try writer.writeAll("\x1b[0m");
+                    }
+                } else {
+                    try writer.writeAll("-");
+                }
+            }
+        }
+        try writer.writeByte('\n');
+
+        // Only printed if we have hardware stats
+        if (m.cycles) |cycles| {
+            const sub_tree_prefix = if (is_last_item) "   └─ " else "│  └─ ";
+            try writer.writeAll(sub_tree_prefix);
+            try writeColor(writer, .dim, "cycles: ");
+            try fmtInt(writer, cycles);
+        }
+
+        if (m.instructions) |instructions| {
+            try writer.writeAll("\t");
+            try writeColor(writer, .dim, "instructions: ");
+            try fmtInt(writer, instructions);
+        }
+
+        if (m.ipc) |ipc| {
+            try writer.writeAll("\t");
+            try writeColor(writer, .dim, "ipc: ");
+            try writer.print("{d:.2}", .{ipc});
+        }
+
+        if (m.cache_misses) |cache_missess| {
+            try writer.writeAll("\t");
+            try writeColor(writer, .dim, "miss: ");
+            try fmtInt(writer, cache_missess);
+
+            try writer.writeByte('\n');
+        }
+    }
+}
+
+fn writeColor(writer: *Writer, color: tty.Color, text: []const u8) !void {
+    const config = tty.Config.detect(std.fs.File.stdout());
+    if (config != .no_color) {
+        switch (color) {
+            .reset => try writer.writeAll("\x1b[0m"),
+            .red => try writer.writeAll("\x1b[31m"),
+            .green => try writer.writeAll("\x1b[32m"),
+            .blue => try writer.writeAll("\x1b[34m"),
+            .cyan => try writer.writeAll("\x1b[36m"),
+            .dim => try writer.writeAll("\x1b[2m"),
+            .black => try writer.writeAll("\x1b[90m"),
+            else => try writer.writeAll(""),
+        }
+    }
+    try writer.writeAll(text);
+    if (config != .no_color) try writer.writeAll("\x1b[0m");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// formatters
+
+fn fmtInt(writer: *Writer, val: f64) !void {
+    if (val < 1000) {
+        try writer.print("{d:.0}", .{val});
+    } else if (val < 1_000_000) {
+        try writer.print("{d:.1}k", .{val / 1000.0});
+    } else if (val < 1_000_000_000) {
+        try writer.print("{d:.1}M", .{val / 1_000_000.0});
+    } else {
+        try writer.print("{d:.1}G", .{val / 1_000_000_000.0});
+    }
+}
+
+fn fmtTime(writer: *Writer, ns: f64) !void {
+    var buf: [64]u8 = undefined;
+    var slice: []u8 = undefined;
+
+    if (ns < 1000) {
+        slice = try std.fmt.bufPrint(&buf, "{d:.2}ns", .{ns});
+    } else if (ns < 1_000_000) {
+        slice = try std.fmt.bufPrint(&buf, "{d:.2}us", .{ns / 1000.0});
+    } else if (ns < 1_000_000_000) {
+        slice = try std.fmt.bufPrint(&buf, "{d:.2}ms", .{ns / 1_000_000.0});
+    } else {
+        slice = try std.fmt.bufPrint(&buf, "{d:.2}s", .{ns / 1_000_000_000.0});
+    }
+    try padLeft(writer, slice, 9);
+}
+
+fn fmtOps(writer: *Writer, ops: f64) !void {
+    var buf: [64]u8 = undefined;
+    var slice: []u8 = undefined;
+
+    if (ops < 1000) {
+        slice = try std.fmt.bufPrint(&buf, "{d:.0}/s", .{ops});
+    } else if (ops < 1_000_000) {
+        slice = try std.fmt.bufPrint(&buf, "{d:.2}K/s", .{ops / 1000.0});
+    } else if (ops < 1_000_000_000) {
+        slice = try std.fmt.bufPrint(&buf, "{d:.2}M/s", .{ops / 1_000_000.0});
+    } else {
+        slice = try std.fmt.bufPrint(&buf, "{d:.2}G/s", .{ops / 1_000_000_000.0});
+    }
+    try padLeft(writer, slice, 11);
+}
+
+fn fmtBandwidth(writer: *Writer, mb: f64) !void {
+    var buf: [64]u8 = undefined;
+    var slice: []u8 = undefined;
+
+    if (mb >= 1000) {
+        slice = try std.fmt.bufPrint(&buf, "{d:.2}GB/s", .{mb / 1000.0});
+    } else {
+        slice = try std.fmt.bufPrint(&buf, "{d:.2}MB/s", .{mb});
+    }
+    try padLeft(writer, slice, 11);
+}
+
+// Pads with spaces on the left (for numbers)
+fn padLeft(writer: *Writer, text: []const u8, width: usize) !void {
+    if (text.len < width) {
+        _ = try writer.splatByte(' ', width - text.len);
+    }
+    try writer.writeAll(text);
+}
+
+// Pads with spaces on the right (for text/comparisons)
+fn padRight(writer: *Writer, text: []const u8, width: usize) !void {
+    try writer.writeAll(text);
+    if (text.len < width) {
+        _ = try writer.splatByte(' ', width - text.len);
+    }
+}
diff --git a/src/test.zig b/src/Runner.test.zig
similarity index 65%
rename from src/test.zig
rename to src/Runner.test.zig
index c9462a4..76f0d51 100644
--- a/src/test.zig
+++ b/src/Runner.test.zig
@@ -2,7 +2,7 @@ const builtin = @import("builtin");
 const std = @import("std");
 const testing = std.testing;
 
-const bench = @import("root.zig");
+const Runner = @import("Runner.zig");
 
 fn noOp() !void {
     // Pure overhead measurement
@@ -25,18 +25,9 @@ fn sleepWork() !void {
     std.mem.doNotOptimizeAway(io);
 }
 
-// Global buffer for memory test
-var src_buf: [16 * 1024]u8 = undefined;
-var dst_buf: [16 * 1024]u8 = undefined;
-
-fn copyWork() !void {
-    @memcpy(&dst_buf, &src_buf);
-    std.mem.doNotOptimizeAway(dst_buf);
-}
-
-test "run: basic check" {
+test "basic metrics" {
     const allocator = testing.allocator;
-    const noop_metrics = try bench.run(allocator, "NoOp", noOp, .{}, .{});
+    const noop_metrics = try Runner.run(allocator, "NoOp", noOp, .{}, .{});
 
     // The minimum cannot be larger than the maximum
     try testing.expect(noop_metrics.min_ns <= noop_metrics.max_ns);
@@ -48,7 +39,7 @@ test "run: basic check" {
     // Execution must take some time (non-zero)
     try testing.expect(noop_metrics.min_ns > 0);
 
-    const busy_metrics = try bench.run(allocator, "Busy", busyWork, .{}, .{});
+    const busy_metrics = try Runner.run(allocator, "Busy", busyWork, .{}, .{});
 
     // The busy function MUST be slower than the no-op
     try testing.expect(busy_metrics.median_ns > noop_metrics.median_ns);
@@ -58,62 +49,17 @@ test "run: basic check" {
     // not just the overhead of the tool itself.
     try testing.expect(busy_metrics.median_ns > (noop_metrics.median_ns * 2));
 
-    const sleep_metrics = try bench.run(allocator, "Sleep", sleepWork, .{}, .{});
+    const sleep_metrics = try Runner.run(allocator, "Sleep", sleepWork, .{}, .{});
     const target_ns = 1 * std.time.ns_per_ms;
 
     // We check if the result is reasonably close to 1ms.
     // Note: OS Sleep is imprecise. It will always be >= target, never less.
-    // We allow a "scheduler noise" overhead (e.g., +2ms tolerance for CI environments).
     try testing.expect(sleep_metrics.median_ns >= target_ns);
 
     const tolerance = 2 * std.time.ns_per_ms;
     try testing.expect(sleep_metrics.median_ns < (target_ns + tolerance));
 }
 
-test "run: bandwidth check" {
-    const allocator = testing.allocator;
-    @memset(&src_buf, 0xAA);
-    const metrics = try bench.run(allocator, "Copy", copyWork, .{}, .{
-        .sample_size = 1000,
-        .bytes_per_op = src_buf.len,
-    });
-
-    try testing.expect(metrics.mb_sec > 0);
-    try testing.expect(metrics.mb_sec > 1.0); // Sanity check
-}
-
-test "report: output" {
-    const allocator = testing.allocator;
-    @memset(&src_buf, 0xAA);
-    const copy_metrics = try bench.run(allocator, "Copy", copyWork, .{}, .{
-        .sample_size = 1000,
-        .bytes_per_op = src_buf.len,
-    });
-
-    const noop_metrics = try bench.run(allocator, "NoOp", noOp, .{}, .{});
-    const sleep_metrics = try bench.run(allocator, "Sleep", sleepWork, .{}, .{});
-    const busy_metrics = try bench.run(allocator, "Busy", busyWork, .{}, .{});
-
-    var single: std.Io.Writer.Allocating = .init(allocator);
-    defer single.deinit();
-    try bench.writeReport(&single.writer, .{ .metrics = &.{copy_metrics} });
-
-    var double: std.Io.Writer.Allocating = .init(allocator);
-    defer double.deinit();
-    try bench.writeReport(&double.writer, .{ .metrics = &.{ noop_metrics, sleep_metrics } });
-
-    var baseline: std.Io.Writer.Allocating = .init(allocator);
-    defer baseline.deinit();
-    try bench.writeReport(&baseline.writer, .{
-        .metrics = &.{ noop_metrics, sleep_metrics, busy_metrics },
-        .baseline_index = 0,
-    });
-
-    std.debug.print("\nsingle:\n{s}\n", .{single.written()});
-    std.debug.print("\ndouble:\n{s}\n", .{double.written()});
-    std.debug.print("\nbaseline:\n{s}\n", .{baseline.written()});
-}
-
 // Simulate a whitespace skipper function
 fn skipWhitespaceNaive(input: []const u8) !void {
     var i: usize = 0;
@@ -132,7 +78,7 @@ fn skipWhitespaceSIMD(input: []const u8) !void {
     std.mem.doNotOptimizeAway(i);
 }
 
-test "run: with args" {
+test "run with args" {
     const allocator = testing.allocator;
 
     // Generate test data outside the benchmark
@@ -142,8 +88,8 @@ test "run: with args" {
     @memset(input, ' ');
     input[len - 1] = 'x'; // Stop at the end
 
-    const m_naive = try bench.run(allocator, "Naive", skipWhitespaceNaive, .{input}, .{ .sample_size = 100 });
-    const m_simd = try bench.run(allocator, "SIMD", skipWhitespaceSIMD, .{input}, .{ .sample_size = 100 });
+    const m_naive = try Runner.run(allocator, "Naive", skipWhitespaceNaive, .{input}, .{ .sample_size = 100 });
+    const m_simd = try Runner.run(allocator, "SIMD", skipWhitespaceSIMD, .{input}, .{ .sample_size = 100 });
 
     try testing.expect(m_naive.median_ns > 0);
     try testing.expect(m_simd.median_ns > 0);
@@ -152,20 +98,38 @@ test "run: with args" {
     try testing.expect(m_simd.median_ns < m_naive.median_ns);
 }
 
-///////////////////////////////////////////////////////////////////////////////
-// accuracy test
+// Global buffer for memory test
+var src_buf: [16 * 1024]u8 = undefined;
+var dst_buf: [16 * 1024]u8 = undefined;
+
+fn copyWork() !void {
+    @memcpy(&dst_buf, &src_buf);
+    std.mem.doNotOptimizeAway(dst_buf);
+}
+
+test "bandwidth check" {
+    const allocator = testing.allocator;
+    @memset(&src_buf, 0xAA);
+    const metrics = try Runner.run(allocator, "Copy", copyWork, .{}, .{
+        .sample_size = 1000,
+        .bytes_per_op = src_buf.len,
+    });
+
+    try testing.expect(metrics.mb_sec > 0);
+    try testing.expect(metrics.mb_sec > 1.0); // Sanity check
+}
 
 fn fastIncrement(val: *u64) !void {
     val.* +%= 1;
     std.mem.doNotOptimizeAway(val.*);
 }
 
-test "accuracy: adaptive batching precision" {
+test "metrics accuracy" {
     const allocator = testing.allocator;
     var x: u64 = 0;
 
     // Run the benchmark on a sub-nanosecond operation
-    const metrics = try bench.run(allocator, "FastInc", fastIncrement, .{&x}, .{
+    const metrics = try Runner.run(allocator, "FastIncrement", fastIncrement, .{&x}, .{
         .warmup_iters = 100,
         .sample_size = 1000,
     });
@@ -210,10 +174,10 @@ fn functionReturnValueError() !u64 {
 test "run: suppported signatures" {
     const allocator = testing.allocator;
 
-    _ = try bench.run(allocator, "functionReturnVoid", functionReturnVoid, .{}, .{});
-    _ = try bench.run(allocator, "functionReturnVoidError", functionReturnVoidError, .{}, .{});
-    _ = try bench.run(allocator, "functionReturnValue", functionReturnValue, .{}, .{});
-    _ = try bench.run(allocator, "functionReturnValueError", functionReturnValueError, .{}, .{});
+    _ = try Runner.run(allocator, "functionReturnVoid", functionReturnVoid, .{}, .{});
+    _ = try Runner.run(allocator, "functionReturnVoidError", functionReturnVoidError, .{}, .{});
+    _ = try Runner.run(allocator, "functionReturnValue", functionReturnValue, .{}, .{});
+    _ = try Runner.run(allocator, "functionReturnValueError", functionReturnValueError, .{}, .{});
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -239,13 +203,13 @@ fn fibIterative(n: u64) u64 {
 }
 
 test "run: fibonacci" {
-    const allocator = std.heap.smp_allocator;
-    const opts = bench.Options{
+    const allocator = testing.allocator;
+    const opts = Runner.Options{
         .sample_size = 100,
         .warmup_iters = 3,
     };
-    const m_naive = try bench.run(allocator, "fibNaive", fibNaive, .{@as(u64, 30)}, opts);
-    const m_iter = try bench.run(allocator, "fibIterative", fibIterative, .{@as(u64, 30)}, opts);
+    const m_naive = try Runner.run(allocator, "fibNaive", fibNaive, .{@as(u64, 30)}, opts);
+    const m_iter = try Runner.run(allocator, "fibIterative", fibIterative, .{@as(u64, 30)}, opts);
 
     try testing.expect(m_naive.mean_ns > m_iter.mean_ns * 100);
 }
diff --git a/src/Runner.zig b/src/Runner.zig
new file mode 100644
index 0000000..f770756
--- /dev/null
+++ b/src/Runner.zig
@@ -0,0 +1,223 @@
+const builtin = @import("builtin");
+const std = @import("std");
+const math = std.math;
+const sort = std.sort;
+const Timer = std.time.Timer;
+const Allocator = std.mem.Allocator;
+
+const Metrics = @import("Metrics.zig");
+const perf = @import("perf.zig");
+
+pub const Options = struct {
+    warmup_iters: u64 = 100,
+    sample_size: u64 = 1000,
+    bytes_per_op: usize = 0,
+};
+
+pub fn run(allocator: Allocator, name: []const u8, function: anytype, args: anytype, options: Options) !Metrics {
+    assertFunctionDef(function, args);
+
+    // ref: https://pyk.sh/blog/2025-12-08-bench-fixing-constant-folding
+    var runtime_args = createRuntimeArgs(function, args);
+    std.mem.doNotOptimizeAway(&runtime_args);
+
+    for (0..options.warmup_iters) |_| {
+        try execute(function, runtime_args);
+    }
+
+    // We need to determine a batch_size such that the total execution time of the batch
+    // is large enough to minimize timer resolution noise.
+    // Target: 1ms (1,000,000 ns) per measurement block.
+    const min_sample_time_ns = 1_000_000;
+    var batch_size: u64 = 1;
+    var timer = try Timer.start();
+
+    while (true) {
+        timer.reset();
+        for (0..batch_size) |_| {
+            try execute(function, runtime_args);
+        }
+        const duration = timer.read();
+
+        if (duration >= min_sample_time_ns) break;
+
+        // If the duration is 0 (too fast to measure) or small, scale up
+        if (duration == 0) {
+            batch_size *= 10;
+        } else {
+            const ratio = @as(f64, @floatFromInt(min_sample_time_ns)) / @as(f64, @floatFromInt(duration));
+            const multiplier = @as(u64, @intFromFloat(std.math.ceil(ratio)));
+            if (multiplier <= 1) {
+                batch_size *= 2; // Fallback growth
+            } else {
+                batch_size *= multiplier;
+            }
+        }
+    }
+
+    const samples = try allocator.alloc(f64, options.sample_size);
+    defer allocator.free(samples);
+
+    for (0..options.sample_size) |i| {
+        timer.reset();
+        for (0..batch_size) |_| {
+            try execute(function, runtime_args);
+        }
+        const total_ns = timer.read();
+        // Average time per operation for this batch
+        samples[i] = @as(f64, @floatFromInt(total_ns)) / @as(f64, @floatFromInt(batch_size));
+    }
+
+    // Sort samples to find the median and process min/max
+    sort.block(f64, samples, {}, sort.asc(f64));
+
+    var sum: f64 = 0;
+    for (samples) |s| sum += s;
+
+    const mean = sum / @as(f64, @floatFromInt(options.sample_size));
+
+    // Calculate Variance for Standard Deviation
+    var sum_sq_diff: f64 = 0;
+    for (samples) |s| {
+        const diff = s - mean;
+        sum_sq_diff += diff * diff;
+    }
+    const variance = sum_sq_diff / @as(f64, @floatFromInt(options.sample_size));
+
+    // Calculate Operations Per Second
+    const ops_sec = if (mean > 0) 1_000_000_000.0 / mean else 0;
+
+    // Calculate MB/s (Megabytes per second)
+    // Formula: (Ops/Sec * Bytes/Op) / 1,000,000
+    const mb_sec = if (options.bytes_per_op > 0)
+        (ops_sec * @as(f64, @floatFromInt(options.bytes_per_op))) / 1_000_000.0
+    else
+        0;
+
+    var metrics = Metrics{
+        .name = name,
+        .min_ns = samples[0],
+        .max_ns = samples[samples.len - 1],
+        .mean_ns = mean,
+        .median_ns = samples[options.sample_size / 2],
+        .std_dev_ns = math.sqrt(variance),
+        .samples = options.sample_size,
+        .ops_sec = ops_sec,
+        .mb_sec = mb_sec,
+    };
+
+    if (builtin.os.tag == .linux) {
+        const events = [_]perf.Event{ .cpu_cycles, .instructions, .cache_misses };
+        const perf_group = perf.Group(&events);
+        if (perf_group.init()) |pg| {
+            var group = pg;
+            defer group.deinit();
+
+            try group.enable();
+            for (0..options.sample_size) |_| {
+                for (0..batch_size) |_| {
+                    try execute(function, runtime_args);
+                }
+            }
+            try group.disable();
+
+            const m = try group.read();
+            const total_ops = @as(f64, @floatFromInt(options.sample_size * batch_size));
+            const avg_cycles = @as(f64, @floatFromInt(m.cpu_cycles)) / total_ops;
+            const avg_instr = @as(f64, @floatFromInt(m.instructions)) / total_ops;
+            const avg_misses = @as(f64, @floatFromInt(m.cache_misses)) / total_ops;
+
+            metrics.cycles = avg_cycles;
+            metrics.instructions = avg_instr;
+            metrics.cache_misses = avg_misses;
+            if (avg_cycles > 0) {
+                metrics.ipc = avg_instr / avg_cycles;
+            }
+        } else |_| {} // skip counter if we can't open it
+    }
+
+    return metrics;
+}
+
+inline fn execute(function: anytype, args: anytype) !void {
+    const FnType = unwrapFnType(@TypeOf(function));
+    const return_type = @typeInfo(FnType).@"fn".return_type.?;
+
+    // Conditional execution based on whether the function can fail
+    if (@typeInfo(return_type) == .error_union) {
+        const result = try @call(.auto, function, args);
+        std.mem.doNotOptimizeAway(result);
+    } else {
+        const result = @call(.auto, function, args);
+        std.mem.doNotOptimizeAway(result);
+    }
+}
+
+/// Returns the underlying Function type, unwrapping it if it is a pointer.
+fn unwrapFnType(comptime T: type) type {
+    if (@typeInfo(T) == .pointer) return @typeInfo(T).pointer.child;
+    return T;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Function definition checker
+
+fn assertFunctionDef(function: anytype, args: anytype) void {
+    const ArgsType = @TypeOf(args);
+    const args_info = @typeInfo(ArgsType);
+    if (args_info != .@"struct" or !args_info.@"struct".is_tuple) {
+        @compileError("Expected 'args' to be a tuple, found '" ++ @typeName(ArgsType) ++ "'");
+    }
+
+    const FnType = unwrapFnType(@TypeOf(function));
+    if (@typeInfo(FnType) != .@"fn") {
+        @compileError("Expected 'function' to be a function or function pointer, found '" ++ @typeName(@TypeOf(function)) ++ "'");
+    }
+
+    const params_len = @typeInfo(FnType).@"fn".params.len;
+    const args_len = @typeInfo(ArgsType).@"struct".fields.len;
+
+    if (params_len != args_len) {
+        @compileError(std.fmt.comptimePrint(
+            "Function expects {d} arguments, but args tuple has {d}",
+            .{ params_len, args_len },
+        ));
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Runtime Arguments Helpers
+
+/// Constructs the runtime argument tuple based on function parameters and input args.
+fn createRuntimeArgs(function: anytype, args: anytype) RuntimeArgsType(@TypeOf(function), @TypeOf(args)) {
+    const TupleType = RuntimeArgsType(@TypeOf(function), @TypeOf(args));
+    var runtime_args: TupleType = undefined;
+
+    // We only need the length here to iterate
+    const fn_params = getFnParams(@TypeOf(function));
+
+    inline for (0..fn_params.len) |i| {
+        runtime_args[i] = args[i];
+    }
+    return runtime_args;
+}
+
+/// Computes the precise Tuple type required to hold the arguments.
+fn RuntimeArgsType(comptime FnType: type, comptime ArgsType: type) type {
+    const fn_params = getFnParams(FnType);
+    const args_fields = @typeInfo(ArgsType).@"struct".fields;
+    comptime var types: [fn_params.len]type = undefined;
+    inline for (fn_params, 0..) |p, i| {
+        if (p.type) |t| {
+            types[i] = t;
+        } else {
+            types[i] = args_fields[i].type;
+        }
+    }
+    return std.meta.Tuple(&types);
+}
+
+/// Helper to unwrap function pointers and retrieve parameter info
+fn getFnParams(comptime FnType: type) []const std.builtin.Type.Fn.Param {
+    return @typeInfo(unwrapFnType(FnType)).@"fn".params;
+}
diff --git a/src/perf.test.zig b/src/perf.test.zig
new file mode 100644
index 0000000..9df8232
--- /dev/null
+++ b/src/perf.test.zig
@@ -0,0 +1,98 @@
+const std = @import("std");
+const testing = std.testing;
+const linux = std.os.linux;
+
+const perf = @import("perf.zig");
+
+test "Event toConfig mapping" {
+    try testing.expectEqual(
+        perf.Event.cpu_cycles.toConfig(),
+        @intFromEnum(linux.PERF.COUNT.HW.CPU_CYCLES),
+    );
+    try testing.expectEqual(
+        perf.Event.instructions.toConfig(),
+        @intFromEnum(linux.PERF.COUNT.HW.INSTRUCTIONS),
+    );
+    try testing.expectEqual(
+        perf.Event.branch_misses.toConfig(),
+        @intFromEnum(linux.PERF.COUNT.HW.BRANCH_MISSES),
+    );
+}
+
+test "GroupReadOutputType generates correct struct fields" {
+    const events = [_]perf.Event{ .cpu_cycles, .branch_misses };
+    const MyCounters = perf.GroupReadOutputType(&events);
+
+    // We expect the struct to have fields named after the events
+    try testing.expect(@hasField(MyCounters, "cpu_cycles"));
+    try testing.expect(@hasField(MyCounters, "branch_misses"));
+
+    // We expect the struct NOT to have fields we didn't include
+    try testing.expect(!@hasField(MyCounters, "instructions"));
+
+    const info = @typeInfo(MyCounters);
+    inline for (info.@"struct".fields) |field| {
+        try testing.expect(field.type == u64);
+    }
+}
+
+test "GroupReadOutputType instantiation and usage" {
+    const events = [_]perf.Event{ .instructions, .cache_misses };
+    const MyCounters = perf.GroupReadOutputType(&events);
+
+    var counters = MyCounters{
+        .instructions = 100,
+        .cache_misses = 5,
+    };
+
+    counters.instructions += 50;
+    try testing.expectEqual(150, counters.instructions);
+    try testing.expectEqual(5, counters.cache_misses);
+}
+
+test "Sanity check" {
+    const ValidGroup = perf.Group(&.{.cpu_cycles});
+    try testing.expect(@sizeOf(ValidGroup) > 0);
+}
+
+test "Group init/deinit lifecycle" {
+    const MyGroup = perf.Group(&.{ .cpu_cycles, .instructions });
+
+    // We expect this might fail with OpenGroupFailed (EACCES/ENOENT) on
+    // many CI systems. We catch that specific error to pass the test,
+    // proving the error mapping logic works.
+    var group = MyGroup.init() catch return error.SkipZigTest;
+    try testing.expect(group.event_fds[0] != -1);
+    try testing.expect(group.event_ids[0] != 0);
+    group.deinit();
+    try testing.expect(group.event_fds[0] == -1);
+    try testing.expect(group.event_ids[0] == 0);
+}
+
+test "Group handles BadGroup error" {
+    const MyGroup = perf.Group(&.{.cpu_cycles});
+    var group = MyGroup.init() catch return error.SkipZigTest;
+    group.deinit();
+    try testing.expectError(error.BadGroup, group.enable());
+    try testing.expectError(error.BadGroup, group.disable());
+}
+
+test "Group lifecycle" {
+    const MyGroup = perf.Group(&.{ .instructions, .cpu_cycles });
+    var group = MyGroup.init() catch return error.SkipZigTest;
+    defer group.deinit();
+
+    try group.enable();
+
+    var x: u64 = 0;
+    for (0..10_000) |i| {
+        x +%= i;
+        std.mem.doNotOptimizeAway(x);
+    }
+
+    try group.disable();
+    const m = try group.read();
+
+    try testing.expect(m.instructions > 10_000);
+    try testing.expect(m.cpu_cycles > 0);
+}
diff --git a/src/perf.zig b/src/perf.zig
new file mode 100644
index 0000000..d032388
--- /dev/null
+++ b/src/perf.zig
@@ -0,0 +1,288 @@
+const std = @import("std");
+const linux = std.os.linux;
+const Type = std.builtin.Type;
+
+// Bits for perf_event_attr.read_format
+const PERF_FORMAT_TOTAL_TIME_ENABLED = 1 << 0;
+const PERF_FORMAT_TOTAL_TIME_RUNNING = 1 << 1;
+const PERF_FORMAT_ID = 1 << 2;
+const PERF_FORMAT_GROUP = 1 << 3;
+
+// Various ioctls act on perf_event_open() file descriptors:
+const PERF_EVENT_IOC_ID = linux.IOCTL.IOR('$', 7, u64);
+const PERF_EVENT_IOC_RESET = linux.PERF.EVENT_IOC.RESET;
+const PERF_EVENT_IOC_ENABLE = linux.PERF.EVENT_IOC.ENABLE;
+const PERF_EVENT_IOC_DISABLE = linux.PERF.EVENT_IOC.DISABLE;
+
+/// The hardware events supported by the kernel for performance monitoring.
+/// These map directly to `perf_event_attr.config` values.
+pub const Event = enum {
+    cpu_cycles,
+    instructions,
+    cache_misses,
+    branch_misses,
+    bus_cycles,
+
+    /// Converts the enum into the specific kernel configuration integer
+    /// required by the `perf_event_open` syscall.
+    pub fn toConfig(self: Event) u64 {
+        return switch (self) {
+            .cpu_cycles => @intFromEnum(linux.PERF.COUNT.HW.CPU_CYCLES),
+            .instructions => @intFromEnum(linux.PERF.COUNT.HW.INSTRUCTIONS),
+            .cache_misses => @intFromEnum(linux.PERF.COUNT.HW.CACHE_MISSES),
+            .branch_misses => @intFromEnum(linux.PERF.COUNT.HW.BRANCH_MISSES),
+            .bus_cycles => @intFromEnum(linux.PERF.COUNT.HW.BUS_CYCLES),
+        };
+    }
+};
+
+pub fn GroupReadOutputType(comptime events: []const Event) type {
+    var field_names: [events.len][]const u8 = undefined;
+    var field_types: [events.len]type = undefined;
+    var field_attrs: [events.len]Type.StructField.Attributes = undefined;
+    for (events, 0..) |event, index| {
+        field_names[index] = @tagName(event);
+        field_types[index] = u64;
+        field_attrs[index] = .{
+            .@"comptime" = false,
+            .@"align" = @alignOf(u64),
+            .default_value_ptr = null,
+        };
+    }
+    return @Struct(
+        .auto,
+        null,
+        &field_names,
+        &field_types,
+        &field_attrs,
+    );
+}
+
+/// A type-safe wrapper for the Linux `perf_event_open` system call,
+/// specifically configured for event grouping (`PERF_FORMAT_GROUP`).
+///
+/// `Group` leverages Zig's `comptime` features to generate a custom
+/// `ReadOutputType` result type that strictly matches the requested `events`.
+/// It manages the complexity of creating a group leader, attaching sibling
+/// events, and handling the binary layout of the kernel's read buffer.
+///
+/// Notes:
+/// * The `read()` method returns a struct with named fields corresponding
+///   exactly to the input events (e.g. `.cpu_cycles`).
+/// * The `read()` method automatically detects if the CPU was oversubscribed
+///   and scales the counter values based on `time_enabled` and `time_running`.
+///
+/// References:
+/// * man 2 perf_event_open
+/// * man 1 perf-list
+pub fn Group(comptime events: []const Event) type {
+    if (events.len == 0) @compileError("perf.Group requires at least 1 event");
+
+    const Error = error{
+        /// Failed to open group via perf_event_open
+        OpenGroupFailed,
+        /// Failed to retrieve the ID of the event via IOCTL
+        GetIdFailed,
+        /// Failed to reset counters via IOCTL
+        ResetGroupFailed,
+        /// Failed to enable counters via IOCTL
+        EnableGroupFailed,
+        /// Failed to disable counters via IOCTL
+        DisableGroupFailed,
+        /// Failed to read data from the file descriptor
+        ReadGroupFailed,
+        /// Group already deinitialized
+        BadGroup,
+    };
+
+    const Output = GroupReadOutputType(events);
+
+    // Matches the binary layout of the buffer read from the group leader fd.
+    // See `man perf_event_open` section "Reading results".
+    // Corresponds to `struct read_format` when using:
+    // PERF_FORMAT_GROUP | PERF_FORMAT_TOTAL_TIME_ENABLED |
+    // PERF_FORMAT_TOTAL_TIME_RUNNING | PERF_FORMAT_ID
+    const ReadFormatGroup = extern struct {
+        /// The number of events in this group.
+        nr: u64,
+        /// Total time the event group was enabled.
+        time_enabled: u64,
+        /// Total time the event group was actually running.
+        time_running: u64,
+        /// Array of values matching the `nr` of events.
+        values: [events.len]extern struct {
+            value: u64,
+            id: u64,
+        },
+    };
+
+    return struct {
+        const Self = @This();
+
+        event_fds: [events.len]linux.fd_t = undefined,
+        event_ids: [events.len]u64 = undefined,
+
+        /// Initializes the performance monitoring group.
+        ///
+        /// This opens a file descriptor for every event in the `events` list.
+        /// The first event becomes the group leader. All subsequent events
+        /// are created as siblings pinned to the leader.
+        ///
+        /// The counters start in a disabled state. You must call `enable()`
+        /// to begin counting.
+        ///
+        /// **Note:** The caller owns the returned group and must call `deinit`
+        /// to close the file descriptors.
+        pub fn init() Error!Self {
+            var self = Self{};
+            @memset(&self.event_fds, -1);
+
+            // Leader
+            var group_fd = @as(i32, -1);
+            const event_config = events[0].toConfig();
+            self.event_fds[0] = try perf_open_group(group_fd, event_config);
+            self.event_ids[0] = try ioctl_get_id(self.event_fds[0]);
+            group_fd = self.event_fds[0];
+
+            // Siblings
+            if (events.len > 1) {
+                for (events[1..], 1..) |event, i| {
+                    const config = event.toConfig();
+                    self.event_fds[i] = try perf_open_group(group_fd, config);
+                    self.event_ids[i] = try ioctl_get_id(self.event_fds[i]);
+                }
+            }
+            return self;
+        }
+
+        /// Closes all file descriptors associated with this event group.
+        /// This invalidates the group object.
+        pub fn deinit(self: *Self) void {
+            for (self.event_fds, 0..) |event_fd, index| {
+                if (event_fd != -1) {
+                    _ = linux.close(event_fd);
+                }
+                self.event_fds[index] = -1;
+                self.event_ids[index] = 0;
+            }
+        }
+
+        /// Resets and enables the event group. Counting begins immediately.
+        pub fn enable(self: *Self) Error!void {
+            const group_fd = self.event_fds[0];
+            if (group_fd == -1) return error.BadGroup;
+            try ioctl_reset_group(group_fd);
+            try ioctl_enable_group(group_fd);
+        }
+
+        /// Disables the event group. Counting stops immediately.
+        pub fn disable(self: *Self) Error!void {
+            const group_fd = self.event_fds[0];
+            if (group_fd == -1) return error.BadGroup;
+            try ioctl_disable_group(group_fd);
+        }
+
+        /// Reads the current values from the kernel and maps them to the
+        /// type-safe output struct.
+        ///
+        /// This performs the following operations:
+        /// 1. Reads the `read_format` binary struct from the leader FD.
+        /// 2. Checks `time_enabled` and `time_running` to detect if the CPU
+        ///    was oversubscribed.
+        /// 3. If multiplexing occurred (time_running < time_enabled), scales
+        ///    the raw values: `val = raw_val * (time_enabled / time_running)`
+        /// 4. Maps the kernel's event IDs back to the field names of the output
+        ///    struct.
+        pub fn read(self: *Self) Error!Output {
+            var output: Output = std.mem.zeroes(Output);
+            var data: ReadFormatGroup = undefined;
+
+            const rc = linux.read(self.event_fds[0], @ptrCast(&data), @sizeOf(ReadFormatGroup));
+            if (linux.errno(rc) != .SUCCESS) return error.ReadGroupFailed;
+
+            // If time_running is 0, we can't scale, so return zeros.
+            if (data.time_running == 0) return output;
+
+            // Multiplexing scaling: scaled_value = value * (time_enabled / time_running)
+            const scale_needed = data.time_running < data.time_enabled;
+            const scale_factor = if (scale_needed)
+                @as(f64, @floatFromInt(data.time_enabled)) / @as(f64, @floatFromInt(data.time_running))
+            else
+                1.0;
+
+            for (data.values) |item| {
+                var val = item.value;
+
+                if (scale_needed) {
+                    val = @as(u64, @intFromFloat(@as(f64, @floatFromInt(val)) * scale_factor));
+                }
+
+                // Map the kernel ID back to our event tags
+                inline for (events, 0..) |tag, i| {
+                    if (item.id == self.event_ids[i]) {
+                        @field(output, @tagName(tag)) = val;
+                    }
+                }
+            }
+
+            return output;
+        }
+
+        ///////////////////////////////////////////////////////////////////////////////
+        // perf & ioctl calls
+
+        // Open new file descriptor for the specific event
+        fn perf_open_group(group_fd: linux.fd_t, config: u64) Error!linux.fd_t {
+            var attr = std.mem.zeroes(linux.perf_event_attr);
+            attr.type = linux.PERF.TYPE.HARDWARE;
+            attr.config = config;
+
+            // Enable grouping and ID tracking
+            attr.read_format = PERF_FORMAT_GROUP |
+                PERF_FORMAT_TOTAL_TIME_ENABLED |
+                PERF_FORMAT_TOTAL_TIME_RUNNING |
+                PERF_FORMAT_ID;
+
+            attr.flags.disabled = (group_fd == -1); // Only leader starts disabled
+            attr.flags.inherit = true;
+            attr.flags.exclude_kernel = true;
+            attr.flags.exclude_hv = true;
+
+            // ref: `man 2 perf_event_open`
+            // pid=0 (current process), cpu=-1 (any cpu), flags=0
+            const pid = 0;
+            const cpu = -1;
+            const flags = 0;
+
+            const rc = linux.perf_event_open(&attr, pid, cpu, group_fd, flags);
+            if (linux.errno(rc) != .SUCCESS) return error.OpenGroupFailed;
+            return @intCast(rc);
+        }
+
+        // ref: `man 2 perf_event_open` then search for `PERF_EVENT_IOC_ID`
+        fn ioctl_get_id(fd: linux.fd_t) Error!u64 {
+            var id: u64 = 0;
+            const rc = linux.ioctl(fd, PERF_EVENT_IOC_ID, @intFromPtr(&id));
+            if (linux.errno(rc) != .SUCCESS) return error.GetIdFailed;
+            return id;
+        }
+
+        // ref: `man 2 perf_event_open` then search for `PERF_EVENT_IOC_RESET`
+        fn ioctl_reset_group(fd: linux.fd_t) Error!void {
+            const rc = linux.ioctl(fd, PERF_EVENT_IOC_RESET, 0);
+            if (linux.errno(rc) != .SUCCESS) return error.ResetGroupFailed;
+        }
+
+        // ref: `man 2 perf_event_open` then search for `PERF_EVENT_IOC_ENABLE`
+        fn ioctl_enable_group(fd: linux.fd_t) Error!void {
+            const rc = linux.ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
+            if (linux.errno(rc) != .SUCCESS) return error.EnableGroupFailed;
+        }
+
+        // ref: `man 2 perf_event_open` then search for `PERF_EVENT_IOC_DISABLE`
+        fn ioctl_disable_group(fd: linux.fd_t) Error!void {
+            const rc = linux.ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
+            if (linux.errno(rc) != .SUCCESS) return error.DisableGroupFailed;
+        }
+    };
+}
diff --git a/src/root.zig b/src/root.zig
index 72ae0b8..04e530e 100644
--- a/src/root.zig
+++ b/src/root.zig
@@ -1,452 +1,18 @@
 const builtin = @import("builtin");
-const std = @import("std");
-const math = std.math;
-const sort = std.sort;
-const Timer = std.time.Timer;
-const Allocator = std.mem.Allocator;
-const Writer = std.Io.Writer;
-const tty = std.Io.tty;
 
-const Perf = @import("Perf.zig");
+pub const Metrics = @import("Metrics.zig");
+pub const perf = @import("perf.zig");
+pub const Runner = @import("Runner.zig");
+pub const Reporter = @import("Reporter.zig");
 
-/// Metrics of the execution
-pub const Metrics = struct {
-    name: []const u8,
-    // Time
-    min_ns: f64,
-    max_ns: f64,
-    mean_ns: f64,
-    median_ns: f64,
-    std_dev_ns: f64,
-    // Throughput
-    samples: usize,
-    ops_sec: f64,
-    mb_sec: f64,
-    // Hardware (Linux only, null otherwise)
-    cycles: ?f64 = null,
-    instructions: ?f64 = null,
-    ipc: ?f64 = null,
-    cache_misses: ?f64 = null,
-};
-
-pub const Options = struct {
-    warmup_iters: u64 = 100,
-    sample_size: u64 = 1000,
-    bytes_per_op: usize = 0,
-};
-
-pub const ReportOptions = struct {
-    metrics: []const Metrics,
-    /// The index in 'metrics' to use as the baseline for comparison (e.g 1.00x).
-    /// If null, no comparison column is shown.
-    baseline_index: ?usize = null,
-};
-
-pub fn run(allocator: Allocator, name: []const u8, function: anytype, args: anytype, options: Options) !Metrics {
-    assertFunctionDef(function, args);
-
-    // ref: https://pyk.sh/blog/2025-12-08-bench-fixing-constant-folding
-    var runtime_args = createRuntimeArgs(function, args);
-    std.mem.doNotOptimizeAway(&runtime_args);
-
-    for (0..options.warmup_iters) |_| {
-        try execute(function, runtime_args);
-    }
-
-    // We need to determine a batch_size such that the total execution time of the batch
-    // is large enough to minimize timer resolution noise.
-    // Target: 1ms (1,000,000 ns) per measurement block.
-    const min_sample_time_ns = 1_000_000;
-    var batch_size: u64 = 1;
-    var timer = try Timer.start();
-
-    while (true) {
-        timer.reset();
-        for (0..batch_size) |_| {
-            try execute(function, runtime_args);
-        }
-        const duration = timer.read();
-
-        if (duration >= min_sample_time_ns) break;
-
-        // If the duration is 0 (too fast to measure) or small, scale up
-        if (duration == 0) {
-            batch_size *= 10;
-        } else {
-            const ratio = @as(f64, @floatFromInt(min_sample_time_ns)) / @as(f64, @floatFromInt(duration));
-            const multiplier = @as(u64, @intFromFloat(std.math.ceil(ratio)));
-            if (multiplier <= 1) {
-                batch_size *= 2; // Fallback growth
-            } else {
-                batch_size *= multiplier;
-            }
-        }
-    }
-
-    const samples = try allocator.alloc(f64, options.sample_size);
-    defer allocator.free(samples);
-
-    for (0..options.sample_size) |i| {
-        timer.reset();
-        for (0..batch_size) |_| {
-            try execute(function, runtime_args);
-        }
-        const total_ns = timer.read();
-        // Average time per operation for this batch
-        samples[i] = @as(f64, @floatFromInt(total_ns)) / @as(f64, @floatFromInt(batch_size));
-    }
-
-    // Sort samples to find the median and process min/max
-    sort.block(f64, samples, {}, sort.asc(f64));
-
-    var sum: f64 = 0;
-    for (samples) |s| sum += s;
-
-    const mean = sum / @as(f64, @floatFromInt(options.sample_size));
-
-    // Calculate Variance for Standard Deviation
-    var sum_sq_diff: f64 = 0;
-    for (samples) |s| {
-        const diff = s - mean;
-        sum_sq_diff += diff * diff;
-    }
-    const variance = sum_sq_diff / @as(f64, @floatFromInt(options.sample_size));
-
-    // Calculate Operations Per Second
-    const ops_sec = if (mean > 0) 1_000_000_000.0 / mean else 0;
-
-    // Calculate MB/s (Megabytes per second)
-    // Formula: (Ops/Sec * Bytes/Op) / 1,000,000
-    const mb_sec = if (options.bytes_per_op > 0)
-        (ops_sec * @as(f64, @floatFromInt(options.bytes_per_op))) / 1_000_000.0
-    else
-        0;
-
-    var metrics = Metrics{
-        .name = name,
-        .min_ns = samples[0],
-        .max_ns = samples[samples.len - 1],
-        .mean_ns = mean,
-        .median_ns = samples[options.sample_size / 2],
-        .std_dev_ns = math.sqrt(variance),
-        .samples = options.sample_size,
-        .ops_sec = ops_sec,
-        .mb_sec = mb_sec,
-    };
-
-    if (builtin.os.tag == .linux) {
-        if (Perf.init()) |p| {
-            var perf = p;
-            defer perf.deinit();
-
-            try perf.capture();
-            for (0..options.sample_size) |_| {
-                for (0..batch_size) |_| {
-                    try execute(function, runtime_args);
-                }
-            }
-            try perf.stop();
-
-            const m = try perf.read();
-            const total_ops = @as(f64, @floatFromInt(options.sample_size * batch_size));
-            const avg_cycles = @as(f64, @floatFromInt(m.cycles)) / total_ops;
-            const avg_instr = @as(f64, @floatFromInt(m.instructions)) / total_ops;
-            const avg_misses = @as(f64, @floatFromInt(m.cache_misses)) / total_ops;
-
-            metrics.cycles = avg_cycles;
-            metrics.instructions = avg_instr;
-            metrics.cache_misses = avg_misses;
-            if (avg_cycles > 0) {
-                metrics.ipc = avg_instr / avg_cycles;
-            }
-        } else |_| {} // skip counter if we can't open it
-    }
-
-    return metrics;
-}
-
-inline fn execute(function: anytype, args: anytype) !void {
-    const FnType = unwrapFnType(@TypeOf(function));
-    const return_type = @typeInfo(FnType).@"fn".return_type.?;
-
-    // Conditional execution based on whether the function can fail
-    if (@typeInfo(return_type) == .error_union) {
-        const result = try @call(.auto, function, args);
-        std.mem.doNotOptimizeAway(result);
-    } else {
-        const result = @call(.auto, function, args);
-        std.mem.doNotOptimizeAway(result);
-    }
-}
-
-/// Returns the underlying Function type, unwrapping it if it is a pointer.
-fn unwrapFnType(comptime T: type) type {
-    if (@typeInfo(T) == .pointer) return @typeInfo(T).pointer.child;
-    return T;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Function definition checker
-
-fn assertFunctionDef(function: anytype, args: anytype) void {
-    const ArgsType = @TypeOf(args);
-    const args_info = @typeInfo(ArgsType);
-    if (args_info != .@"struct" or !args_info.@"struct".is_tuple) {
-        @compileError("Expected 'args' to be a tuple, found '" ++ @typeName(ArgsType) ++ "'");
-    }
-
-    const FnType = unwrapFnType(@TypeOf(function));
-    if (@typeInfo(FnType) != .@"fn") {
-        @compileError("Expected 'function' to be a function or function pointer, found '" ++ @typeName(@TypeOf(function)) ++ "'");
-    }
-
-    const params_len = @typeInfo(FnType).@"fn".params.len;
-    const args_len = @typeInfo(ArgsType).@"struct".fields.len;
-
-    if (params_len != args_len) {
-        @compileError(std.fmt.comptimePrint(
-            "Function expects {d} arguments, but args tuple has {d}",
-            .{ params_len, args_len },
-        ));
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Runtime Arguments Helpers
-
-/// Constructs the runtime argument tuple based on function parameters and input args.
-fn createRuntimeArgs(function: anytype, args: anytype) RuntimeArgsType(@TypeOf(function), @TypeOf(args)) {
-    const TupleType = RuntimeArgsType(@TypeOf(function), @TypeOf(args));
-    var runtime_args: TupleType = undefined;
-
-    // We only need the length here to iterate
-    const fn_params = getFnParams(@TypeOf(function));
-
-    inline for (0..fn_params.len) |i| {
-        runtime_args[i] = args[i];
-    }
-    return runtime_args;
-}
-
-/// Computes the precise Tuple type required to hold the arguments.
-fn RuntimeArgsType(comptime FnType: type, comptime ArgsType: type) type {
-    const fn_params = getFnParams(FnType);
-    const args_fields = @typeInfo(ArgsType).@"struct".fields;
-    comptime var types: [fn_params.len]type = undefined;
-    inline for (fn_params, 0..) |p, i| {
-        if (p.type) |t| {
-            types[i] = t;
-        } else {
-            types[i] = args_fields[i].type;
-        }
-    }
-    return std.meta.Tuple(&types);
-}
-
-/// Helper to unwrap function pointers and retrieve parameter info
-fn getFnParams(comptime FnType: type) []const std.builtin.Type.Fn.Param {
-    return @typeInfo(unwrapFnType(FnType)).@"fn".params;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// reporters
-
-fn writeColor(writer: *Writer, color: tty.Color, text: []const u8) !void {
-    const config = tty.Config.detect(std.fs.File.stdout());
-    if (config != .no_color) {
-        switch (color) {
-            .reset => try writer.writeAll("\x1b[0m"),
-            .red => try writer.writeAll("\x1b[31m"),
-            .green => try writer.writeAll("\x1b[32m"),
-            .blue => try writer.writeAll("\x1b[34m"),
-            .cyan => try writer.writeAll("\x1b[36m"),
-            .dim => try writer.writeAll("\x1b[2m"),
-            .black => try writer.writeAll("\x1b[90m"),
-            else => try writer.writeAll(""),
-        }
-    }
-    try writer.writeAll(text);
-    if (config != .no_color) try writer.writeAll("\x1b[0m");
-}
-
-/// Writes the formatted report to a specific writer
-pub fn writeReport(writer: *Writer, options: ReportOptions) !void {
-    if (options.metrics.len == 0) return;
-
-    try writer.print("Benchmark Summary: {d} benchmarks run\n", .{options.metrics.len});
-
-    var max_name_len: usize = 0;
-    for (options.metrics) |m| max_name_len = @max(max_name_len, m.name.len);
-
-    for (options.metrics, 0..) |m, i| {
-        const is_last_item = i == options.metrics.len - 1;
-
-        // --- ROW 1: High Level (Name | Time | Speed | Comparison) ---
-        const tree_char = if (is_last_item) "└─ " else "├─ ";
-        try writeColor(writer, .bright_black, tree_char);
-        try writeColor(writer, .cyan, m.name);
-        // try writer.print("{s}{s}", .{ tree_char, m.name });
-
-        // Align name
-        const padding = max_name_len - m.name.len + 2;
-        _ = try writer.splatByte(' ', padding);
-
-        try fmtTime(writer, m.median_ns);
-        try writer.writeAll("   ");
-
-        if (m.mb_sec > 0.001) {
-            try fmtBandwidth(writer, m.mb_sec);
-        } else {
-            try fmtOps(writer, m.ops_sec);
-        }
-
-        // Comparison (On the first line now)
-        if (options.baseline_index) |base_idx| {
-            try writer.writeAll("   ");
-            if (i == base_idx) {
-                try writeColor(writer, .blue, "[baseline]");
-            } else if (base_idx < options.metrics.len) {
-                const base = options.metrics[base_idx];
-                const base_f = base.median_ns;
-                const curr_f = m.median_ns;
-
-                if (curr_f > 0 and base_f > 0) {
-                    if (curr_f < base_f) {
-                        try writer.writeAll("\x1b[32m"); // Green manually to mix with print
-                        try writer.print("{d:.2}x faster", .{base_f / curr_f});
-                        try writer.writeAll("\x1b[0m");
-                    } else {
-                        try writer.writeAll("\x1b[31m");
-                        try writer.print("{d:.2}x slower", .{curr_f / base_f});
-                        try writer.writeAll("\x1b[0m");
-                    }
-                } else {
-                    try writer.writeAll("-");
-                }
-            }
-        }
-        try writer.writeByte('\n');
-
-        // Only printed if we have hardware stats
-        if (m.cycles) |cycles| {
-            const sub_tree_prefix = if (is_last_item) "   └─ " else "│  └─ ";
-            try writer.writeAll(sub_tree_prefix);
-            try writeColor(writer, .dim, "cycles: ");
-            try fmtInt(writer, cycles);
-        }
-
-        if (m.instructions) |instructions| {
-            try writer.writeAll("\t");
-            try writeColor(writer, .dim, "instructions: ");
-            try fmtInt(writer, instructions);
-        }
-
-        if (m.ipc) |ipc| {
-            try writer.writeAll("\t");
-            try writeColor(writer, .dim, "ipc: ");
-            try writer.print("{d:.2}", .{ipc});
-        }
-
-        if (m.cache_misses) |cache_missess| {
-            try writer.writeAll("\t");
-            try writeColor(writer, .dim, "miss: ");
-            try fmtInt(writer, cache_missess);
-
-            try writer.writeByte('\n');
-        }
-    }
-}
-
-/// Prints a formatted summary table to stdout.
-pub fn report(options: ReportOptions) !void {
-    var stdout_buffer: [0x2000]u8 = undefined;
-    var stdout_writer = std.fs.File.stdout().writer(&stdout_buffer);
-    const stdout = &stdout_writer.interface;
-    try writeReport(stdout, options);
-    try stdout.flush();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// formatters
-
-fn fmtInt(writer: *Writer, val: f64) !void {
-    if (val < 1000) {
-        try writer.print("{d:.0}", .{val});
-    } else if (val < 1_000_000) {
-        try writer.print("{d:.1}k", .{val / 1000.0});
-    } else if (val < 1_000_000_000) {
-        try writer.print("{d:.1}M", .{val / 1_000_000.0});
-    } else {
-        try writer.print("{d:.1}G", .{val / 1_000_000_000.0});
-    }
-}
-
-fn fmtTime(writer: *Writer, ns: f64) !void {
-    var buf: [64]u8 = undefined;
-    var slice: []u8 = undefined;
-
-    if (ns < 1000) {
-        slice = try std.fmt.bufPrint(&buf, "{d:.2}ns", .{ns});
-    } else if (ns < 1_000_000) {
-        slice = try std.fmt.bufPrint(&buf, "{d:.2}us", .{ns / 1000.0});
-    } else if (ns < 1_000_000_000) {
-        slice = try std.fmt.bufPrint(&buf, "{d:.2}ms", .{ns / 1_000_000.0});
-    } else {
-        slice = try std.fmt.bufPrint(&buf, "{d:.2}s", .{ns / 1_000_000_000.0});
-    }
-    try padLeft(writer, slice, 9);
-}
-
-fn fmtOps(writer: *Writer, ops: f64) !void {
-    var buf: [64]u8 = undefined;
-    var slice: []u8 = undefined;
-
-    if (ops < 1000) {
-        slice = try std.fmt.bufPrint(&buf, "{d:.0}/s", .{ops});
-    } else if (ops < 1_000_000) {
-        slice = try std.fmt.bufPrint(&buf, "{d:.2}K/s", .{ops / 1000.0});
-    } else if (ops < 1_000_000_000) {
-        slice = try std.fmt.bufPrint(&buf, "{d:.2}M/s", .{ops / 1_000_000.0});
-    } else {
-        slice = try std.fmt.bufPrint(&buf, "{d:.2}G/s", .{ops / 1_000_000_000.0});
-    }
-    try padLeft(writer, slice, 11);
-}
-
-fn fmtBandwidth(writer: *Writer, mb: f64) !void {
-    var buf: [64]u8 = undefined;
-    var slice: []u8 = undefined;
-
-    if (mb >= 1000) {
-        slice = try std.fmt.bufPrint(&buf, "{d:.2}GB/s", .{mb / 1000.0});
-    } else {
-        slice = try std.fmt.bufPrint(&buf, "{d:.2}MB/s", .{mb});
-    }
-    try padLeft(writer, slice, 11);
-}
-
-// Pads with spaces on the left (for numbers)
-fn padLeft(writer: *Writer, text: []const u8, width: usize) !void {
-    if (text.len < width) {
-        _ = try writer.splatByte(' ', width - text.len);
-    }
-    try writer.writeAll(text);
-}
-
-// Pads with spaces on the right (for text/comparisons)
-fn padRight(writer: *Writer, text: []const u8, width: usize) !void {
-    try writer.writeAll(text);
-    if (text.len < width) {
-        _ = try writer.splatByte(' ', width - text.len);
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// tests
+pub const Options = Runner.Options;
+pub const run = Runner.run;
+pub const report = Reporter.report;
 
 test {
-    _ = @import("test.zig");
     if (builtin.os.tag == .linux) {
-        _ = @import("Perf.test.zig");
+        _ = @import("perf.test.zig");
     }
+    _ = @import("Runner.test.zig");
+    _ = @import("Reporter.test.zig");
 }