diff --git a/src/Metrics.zig b/src/Metrics.zig new file mode 100644 index 0000000..8463445 --- /dev/null +++ b/src/Metrics.zig @@ -0,0 +1,41 @@ +/////////////////////////////////////////////////////////////////////////////// +// Meta + +/// The identifier string for the benchmark +name: []const u8, +/// Total number of measurement samples collected +samples: usize, + +/////////////////////////////////////////////////////////////////////////////// +// Time + +/// Minimum execution time per operation (nanoseconds) +min_ns: f64, +/// Maximum execution time per operation (nanoseconds) +max_ns: f64, +/// Mean execution time (nanoseconds) +mean_ns: f64, +/// Median execution time (nanoseconds) +median_ns: f64, +/// Standard deviation of the execution time +std_dev_ns: f64, + +/////////////////////////////////////////////////////////////////////////////// +// Throughput + +/// Calculated operations per second +ops_sec: f64, +/// Data throughput in MB/s (populated if `bytes_per_op` > 0) +mb_sec: f64, + +/////////////////////////////////////////////////////////////////////////////// +// Hardware (Linux only, null otherwise) + +/// Average CPU cycles per operation +cycles: ?f64 = null, +/// Average CPU instructions executed per operation +instructions: ?f64 = null, +/// Instructions Per Cycle (efficiency ratio) +ipc: ?f64 = null, +/// Average cache misses per operation +cache_misses: ?f64 = null, diff --git a/src/Perf.test.zig b/src/Perf.test.zig deleted file mode 100644 index fe3b1e4..0000000 --- a/src/Perf.test.zig +++ /dev/null @@ -1,51 +0,0 @@ -const std = @import("std"); -const testing = std.testing; -const builtin = @import("builtin"); -const Perf = @import("Perf.zig"); - -test "Perf: lifecycle" { - var perf = Perf.init() catch return error.SkipZigTest; - defer perf.deinit(); - - try perf.capture(); - - var x: u64 = 0; - for (0..10_000) |i| { - x +%= i; - std.mem.doNotOptimizeAway(x); - } - - try perf.stop(); - const m = try perf.read(); - - // Verify we captured instructions - if (m.instructions == 0) { - std.debug.print("WARN: Captured 0 instructions. Check permissions.\n", .{}); - } else { - try testing.expect(m.instructions > 10_000); - try testing.expect(m.cycles > 0); - } -} - -test "Perf: cache misses" { - var perf = Perf.init() catch return error.SkipZigTest; - defer perf.deinit(); - - try perf.capture(); - - // Thrash L1 cache - var buf = try testing.allocator.alloc(u8, 1024 * 1024); - defer testing.allocator.free(buf); - @memset(buf, 0xAA); - - var sum: u64 = 0; - var i: usize = 0; - while (i < buf.len) : (i += 64) { - sum +%= buf[i]; - } - std.mem.doNotOptimizeAway(sum); - - try perf.stop(); - const m = try perf.read(); - std.debug.print("m = {any}", .{m}); -} diff --git a/src/Perf.zig b/src/Perf.zig deleted file mode 100644 index b761fc2..0000000 --- a/src/Perf.zig +++ /dev/null @@ -1,152 +0,0 @@ -// References: https://man7.org/linux/man-pages/man2/perf_event_open.2.html - -const std = @import("std"); -const builtin = @import("builtin"); -const linux = std.os.linux; -const posix = std.posix; - -const Perf = @This(); -const PERF_EVENT_IOC_ID = linux.IOCTL.IOR('$', 7, u64); - -leader_fd: posix.fd_t = -1, -sibling_fds: [2]posix.fd_t = .{ -1, -1 }, - -/// IDs assigned by the kernel to identify events in the read buffer. -/// Indices: 0=Cycles, 1=Instructions, 2=CacheMisses -ids: [3]u64 = .{ 0, 0, 0 }, - -pub const Measurements = struct { - cycles: u64, - instructions: u64, - cache_misses: u64, -}; - -pub fn init() !Perf { - var self = Perf{}; - - // CPU Cycles (Group Leader) - self.leader_fd = try openEvent(.cpu_cycles, -1); - self.ids[0] = try getId(self.leader_fd); - - { - const fd = try openEvent(.instructions, self.leader_fd); - self.ids[1] = try getId(fd); - self.sibling_fds[0] = fd; - } - - { - const fd = try openEvent(.cache_misses, self.leader_fd); - self.ids[2] = try getId(fd); - self.sibling_fds[1] = fd; - } - - return self; -} - -pub fn deinit(self: *Perf) void { - if (self.leader_fd != -1) { - _ = linux.close(self.leader_fd); - self.leader_fd = -1; - } - for (self.sibling_fds, 0..) |fd, i| { - if (fd != -1) _ = linux.close(fd); - self.sibling_fds[i] = -1; - } -} - -pub fn capture(self: *Perf) !void { - if (self.leader_fd == -1) return; - const reset = linux.ioctl(self.leader_fd, linux.PERF.EVENT_IOC.RESET, 0); - if (std.c.errno(reset) != .SUCCESS) @panic("ioctl/reset fails"); - const enable = linux.ioctl(self.leader_fd, linux.PERF.EVENT_IOC.ENABLE, 0); - if (std.c.errno(enable) != .SUCCESS) @panic("ioctl/enable fails"); -} - -pub fn stop(self: *Perf) !void { - if (self.leader_fd == -1) return; - const disable = linux.ioctl(self.leader_fd, linux.PERF.EVENT_IOC.DISABLE, 0); - if (std.c.errno(disable) != .SUCCESS) @panic("ioctl/disable fails"); -} - -/// Reads the counter values. -/// Returns a struct with the collected data. -pub fn read(self: *Perf) !Measurements { - var m = Measurements{ - .cycles = 0, - .instructions = 0, - .cache_misses = 0, - }; - if (self.leader_fd == -1) return m; - - // Format: PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING | PERF_FORMAT_ID | PERF_FORMAT_GROUP - // Layout: nr, time_enabled, time_running, [value, id], [value, id], ... - // Max items = 3. Header = 3 u64. Total u64s = 3 + (2 * 3) = 9 - var buf: [16]u64 = undefined; - - _ = try posix.read(self.leader_fd, std.mem.sliceAsBytes(&buf)); - - const nr = buf[0]; - const time_enabled = buf[1]; - const time_running = buf[2]; - - // std.debug.print("nr={d}\n", .{nr}); - // std.debug.print("time_running={d}\n", .{time_running}); - - if (time_running == 0) return m; - - var i: usize = 0; - while (i < nr) : (i += 1) { - const base_idx = 3 + (i * 2); - if (base_idx + 1 >= buf.len) break; - - var val = buf[base_idx]; - const id = buf[base_idx + 1]; - - // std.debug.print("i={d} val={d} (before)\n", .{ i, val }); - if (time_running < time_enabled) { - val = @as(u64, @intFromFloat(@as(f64, @floatFromInt(val)) * (@as(f64, @floatFromInt(time_enabled)) / @as(f64, @floatFromInt(time_running))))); - } - - // std.debug.print("i={d} val={d} (after)\n", .{ i, val }); - // std.debug.print("i={d} id={d}\n", .{ i, id }); - - if (id == self.ids[0]) m.cycles = val; - if (id == self.ids[1]) m.instructions = val; - if (id == self.ids[2]) m.cache_misses = val; - } - - return m; -} - -const Event = enum { cpu_cycles, instructions, cache_misses }; - -fn openEvent(event: Event, group_fd: posix.fd_t) !posix.fd_t { - const config: u64 = switch (event) { - .cpu_cycles => @intFromEnum(linux.PERF.COUNT.HW.CPU_CYCLES), - .instructions => @intFromEnum(linux.PERF.COUNT.HW.INSTRUCTIONS), - .cache_misses => @intFromEnum(linux.PERF.COUNT.HW.CACHE_MISSES), - }; - - var attr = std.mem.zeroes(linux.perf_event_attr); - attr.type = linux.PERF.TYPE.HARDWARE; - attr.config = config; - - // Enable grouping and ID tracking - attr.read_format = 1 << 0 | 1 << 1 | 1 << 2 | 1 << 3; - - attr.flags.disabled = (group_fd == -1); // Only leader starts disabled - attr.flags.inherit = true; - attr.flags.exclude_kernel = true; - attr.flags.exclude_hv = true; - - const fd = try posix.perf_event_open(&attr, 0, -1, group_fd, 0); - return fd; -} - -fn getId(fd: i32) !u64 { - var id: u64 = 0; - if (linux.ioctl(fd, PERF_EVENT_IOC_ID, @intFromPtr(&id)) != 0) { - return error.IoctlFailed; - } - return id; -} diff --git a/src/Reporter.test.zig b/src/Reporter.test.zig new file mode 100644 index 0000000..d40c49c --- /dev/null +++ b/src/Reporter.test.zig @@ -0,0 +1,34 @@ +const std = @import("std"); +const testing = std.testing; + +const Runner = @import("Runner.zig"); +const Reporter = @import("Reporter.zig"); + +fn fibNaive(n: u64) u64 { + if (n <= 1) return n; + return fibNaive(n - 1) + fibNaive(n - 2); +} + +fn fibIterative(n: u64) u64 { + if (n == 0) return 0; + var a: u64 = 0; + var b: u64 = 1; + for (2..n + 1) |_| { + const c = a + b; + a = b; + b = c; + } + return b; +} + +test "report fib" { + const allocator = testing.allocator; + const opts = Runner.Options{ + .sample_size = 100, + .warmup_iters = 3, + }; + const m_naive = try Runner.run(allocator, "fibNaive", fibNaive, .{@as(u64, 20)}, opts); + const m_iter = try Runner.run(allocator, "fibIterative", fibIterative, .{@as(u64, 20)}, opts); + + try Reporter.report(.{ .metrics = &.{ m_naive, m_iter }, .baseline_index = 0 }); +} diff --git a/src/Reporter.zig b/src/Reporter.zig new file mode 100644 index 0000000..118cdf6 --- /dev/null +++ b/src/Reporter.zig @@ -0,0 +1,201 @@ +const std = @import("std"); +const Writer = std.Io.Writer; +const tty = std.Io.tty; + +const Metrics = @import("Metrics.zig"); + +pub const Options = struct { + metrics: []const Metrics, + /// The index in 'metrics' to use as the baseline for comparison (e.g 1.00x). + /// If null, no comparison column is shown. + baseline_index: ?usize = null, +}; + +/// Prints a formatted summary table to stdout. +pub fn report(options: Options) !void { + var buffer: [0x2000]u8 = undefined; + var w: Writer = .fixed(&buffer); + try writeReport(&w, options); + std.debug.print("{s}", .{w.buffered()}); +} + +/// Writes the formatted report to a specific writer +pub fn writeReport(writer: *Writer, options: Options) !void { + if (options.metrics.len == 0) return; + + try writer.print("Benchmark Summary: {d} benchmarks run\n", .{options.metrics.len}); + + var max_name_len: usize = 0; + for (options.metrics) |m| max_name_len = @max(max_name_len, m.name.len); + + for (options.metrics, 0..) |m, i| { + const is_last_item = i == options.metrics.len - 1; + + // --- ROW 1: High Level (Name | Time | Speed | Comparison) --- + const tree_char = if (is_last_item) "└─ " else "├─ "; + try writeColor(writer, .bright_black, tree_char); + try writeColor(writer, .cyan, m.name); + // try writer.print("{s}{s}", .{ tree_char, m.name }); + + // Align name + const padding = max_name_len - m.name.len + 2; + _ = try writer.splatByte(' ', padding); + + try fmtTime(writer, m.median_ns); + try writer.writeAll(" "); + + if (m.mb_sec > 0.001) { + try fmtBandwidth(writer, m.mb_sec); + } else { + try fmtOps(writer, m.ops_sec); + } + + // Comparison (On the first line now) + if (options.baseline_index) |base_idx| { + try writer.writeAll(" "); + if (i == base_idx) { + try writeColor(writer, .blue, "[baseline]"); + } else if (base_idx < options.metrics.len) { + const base = options.metrics[base_idx]; + const base_f = base.median_ns; + const curr_f = m.median_ns; + + if (curr_f > 0 and base_f > 0) { + if (curr_f < base_f) { + try writer.writeAll("\x1b[32m"); // Green manually to mix with print + try writer.print("{d:.2}x faster", .{base_f / curr_f}); + try writer.writeAll("\x1b[0m"); + } else { + try writer.writeAll("\x1b[31m"); + try writer.print("{d:.2}x slower", .{curr_f / base_f}); + try writer.writeAll("\x1b[0m"); + } + } else { + try writer.writeAll("-"); + } + } + } + try writer.writeByte('\n'); + + // Only printed if we have hardware stats + if (m.cycles) |cycles| { + const sub_tree_prefix = if (is_last_item) " └─ " else "│ └─ "; + try writer.writeAll(sub_tree_prefix); + try writeColor(writer, .dim, "cycles: "); + try fmtInt(writer, cycles); + } + + if (m.instructions) |instructions| { + try writer.writeAll("\t"); + try writeColor(writer, .dim, "instructions: "); + try fmtInt(writer, instructions); + } + + if (m.ipc) |ipc| { + try writer.writeAll("\t"); + try writeColor(writer, .dim, "ipc: "); + try writer.print("{d:.2}", .{ipc}); + } + + if (m.cache_misses) |cache_missess| { + try writer.writeAll("\t"); + try writeColor(writer, .dim, "miss: "); + try fmtInt(writer, cache_missess); + + try writer.writeByte('\n'); + } + } +} + +fn writeColor(writer: *Writer, color: tty.Color, text: []const u8) !void { + const config = tty.Config.detect(std.fs.File.stdout()); + if (config != .no_color) { + switch (color) { + .reset => try writer.writeAll("\x1b[0m"), + .red => try writer.writeAll("\x1b[31m"), + .green => try writer.writeAll("\x1b[32m"), + .blue => try writer.writeAll("\x1b[34m"), + .cyan => try writer.writeAll("\x1b[36m"), + .dim => try writer.writeAll("\x1b[2m"), + .black => try writer.writeAll("\x1b[90m"), + else => try writer.writeAll(""), + } + } + try writer.writeAll(text); + if (config != .no_color) try writer.writeAll("\x1b[0m"); +} + +//////////////////////////////////////////////////////////////////////////////// +// formatters + +fn fmtInt(writer: *Writer, val: f64) !void { + if (val < 1000) { + try writer.print("{d:.0}", .{val}); + } else if (val < 1_000_000) { + try writer.print("{d:.1}k", .{val / 1000.0}); + } else if (val < 1_000_000_000) { + try writer.print("{d:.1}M", .{val / 1_000_000.0}); + } else { + try writer.print("{d:.1}G", .{val / 1_000_000_000.0}); + } +} + +fn fmtTime(writer: *Writer, ns: f64) !void { + var buf: [64]u8 = undefined; + var slice: []u8 = undefined; + + if (ns < 1000) { + slice = try std.fmt.bufPrint(&buf, "{d:.2}ns", .{ns}); + } else if (ns < 1_000_000) { + slice = try std.fmt.bufPrint(&buf, "{d:.2}us", .{ns / 1000.0}); + } else if (ns < 1_000_000_000) { + slice = try std.fmt.bufPrint(&buf, "{d:.2}ms", .{ns / 1_000_000.0}); + } else { + slice = try std.fmt.bufPrint(&buf, "{d:.2}s", .{ns / 1_000_000_000.0}); + } + try padLeft(writer, slice, 9); +} + +fn fmtOps(writer: *Writer, ops: f64) !void { + var buf: [64]u8 = undefined; + var slice: []u8 = undefined; + + if (ops < 1000) { + slice = try std.fmt.bufPrint(&buf, "{d:.0}/s", .{ops}); + } else if (ops < 1_000_000) { + slice = try std.fmt.bufPrint(&buf, "{d:.2}K/s", .{ops / 1000.0}); + } else if (ops < 1_000_000_000) { + slice = try std.fmt.bufPrint(&buf, "{d:.2}M/s", .{ops / 1_000_000.0}); + } else { + slice = try std.fmt.bufPrint(&buf, "{d:.2}G/s", .{ops / 1_000_000_000.0}); + } + try padLeft(writer, slice, 11); +} + +fn fmtBandwidth(writer: *Writer, mb: f64) !void { + var buf: [64]u8 = undefined; + var slice: []u8 = undefined; + + if (mb >= 1000) { + slice = try std.fmt.bufPrint(&buf, "{d:.2}GB/s", .{mb / 1000.0}); + } else { + slice = try std.fmt.bufPrint(&buf, "{d:.2}MB/s", .{mb}); + } + try padLeft(writer, slice, 11); +} + +// Pads with spaces on the left (for numbers) +fn padLeft(writer: *Writer, text: []const u8, width: usize) !void { + if (text.len < width) { + _ = try writer.splatByte(' ', width - text.len); + } + try writer.writeAll(text); +} + +// Pads with spaces on the right (for text/comparisons) +fn padRight(writer: *Writer, text: []const u8, width: usize) !void { + try writer.writeAll(text); + if (text.len < width) { + _ = try writer.splatByte(' ', width - text.len); + } +} diff --git a/src/test.zig b/src/Runner.test.zig similarity index 65% rename from src/test.zig rename to src/Runner.test.zig index c9462a4..76f0d51 100644 --- a/src/test.zig +++ b/src/Runner.test.zig @@ -2,7 +2,7 @@ const builtin = @import("builtin"); const std = @import("std"); const testing = std.testing; -const bench = @import("root.zig"); +const Runner = @import("Runner.zig"); fn noOp() !void { // Pure overhead measurement @@ -25,18 +25,9 @@ fn sleepWork() !void { std.mem.doNotOptimizeAway(io); } -// Global buffer for memory test -var src_buf: [16 * 1024]u8 = undefined; -var dst_buf: [16 * 1024]u8 = undefined; - -fn copyWork() !void { - @memcpy(&dst_buf, &src_buf); - std.mem.doNotOptimizeAway(dst_buf); -} - -test "run: basic check" { +test "basic metrics" { const allocator = testing.allocator; - const noop_metrics = try bench.run(allocator, "NoOp", noOp, .{}, .{}); + const noop_metrics = try Runner.run(allocator, "NoOp", noOp, .{}, .{}); // The minimum cannot be larger than the maximum try testing.expect(noop_metrics.min_ns <= noop_metrics.max_ns); @@ -48,7 +39,7 @@ test "run: basic check" { // Execution must take some time (non-zero) try testing.expect(noop_metrics.min_ns > 0); - const busy_metrics = try bench.run(allocator, "Busy", busyWork, .{}, .{}); + const busy_metrics = try Runner.run(allocator, "Busy", busyWork, .{}, .{}); // The busy function MUST be slower than the no-op try testing.expect(busy_metrics.median_ns > noop_metrics.median_ns); @@ -58,62 +49,17 @@ test "run: basic check" { // not just the overhead of the tool itself. try testing.expect(busy_metrics.median_ns > (noop_metrics.median_ns * 2)); - const sleep_metrics = try bench.run(allocator, "Sleep", sleepWork, .{}, .{}); + const sleep_metrics = try Runner.run(allocator, "Sleep", sleepWork, .{}, .{}); const target_ns = 1 * std.time.ns_per_ms; // We check if the result is reasonably close to 1ms. // Note: OS Sleep is imprecise. It will always be >= target, never less. - // We allow a "scheduler noise" overhead (e.g., +2ms tolerance for CI environments). try testing.expect(sleep_metrics.median_ns >= target_ns); const tolerance = 2 * std.time.ns_per_ms; try testing.expect(sleep_metrics.median_ns < (target_ns + tolerance)); } -test "run: bandwidth check" { - const allocator = testing.allocator; - @memset(&src_buf, 0xAA); - const metrics = try bench.run(allocator, "Copy", copyWork, .{}, .{ - .sample_size = 1000, - .bytes_per_op = src_buf.len, - }); - - try testing.expect(metrics.mb_sec > 0); - try testing.expect(metrics.mb_sec > 1.0); // Sanity check -} - -test "report: output" { - const allocator = testing.allocator; - @memset(&src_buf, 0xAA); - const copy_metrics = try bench.run(allocator, "Copy", copyWork, .{}, .{ - .sample_size = 1000, - .bytes_per_op = src_buf.len, - }); - - const noop_metrics = try bench.run(allocator, "NoOp", noOp, .{}, .{}); - const sleep_metrics = try bench.run(allocator, "Sleep", sleepWork, .{}, .{}); - const busy_metrics = try bench.run(allocator, "Busy", busyWork, .{}, .{}); - - var single: std.Io.Writer.Allocating = .init(allocator); - defer single.deinit(); - try bench.writeReport(&single.writer, .{ .metrics = &.{copy_metrics} }); - - var double: std.Io.Writer.Allocating = .init(allocator); - defer double.deinit(); - try bench.writeReport(&double.writer, .{ .metrics = &.{ noop_metrics, sleep_metrics } }); - - var baseline: std.Io.Writer.Allocating = .init(allocator); - defer baseline.deinit(); - try bench.writeReport(&baseline.writer, .{ - .metrics = &.{ noop_metrics, sleep_metrics, busy_metrics }, - .baseline_index = 0, - }); - - std.debug.print("\nsingle:\n{s}\n", .{single.written()}); - std.debug.print("\ndouble:\n{s}\n", .{double.written()}); - std.debug.print("\nbaseline:\n{s}\n", .{baseline.written()}); -} - // Simulate a whitespace skipper function fn skipWhitespaceNaive(input: []const u8) !void { var i: usize = 0; @@ -132,7 +78,7 @@ fn skipWhitespaceSIMD(input: []const u8) !void { std.mem.doNotOptimizeAway(i); } -test "run: with args" { +test "run with args" { const allocator = testing.allocator; // Generate test data outside the benchmark @@ -142,8 +88,8 @@ test "run: with args" { @memset(input, ' '); input[len - 1] = 'x'; // Stop at the end - const m_naive = try bench.run(allocator, "Naive", skipWhitespaceNaive, .{input}, .{ .sample_size = 100 }); - const m_simd = try bench.run(allocator, "SIMD", skipWhitespaceSIMD, .{input}, .{ .sample_size = 100 }); + const m_naive = try Runner.run(allocator, "Naive", skipWhitespaceNaive, .{input}, .{ .sample_size = 100 }); + const m_simd = try Runner.run(allocator, "SIMD", skipWhitespaceSIMD, .{input}, .{ .sample_size = 100 }); try testing.expect(m_naive.median_ns > 0); try testing.expect(m_simd.median_ns > 0); @@ -152,20 +98,38 @@ test "run: with args" { try testing.expect(m_simd.median_ns < m_naive.median_ns); } -/////////////////////////////////////////////////////////////////////////////// -// accuracy test +// Global buffer for memory test +var src_buf: [16 * 1024]u8 = undefined; +var dst_buf: [16 * 1024]u8 = undefined; + +fn copyWork() !void { + @memcpy(&dst_buf, &src_buf); + std.mem.doNotOptimizeAway(dst_buf); +} + +test "bandwidth check" { + const allocator = testing.allocator; + @memset(&src_buf, 0xAA); + const metrics = try Runner.run(allocator, "Copy", copyWork, .{}, .{ + .sample_size = 1000, + .bytes_per_op = src_buf.len, + }); + + try testing.expect(metrics.mb_sec > 0); + try testing.expect(metrics.mb_sec > 1.0); // Sanity check +} fn fastIncrement(val: *u64) !void { val.* +%= 1; std.mem.doNotOptimizeAway(val.*); } -test "accuracy: adaptive batching precision" { +test "metrics accuracy" { const allocator = testing.allocator; var x: u64 = 0; // Run the benchmark on a sub-nanosecond operation - const metrics = try bench.run(allocator, "FastInc", fastIncrement, .{&x}, .{ + const metrics = try Runner.run(allocator, "FastIncrement", fastIncrement, .{&x}, .{ .warmup_iters = 100, .sample_size = 1000, }); @@ -210,10 +174,10 @@ fn functionReturnValueError() !u64 { test "run: suppported signatures" { const allocator = testing.allocator; - _ = try bench.run(allocator, "functionReturnVoid", functionReturnVoid, .{}, .{}); - _ = try bench.run(allocator, "functionReturnVoidError", functionReturnVoidError, .{}, .{}); - _ = try bench.run(allocator, "functionReturnValue", functionReturnValue, .{}, .{}); - _ = try bench.run(allocator, "functionReturnValueError", functionReturnValueError, .{}, .{}); + _ = try Runner.run(allocator, "functionReturnVoid", functionReturnVoid, .{}, .{}); + _ = try Runner.run(allocator, "functionReturnVoidError", functionReturnVoidError, .{}, .{}); + _ = try Runner.run(allocator, "functionReturnValue", functionReturnValue, .{}, .{}); + _ = try Runner.run(allocator, "functionReturnValueError", functionReturnValueError, .{}, .{}); } /////////////////////////////////////////////////////////////////////////////// @@ -239,13 +203,13 @@ fn fibIterative(n: u64) u64 { } test "run: fibonacci" { - const allocator = std.heap.smp_allocator; - const opts = bench.Options{ + const allocator = testing.allocator; + const opts = Runner.Options{ .sample_size = 100, .warmup_iters = 3, }; - const m_naive = try bench.run(allocator, "fibNaive", fibNaive, .{@as(u64, 30)}, opts); - const m_iter = try bench.run(allocator, "fibIterative", fibIterative, .{@as(u64, 30)}, opts); + const m_naive = try Runner.run(allocator, "fibNaive", fibNaive, .{@as(u64, 30)}, opts); + const m_iter = try Runner.run(allocator, "fibIterative", fibIterative, .{@as(u64, 30)}, opts); try testing.expect(m_naive.mean_ns > m_iter.mean_ns * 100); } diff --git a/src/Runner.zig b/src/Runner.zig new file mode 100644 index 0000000..f770756 --- /dev/null +++ b/src/Runner.zig @@ -0,0 +1,223 @@ +const builtin = @import("builtin"); +const std = @import("std"); +const math = std.math; +const sort = std.sort; +const Timer = std.time.Timer; +const Allocator = std.mem.Allocator; + +const Metrics = @import("Metrics.zig"); +const perf = @import("perf.zig"); + +pub const Options = struct { + warmup_iters: u64 = 100, + sample_size: u64 = 1000, + bytes_per_op: usize = 0, +}; + +pub fn run(allocator: Allocator, name: []const u8, function: anytype, args: anytype, options: Options) !Metrics { + assertFunctionDef(function, args); + + // ref: https://pyk.sh/blog/2025-12-08-bench-fixing-constant-folding + var runtime_args = createRuntimeArgs(function, args); + std.mem.doNotOptimizeAway(&runtime_args); + + for (0..options.warmup_iters) |_| { + try execute(function, runtime_args); + } + + // We need to determine a batch_size such that the total execution time of the batch + // is large enough to minimize timer resolution noise. + // Target: 1ms (1,000,000 ns) per measurement block. + const min_sample_time_ns = 1_000_000; + var batch_size: u64 = 1; + var timer = try Timer.start(); + + while (true) { + timer.reset(); + for (0..batch_size) |_| { + try execute(function, runtime_args); + } + const duration = timer.read(); + + if (duration >= min_sample_time_ns) break; + + // If the duration is 0 (too fast to measure) or small, scale up + if (duration == 0) { + batch_size *= 10; + } else { + const ratio = @as(f64, @floatFromInt(min_sample_time_ns)) / @as(f64, @floatFromInt(duration)); + const multiplier = @as(u64, @intFromFloat(std.math.ceil(ratio))); + if (multiplier <= 1) { + batch_size *= 2; // Fallback growth + } else { + batch_size *= multiplier; + } + } + } + + const samples = try allocator.alloc(f64, options.sample_size); + defer allocator.free(samples); + + for (0..options.sample_size) |i| { + timer.reset(); + for (0..batch_size) |_| { + try execute(function, runtime_args); + } + const total_ns = timer.read(); + // Average time per operation for this batch + samples[i] = @as(f64, @floatFromInt(total_ns)) / @as(f64, @floatFromInt(batch_size)); + } + + // Sort samples to find the median and process min/max + sort.block(f64, samples, {}, sort.asc(f64)); + + var sum: f64 = 0; + for (samples) |s| sum += s; + + const mean = sum / @as(f64, @floatFromInt(options.sample_size)); + + // Calculate Variance for Standard Deviation + var sum_sq_diff: f64 = 0; + for (samples) |s| { + const diff = s - mean; + sum_sq_diff += diff * diff; + } + const variance = sum_sq_diff / @as(f64, @floatFromInt(options.sample_size)); + + // Calculate Operations Per Second + const ops_sec = if (mean > 0) 1_000_000_000.0 / mean else 0; + + // Calculate MB/s (Megabytes per second) + // Formula: (Ops/Sec * Bytes/Op) / 1,000,000 + const mb_sec = if (options.bytes_per_op > 0) + (ops_sec * @as(f64, @floatFromInt(options.bytes_per_op))) / 1_000_000.0 + else + 0; + + var metrics = Metrics{ + .name = name, + .min_ns = samples[0], + .max_ns = samples[samples.len - 1], + .mean_ns = mean, + .median_ns = samples[options.sample_size / 2], + .std_dev_ns = math.sqrt(variance), + .samples = options.sample_size, + .ops_sec = ops_sec, + .mb_sec = mb_sec, + }; + + if (builtin.os.tag == .linux) { + const events = [_]perf.Event{ .cpu_cycles, .instructions, .cache_misses }; + const perf_group = perf.Group(&events); + if (perf_group.init()) |pg| { + var group = pg; + defer group.deinit(); + + try group.enable(); + for (0..options.sample_size) |_| { + for (0..batch_size) |_| { + try execute(function, runtime_args); + } + } + try group.disable(); + + const m = try group.read(); + const total_ops = @as(f64, @floatFromInt(options.sample_size * batch_size)); + const avg_cycles = @as(f64, @floatFromInt(m.cpu_cycles)) / total_ops; + const avg_instr = @as(f64, @floatFromInt(m.instructions)) / total_ops; + const avg_misses = @as(f64, @floatFromInt(m.cache_misses)) / total_ops; + + metrics.cycles = avg_cycles; + metrics.instructions = avg_instr; + metrics.cache_misses = avg_misses; + if (avg_cycles > 0) { + metrics.ipc = avg_instr / avg_cycles; + } + } else |_| {} // skip counter if we can't open it + } + + return metrics; +} + +inline fn execute(function: anytype, args: anytype) !void { + const FnType = unwrapFnType(@TypeOf(function)); + const return_type = @typeInfo(FnType).@"fn".return_type.?; + + // Conditional execution based on whether the function can fail + if (@typeInfo(return_type) == .error_union) { + const result = try @call(.auto, function, args); + std.mem.doNotOptimizeAway(result); + } else { + const result = @call(.auto, function, args); + std.mem.doNotOptimizeAway(result); + } +} + +/// Returns the underlying Function type, unwrapping it if it is a pointer. +fn unwrapFnType(comptime T: type) type { + if (@typeInfo(T) == .pointer) return @typeInfo(T).pointer.child; + return T; +} + +//////////////////////////////////////////////////////////////////////////////// +// Function definition checker + +fn assertFunctionDef(function: anytype, args: anytype) void { + const ArgsType = @TypeOf(args); + const args_info = @typeInfo(ArgsType); + if (args_info != .@"struct" or !args_info.@"struct".is_tuple) { + @compileError("Expected 'args' to be a tuple, found '" ++ @typeName(ArgsType) ++ "'"); + } + + const FnType = unwrapFnType(@TypeOf(function)); + if (@typeInfo(FnType) != .@"fn") { + @compileError("Expected 'function' to be a function or function pointer, found '" ++ @typeName(@TypeOf(function)) ++ "'"); + } + + const params_len = @typeInfo(FnType).@"fn".params.len; + const args_len = @typeInfo(ArgsType).@"struct".fields.len; + + if (params_len != args_len) { + @compileError(std.fmt.comptimePrint( + "Function expects {d} arguments, but args tuple has {d}", + .{ params_len, args_len }, + )); + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Runtime Arguments Helpers + +/// Constructs the runtime argument tuple based on function parameters and input args. +fn createRuntimeArgs(function: anytype, args: anytype) RuntimeArgsType(@TypeOf(function), @TypeOf(args)) { + const TupleType = RuntimeArgsType(@TypeOf(function), @TypeOf(args)); + var runtime_args: TupleType = undefined; + + // We only need the length here to iterate + const fn_params = getFnParams(@TypeOf(function)); + + inline for (0..fn_params.len) |i| { + runtime_args[i] = args[i]; + } + return runtime_args; +} + +/// Computes the precise Tuple type required to hold the arguments. +fn RuntimeArgsType(comptime FnType: type, comptime ArgsType: type) type { + const fn_params = getFnParams(FnType); + const args_fields = @typeInfo(ArgsType).@"struct".fields; + comptime var types: [fn_params.len]type = undefined; + inline for (fn_params, 0..) |p, i| { + if (p.type) |t| { + types[i] = t; + } else { + types[i] = args_fields[i].type; + } + } + return std.meta.Tuple(&types); +} + +/// Helper to unwrap function pointers and retrieve parameter info +fn getFnParams(comptime FnType: type) []const std.builtin.Type.Fn.Param { + return @typeInfo(unwrapFnType(FnType)).@"fn".params; +} diff --git a/src/perf.test.zig b/src/perf.test.zig new file mode 100644 index 0000000..9df8232 --- /dev/null +++ b/src/perf.test.zig @@ -0,0 +1,98 @@ +const std = @import("std"); +const testing = std.testing; +const linux = std.os.linux; + +const perf = @import("perf.zig"); + +test "Event toConfig mapping" { + try testing.expectEqual( + perf.Event.cpu_cycles.toConfig(), + @intFromEnum(linux.PERF.COUNT.HW.CPU_CYCLES), + ); + try testing.expectEqual( + perf.Event.instructions.toConfig(), + @intFromEnum(linux.PERF.COUNT.HW.INSTRUCTIONS), + ); + try testing.expectEqual( + perf.Event.branch_misses.toConfig(), + @intFromEnum(linux.PERF.COUNT.HW.BRANCH_MISSES), + ); +} + +test "GroupReadOutputType generates correct struct fields" { + const events = [_]perf.Event{ .cpu_cycles, .branch_misses }; + const MyCounters = perf.GroupReadOutputType(&events); + + // We expect the struct to have fields named after the events + try testing.expect(@hasField(MyCounters, "cpu_cycles")); + try testing.expect(@hasField(MyCounters, "branch_misses")); + + // We expect the struct NOT to have fields we didn't include + try testing.expect(!@hasField(MyCounters, "instructions")); + + const info = @typeInfo(MyCounters); + inline for (info.@"struct".fields) |field| { + try testing.expect(field.type == u64); + } +} + +test "GroupReadOutputType instantiation and usage" { + const events = [_]perf.Event{ .instructions, .cache_misses }; + const MyCounters = perf.GroupReadOutputType(&events); + + var counters = MyCounters{ + .instructions = 100, + .cache_misses = 5, + }; + + counters.instructions += 50; + try testing.expectEqual(150, counters.instructions); + try testing.expectEqual(5, counters.cache_misses); +} + +test "Sanity check" { + const ValidGroup = perf.Group(&.{.cpu_cycles}); + try testing.expect(@sizeOf(ValidGroup) > 0); +} + +test "Group init/deinit lifecycle" { + const MyGroup = perf.Group(&.{ .cpu_cycles, .instructions }); + + // We expect this might fail with OpenGroupFailed (EACCES/ENOENT) on + // many CI systems. We catch that specific error to pass the test, + // proving the error mapping logic works. + var group = MyGroup.init() catch return error.SkipZigTest; + try testing.expect(group.event_fds[0] != -1); + try testing.expect(group.event_ids[0] != 0); + group.deinit(); + try testing.expect(group.event_fds[0] == -1); + try testing.expect(group.event_ids[0] == 0); +} + +test "Group handles BadGroup error" { + const MyGroup = perf.Group(&.{.cpu_cycles}); + var group = MyGroup.init() catch return error.SkipZigTest; + group.deinit(); + try testing.expectError(error.BadGroup, group.enable()); + try testing.expectError(error.BadGroup, group.disable()); +} + +test "Group lifecycle" { + const MyGroup = perf.Group(&.{ .instructions, .cpu_cycles }); + var group = MyGroup.init() catch return error.SkipZigTest; + defer group.deinit(); + + try group.enable(); + + var x: u64 = 0; + for (0..10_000) |i| { + x +%= i; + std.mem.doNotOptimizeAway(x); + } + + try group.disable(); + const m = try group.read(); + + try testing.expect(m.instructions > 10_000); + try testing.expect(m.cpu_cycles > 0); +} diff --git a/src/perf.zig b/src/perf.zig new file mode 100644 index 0000000..d032388 --- /dev/null +++ b/src/perf.zig @@ -0,0 +1,288 @@ +const std = @import("std"); +const linux = std.os.linux; +const Type = std.builtin.Type; + +// Bits for perf_event_attr.read_format +const PERF_FORMAT_TOTAL_TIME_ENABLED = 1 << 0; +const PERF_FORMAT_TOTAL_TIME_RUNNING = 1 << 1; +const PERF_FORMAT_ID = 1 << 2; +const PERF_FORMAT_GROUP = 1 << 3; + +// Various ioctls act on perf_event_open() file descriptors: +const PERF_EVENT_IOC_ID = linux.IOCTL.IOR('$', 7, u64); +const PERF_EVENT_IOC_RESET = linux.PERF.EVENT_IOC.RESET; +const PERF_EVENT_IOC_ENABLE = linux.PERF.EVENT_IOC.ENABLE; +const PERF_EVENT_IOC_DISABLE = linux.PERF.EVENT_IOC.DISABLE; + +/// The hardware events supported by the kernel for performance monitoring. +/// These map directly to `perf_event_attr.config` values. +pub const Event = enum { + cpu_cycles, + instructions, + cache_misses, + branch_misses, + bus_cycles, + + /// Converts the enum into the specific kernel configuration integer + /// required by the `perf_event_open` syscall. + pub fn toConfig(self: Event) u64 { + return switch (self) { + .cpu_cycles => @intFromEnum(linux.PERF.COUNT.HW.CPU_CYCLES), + .instructions => @intFromEnum(linux.PERF.COUNT.HW.INSTRUCTIONS), + .cache_misses => @intFromEnum(linux.PERF.COUNT.HW.CACHE_MISSES), + .branch_misses => @intFromEnum(linux.PERF.COUNT.HW.BRANCH_MISSES), + .bus_cycles => @intFromEnum(linux.PERF.COUNT.HW.BUS_CYCLES), + }; + } +}; + +pub fn GroupReadOutputType(comptime events: []const Event) type { + var field_names: [events.len][]const u8 = undefined; + var field_types: [events.len]type = undefined; + var field_attrs: [events.len]Type.StructField.Attributes = undefined; + for (events, 0..) |event, index| { + field_names[index] = @tagName(event); + field_types[index] = u64; + field_attrs[index] = .{ + .@"comptime" = false, + .@"align" = @alignOf(u64), + .default_value_ptr = null, + }; + } + return @Struct( + .auto, + null, + &field_names, + &field_types, + &field_attrs, + ); +} + +/// A type-safe wrapper for the Linux `perf_event_open` system call, +/// specifically configured for event grouping (`PERF_FORMAT_GROUP`). +/// +/// `Group` leverages Zig's `comptime` features to generate a custom +/// `ReadOutputType` result type that strictly matches the requested `events`. +/// It manages the complexity of creating a group leader, attaching sibling +/// events, and handling the binary layout of the kernel's read buffer. +/// +/// Notes: +/// * The `read()` method returns a struct with named fields corresponding +/// exactly to the input events (e.g. `.cpu_cycles`). +/// * The `read()` method automatically detects if the CPU was oversubscribed +/// and scales the counter values based on `time_enabled` and `time_running`. +/// +/// References: +/// * man 2 perf_event_open +/// * man 1 perf-list +pub fn Group(comptime events: []const Event) type { + if (events.len == 0) @compileError("perf.Group requires at least 1 event"); + + const Error = error{ + /// Failed to open group via perf_event_open + OpenGroupFailed, + /// Failed to retrieve the ID of the event via IOCTL + GetIdFailed, + /// Failed to reset counters via IOCTL + ResetGroupFailed, + /// Failed to enable counters via IOCTL + EnableGroupFailed, + /// Failed to disable counters via IOCTL + DisableGroupFailed, + /// Failed to read data from the file descriptor + ReadGroupFailed, + /// Group already deinitialized + BadGroup, + }; + + const Output = GroupReadOutputType(events); + + // Matches the binary layout of the buffer read from the group leader fd. + // See `man perf_event_open` section "Reading results". + // Corresponds to `struct read_format` when using: + // PERF_FORMAT_GROUP | PERF_FORMAT_TOTAL_TIME_ENABLED | + // PERF_FORMAT_TOTAL_TIME_RUNNING | PERF_FORMAT_ID + const ReadFormatGroup = extern struct { + /// The number of events in this group. + nr: u64, + /// Total time the event group was enabled. + time_enabled: u64, + /// Total time the event group was actually running. + time_running: u64, + /// Array of values matching the `nr` of events. + values: [events.len]extern struct { + value: u64, + id: u64, + }, + }; + + return struct { + const Self = @This(); + + event_fds: [events.len]linux.fd_t = undefined, + event_ids: [events.len]u64 = undefined, + + /// Initializes the performance monitoring group. + /// + /// This opens a file descriptor for every event in the `events` list. + /// The first event becomes the group leader. All subsequent events + /// are created as siblings pinned to the leader. + /// + /// The counters start in a disabled state. You must call `enable()` + /// to begin counting. + /// + /// **Note:** The caller owns the returned group and must call `deinit` + /// to close the file descriptors. + pub fn init() Error!Self { + var self = Self{}; + @memset(&self.event_fds, -1); + + // Leader + var group_fd = @as(i32, -1); + const event_config = events[0].toConfig(); + self.event_fds[0] = try perf_open_group(group_fd, event_config); + self.event_ids[0] = try ioctl_get_id(self.event_fds[0]); + group_fd = self.event_fds[0]; + + // Siblings + if (events.len > 1) { + for (events[1..], 1..) |event, i| { + const config = event.toConfig(); + self.event_fds[i] = try perf_open_group(group_fd, config); + self.event_ids[i] = try ioctl_get_id(self.event_fds[i]); + } + } + return self; + } + + /// Closes all file descriptors associated with this event group. + /// This invalidates the group object. + pub fn deinit(self: *Self) void { + for (self.event_fds, 0..) |event_fd, index| { + if (event_fd != -1) { + _ = linux.close(event_fd); + } + self.event_fds[index] = -1; + self.event_ids[index] = 0; + } + } + + /// Resets and enables the event group. Counting begins immediately. + pub fn enable(self: *Self) Error!void { + const group_fd = self.event_fds[0]; + if (group_fd == -1) return error.BadGroup; + try ioctl_reset_group(group_fd); + try ioctl_enable_group(group_fd); + } + + /// Disables the event group. Counting stops immediately. + pub fn disable(self: *Self) Error!void { + const group_fd = self.event_fds[0]; + if (group_fd == -1) return error.BadGroup; + try ioctl_disable_group(group_fd); + } + + /// Reads the current values from the kernel and maps them to the + /// type-safe output struct. + /// + /// This performs the following operations: + /// 1. Reads the `read_format` binary struct from the leader FD. + /// 2. Checks `time_enabled` and `time_running` to detect if the CPU + /// was oversubscribed. + /// 3. If multiplexing occurred (time_running < time_enabled), scales + /// the raw values: `val = raw_val * (time_enabled / time_running)` + /// 4. Maps the kernel's event IDs back to the field names of the output + /// struct. + pub fn read(self: *Self) Error!Output { + var output: Output = std.mem.zeroes(Output); + var data: ReadFormatGroup = undefined; + + const rc = linux.read(self.event_fds[0], @ptrCast(&data), @sizeOf(ReadFormatGroup)); + if (linux.errno(rc) != .SUCCESS) return error.ReadGroupFailed; + + // If time_running is 0, we can't scale, so return zeros. + if (data.time_running == 0) return output; + + // Multiplexing scaling: scaled_value = value * (time_enabled / time_running) + const scale_needed = data.time_running < data.time_enabled; + const scale_factor = if (scale_needed) + @as(f64, @floatFromInt(data.time_enabled)) / @as(f64, @floatFromInt(data.time_running)) + else + 1.0; + + for (data.values) |item| { + var val = item.value; + + if (scale_needed) { + val = @as(u64, @intFromFloat(@as(f64, @floatFromInt(val)) * scale_factor)); + } + + // Map the kernel ID back to our event tags + inline for (events, 0..) |tag, i| { + if (item.id == self.event_ids[i]) { + @field(output, @tagName(tag)) = val; + } + } + } + + return output; + } + + /////////////////////////////////////////////////////////////////////////////// + // perf & ioctl calls + + // Open new file descriptor for the specific event + fn perf_open_group(group_fd: linux.fd_t, config: u64) Error!linux.fd_t { + var attr = std.mem.zeroes(linux.perf_event_attr); + attr.type = linux.PERF.TYPE.HARDWARE; + attr.config = config; + + // Enable grouping and ID tracking + attr.read_format = PERF_FORMAT_GROUP | + PERF_FORMAT_TOTAL_TIME_ENABLED | + PERF_FORMAT_TOTAL_TIME_RUNNING | + PERF_FORMAT_ID; + + attr.flags.disabled = (group_fd == -1); // Only leader starts disabled + attr.flags.inherit = true; + attr.flags.exclude_kernel = true; + attr.flags.exclude_hv = true; + + // ref: `man 2 perf_event_open` + // pid=0 (current process), cpu=-1 (any cpu), flags=0 + const pid = 0; + const cpu = -1; + const flags = 0; + + const rc = linux.perf_event_open(&attr, pid, cpu, group_fd, flags); + if (linux.errno(rc) != .SUCCESS) return error.OpenGroupFailed; + return @intCast(rc); + } + + // ref: `man 2 perf_event_open` then search for `PERF_EVENT_IOC_ID` + fn ioctl_get_id(fd: linux.fd_t) Error!u64 { + var id: u64 = 0; + const rc = linux.ioctl(fd, PERF_EVENT_IOC_ID, @intFromPtr(&id)); + if (linux.errno(rc) != .SUCCESS) return error.GetIdFailed; + return id; + } + + // ref: `man 2 perf_event_open` then search for `PERF_EVENT_IOC_RESET` + fn ioctl_reset_group(fd: linux.fd_t) Error!void { + const rc = linux.ioctl(fd, PERF_EVENT_IOC_RESET, 0); + if (linux.errno(rc) != .SUCCESS) return error.ResetGroupFailed; + } + + // ref: `man 2 perf_event_open` then search for `PERF_EVENT_IOC_ENABLE` + fn ioctl_enable_group(fd: linux.fd_t) Error!void { + const rc = linux.ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); + if (linux.errno(rc) != .SUCCESS) return error.EnableGroupFailed; + } + + // ref: `man 2 perf_event_open` then search for `PERF_EVENT_IOC_DISABLE` + fn ioctl_disable_group(fd: linux.fd_t) Error!void { + const rc = linux.ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); + if (linux.errno(rc) != .SUCCESS) return error.DisableGroupFailed; + } + }; +} diff --git a/src/root.zig b/src/root.zig index 72ae0b8..04e530e 100644 --- a/src/root.zig +++ b/src/root.zig @@ -1,452 +1,18 @@ const builtin = @import("builtin"); -const std = @import("std"); -const math = std.math; -const sort = std.sort; -const Timer = std.time.Timer; -const Allocator = std.mem.Allocator; -const Writer = std.Io.Writer; -const tty = std.Io.tty; -const Perf = @import("Perf.zig"); +pub const Metrics = @import("Metrics.zig"); +pub const perf = @import("perf.zig"); +pub const Runner = @import("Runner.zig"); +pub const Reporter = @import("Reporter.zig"); -/// Metrics of the execution -pub const Metrics = struct { - name: []const u8, - // Time - min_ns: f64, - max_ns: f64, - mean_ns: f64, - median_ns: f64, - std_dev_ns: f64, - // Throughput - samples: usize, - ops_sec: f64, - mb_sec: f64, - // Hardware (Linux only, null otherwise) - cycles: ?f64 = null, - instructions: ?f64 = null, - ipc: ?f64 = null, - cache_misses: ?f64 = null, -}; - -pub const Options = struct { - warmup_iters: u64 = 100, - sample_size: u64 = 1000, - bytes_per_op: usize = 0, -}; - -pub const ReportOptions = struct { - metrics: []const Metrics, - /// The index in 'metrics' to use as the baseline for comparison (e.g 1.00x). - /// If null, no comparison column is shown. - baseline_index: ?usize = null, -}; - -pub fn run(allocator: Allocator, name: []const u8, function: anytype, args: anytype, options: Options) !Metrics { - assertFunctionDef(function, args); - - // ref: https://pyk.sh/blog/2025-12-08-bench-fixing-constant-folding - var runtime_args = createRuntimeArgs(function, args); - std.mem.doNotOptimizeAway(&runtime_args); - - for (0..options.warmup_iters) |_| { - try execute(function, runtime_args); - } - - // We need to determine a batch_size such that the total execution time of the batch - // is large enough to minimize timer resolution noise. - // Target: 1ms (1,000,000 ns) per measurement block. - const min_sample_time_ns = 1_000_000; - var batch_size: u64 = 1; - var timer = try Timer.start(); - - while (true) { - timer.reset(); - for (0..batch_size) |_| { - try execute(function, runtime_args); - } - const duration = timer.read(); - - if (duration >= min_sample_time_ns) break; - - // If the duration is 0 (too fast to measure) or small, scale up - if (duration == 0) { - batch_size *= 10; - } else { - const ratio = @as(f64, @floatFromInt(min_sample_time_ns)) / @as(f64, @floatFromInt(duration)); - const multiplier = @as(u64, @intFromFloat(std.math.ceil(ratio))); - if (multiplier <= 1) { - batch_size *= 2; // Fallback growth - } else { - batch_size *= multiplier; - } - } - } - - const samples = try allocator.alloc(f64, options.sample_size); - defer allocator.free(samples); - - for (0..options.sample_size) |i| { - timer.reset(); - for (0..batch_size) |_| { - try execute(function, runtime_args); - } - const total_ns = timer.read(); - // Average time per operation for this batch - samples[i] = @as(f64, @floatFromInt(total_ns)) / @as(f64, @floatFromInt(batch_size)); - } - - // Sort samples to find the median and process min/max - sort.block(f64, samples, {}, sort.asc(f64)); - - var sum: f64 = 0; - for (samples) |s| sum += s; - - const mean = sum / @as(f64, @floatFromInt(options.sample_size)); - - // Calculate Variance for Standard Deviation - var sum_sq_diff: f64 = 0; - for (samples) |s| { - const diff = s - mean; - sum_sq_diff += diff * diff; - } - const variance = sum_sq_diff / @as(f64, @floatFromInt(options.sample_size)); - - // Calculate Operations Per Second - const ops_sec = if (mean > 0) 1_000_000_000.0 / mean else 0; - - // Calculate MB/s (Megabytes per second) - // Formula: (Ops/Sec * Bytes/Op) / 1,000,000 - const mb_sec = if (options.bytes_per_op > 0) - (ops_sec * @as(f64, @floatFromInt(options.bytes_per_op))) / 1_000_000.0 - else - 0; - - var metrics = Metrics{ - .name = name, - .min_ns = samples[0], - .max_ns = samples[samples.len - 1], - .mean_ns = mean, - .median_ns = samples[options.sample_size / 2], - .std_dev_ns = math.sqrt(variance), - .samples = options.sample_size, - .ops_sec = ops_sec, - .mb_sec = mb_sec, - }; - - if (builtin.os.tag == .linux) { - if (Perf.init()) |p| { - var perf = p; - defer perf.deinit(); - - try perf.capture(); - for (0..options.sample_size) |_| { - for (0..batch_size) |_| { - try execute(function, runtime_args); - } - } - try perf.stop(); - - const m = try perf.read(); - const total_ops = @as(f64, @floatFromInt(options.sample_size * batch_size)); - const avg_cycles = @as(f64, @floatFromInt(m.cycles)) / total_ops; - const avg_instr = @as(f64, @floatFromInt(m.instructions)) / total_ops; - const avg_misses = @as(f64, @floatFromInt(m.cache_misses)) / total_ops; - - metrics.cycles = avg_cycles; - metrics.instructions = avg_instr; - metrics.cache_misses = avg_misses; - if (avg_cycles > 0) { - metrics.ipc = avg_instr / avg_cycles; - } - } else |_| {} // skip counter if we can't open it - } - - return metrics; -} - -inline fn execute(function: anytype, args: anytype) !void { - const FnType = unwrapFnType(@TypeOf(function)); - const return_type = @typeInfo(FnType).@"fn".return_type.?; - - // Conditional execution based on whether the function can fail - if (@typeInfo(return_type) == .error_union) { - const result = try @call(.auto, function, args); - std.mem.doNotOptimizeAway(result); - } else { - const result = @call(.auto, function, args); - std.mem.doNotOptimizeAway(result); - } -} - -/// Returns the underlying Function type, unwrapping it if it is a pointer. -fn unwrapFnType(comptime T: type) type { - if (@typeInfo(T) == .pointer) return @typeInfo(T).pointer.child; - return T; -} - -//////////////////////////////////////////////////////////////////////////////// -// Function definition checker - -fn assertFunctionDef(function: anytype, args: anytype) void { - const ArgsType = @TypeOf(args); - const args_info = @typeInfo(ArgsType); - if (args_info != .@"struct" or !args_info.@"struct".is_tuple) { - @compileError("Expected 'args' to be a tuple, found '" ++ @typeName(ArgsType) ++ "'"); - } - - const FnType = unwrapFnType(@TypeOf(function)); - if (@typeInfo(FnType) != .@"fn") { - @compileError("Expected 'function' to be a function or function pointer, found '" ++ @typeName(@TypeOf(function)) ++ "'"); - } - - const params_len = @typeInfo(FnType).@"fn".params.len; - const args_len = @typeInfo(ArgsType).@"struct".fields.len; - - if (params_len != args_len) { - @compileError(std.fmt.comptimePrint( - "Function expects {d} arguments, but args tuple has {d}", - .{ params_len, args_len }, - )); - } -} - -//////////////////////////////////////////////////////////////////////////////// -// Runtime Arguments Helpers - -/// Constructs the runtime argument tuple based on function parameters and input args. -fn createRuntimeArgs(function: anytype, args: anytype) RuntimeArgsType(@TypeOf(function), @TypeOf(args)) { - const TupleType = RuntimeArgsType(@TypeOf(function), @TypeOf(args)); - var runtime_args: TupleType = undefined; - - // We only need the length here to iterate - const fn_params = getFnParams(@TypeOf(function)); - - inline for (0..fn_params.len) |i| { - runtime_args[i] = args[i]; - } - return runtime_args; -} - -/// Computes the precise Tuple type required to hold the arguments. -fn RuntimeArgsType(comptime FnType: type, comptime ArgsType: type) type { - const fn_params = getFnParams(FnType); - const args_fields = @typeInfo(ArgsType).@"struct".fields; - comptime var types: [fn_params.len]type = undefined; - inline for (fn_params, 0..) |p, i| { - if (p.type) |t| { - types[i] = t; - } else { - types[i] = args_fields[i].type; - } - } - return std.meta.Tuple(&types); -} - -/// Helper to unwrap function pointers and retrieve parameter info -fn getFnParams(comptime FnType: type) []const std.builtin.Type.Fn.Param { - return @typeInfo(unwrapFnType(FnType)).@"fn".params; -} - -//////////////////////////////////////////////////////////////////////////////// -// reporters - -fn writeColor(writer: *Writer, color: tty.Color, text: []const u8) !void { - const config = tty.Config.detect(std.fs.File.stdout()); - if (config != .no_color) { - switch (color) { - .reset => try writer.writeAll("\x1b[0m"), - .red => try writer.writeAll("\x1b[31m"), - .green => try writer.writeAll("\x1b[32m"), - .blue => try writer.writeAll("\x1b[34m"), - .cyan => try writer.writeAll("\x1b[36m"), - .dim => try writer.writeAll("\x1b[2m"), - .black => try writer.writeAll("\x1b[90m"), - else => try writer.writeAll(""), - } - } - try writer.writeAll(text); - if (config != .no_color) try writer.writeAll("\x1b[0m"); -} - -/// Writes the formatted report to a specific writer -pub fn writeReport(writer: *Writer, options: ReportOptions) !void { - if (options.metrics.len == 0) return; - - try writer.print("Benchmark Summary: {d} benchmarks run\n", .{options.metrics.len}); - - var max_name_len: usize = 0; - for (options.metrics) |m| max_name_len = @max(max_name_len, m.name.len); - - for (options.metrics, 0..) |m, i| { - const is_last_item = i == options.metrics.len - 1; - - // --- ROW 1: High Level (Name | Time | Speed | Comparison) --- - const tree_char = if (is_last_item) "└─ " else "├─ "; - try writeColor(writer, .bright_black, tree_char); - try writeColor(writer, .cyan, m.name); - // try writer.print("{s}{s}", .{ tree_char, m.name }); - - // Align name - const padding = max_name_len - m.name.len + 2; - _ = try writer.splatByte(' ', padding); - - try fmtTime(writer, m.median_ns); - try writer.writeAll(" "); - - if (m.mb_sec > 0.001) { - try fmtBandwidth(writer, m.mb_sec); - } else { - try fmtOps(writer, m.ops_sec); - } - - // Comparison (On the first line now) - if (options.baseline_index) |base_idx| { - try writer.writeAll(" "); - if (i == base_idx) { - try writeColor(writer, .blue, "[baseline]"); - } else if (base_idx < options.metrics.len) { - const base = options.metrics[base_idx]; - const base_f = base.median_ns; - const curr_f = m.median_ns; - - if (curr_f > 0 and base_f > 0) { - if (curr_f < base_f) { - try writer.writeAll("\x1b[32m"); // Green manually to mix with print - try writer.print("{d:.2}x faster", .{base_f / curr_f}); - try writer.writeAll("\x1b[0m"); - } else { - try writer.writeAll("\x1b[31m"); - try writer.print("{d:.2}x slower", .{curr_f / base_f}); - try writer.writeAll("\x1b[0m"); - } - } else { - try writer.writeAll("-"); - } - } - } - try writer.writeByte('\n'); - - // Only printed if we have hardware stats - if (m.cycles) |cycles| { - const sub_tree_prefix = if (is_last_item) " └─ " else "│ └─ "; - try writer.writeAll(sub_tree_prefix); - try writeColor(writer, .dim, "cycles: "); - try fmtInt(writer, cycles); - } - - if (m.instructions) |instructions| { - try writer.writeAll("\t"); - try writeColor(writer, .dim, "instructions: "); - try fmtInt(writer, instructions); - } - - if (m.ipc) |ipc| { - try writer.writeAll("\t"); - try writeColor(writer, .dim, "ipc: "); - try writer.print("{d:.2}", .{ipc}); - } - - if (m.cache_misses) |cache_missess| { - try writer.writeAll("\t"); - try writeColor(writer, .dim, "miss: "); - try fmtInt(writer, cache_missess); - - try writer.writeByte('\n'); - } - } -} - -/// Prints a formatted summary table to stdout. -pub fn report(options: ReportOptions) !void { - var stdout_buffer: [0x2000]u8 = undefined; - var stdout_writer = std.fs.File.stdout().writer(&stdout_buffer); - const stdout = &stdout_writer.interface; - try writeReport(stdout, options); - try stdout.flush(); -} - -//////////////////////////////////////////////////////////////////////////////// -// formatters - -fn fmtInt(writer: *Writer, val: f64) !void { - if (val < 1000) { - try writer.print("{d:.0}", .{val}); - } else if (val < 1_000_000) { - try writer.print("{d:.1}k", .{val / 1000.0}); - } else if (val < 1_000_000_000) { - try writer.print("{d:.1}M", .{val / 1_000_000.0}); - } else { - try writer.print("{d:.1}G", .{val / 1_000_000_000.0}); - } -} - -fn fmtTime(writer: *Writer, ns: f64) !void { - var buf: [64]u8 = undefined; - var slice: []u8 = undefined; - - if (ns < 1000) { - slice = try std.fmt.bufPrint(&buf, "{d:.2}ns", .{ns}); - } else if (ns < 1_000_000) { - slice = try std.fmt.bufPrint(&buf, "{d:.2}us", .{ns / 1000.0}); - } else if (ns < 1_000_000_000) { - slice = try std.fmt.bufPrint(&buf, "{d:.2}ms", .{ns / 1_000_000.0}); - } else { - slice = try std.fmt.bufPrint(&buf, "{d:.2}s", .{ns / 1_000_000_000.0}); - } - try padLeft(writer, slice, 9); -} - -fn fmtOps(writer: *Writer, ops: f64) !void { - var buf: [64]u8 = undefined; - var slice: []u8 = undefined; - - if (ops < 1000) { - slice = try std.fmt.bufPrint(&buf, "{d:.0}/s", .{ops}); - } else if (ops < 1_000_000) { - slice = try std.fmt.bufPrint(&buf, "{d:.2}K/s", .{ops / 1000.0}); - } else if (ops < 1_000_000_000) { - slice = try std.fmt.bufPrint(&buf, "{d:.2}M/s", .{ops / 1_000_000.0}); - } else { - slice = try std.fmt.bufPrint(&buf, "{d:.2}G/s", .{ops / 1_000_000_000.0}); - } - try padLeft(writer, slice, 11); -} - -fn fmtBandwidth(writer: *Writer, mb: f64) !void { - var buf: [64]u8 = undefined; - var slice: []u8 = undefined; - - if (mb >= 1000) { - slice = try std.fmt.bufPrint(&buf, "{d:.2}GB/s", .{mb / 1000.0}); - } else { - slice = try std.fmt.bufPrint(&buf, "{d:.2}MB/s", .{mb}); - } - try padLeft(writer, slice, 11); -} - -// Pads with spaces on the left (for numbers) -fn padLeft(writer: *Writer, text: []const u8, width: usize) !void { - if (text.len < width) { - _ = try writer.splatByte(' ', width - text.len); - } - try writer.writeAll(text); -} - -// Pads with spaces on the right (for text/comparisons) -fn padRight(writer: *Writer, text: []const u8, width: usize) !void { - try writer.writeAll(text); - if (text.len < width) { - _ = try writer.splatByte(' ', width - text.len); - } -} - -//////////////////////////////////////////////////////////////////////////////// -// tests +pub const Options = Runner.Options; +pub const run = Runner.run; +pub const report = Reporter.report; test { - _ = @import("test.zig"); if (builtin.os.tag == .linux) { - _ = @import("Perf.test.zig"); + _ = @import("perf.test.zig"); } + _ = @import("Runner.test.zig"); + _ = @import("Reporter.test.zig"); }