From 661ccf5687f0c1eb85eb9a24f90a7e75b06a6d87 Mon Sep 17 00:00:00 2001 From: Yat Long Poon Date: Fri, 10 Oct 2025 10:58:04 +0100 Subject: [PATCH] Add VectorMax and Clamp to SVE microbenchmark --- src/benchmarks/micro/sve/Clamp.cs | 128 +++++++++++++++++++ src/benchmarks/micro/sve/VectorMax.cs | 172 ++++++++++++++++++++++++++ 2 files changed, 300 insertions(+) create mode 100644 src/benchmarks/micro/sve/Clamp.cs create mode 100644 src/benchmarks/micro/sve/VectorMax.cs diff --git a/src/benchmarks/micro/sve/Clamp.cs b/src/benchmarks/micro/sve/Clamp.cs new file mode 100644 index 00000000000..ac583402227 --- /dev/null +++ b/src/benchmarks/micro/sve/Clamp.cs @@ -0,0 +1,128 @@ +using System; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Extensions; +using BenchmarkDotNet.Configs; +using BenchmarkDotNet.Filters; +using MicroBenchmarks; + +namespace SveBenchmarks +{ + [BenchmarkCategory(Categories.Runtime)] + [OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)] + [Config(typeof(Config))] + public class Clamp + { + private class Config : ManualConfig + { + public Config() + { + AddFilter(new SimpleFilter(_ => Sve.IsSupported)); + } + } + + [Params(15, 127, 527, 10015)] + public int Size; + + private int _output; + + [GlobalCleanup] + public virtual void Verify() + { + int current = _output; + Scalar(); + int scalar = _output; + // Check that the result is the same as the scalar result. + Debug.Assert(current == scalar); + } + + // The following algorithms are adapted from the Arm simd-loops repository: + // https://gitlab.arm.com/architecture/simd-loops/-/blob/main/loops/loop_040.c + + [Benchmark] + public unsafe void Scalar() + { + int res = 0; + int val = Size / 2; + for (int i = 0; i < Size; i++) + { + res += Math.Min(Math.Max(val, i), 2 * i); + } + _output = res; + } + + [Benchmark] + public unsafe void Vector128Clamp() + { + int i = 0; + int lmt = Size - Size % 4; + int val = Size / 2; + + Vector128 resVec = Vector128.Zero; + Vector128 valVec = Vector128.Create(val); + Vector128 minVec = Vector128.Create(0, 1, 2, 3); + for (; i < lmt; i += 4) + { + Vector128 maxVec = AdvSimd.ShiftLeftLogical(minVec.AsUInt32(), 1).AsInt32(); + Vector128 tmpVec = AdvSimd.Min(AdvSimd.Max(valVec, minVec), maxVec); + resVec = AdvSimd.Add(resVec, tmpVec); + minVec = AdvSimd.Add(minVec, Vector128.Create(4)); + } + int res = (int)AdvSimd.Arm64.AddAcross(resVec).ToScalar(); + for (; i < Size; i++) + { + res += Math.Min(Math.Max(val, i), 2 * i); + } + _output = res; + } + + [Benchmark] + public unsafe void SveClamp() + { + int i = 0; + int length = Size; + int cntw = (int)Sve.Count32BitElements(); + + Vector resVec = Vector.Zero; + Vector valVec = new Vector(Size / 2); + Vector minVec = Vector.Indices; + Vector pTrue = Sve.CreateTrueMaskInt32(); + Vector pLoop = (Vector)Sve.CreateWhileLessThanMask32Bit(i, length); + while (Sve.TestFirstTrue(pTrue, pLoop)) + { + Vector maxVec = Sve.ShiftLeftLogical(minVec, Vector.One); + Vector tmpVec = Sve.Min(Sve.Max(valVec, minVec), maxVec); + resVec = Sve.ConditionalSelect(pLoop, Sve.Add(resVec, tmpVec), resVec); + minVec = Sve.Add(minVec, new Vector(cntw)); + + i += cntw; + pLoop = (Vector)Sve.CreateWhileLessThanMask32Bit(i, length); + } + _output = (int)Sve.AddAcross(resVec).ToScalar(); + } + + [Benchmark] + public unsafe void SveTail() + { + int i = 0; + int length = Size; + int cntw = (int)Sve.Count32BitElements(); + + Vector resVec = Vector.Zero; + Vector valVec = new Vector(Size / 2); + Vector minVec = Vector.Indices; + for (; i < length; i += cntw) + { + Vector pLoop = (Vector)Sve.CreateWhileLessThanMask32Bit(i, length); + Vector maxVec = Sve.ShiftLeftLogical(minVec, Vector.One); + Vector tmpVec = Sve.Min(Sve.Max(valVec, minVec), maxVec); + resVec = Sve.ConditionalSelect(pLoop, Sve.Add(resVec, tmpVec), resVec); + minVec = Sve.Add(minVec, new Vector(cntw)); + } + _output = (int)Sve.AddAcross(resVec).ToScalar(); + } + } +} diff --git a/src/benchmarks/micro/sve/VectorMax.cs b/src/benchmarks/micro/sve/VectorMax.cs new file mode 100644 index 00000000000..3a57554305b --- /dev/null +++ b/src/benchmarks/micro/sve/VectorMax.cs @@ -0,0 +1,172 @@ +using System; +using System.Diagnostics; +using System.Numerics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.Arm; +using BenchmarkDotNet.Attributes; +using BenchmarkDotNet.Extensions; +using BenchmarkDotNet.Configs; +using BenchmarkDotNet.Filters; +using MicroBenchmarks; + +namespace SveBenchmarks +{ + [BenchmarkCategory(Categories.Runtime)] + [OperatingSystemsArchitectureFilter(allowed: true, System.Runtime.InteropServices.Architecture.Arm64)] + [Config(typeof(Config))] + public class VectorMax + { + private class Config : ManualConfig + { + public Config() + { + AddFilter(new SimpleFilter(_ => Sve.IsSupported)); + } + } + + [Params(15, 127, 527, 10015)] + public short Size; + + private short[] _input; + private uint _output; + + [GlobalSetup] + public virtual void Setup() + { + _input = ValuesGenerator.Array(Size); + } + + [GlobalCleanup] + public virtual void Verify() + { + uint current = _output; + Setup(); + Scalar(); + uint scalar = _output; + // Check that the result is the same as the scalar result. + Debug.Assert(current == scalar); + } + + // The following algorithms are adapted from Arm "SVE Programming Examples": + // https://developer.arm.com/documentation/dai0548/latest/ (example B1) + + [Benchmark] + public unsafe void Scalar() + { + fixed (short* input = _input) + { + short maxVal = input[0]; + short maxIdx = 0; + for (short i = 0; i < Size; i++) + { + if (input[i] > maxVal) + { + maxVal = input[i]; + maxIdx = i; + } + } + // Combine max value and index into a 32-bit integer. + _output = (uint)maxVal << 16 ^ (uint)maxIdx; + } + } + + [Benchmark] + public unsafe void Vector128VectorMax() + { + fixed (short* input = _input) + { + short i = 0; + short lmt = (short)(Size - Size % 8); + + Vector128 idxVec = Vector128.Create(0, 1, 2, 3, 4, 5, 6, 7); + + // Initialize the first vector worth of values. + Vector128 maxVec = Vector128.Load(input); + Vector128 maxIdxVec = idxVec; + + i += 8; + for (; i < lmt; i += 8) + { + Vector128 val = Vector128.Load(input + i); + idxVec = AdvSimd.Add(idxVec, Vector128.Create((short)8)); + // Find indices of the new maximum values. + Vector128 cmp = AdvSimd.CompareGreaterThan(val, maxVec); + // Update maximum values. + maxVec = AdvSimd.Max(maxVec, val); + // Update the indices with the maximum values. + maxIdxVec = AdvSimd.BitwiseSelect(cmp, idxVec, maxIdxVec); + } + + // Get the maximum element across the max vector. + short maxVal = AdvSimd.Arm64.MaxAcross(maxVec).ToScalar(); + + // Find the first occurence (min index) of the max value. + Vector128 cmpIndex = AdvSimd.CompareEqual(maxVec, Vector128.Create(maxVal)); + maxIdxVec = AdvSimd.BitwiseSelect(cmpIndex, maxIdxVec, Vector128.Create((short)-1)); + short maxIdx = (short)AdvSimd.Arm64.MinAcross(maxIdxVec.AsUInt16()).ToScalar(); + + // Search in remaining elements. + for (; i < Size; i++) + { + if (input[i] > maxVal) + { + maxVal = input[i]; + maxIdx = i; + } + } + + // Combine max value and index into a 32-bit integer. + _output = (uint)maxVal << 16 ^ (uint)maxIdx; + } + } + + [Benchmark] + public unsafe void SveVectorMax() + { + fixed (short* input = _input) + { + short i = 0; + short cnth = (short)Sve.Count16BitElements(); + + Vector pTrue = Sve.CreateTrueMaskInt16(); + Vector pLoop = (Vector)Sve.CreateWhileLessThanMask16Bit(0, Size); + Vector idxVec = Vector.Indices; + + // Initialize the first vector worth of values. + Vector maxVec = Sve.LoadVector(pLoop, input); + Vector maxIdxVec = idxVec; + + i += cnth; + pLoop = (Vector)Sve.CreateWhileLessThanMask16Bit(i, Size); + while (Sve.TestFirstTrue(pTrue, pLoop)) + { + Vector val = Sve.LoadVector(pLoop, input + i); + // Increment indices counter. + idxVec = Sve.Add(idxVec, new Vector(cnth)); + // Find indices of the new maximum values. + Vector cmp = Sve.CompareGreaterThan(val, maxVec); + // Update maximum values. + maxVec = Sve.Max(maxVec, val); + // Update the indices with the maximum values. + maxIdxVec = Sve.ConditionalSelect(cmp, idxVec, maxIdxVec); + + // Handle loop. + i += cnth; + pLoop = (Vector)Sve.CreateWhileLessThanMask16Bit(i, Size); + } + + // Get the maximum element across the max vector. + short maxVal = Sve.MaxAcross(maxVec).ToScalar(); + + // Find the first occurence (min index) of the max value. + Vector pIndex = Sve.CompareEqual(maxVec, new Vector(maxVal)); + maxIdxVec = Sve.ConditionalSelect(pIndex, maxIdxVec, new Vector(-1)); + short maxIdx = (short)Sve.MinAcross((Vector)maxIdxVec).ToScalar(); + + // Combine max value and index into a 32-bit integer. + _output = (uint)maxVal << 16 ^ (uint)maxIdx; + } + } + + } +}