[DO NOT MERGE] Test accumulate acceleratedkernel by christiangnrd · Pull Request #590 · JuliaGPU/Metal.jl

christiangnrd · 2025-05-19T22:23:56Z

~~I'm getting errors locally~~

github-actions · 2025-05-19T22:24:53Z

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic main) to apply these changes.

Click here to view the suggested changes.

diff --git a/perf/array.jl b/perf/array.jl
index f3ef0adf..e2d2b823 100644
--- a/perf/array.jl
+++ b/perf/array.jl
@@ -10,7 +10,7 @@ for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shar
     gpu_vec = reshape(gpu_mat, length(gpu_mat))
     gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
     gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
-    gpu_mat_ints = MtlMatrix{Int,S}(rand(rng, -10:10, m, n))
+    gpu_mat_ints = MtlMatrix{Int, S}(rand(rng, -10:10, m, n))
     gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
     gpu_mat_bools = MtlMatrix{Bool,S}(rand(rng, Bool, m, n))
     gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
@@ -60,13 +60,13 @@ for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shar
     let group = addgroup!(group, "accumulate")
         let group = addgroup!(group, "Float32")
             group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec)
-            group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=1)
-            group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=2)
+            group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims = 1)
+            group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims = 2)
         end
         let group = addgroup!(group, "Int64")
             group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec_ints)
-            group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims=1)
-            group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims=2)
+            group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims = 1)
+            group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims = 2)
         end
     end
 
@@ -74,26 +74,26 @@ for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shar
         let group = addgroup!(group, "reduce")
             let group = addgroup!(group, "Float32")
                 group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec)
-                group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=1)
-                group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=2)
+                group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims = 1)
+                group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims = 2)
             end
             let group = addgroup!(group, "Int64")
                 group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec_ints)
-                group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims=1)
-                group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims=2)
+                group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims = 1)
+                group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims = 2)
             end
         end
 
         let group = addgroup!(group, "mapreduce")
             let group = addgroup!(group, "Float32")
-                group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec)
-                group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=1)
-                group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=2)
+                group["1d"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_vec)
+                group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_mat; dims = 1)
+                group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_mat; dims = 2)
             end
             let group = addgroup!(group, "Int64")
-                group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec_ints)
-                group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat_ints; dims=1)
-                group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat_ints; dims=2)
+                group["1d"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_vec_ints)
+                group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_mat_ints; dims = 1)
+                group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_mat_ints; dims = 2)
             end
         end
 
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 7af43a09..db6fa99a 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -170,17 +170,17 @@ end
 ## Base interface
 
 Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlVector, dims::Nothing, init::Nothing) =
-    @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=AK.neutral_element(op, eltype(output)))
+    @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init = AK.neutral_element(op, eltype(output)))
 
 Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlArray, dims::Integer, init::Nothing) =
-    @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=AK.neutral_element(op, eltype(output)))
+    @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init = AK.neutral_element(op, eltype(output)))
 Base._accumulate!(op, output::WrappedMtlArray, input::MtlVector, dims::Nothing, init::Some) =
-    @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=something(init))
+    @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init = something(init))
 
 Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlArray, dims::Integer, init::Some) =
-    @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=something(init))
+    @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init = something(init))
 
-Base.accumulate_pairwise!(op, result::WrappedMtlVector, v::WrappedMtlVector) = @inline AK.accumulate!(op, result, v, MetalBackend(); init=AK.neutral_element(op, eltype(result)))
+Base.accumulate_pairwise!(op, result::WrappedMtlVector, v::WrappedMtlVector) = @inline AK.accumulate!(op, result, v, MetalBackend(); init = AK.neutral_element(op, eltype(result)))
 
 # default behavior unless dims are specified by the user
 function Base.accumulate(op, A::WrappedMtlArray;
@@ -199,5 +199,5 @@ function Base.accumulate(op, A::WrappedMtlArray;
     else
         throw(ArgumentError("accumulate does not support the keyword arguments $(setdiff(keys(nt), (:init,)))"))
     end
-    AK.accumulate!(op, out, A, MetalBackend(); dims, init)
+    return AK.accumulate!(op, out, A, MetalBackend(); dims, init)
 end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 59e5f8db..a6eb3ab3 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -142,12 +142,16 @@ end
 
 ## COV_EXCL_STOP
 
-Base.mapreduce(f, op, A::WrappedMtlArray;
-            dims=:, init=nothing) = _mapreduce(f, op, A, init, dims)
-            # dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims)
-Base.mapreduce(f, op, A::Broadcast.Broadcasted{<:MtlArrayStyle};
-            dims=:, init=nothing) = _mapreduce(f, op, A, init, dims)
-            # dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims)
+Base.mapreduce(
+    f, op, A::WrappedMtlArray;
+    dims = :, init = nothing
+) = _mapreduce(f, op, A, init, dims)
+# dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims)
+Base.mapreduce(
+    f, op, A::Broadcast.Broadcasted{<:MtlArrayStyle};
+    dims = :, init = nothing
+) = _mapreduce(f, op, A, init, dims)
+# dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims)
 
 # "Borrowed" from GPUArrays
 @inline function _init_value(f, op, init, As...)
@@ -164,7 +168,7 @@ end
 
 function _mapreduce(f, op, A, init, dims::Union{Nothing, Integer})
     init_val = _init_value(f, op, init, A)
-    AK.mapreduce(f, op, A; init=init_val, neutral=init_val, dims)
+    return AK.mapreduce(f, op, A; init = init_val, neutral = init_val, dims)
 end
 _mapreduce(f, op, A, init, ::Colon) = _mapreduce(f, op, A, init, nothing)
 _mapreduce(f, op, A, init, dims) = GPUArrays._mapreduce(f, op, A; dims, init)

maleadt · 2025-05-20T06:47:43Z

Why not implement this at the GPUArrays.jl level? Or is this purely for debugging?

christiangnrd · 2025-05-20T10:46:35Z

@maleadt purely for debugging. I’ll add do not merge in the title

christiangnrd · 2025-05-20T13:40:32Z

The goal behind this was to get an idea of what the benchmark results look like, but there seem to be strange failures that I haven't yet determined if they're a Metal issue or an AcceleratedKernels issue.

christiangnrd · 2025-05-21T01:33:44Z

Just for benchmarks, this still shouldn't be merged

github-actions

Metal Benchmarks

Details

Benchmark suite	Current: `e1935ed`	Previous: `4abe75f`	Ratio
`private array/construct`	`23500` ns	`26364.666666666668` ns	`0.89`
`private array/broadcast`	`460562.5` ns	`466750` ns	`0.99`
`private array/random/randn/Float32`	`935166` ns	`843083` ns	`1.11`
`private array/random/randn!/Float32`	`598083` ns	`633208` ns	`0.94`
`private array/random/rand!/Int64`	`556895.5` ns	`560750` ns	`0.99`
`private array/random/rand!/Float32`	`565833` ns	`594542` ns	`0.95`
`private array/random/rand/Int64`	`865959` ns	`767854` ns	`1.13`
`private array/random/rand/Float32`	`798625` ns	`621500` ns	`1.28`
`private array/copyto!/gpu_to_gpu`	`578250` ns	`629625` ns	`0.92`
`private array/copyto!/cpu_to_gpu`	`633000` ns	`655708.5` ns	`0.97`
`private array/copyto!/gpu_to_cpu`	`666645.5` ns	`821708` ns	`0.81`
`private array/accumulate/Int64/1d`	`2000520.5` ns
`private array/accumulate/Int64/dims=1`	`2176625` ns
`private array/accumulate/Int64/dims=2`	`3040000` ns
`private array/accumulate/Float32/1d`	`1774916` ns
`private array/accumulate/Float32/dims=1`	`1962042` ns
`private array/accumulate/Float32/dims=2`	`2351812.5` ns
`private array/iteration/findall/int`	`1912167` ns	`1848833` ns	`1.03`
`private array/iteration/findall/bool`	`1670270.5` ns	`1610708.5` ns	`1.04`
`private array/iteration/findfirst/int`	`1830083` ns	`1716416` ns	`1.07`
`private array/iteration/findfirst/bool`	`1740187` ns	`1681917` ns	`1.03`
`private array/iteration/scalar`	`2474083` ns	`3851541.5` ns	`0.64`
`private array/iteration/logical`	`3091209` ns	`3013084` ns	`1.03`
`private array/iteration/findmin/1d`	`1897792` ns	`1793417` ns	`1.06`
`private array/iteration/findmin/2d`	`1434417` ns	`1369667` ns	`1.05`
`private array/reductions/reduce/Int64/1d`	`995500` ns
`private array/reductions/reduce/Int64/dims=1`	`836416.5` ns
`private array/reductions/reduce/Int64/dims=2`	`712729.5` ns
`private array/reductions/reduce/Float32/1d`	`1133834` ns
`private array/reductions/reduce/Float32/dims=1`	`825375.5` ns
`private array/reductions/reduce/Float32/dims=2`	`743125` ns
`private array/reductions/mapreduce/Int64/1d`	`1002250` ns
`private array/reductions/mapreduce/Int64/dims=1`	`833417` ns
`private array/reductions/mapreduce/Int64/dims=2`	`700062.5` ns
`private array/reductions/mapreduce/Float32/1d`	`1124167` ns
`private array/reductions/mapreduce/Float32/dims=1`	`813437.5` ns
`private array/reductions/mapreduce/Float32/dims=2`	`745354.5` ns
`private array/permutedims/4d`	`2673084` ns	`2524541` ns	`1.06`
`private array/permutedims/2d`	`1090146` ns	`1040583` ns	`1.05`
`private array/permutedims/3d`	`1805542` ns	`1604084` ns	`1.13`
`private array/copy`	`827292` ns	`627437.5` ns	`1.32`
`latency/precompile`	`9972765625` ns	`9812875250` ns	`1.02`
`latency/ttfp`	`4152996666` ns	`4071015166` ns	`1.02`
`latency/import`	`1306905250` ns	`1279968583` ns	`1.02`
`integration/metaldevrt`	`770750` ns	`706083` ns	`1.09`
`integration/byval/slices=1`	`1667250` ns	`1645500` ns	`1.01`
`integration/byval/slices=3`	`19804125` ns	`10475417` ns	`1.89`
`integration/byval/reference`	`1664208` ns	`1618417` ns	`1.03`
`integration/byval/slices=2`	`2840000` ns	`2694041.5` ns	`1.05`
`kernel/indexing`	`473500` ns	`458250` ns	`1.03`
`kernel/indexing_checked`	`467292` ns	`462875` ns	`1.01`
`kernel/launch`	`9923.666666666666` ns	`8167` ns	`1.22`
`metal/synchronization/stream`	`15042` ns	`14958` ns	`1.01`
`metal/synchronization/context`	`16084` ns	`15208` ns	`1.06`
`shared array/construct`	`23558.4` ns	`23701.416666666664` ns	`0.99`
`shared array/broadcast`	`463166.5` ns	`467292` ns	`0.99`
`shared array/random/randn/Float32`	`911624.5` ns	`789125` ns	`1.16`
`shared array/random/randn!/Float32`	`598208` ns	`634708` ns	`0.94`
`shared array/random/rand!/Int64`	`560417` ns	`561209` ns	`1.00`
`shared array/random/rand!/Float32`	`563000` ns	`599792` ns	`0.94`
`shared array/random/rand/Int64`	`864313` ns	`807979` ns	`1.07`
`shared array/random/rand/Float32`	`833417` ns	`634062.5` ns	`1.31`
`shared array/copyto!/gpu_to_gpu`	`83083` ns	`79666` ns	`1.04`
`shared array/copyto!/cpu_to_gpu`	`79042` ns	`83792` ns	`0.94`
`shared array/copyto!/gpu_to_cpu`	`79167` ns	`83375` ns	`0.95`
`shared array/accumulate/Int64/1d`	`2300958` ns
`shared array/accumulate/Int64/dims=1`	`2963333` ns
`shared array/accumulate/Int64/dims=2`	`3742875` ns
`shared array/accumulate/Float32/1d`	`1777437.5` ns
`shared array/accumulate/Float32/dims=1`	`2137375` ns
`shared array/accumulate/Float32/dims=2`	`2496250` ns
`shared array/iteration/findall/int`	`1954375.5` ns	`1869000` ns	`1.05`
`shared array/iteration/findall/bool`	`1695833.5` ns	`1607125` ns	`1.06`
`shared array/iteration/findfirst/int`	`1510645.5` ns	`1430125` ns	`1.06`
`shared array/iteration/findfirst/bool`	`1456125` ns	`1386250` ns	`1.05`
`shared array/iteration/scalar`	`161792` ns	`160291.5` ns	`1.01`
`shared array/iteration/logical`	`3178375` ns	`2911209` ns	`1.09`
`shared array/iteration/findmin/1d`	`1566958.5` ns	`1472625` ns	`1.06`
`shared array/iteration/findmin/2d`	`1448750` ns	`1382417` ns	`1.05`
`shared array/reductions/reduce/Int64/1d`	`1006541.5` ns
`shared array/reductions/reduce/Int64/dims=1`	`838646` ns
`shared array/reductions/reduce/Int64/dims=2`	`717500` ns
`shared array/reductions/reduce/Float32/1d`	`1122875` ns
`shared array/reductions/reduce/Float32/dims=1`	`840500` ns
`shared array/reductions/reduce/Float32/dims=2`	`747667` ns
`shared array/reductions/mapreduce/Int64/1d`	`1021292` ns
`shared array/reductions/mapreduce/Int64/dims=1`	`837958` ns
`shared array/reductions/mapreduce/Int64/dims=2`	`697709` ns
`shared array/reductions/mapreduce/Float32/1d`	`1120375` ns
`shared array/reductions/mapreduce/Float32/dims=1`	`837291` ns
`shared array/reductions/mapreduce/Float32/dims=2`	`757083` ns
`shared array/permutedims/4d`	`2645646` ns	`2528916` ns	`1.05`
`shared array/permutedims/2d`	`1115500` ns	`1043958` ns	`1.07`
`shared array/permutedims/3d`	`1823833` ns	`1589500` ns	`1.15`
`shared array/copy`	`211333` ns	`247500` ns	`0.85`

This comment was automatically generated by workflow using github-action-benchmark.

codecov · 2025-05-27T15:01:47Z

Codecov Report

Attention: Patch coverage is 66.66667% with 3 lines in your changes missing coverage. Please review.

Project coverage is 79.61%. Comparing base (4abe75f) to head (4a8c4e1).

Files with missing lines	Patch %	Lines
src/accumulate.jl	66.66%	3 Missing ⚠️

Additional details and impacted files

@@            Coverage Diff             @@
##             main     #590      +/-   ##
==========================================
- Coverage   80.66%   79.61%   -1.06%     
==========================================
  Files          61       61              
  Lines        2690     2688       -2     
==========================================
- Hits         2170     2140      -30     
- Misses        520      548      +28

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:

❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

[only benchmarks]

christiangnrd · 2025-07-20T17:52:41Z

Closing in favour of #625

christiangnrd marked this pull request as draft May 20, 2025 02:24

christiangnrd changed the title ~~Test accumulate acceleratedkernel~~ [DO NOT MERGE] Test accumulate acceleratedkernel May 20, 2025

christiangnrd mentioned this pull request May 20, 2025

[Metal] Use safe block_size in accumulate JuliaGPU/AcceleratedKernels.jl#36

Merged

christiangnrd force-pushed the testak branch 2 times, most recently from a3a50f3 to 98dddc9 Compare May 21, 2025 01:27

christiangnrd changed the base branch from main to tests May 21, 2025 01:30

christiangnrd mentioned this pull request May 21, 2025

Minor findall and accumulate tests improvements #592

Merged

christiangnrd changed the base branch from tests to main May 21, 2025 01:32

christiangnrd marked this pull request as ready for review May 21, 2025 01:33

christiangnrd force-pushed the testak branch 2 times, most recently from 69e30e0 to e38640b Compare May 21, 2025 13:14

github-actions bot reviewed May 21, 2025

View reviewed changes

christiangnrd mentioned this pull request May 26, 2025

accumulate algorithm selection for Metal implementation being overridden JuliaGPU/AcceleratedKernels.jl#37

Closed

christiangnrd force-pushed the testak branch from e38640b to 789a2a4 Compare May 27, 2025 14:11

christiangnrd force-pushed the testak branch 2 times, most recently from 9449e39 to 8da0d16 Compare June 16, 2025 18:30

Test exapanded benchmarking

93186dd

[only benchmarks]

christiangnrd force-pushed the testak branch from 8da0d16 to 4a8c4e1 Compare June 25, 2025 15:42

christiangnrd added 6 commits June 25, 2025 13:07

Naming

5703e31

Test AK accumulate

55eceff

[only benchmarks]

bce8a7b

Use default algorithm from AK

5c2f529

Compat

d2e94b8

Use AK for supported reductions

e1935ed

christiangnrd force-pushed the testak branch from 4a8c4e1 to e1935ed Compare June 25, 2025 18:39

christiangnrd closed this Jul 20, 2025

christiangnrd deleted the testak branch July 22, 2025 16:39

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[DO NOT MERGE] Test accumulate acceleratedkernel#590

[DO NOT MERGE] Test accumulate acceleratedkernel#590
christiangnrd wants to merge 7 commits intomainfrom
testak

christiangnrd commented May 19, 2025 •

edited

Loading

Uh oh!

github-actions bot commented May 19, 2025 •

edited

Loading

Uh oh!

maleadt commented May 20, 2025

Uh oh!

christiangnrd commented May 20, 2025

Uh oh!

christiangnrd commented May 20, 2025

Uh oh!

christiangnrd commented May 21, 2025

Uh oh!

github-actions bot left a comment •

edited

Loading

Uh oh!

codecov bot commented May 27, 2025 •

edited

Loading

Uh oh!

christiangnrd commented Jul 20, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

Conversation

christiangnrd commented May 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented May 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

maleadt commented May 20, 2025

Uh oh!

christiangnrd commented May 20, 2025

Uh oh!

christiangnrd commented May 20, 2025

Uh oh!

christiangnrd commented May 21, 2025

Uh oh!

github-actions bot left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Metal Benchmarks

Uh oh!

codecov bot commented May 27, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Codecov Report

Uh oh!

christiangnrd commented Jul 20, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

christiangnrd commented May 19, 2025 •

edited

Loading

github-actions bot commented May 19, 2025 •

edited

Loading

github-actions bot left a comment •

edited

Loading

codecov bot commented May 27, 2025 •

edited

Loading