[DO NOT MERGE] Test accumulate acceleratedkernel#590
[DO NOT MERGE] Test accumulate acceleratedkernel#590christiangnrd wants to merge 7 commits intomainfrom
Conversation
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/perf/array.jl b/perf/array.jl
index f3ef0adf..e2d2b823 100644
--- a/perf/array.jl
+++ b/perf/array.jl
@@ -10,7 +10,7 @@ for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shar
gpu_vec = reshape(gpu_mat, length(gpu_mat))
gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
- gpu_mat_ints = MtlMatrix{Int,S}(rand(rng, -10:10, m, n))
+ gpu_mat_ints = MtlMatrix{Int, S}(rand(rng, -10:10, m, n))
gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
gpu_mat_bools = MtlMatrix{Bool,S}(rand(rng, Bool, m, n))
gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
@@ -60,13 +60,13 @@ for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shar
let group = addgroup!(group, "accumulate")
let group = addgroup!(group, "Float32")
group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec)
- group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=1)
- group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=2)
+ group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims = 1)
+ group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims = 2)
end
let group = addgroup!(group, "Int64")
group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec_ints)
- group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims=1)
- group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims=2)
+ group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims = 1)
+ group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims = 2)
end
end
@@ -74,26 +74,26 @@ for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shar
let group = addgroup!(group, "reduce")
let group = addgroup!(group, "Float32")
group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec)
- group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=1)
- group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=2)
+ group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims = 1)
+ group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims = 2)
end
let group = addgroup!(group, "Int64")
group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec_ints)
- group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims=1)
- group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims=2)
+ group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims = 1)
+ group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims = 2)
end
end
let group = addgroup!(group, "mapreduce")
let group = addgroup!(group, "Float32")
- group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec)
- group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=1)
- group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=2)
+ group["1d"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_vec)
+ group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_mat; dims = 1)
+ group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_mat; dims = 2)
end
let group = addgroup!(group, "Int64")
- group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec_ints)
- group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat_ints; dims=1)
- group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat_ints; dims=2)
+ group["1d"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_vec_ints)
+ group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_mat_ints; dims = 1)
+ group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x -> x + 1, +, $gpu_mat_ints; dims = 2)
end
end
diff --git a/src/accumulate.jl b/src/accumulate.jl
index 7af43a09..db6fa99a 100644
--- a/src/accumulate.jl
+++ b/src/accumulate.jl
@@ -170,17 +170,17 @@ end
## Base interface
Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlVector, dims::Nothing, init::Nothing) =
- @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=AK.neutral_element(op, eltype(output)))
+ @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init = AK.neutral_element(op, eltype(output)))
Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlArray, dims::Integer, init::Nothing) =
- @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=AK.neutral_element(op, eltype(output)))
+ @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init = AK.neutral_element(op, eltype(output)))
Base._accumulate!(op, output::WrappedMtlArray, input::MtlVector, dims::Nothing, init::Some) =
- @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=something(init))
+ @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init = something(init))
Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlArray, dims::Integer, init::Some) =
- @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=something(init))
+ @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init = something(init))
-Base.accumulate_pairwise!(op, result::WrappedMtlVector, v::WrappedMtlVector) = @inline AK.accumulate!(op, result, v, MetalBackend(); init=AK.neutral_element(op, eltype(result)))
+Base.accumulate_pairwise!(op, result::WrappedMtlVector, v::WrappedMtlVector) = @inline AK.accumulate!(op, result, v, MetalBackend(); init = AK.neutral_element(op, eltype(result)))
# default behavior unless dims are specified by the user
function Base.accumulate(op, A::WrappedMtlArray;
@@ -199,5 +199,5 @@ function Base.accumulate(op, A::WrappedMtlArray;
else
throw(ArgumentError("accumulate does not support the keyword arguments $(setdiff(keys(nt), (:init,)))"))
end
- AK.accumulate!(op, out, A, MetalBackend(); dims, init)
+ return AK.accumulate!(op, out, A, MetalBackend(); dims, init)
end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index 59e5f8db..a6eb3ab3 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -142,12 +142,16 @@ end
## COV_EXCL_STOP
-Base.mapreduce(f, op, A::WrappedMtlArray;
- dims=:, init=nothing) = _mapreduce(f, op, A, init, dims)
- # dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims)
-Base.mapreduce(f, op, A::Broadcast.Broadcasted{<:MtlArrayStyle};
- dims=:, init=nothing) = _mapreduce(f, op, A, init, dims)
- # dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims)
+Base.mapreduce(
+ f, op, A::WrappedMtlArray;
+ dims = :, init = nothing
+) = _mapreduce(f, op, A, init, dims)
+# dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims)
+Base.mapreduce(
+ f, op, A::Broadcast.Broadcasted{<:MtlArrayStyle};
+ dims = :, init = nothing
+) = _mapreduce(f, op, A, init, dims)
+# dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims)
# "Borrowed" from GPUArrays
@inline function _init_value(f, op, init, As...)
@@ -164,7 +168,7 @@ end
function _mapreduce(f, op, A, init, dims::Union{Nothing, Integer})
init_val = _init_value(f, op, init, A)
- AK.mapreduce(f, op, A; init=init_val, neutral=init_val, dims)
+ return AK.mapreduce(f, op, A; init = init_val, neutral = init_val, dims)
end
_mapreduce(f, op, A, init, ::Colon) = _mapreduce(f, op, A, init, nothing)
_mapreduce(f, op, A, init, dims) = GPUArrays._mapreduce(f, op, A; dims, init) |
|
Why not implement this at the GPUArrays.jl level? Or is this purely for debugging? |
|
@maleadt purely for debugging. I’ll add do not merge in the title |
|
The goal behind this was to get an idea of what the benchmark results look like, but there seem to be strange failures that I haven't yet determined if they're a Metal issue or an AcceleratedKernels issue. |
a3a50f3 to
98dddc9
Compare
|
Just for benchmarks, this still shouldn't be merged |
69e30e0 to
e38640b
Compare
There was a problem hiding this comment.
Metal Benchmarks
Details
| Benchmark suite | Current: e1935ed | Previous: 4abe75f | Ratio |
|---|---|---|---|
private array/construct |
23500 ns |
26364.666666666668 ns |
0.89 |
private array/broadcast |
460562.5 ns |
466750 ns |
0.99 |
private array/random/randn/Float32 |
935166 ns |
843083 ns |
1.11 |
private array/random/randn!/Float32 |
598083 ns |
633208 ns |
0.94 |
private array/random/rand!/Int64 |
556895.5 ns |
560750 ns |
0.99 |
private array/random/rand!/Float32 |
565833 ns |
594542 ns |
0.95 |
private array/random/rand/Int64 |
865959 ns |
767854 ns |
1.13 |
private array/random/rand/Float32 |
798625 ns |
621500 ns |
1.28 |
private array/copyto!/gpu_to_gpu |
578250 ns |
629625 ns |
0.92 |
private array/copyto!/cpu_to_gpu |
633000 ns |
655708.5 ns |
0.97 |
private array/copyto!/gpu_to_cpu |
666645.5 ns |
821708 ns |
0.81 |
private array/accumulate/Int64/1d |
2000520.5 ns |
||
private array/accumulate/Int64/dims=1 |
2176625 ns |
||
private array/accumulate/Int64/dims=2 |
3040000 ns |
||
private array/accumulate/Float32/1d |
1774916 ns |
||
private array/accumulate/Float32/dims=1 |
1962042 ns |
||
private array/accumulate/Float32/dims=2 |
2351812.5 ns |
||
private array/iteration/findall/int |
1912167 ns |
1848833 ns |
1.03 |
private array/iteration/findall/bool |
1670270.5 ns |
1610708.5 ns |
1.04 |
private array/iteration/findfirst/int |
1830083 ns |
1716416 ns |
1.07 |
private array/iteration/findfirst/bool |
1740187 ns |
1681917 ns |
1.03 |
private array/iteration/scalar |
2474083 ns |
3851541.5 ns |
0.64 |
private array/iteration/logical |
3091209 ns |
3013084 ns |
1.03 |
private array/iteration/findmin/1d |
1897792 ns |
1793417 ns |
1.06 |
private array/iteration/findmin/2d |
1434417 ns |
1369667 ns |
1.05 |
private array/reductions/reduce/Int64/1d |
995500 ns |
||
private array/reductions/reduce/Int64/dims=1 |
836416.5 ns |
||
private array/reductions/reduce/Int64/dims=2 |
712729.5 ns |
||
private array/reductions/reduce/Float32/1d |
1133834 ns |
||
private array/reductions/reduce/Float32/dims=1 |
825375.5 ns |
||
private array/reductions/reduce/Float32/dims=2 |
743125 ns |
||
private array/reductions/mapreduce/Int64/1d |
1002250 ns |
||
private array/reductions/mapreduce/Int64/dims=1 |
833417 ns |
||
private array/reductions/mapreduce/Int64/dims=2 |
700062.5 ns |
||
private array/reductions/mapreduce/Float32/1d |
1124167 ns |
||
private array/reductions/mapreduce/Float32/dims=1 |
813437.5 ns |
||
private array/reductions/mapreduce/Float32/dims=2 |
745354.5 ns |
||
private array/permutedims/4d |
2673084 ns |
2524541 ns |
1.06 |
private array/permutedims/2d |
1090146 ns |
1040583 ns |
1.05 |
private array/permutedims/3d |
1805542 ns |
1604084 ns |
1.13 |
private array/copy |
827292 ns |
627437.5 ns |
1.32 |
latency/precompile |
9972765625 ns |
9812875250 ns |
1.02 |
latency/ttfp |
4152996666 ns |
4071015166 ns |
1.02 |
latency/import |
1306905250 ns |
1279968583 ns |
1.02 |
integration/metaldevrt |
770750 ns |
706083 ns |
1.09 |
integration/byval/slices=1 |
1667250 ns |
1645500 ns |
1.01 |
integration/byval/slices=3 |
19804125 ns |
10475417 ns |
1.89 |
integration/byval/reference |
1664208 ns |
1618417 ns |
1.03 |
integration/byval/slices=2 |
2840000 ns |
2694041.5 ns |
1.05 |
kernel/indexing |
473500 ns |
458250 ns |
1.03 |
kernel/indexing_checked |
467292 ns |
462875 ns |
1.01 |
kernel/launch |
9923.666666666666 ns |
8167 ns |
1.22 |
metal/synchronization/stream |
15042 ns |
14958 ns |
1.01 |
metal/synchronization/context |
16084 ns |
15208 ns |
1.06 |
shared array/construct |
23558.4 ns |
23701.416666666664 ns |
0.99 |
shared array/broadcast |
463166.5 ns |
467292 ns |
0.99 |
shared array/random/randn/Float32 |
911624.5 ns |
789125 ns |
1.16 |
shared array/random/randn!/Float32 |
598208 ns |
634708 ns |
0.94 |
shared array/random/rand!/Int64 |
560417 ns |
561209 ns |
1.00 |
shared array/random/rand!/Float32 |
563000 ns |
599792 ns |
0.94 |
shared array/random/rand/Int64 |
864313 ns |
807979 ns |
1.07 |
shared array/random/rand/Float32 |
833417 ns |
634062.5 ns |
1.31 |
shared array/copyto!/gpu_to_gpu |
83083 ns |
79666 ns |
1.04 |
shared array/copyto!/cpu_to_gpu |
79042 ns |
83792 ns |
0.94 |
shared array/copyto!/gpu_to_cpu |
79167 ns |
83375 ns |
0.95 |
shared array/accumulate/Int64/1d |
2300958 ns |
||
shared array/accumulate/Int64/dims=1 |
2963333 ns |
||
shared array/accumulate/Int64/dims=2 |
3742875 ns |
||
shared array/accumulate/Float32/1d |
1777437.5 ns |
||
shared array/accumulate/Float32/dims=1 |
2137375 ns |
||
shared array/accumulate/Float32/dims=2 |
2496250 ns |
||
shared array/iteration/findall/int |
1954375.5 ns |
1869000 ns |
1.05 |
shared array/iteration/findall/bool |
1695833.5 ns |
1607125 ns |
1.06 |
shared array/iteration/findfirst/int |
1510645.5 ns |
1430125 ns |
1.06 |
shared array/iteration/findfirst/bool |
1456125 ns |
1386250 ns |
1.05 |
shared array/iteration/scalar |
161792 ns |
160291.5 ns |
1.01 |
shared array/iteration/logical |
3178375 ns |
2911209 ns |
1.09 |
shared array/iteration/findmin/1d |
1566958.5 ns |
1472625 ns |
1.06 |
shared array/iteration/findmin/2d |
1448750 ns |
1382417 ns |
1.05 |
shared array/reductions/reduce/Int64/1d |
1006541.5 ns |
||
shared array/reductions/reduce/Int64/dims=1 |
838646 ns |
||
shared array/reductions/reduce/Int64/dims=2 |
717500 ns |
||
shared array/reductions/reduce/Float32/1d |
1122875 ns |
||
shared array/reductions/reduce/Float32/dims=1 |
840500 ns |
||
shared array/reductions/reduce/Float32/dims=2 |
747667 ns |
||
shared array/reductions/mapreduce/Int64/1d |
1021292 ns |
||
shared array/reductions/mapreduce/Int64/dims=1 |
837958 ns |
||
shared array/reductions/mapreduce/Int64/dims=2 |
697709 ns |
||
shared array/reductions/mapreduce/Float32/1d |
1120375 ns |
||
shared array/reductions/mapreduce/Float32/dims=1 |
837291 ns |
||
shared array/reductions/mapreduce/Float32/dims=2 |
757083 ns |
||
shared array/permutedims/4d |
2645646 ns |
2528916 ns |
1.05 |
shared array/permutedims/2d |
1115500 ns |
1043958 ns |
1.07 |
shared array/permutedims/3d |
1823833 ns |
1589500 ns |
1.15 |
shared array/copy |
211333 ns |
247500 ns |
0.85 |
This comment was automatically generated by workflow using github-action-benchmark.
Codecov ReportAttention: Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## main #590 +/- ##
==========================================
- Coverage 80.66% 79.61% -1.06%
==========================================
Files 61 61
Lines 2690 2688 -2
==========================================
- Hits 2170 2140 -30
- Misses 520 548 +28 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
9449e39 to
8da0d16
Compare
[only benchmarks]
|
Closing in favour of #625 |
I'm getting errors locally