diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 2d69ec0e5..bbeb3aa9c 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -138,6 +138,7 @@ steps: println("--- :julia: Instantiating project") Pkg.develop([PackageSpec(path=pwd())]) + Pkg.add(url="https://github.com/JuliaGPU/AcceleratedKernels.jl", rev="main") println("+++ :julia: Benchmarking") include("perf/runbenchmarks.jl")' diff --git a/Project.toml b/Project.toml index 6129ad5df..7038e1edd 100644 --- a/Project.toml +++ b/Project.toml @@ -3,6 +3,7 @@ uuid = "dde4c033-4e86-420c-a63e-0dd931031962" version = "1.6.2" [deps] +AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82" @@ -32,6 +33,7 @@ SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" SpecialFunctionsExt = "SpecialFunctions" [compat] +AcceleratedKernels = "0.4" Adapt = "4" BFloat16s = "0.5" CEnum = "0.4, 0.5" diff --git a/perf/array.jl b/perf/array.jl index 954014ccc..f3ef0adf2 100644 --- a/perf/array.jl +++ b/perf/array.jl @@ -10,7 +10,7 @@ for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shar gpu_vec = reshape(gpu_mat, length(gpu_mat)) gpu_arr_3d = reshape(gpu_mat, (m, 40, 25)) gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10)) - gpu_mat_ints = MtlMatrix{Int,S}(rand(rng, Int, m, n)) + gpu_mat_ints = MtlMatrix{Int,S}(rand(rng, -10:10, m, n)) gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints)) gpu_mat_bools = MtlMatrix{Bool,S}(rand(rng, Bool, m, n)) gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools)) @@ -58,19 +58,43 @@ for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shar # no need to test inplace version, which performs the same operation (but with an alloc) let group = addgroup!(group, "accumulate") - group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec) - group["2d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=1) + let group = addgroup!(group, "Float32") + group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec) + group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=1) + group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=2) + end + let group = addgroup!(group, "Int64") + group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec_ints) + group["dims=1"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims=1) + group["dims=2"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat_ints; dims=2) + end end let group = addgroup!(group, "reductions") let group = addgroup!(group, "reduce") - group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec) - group["2d"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=1) + let group = addgroup!(group, "Float32") + group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec) + group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=1) + group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=2) + end + let group = addgroup!(group, "Int64") + group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec_ints) + group["dims=1"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims=1) + group["dims=2"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat_ints; dims=2) + end end let group = addgroup!(group, "mapreduce") - group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec) - group["2d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=1) + let group = addgroup!(group, "Float32") + group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec) + group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=1) + group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=2) + end + let group = addgroup!(group, "Int64") + group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec_ints) + group["dims=1"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat_ints; dims=1) + group["dims=2"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat_ints; dims=2) + end end # used by sum, prod, minimum, maximum, all, any, count diff --git a/src/Metal.jl b/src/Metal.jl index b6c974588..b9bdad0e1 100644 --- a/src/Metal.jl +++ b/src/Metal.jl @@ -12,6 +12,7 @@ using ExprTools: splitdef, combinedef using ObjectiveC, .CoreFoundation, .Foundation, .Dispatch, .OS import ObjectiveC: is_macos, darwin_version, macos_version import KernelAbstractions +import AcceleratedKernels as AK using ScopedValues include("version.jl") diff --git a/src/accumulate.jl b/src/accumulate.jl index 31e2dc4fe..7af43a09f 100644 --- a/src/accumulate.jl +++ b/src/accumulate.jl @@ -170,33 +170,34 @@ end ## Base interface Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlVector, dims::Nothing, init::Nothing) = - scan!(op, output, input; dims=1) + @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=AK.neutral_element(op, eltype(output))) Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlArray, dims::Integer, init::Nothing) = - scan!(op, output, input; dims=dims) - + @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=AK.neutral_element(op, eltype(output))) Base._accumulate!(op, output::WrappedMtlArray, input::MtlVector, dims::Nothing, init::Some) = - scan!(op, output, input; dims=1, init=init) + @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=something(init)) Base._accumulate!(op, output::WrappedMtlArray, input::WrappedMtlArray, dims::Integer, init::Some) = - scan!(op, output, input; dims=dims, init=init) + @inline AK.accumulate!(op, output, input, MetalBackend(); dims, init=something(init)) -Base.accumulate_pairwise!(op, result::WrappedMtlVector, v::WrappedMtlVector) = accumulate!(op, result, v) +Base.accumulate_pairwise!(op, result::WrappedMtlVector, v::WrappedMtlVector) = @inline AK.accumulate!(op, result, v, MetalBackend(); init=AK.neutral_element(op, eltype(result))) # default behavior unless dims are specified by the user function Base.accumulate(op, A::WrappedMtlArray; dims::Union{Nothing,Integer}=nothing, kw...) + nt = values(kw) if dims === nothing && !(A isa AbstractVector) # This branch takes care of the cases not handled by `_accumulate!`. - return reshape(accumulate(op, A[:]; kw...), size(A)) + return reshape(AK.accumulate(op, A[:], MetalBackend(); init = (:init in keys(kw) ? nt.init : AK.neutral_element(op, eltype(A)))), size(A)) end - nt = values(kw) if isempty(kw) out = similar(A, Base.promote_op(op, eltype(A), eltype(A))) + init = AK.neutral_element(op, eltype(out)) elseif keys(nt) === (:init,) out = similar(A, Base.promote_op(op, typeof(nt.init), eltype(A))) + init = nt.init else throw(ArgumentError("accumulate does not support the keyword arguments $(setdiff(keys(nt), (:init,)))")) end - accumulate!(op, out, A; dims=dims, kw...) + AK.accumulate!(op, out, A, MetalBackend(); dims, init) end diff --git a/src/mapreduce.jl b/src/mapreduce.jl index 8a353e3c2..59e5f8dbd 100644 --- a/src/mapreduce.jl +++ b/src/mapreduce.jl @@ -142,6 +142,33 @@ end ## COV_EXCL_STOP +Base.mapreduce(f, op, A::WrappedMtlArray; + dims=:, init=nothing) = _mapreduce(f, op, A, init, dims) + # dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims) +Base.mapreduce(f, op, A::Broadcast.Broadcasted{<:MtlArrayStyle}; + dims=:, init=nothing) = _mapreduce(f, op, A, init, dims) + # dims=:, init=nothing) = AK.mapreduce(f, op, A, init, dims=dims isa Colon ? nothing : dims) + +# "Borrowed" from GPUArrays +@inline function _init_value(f, op, init, As...) + if init === nothing + ET = Broadcast.combine_eltypes(f, As) + ET = Base.promote_op(op, ET, ET) + (ET === Union{} || ET === Any) && + error("mapreduce cannot figure the output element type, please pass an explicit init value") + + init = AK.neutral_element(op, ET) + end + return init +end + +function _mapreduce(f, op, A, init, dims::Union{Nothing, Integer}) + init_val = _init_value(f, op, init, A) + AK.mapreduce(f, op, A; init=init_val, neutral=init_val, dims) +end +_mapreduce(f, op, A, init, ::Colon) = _mapreduce(f, op, A, init, nothing) +_mapreduce(f, op, A, init, dims) = GPUArrays._mapreduce(f, op, A; dims, init) + function GPUArrays.mapreducedim!(f::F, op::OP, R::WrappedMtlArray{T}, A::Union{AbstractArray,Broadcast.Broadcasted}; init=nothing) where {F, OP, T} diff --git a/test/Project.toml b/test/Project.toml index b64b414d6..a511d64fb 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,4 +1,5 @@ [deps] +AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" @@ -11,6 +12,7 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" ObjectiveC = "e86c9b32-1129-44ac-8ea0-90d5bb39ded9" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" diff --git a/test/runtests.jl b/test/runtests.jl index b46c4ee71..0fdaaa134 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,3 +1,5 @@ +using Pkg +Pkg.develop("AcceleratedKernels") using Distributed using Dates using Metal