diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index d25fd5fba..5984f88e0 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -13,7 +13,7 @@ steps: command: | julia -e 'println("--- :julia: Developing CUDA") using Pkg - Pkg.add(url="https://github.com/christiangnrd/CUDA.jl", rev="intrinsics")' + Pkg.add(url="https://github.com/christiangnrd/CUDA.jl", rev="intrinsicsnew")' julia -e 'println("--- :julia: Instantiating project") using Pkg Pkg.develop(; path=pwd())' || exit 3 @@ -76,7 +76,7 @@ steps: command: | julia -e 'println("--- :julia: Developing Metal") using Pkg - Pkg.add(url="https://github.com/JuliaGPU/Metal.jl", rev="kaintr")' + Pkg.add(url="https://github.com/JuliaGPU/Metal.jl", rev="kaintrnew")' julia -e 'println("--- :julia: Instantiating project") using Pkg Pkg.develop(; path=pwd())' || exit 3 @@ -106,10 +106,11 @@ steps: - JuliaCI/julia-coverage#v1: codecov: true command: | - julia -e 'println("--- :julia: Developing oneAPI") + julia --compiled-modules=no -e 'println("--- :julia: Developing oneAPI") using Pkg - Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsics") - Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")' + Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsicsnew") + Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple") + Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew", subdir="lib/intrinsics", name="SPIRVIntrinsics")' julia -e 'println("--- :julia: Instantiating project") using Pkg Pkg.develop(; path=pwd())' || exit 3 @@ -143,7 +144,7 @@ steps: Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")' julia -e ' using Pkg - Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsics") + Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsicsnew") println("--- :julia: Instantiating project") Pkg.develop(; path=pwd())' || exit 3 @@ -174,11 +175,11 @@ steps: command: | julia -e 'println("--- :julia: Developing OpenCL") using Pkg - Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsics") - Pkg.develop(; name="SPIRVIntrinsics")' + Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew")' julia -e 'println("--- :julia: Instantiating project") using Pkg - Pkg.develop(; path=pwd())' || exit 3 + Pkg.develop(; path=pwd()) + Pkg.develop(; name="SPIRVIntrinsics")' || exit 3 julia -e 'println("+++ :julia: Running tests") using Pkg diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c98ffeb2d..d4807efa0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -128,6 +128,15 @@ jobs: end end ' + - name: "Develop SPIRVIntrinsics" + run: | + julia -e ' + using Pkg + withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do + Pkg.activate(".") + Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew", subdir="lib/intrinsics", name="SPIRVIntrinsics") + end + ' - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 if: runner.os != 'Windows' diff --git a/Project.toml b/Project.toml index 7e22dd203..0b5235dc2 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "KernelAbstractions" uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" authors = ["Valentin Churavy and contributors"] -version = "0.10.0-dev" +version = "0.10.1-dev" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" diff --git a/src/intrinsics.jl b/src/intrinsics.jl index 79efa4cbe..5bc96f401 100644 --- a/src/intrinsics.jl +++ b/src/intrinsics.jl @@ -102,6 +102,78 @@ Returns the unique group ID. """ function get_group_id end +""" + get_sub_group_size()::UInt32 + +Returns the number of work-items in the sub-group. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_sub_group_size()::UInt32 + ``` +""" +function get_sub_group_size end + +""" + get_max_sub_group_size()::UInt32 + +Returns the maximum sub-group size for sub-groups in the current workgroup. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_max_sub_group_size()::UInt32 + ``` +""" +function get_max_sub_group_size end + +""" + get_num_sub_groups()::UInt32 + +Returns the number of sub-groups in the current workgroup. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_num_sub_groups()::UInt32 + ``` +""" +function get_num_sub_groups end + +""" + get_sub_group_id()::UInt32 + +Returns the sub-group ID within the work-group. + +!!! note + 1-based. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_sub_group_id()::UInt32 + ``` +""" +function get_sub_group_id end + +""" + get_sub_group_local_id()::UInt32 + +Returns the work-item ID within the current sub-group. + +!!! note + 1-based. + +!!! note + Backend implementations **must** implement: + ``` + @device_override get_sub_group_local_id()::UInt32 + ``` +""" +function get_sub_group_local_id end + + """ localmemory(::Type{T}, dims) @@ -116,6 +188,34 @@ Declare memory that is local to a workgroup. """ localmemory(::Type{T}, dims) where {T} = localmemory(T, Val(dims)) +""" + shfl_down(val::T, offset::Integer) where T + +Read `val` from a lane with higher id given by `offset`. +When writing kernels using this function, it should be +assumed that it is not synchronized. + +!!! note + Backend implementations **must** implement: + ``` + @device_override shfl_down(val::T, offset::Integer) where T + ``` + As well as the on-device functionality. +""" +function shfl_down end + +""" + shfl_down_types(::Backend)::Vector{DataType} + +Returns a vector of `DataType`s supported on `backend` + +!!! note + Backend implementations **must** implement this function + only if they support `shfl_down` for any types. +""" +shfl_down_types(::Backend) = DataType[] + + """ barrier() @@ -139,6 +239,29 @@ function barrier() error("Group barrier used outside kernel or not captured") end +""" + sub_group_barrier() + +After a `sub_group_barrier()` call, all read and writes to global and local memory +from each thread in the sub-group are visible in from all other threads in the +sub-group. + +This does **not** guarantee that a write from a thread in a certain sub-group will +be visible to a thread in a different sub-group. + +!!! note + `sub_group_barrier()` must be encountered by all workitems of a sub-group executing the kernel or by none at all. + +!!! note + Backend implementations **must** implement: + ``` + @device_override sub_group_barrier() + ``` +""" +function sub_group_barrier() + error("Sub-group barrier used outside kernel or not captured") +end + """ _print(args...) @@ -220,6 +343,22 @@ kernel launch with too big a workgroup is attempted. """ function max_work_group_size end +""" + sub_group_size(backend)::Int + +Returns a reasonable sub-group size supported by the currently +active device for the specified backend. This would typically +be 32, or 64 for devices that don't support 32. + +!!! note + Backend implementations **must** implement: + ``` + sub_group_size(backend::NewBackend)::Int + ``` + As well as the on-device functionality. +""" +function sub_group_size end + """ multiprocessor_count(backend::NewBackend)::Int diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl index f23e20b0f..d3b4fd868 100644 --- a/src/pocl/backend.jl +++ b/src/pocl/backend.jl @@ -7,6 +7,8 @@ using ..POCL: device, clconvert, clfunction import KernelAbstractions as KA import KernelAbstractions.KernelIntrinsics as KI +import SPIRVIntrinsics + import StaticArrays import Adapt @@ -174,10 +176,36 @@ end function KI.max_work_group_size(::POCLBackend)::Int return Int(device().max_work_group_size) end +function KI.sub_group_size(::POCLBackend)::Int + sg_sizes = cl.device().sub_group_sizes + if 32 in sg_sizes + return 32 + elseif 64 in sg_sizes + return 64 + elseif 16 in sg_sizes + return 16 + else + return 1 + end +end function KI.multiprocessor_count(::POCLBackend)::Int return Int(device().max_compute_units) end +function KI.shfl_down_types(::POCLBackend) + res = copy(SPIRVIntrinsics.gentypes) + + backend_extensions = cl.device().extensions + if "cl_khr_fp64" ∉ backend_extensions + res = setdiff(res, [Float64]) + end + if "cl_khr_fp16" ∉ backend_extensions + res = setdiff(res, [Float16]) + end + + return res +end + ## Indexing Functions @device_override @inline function KI.get_local_id() @@ -204,6 +232,16 @@ end return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3))) end +@device_override KI.get_sub_group_size() = get_sub_group_size() + +@device_override KI.get_max_sub_group_size() = get_max_sub_group_size() + +@device_override KI.get_num_sub_groups() = get_num_sub_groups() + +@device_override KI.get_sub_group_id() = get_sub_group_id() + +@device_override KI.get_sub_group_local_id() = get_sub_group_local_id() + @device_override @inline function KA.__validindex(ctx) if KA.__dynamic_checkbounds(ctx) I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1)) @@ -232,6 +270,14 @@ end work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE) end +@device_override @inline function KI.sub_group_barrier() + sub_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE) +end + +@device_override function KI.shfl_down(val::T, offset::Integer) where {T} + sub_group_shuffle(val, get_sub_group_local_id() + offset) +end + @device_override @inline function KI._print(args...) POCL._print(args...) end diff --git a/src/pocl/compiler/compilation.jl b/src/pocl/compiler/compilation.jl index 3d5930d40..c277bed7d 100644 --- a/src/pocl/compiler/compilation.jl +++ b/src/pocl/compiler/compilation.jl @@ -1,6 +1,14 @@ ## gpucompiler interface -struct OpenCLCompilerParams <: AbstractCompilerParams end +Base.@kwdef struct OpenCLCompilerParams <: AbstractCompilerParams + sub_group_size::Int +end +function Base.hash(params::OpenCLCompilerParams, h::UInt) + h = hash(params.sub_group_size, h) + + return h +end + const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams} const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget, OpenCLCompilerParams} @@ -19,7 +27,21 @@ GPUCompiler.isintrinsic(job::OpenCLCompilerJob, fn::String) = in(fn, known_intrinsics) || contains(fn, "__spirv_") +function GPUCompiler.finish_module!( + @nospecialize(job::OpenCLCompilerJob), + mod::LLVM.Module, entry::LLVM.Function + ) + entry = invoke( + GPUCompiler.finish_module!, + Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function}, + job, mod, entry + ) + # Set the subgroup size + metadata(entry)["intel_reqd_sub_group_size"] = MDNode([ConstantInt(Int32(job.config.params.sub_group_size))]) + + return entry +end ## compiler implementation (cache, configure, compile, and link) # cache of compilation caches, per context @@ -45,14 +67,17 @@ function compiler_config(dev::cl.Device; kwargs...) end return config end -@noinline function _compiler_config(dev; kernel = true, name = nothing, always_inline = false, kwargs...) +@noinline function _compiler_config(dev; kernel = true, name = nothing, always_inline = false, sub_group_size = 32, kwargs...) supports_fp16 = "cl_khr_fp16" in dev.extensions supports_fp64 = "cl_khr_fp64" in dev.extensions + if sub_group_size ∉ dev.sub_group_sizes + @error("$sub_group_size is not a valid sub-group size for this device.") + end # create GPUCompiler objects target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, kwargs...) - params = OpenCLCompilerParams() + params = OpenCLCompilerParams(; sub_group_size) return CompilerConfig(target, params; kernel, name, always_inline) end diff --git a/src/pocl/compiler/execution.jl b/src/pocl/compiler/execution.jl index dc47cb302..6c47952ee 100644 --- a/src/pocl/compiler/execution.jl +++ b/src/pocl/compiler/execution.jl @@ -4,7 +4,7 @@ export @opencl, clfunction, clconvert ## high-level @opencl interface const MACRO_KWARGS = [:launch] -const COMPILER_KWARGS = [:kernel, :name, :always_inline] +const COMPILER_KWARGS = [:kernel, :name, :always_inline, :sub_group_size] const LAUNCH_KWARGS = [:global_size, :local_size, :queue] macro opencl(ex...) diff --git a/src/pocl/nanoOpenCL.jl b/src/pocl/nanoOpenCL.jl index a706710d1..82ec7d281 100644 --- a/src/pocl/nanoOpenCL.jl +++ b/src/pocl/nanoOpenCL.jl @@ -390,6 +390,8 @@ const CL_KERNEL_EXEC_INFO_SVM_PTRS = 0x11b6 const CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM = 0x11b7 +const CL_DEVICE_SUB_GROUP_SIZES_INTEL = 0x4108 + struct CLError <: Exception code::Cint end @@ -935,6 +937,14 @@ devices(p::Platform) = devices(p, CL_DEVICE_TYPE_ALL) return tuple([Int(r) for r in result]...) end + if s == :sub_group_sizes + res_size = Ref{Csize_t}() + clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, C_NULL, C_NULL, res_size) + result = Vector{Csize_t}(undef, res_size[] ÷ sizeof(Csize_t)) + clGetDeviceInfo(d, CL_DEVICE_SUB_GROUP_SIZES_INTEL, sizeof(result), result, C_NULL) + return tuple([Int(r) for r in result]...) + end + if s == :max_image2d_shape width = Ref{Csize_t}() height = Ref{Csize_t}() diff --git a/src/pocl/pocl.jl b/src/pocl/pocl.jl index 1cc693c86..4638b574a 100644 --- a/src/pocl/pocl.jl +++ b/src/pocl/pocl.jl @@ -41,7 +41,7 @@ function queue() end using GPUCompiler -import LLVM +import LLVM: LLVM, MDNode, ConstantInt, metadata using Adapt ## device overrides diff --git a/test/intrinsics.jl b/test/intrinsics.jl index 63216d32b..03e10970d 100644 --- a/test/intrinsics.jl +++ b/test/intrinsics.jl @@ -23,6 +23,46 @@ function test_intrinsics_kernel(results) end return end +struct SubgroupData + sub_group_size::UInt32 + max_sub_group_size::UInt32 + num_sub_groups::UInt32 + sub_group_id::UInt32 + sub_group_local_id::UInt32 +end +function test_subgroup_kernel(results) + i = KI.get_global_id().x + + if i <= length(results) + @inbounds results[i] = SubgroupData( + KI.get_sub_group_size(), + KI.get_max_sub_group_size(), + KI.get_num_sub_groups(), + KI.get_sub_group_id(), + KI.get_sub_group_local_id() + ) + end + return +end + +function shfl_down_test_kernel(a, b, ::Val{N}) where {N} + idx = KI.get_sub_group_local_id() + + val = a[idx] + + offset = 0x00000001 + while offset < N + val += KI.shfl_down(val, offset) + offset <<= 1 + end + + KI.sub_group_barrier() + + if idx == 1 + b[idx] = val + end + return +end function intrinsics_testsuite(backend, AT) @testset "KernelIntrinsics Tests" begin @@ -122,6 +162,57 @@ function intrinsics_testsuite(backend, AT) @test k_data.local_id == expected_local end end + + @testset "Sub-groups" begin + @test KI.sub_group_size(backend()) isa Int + + # Test with small kernel + sg_size = KI.sub_group_size(backend()) + sg_n = 2 + workgroupsize = sg_size * sg_n + numworkgroups = 2 + N = workgroupsize * numworkgroups + + results = AT(Vector{SubgroupData}(undef, N)) + kernel = KI.@kernel backend() launch = false test_subgroup_kernel(results) + + kernel(results; workgroupsize, numworkgroups) + KernelAbstractions.synchronize(backend()) + + host_results = Array(results) + + # Verify results make sense + for (i, sg_data) in enumerate(host_results) + @test sg_data.sub_group_size == sg_size + @test sg_data.max_sub_group_size == sg_size + @test sg_data.num_sub_groups == sg_n + + # Group ID should be 1-based + expected_sub_group = div(((i - 1) % workgroupsize), sg_size) + 1 + @test sg_data.sub_group_id == expected_sub_group + + # Local ID should be 1-based within group + expected_sg_local = ((i - 1) % sg_size) + 1 + @test sg_data.sub_group_local_id == expected_sg_local + end + end + @testset "shfl_down" begin + @test !isempty(KI.shfl_down_types(backend())) + types_to_test = setdiff(KI.shfl_down_types(backend()), [Bool]) + @testset "$T" for T in types_to_test + N = KI.sub_group_size(backend()) + a = zeros(T, N) + rand!(a, (0:1)) + + dev_a = AT(a) + dev_b = AT(zeros(T, N)) + + KI.@kernel backend() workgroupsize = N shfl_down_test_kernel(dev_a, dev_b, Val(N)) + + b = Array(dev_b) + @test sum(a) ≈ b[1] + end + end end return nothing end