Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ steps:
command: |
julia -e 'println("--- :julia: Developing CUDA")
using Pkg
Pkg.add(url="https://github.com/christiangnrd/CUDA.jl", rev="intrinsics")'
Pkg.add(url="https://github.com/christiangnrd/CUDA.jl", rev="intrinsicsnew")'
julia -e 'println("--- :julia: Instantiating project")
using Pkg
Pkg.develop(; path=pwd())' || exit 3
Expand Down Expand Up @@ -76,7 +76,7 @@ steps:
command: |
julia -e 'println("--- :julia: Developing Metal")
using Pkg
Pkg.add(url="https://github.com/JuliaGPU/Metal.jl", rev="kaintr")'
Pkg.add(url="https://github.com/JuliaGPU/Metal.jl", rev="kaintrnew")'
julia -e 'println("--- :julia: Instantiating project")
using Pkg
Pkg.develop(; path=pwd())' || exit 3
Expand Down Expand Up @@ -106,10 +106,11 @@ steps:
- JuliaCI/julia-coverage#v1:
codecov: true
command: |
julia -e 'println("--- :julia: Developing oneAPI")
julia --compiled-modules=no -e 'println("--- :julia: Developing oneAPI")
using Pkg
Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsics")
Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")'
Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsicsnew")
Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")
Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew", subdir="lib/intrinsics", name="SPIRVIntrinsics")'
julia -e 'println("--- :julia: Instantiating project")
using Pkg
Pkg.develop(; path=pwd())' || exit 3
Expand Down Expand Up @@ -143,7 +144,7 @@ steps:
Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")'
julia -e '
using Pkg
Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsics")
Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsicsnew")
println("--- :julia: Instantiating project")
Pkg.develop(; path=pwd())' || exit 3

Expand Down Expand Up @@ -174,11 +175,11 @@ steps:
command: |
julia -e 'println("--- :julia: Developing OpenCL")
using Pkg
Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsics")
Pkg.develop(; name="SPIRVIntrinsics")'
Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew")'
julia -e 'println("--- :julia: Instantiating project")
using Pkg
Pkg.develop(; path=pwd())' || exit 3
Pkg.develop(; path=pwd())
Pkg.develop(; name="SPIRVIntrinsics")' || exit 3

julia -e 'println("+++ :julia: Running tests")
using Pkg
Expand Down
9 changes: 9 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,15 @@ jobs:
end
end
'
- name: "Develop SPIRVIntrinsics"
run: |
julia -e '
using Pkg
withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do
Pkg.activate(".")
Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew", subdir="lib/intrinsics", name="SPIRVIntrinsics")
end
'
- uses: julia-actions/julia-buildpkg@v1
- uses: julia-actions/julia-runtest@v1
if: runner.os != 'Windows'
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "KernelAbstractions"
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
authors = ["Valentin Churavy <v.churavy@gmail.com> and contributors"]
version = "0.10.0-dev"
version = "0.10.1-dev"

[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
Expand Down
139 changes: 139 additions & 0 deletions src/intrinsics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,78 @@ Returns the unique group ID.
"""
function get_group_id end

"""
get_sub_group_size()::UInt32

Returns the number of work-items in the sub-group.

!!! note
Backend implementations **must** implement:
```
@device_override get_sub_group_size()::UInt32
```
"""
function get_sub_group_size end

"""
get_max_sub_group_size()::UInt32

Returns the maximum sub-group size for sub-groups in the current workgroup.

!!! note
Backend implementations **must** implement:
```
@device_override get_max_sub_group_size()::UInt32
```
"""
function get_max_sub_group_size end

"""
get_num_sub_groups()::UInt32

Returns the number of sub-groups in the current workgroup.

!!! note
Backend implementations **must** implement:
```
@device_override get_num_sub_groups()::UInt32
```
"""
function get_num_sub_groups end

"""
get_sub_group_id()::UInt32

Returns the sub-group ID within the work-group.

!!! note
1-based.

!!! note
Backend implementations **must** implement:
```
@device_override get_sub_group_id()::UInt32
```
"""
function get_sub_group_id end

"""
get_sub_group_local_id()::UInt32

Returns the work-item ID within the current sub-group.

!!! note
1-based.

!!! note
Backend implementations **must** implement:
```
@device_override get_sub_group_local_id()::UInt32
```
"""
function get_sub_group_local_id end


"""
localmemory(::Type{T}, dims)

Expand All @@ -116,6 +188,34 @@ Declare memory that is local to a workgroup.
"""
localmemory(::Type{T}, dims) where {T} = localmemory(T, Val(dims))

"""
shfl_down(val::T, offset::Integer) where T

Read `val` from a lane with higher id given by `offset`.
When writing kernels using this function, it should be
assumed that it is not synchronized.

!!! note
Backend implementations **must** implement:
```
@device_override shfl_down(val::T, offset::Integer) where T
```
As well as the on-device functionality.
"""
function shfl_down end

"""
shfl_down_types(::Backend)::Vector{DataType}

Returns a vector of `DataType`s supported on `backend`

!!! note
Backend implementations **must** implement this function
only if they support `shfl_down` for any types.
"""
shfl_down_types(::Backend) = DataType[]


"""
barrier()

Expand All @@ -139,6 +239,29 @@ function barrier()
error("Group barrier used outside kernel or not captured")
end

"""
sub_group_barrier()

After a `sub_group_barrier()` call, all read and writes to global and local memory
from each thread in the sub-group are visible in from all other threads in the
sub-group.

This does **not** guarantee that a write from a thread in a certain sub-group will
be visible to a thread in a different sub-group.

!!! note
`sub_group_barrier()` must be encountered by all workitems of a sub-group executing the kernel or by none at all.

!!! note
Backend implementations **must** implement:
```
@device_override sub_group_barrier()
```
"""
function sub_group_barrier()
error("Sub-group barrier used outside kernel or not captured")
end

"""
_print(args...)

Expand Down Expand Up @@ -220,6 +343,22 @@ kernel launch with too big a workgroup is attempted.
"""
function max_work_group_size end

"""
sub_group_size(backend)::Int

Returns a reasonable sub-group size supported by the currently
active device for the specified backend. This would typically
be 32, or 64 for devices that don't support 32.

!!! note
Backend implementations **must** implement:
```
sub_group_size(backend::NewBackend)::Int
```
As well as the on-device functionality.
"""
function sub_group_size end

"""
multiprocessor_count(backend::NewBackend)::Int

Expand Down
46 changes: 46 additions & 0 deletions src/pocl/backend.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ using ..POCL: device, clconvert, clfunction
import KernelAbstractions as KA
import KernelAbstractions.KernelIntrinsics as KI

import SPIRVIntrinsics

import StaticArrays

import Adapt
Expand Down Expand Up @@ -174,10 +176,36 @@ end
function KI.max_work_group_size(::POCLBackend)::Int
return Int(device().max_work_group_size)
end
function KI.sub_group_size(::POCLBackend)::Int
sg_sizes = cl.device().sub_group_sizes
if 32 in sg_sizes
return 32
elseif 64 in sg_sizes
return 64
elseif 16 in sg_sizes
return 16
else
return 1
end
end
function KI.multiprocessor_count(::POCLBackend)::Int
return Int(device().max_compute_units)
end

function KI.shfl_down_types(::POCLBackend)
res = copy(SPIRVIntrinsics.gentypes)

backend_extensions = cl.device().extensions
if "cl_khr_fp64" ∉ backend_extensions
res = setdiff(res, [Float64])
end
if "cl_khr_fp16" ∉ backend_extensions
res = setdiff(res, [Float16])
end

return res
end

## Indexing Functions

@device_override @inline function KI.get_local_id()
Expand All @@ -204,6 +232,16 @@ end
return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3)))
end

@device_override KI.get_sub_group_size() = get_sub_group_size()

@device_override KI.get_max_sub_group_size() = get_max_sub_group_size()

@device_override KI.get_num_sub_groups() = get_num_sub_groups()

@device_override KI.get_sub_group_id() = get_sub_group_id()

@device_override KI.get_sub_group_local_id() = get_sub_group_local_id()

@device_override @inline function KA.__validindex(ctx)
if KA.__dynamic_checkbounds(ctx)
I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
Expand Down Expand Up @@ -232,6 +270,14 @@ end
work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE)
end

@device_override @inline function KI.sub_group_barrier()
sub_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE)
end

@device_override function KI.shfl_down(val::T, offset::Integer) where {T}
sub_group_shuffle(val, get_sub_group_local_id() + offset)
end

@device_override @inline function KI._print(args...)
POCL._print(args...)
end
Expand Down
31 changes: 28 additions & 3 deletions src/pocl/compiler/compilation.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
## gpucompiler interface

struct OpenCLCompilerParams <: AbstractCompilerParams end
Base.@kwdef struct OpenCLCompilerParams <: AbstractCompilerParams
sub_group_size::Int
end
function Base.hash(params::OpenCLCompilerParams, h::UInt)
h = hash(params.sub_group_size, h)

return h
end

const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams}
const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget, OpenCLCompilerParams}

Expand All @@ -19,7 +27,21 @@ GPUCompiler.isintrinsic(job::OpenCLCompilerJob, fn::String) =
in(fn, known_intrinsics) ||
contains(fn, "__spirv_")

function GPUCompiler.finish_module!(
@nospecialize(job::OpenCLCompilerJob),
mod::LLVM.Module, entry::LLVM.Function
)
entry = invoke(
GPUCompiler.finish_module!,
Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function},
job, mod, entry
)

# Set the subgroup size
metadata(entry)["intel_reqd_sub_group_size"] = MDNode([ConstantInt(Int32(job.config.params.sub_group_size))])

return entry
end
## compiler implementation (cache, configure, compile, and link)

# cache of compilation caches, per context
Expand All @@ -45,14 +67,17 @@ function compiler_config(dev::cl.Device; kwargs...)
end
return config
end
@noinline function _compiler_config(dev; kernel = true, name = nothing, always_inline = false, kwargs...)
@noinline function _compiler_config(dev; kernel = true, name = nothing, always_inline = false, sub_group_size = 32, kwargs...)
supports_fp16 = "cl_khr_fp16" in dev.extensions
supports_fp64 = "cl_khr_fp64" in dev.extensions

if sub_group_size ∉ dev.sub_group_sizes
@error("$sub_group_size is not a valid sub-group size for this device.")
end

# create GPUCompiler objects
target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, kwargs...)
params = OpenCLCompilerParams()
params = OpenCLCompilerParams(; sub_group_size)
return CompilerConfig(target, params; kernel, name, always_inline)
end

Expand Down
2 changes: 1 addition & 1 deletion src/pocl/compiler/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ export @opencl, clfunction, clconvert
## high-level @opencl interface

const MACRO_KWARGS = [:launch]
const COMPILER_KWARGS = [:kernel, :name, :always_inline]
const COMPILER_KWARGS = [:kernel, :name, :always_inline, :sub_group_size]
const LAUNCH_KWARGS = [:global_size, :local_size, :queue]

macro opencl(ex...)
Expand Down
Loading
Loading