JuliaGPU · christiangnrd · Dec 22, 2025 · Dec 23, 2025 · Nov 22, 2025 · Jan 2, 2026
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -13,7 +13,7 @@ steps:
     command: |
       julia -e 'println("--- :julia: Developing CUDA")
                 using Pkg
-                Pkg.add(url="https://github.com/christiangnrd/CUDA.jl", rev="intrinsics")'
+                Pkg.add(url="https://github.com/christiangnrd/CUDA.jl", rev="intrinsicsnew")'
       julia -e 'println("--- :julia: Instantiating project")
                 using Pkg
                 Pkg.develop(; path=pwd())' || exit 3
@@ -76,7 +76,7 @@ steps:
     command: |
       julia -e 'println("--- :julia: Developing Metal")
                 using Pkg
-                Pkg.add(url="https://github.com/JuliaGPU/Metal.jl", rev="kaintr")'
+                Pkg.add(url="https://github.com/JuliaGPU/Metal.jl", rev="kaintrnew")'
       julia -e 'println("--- :julia: Instantiating project")
                 using Pkg
                 Pkg.develop(; path=pwd())' || exit 3
@@ -106,10 +106,11 @@ steps:
       - JuliaCI/julia-coverage#v1:
           codecov: true
     command: |
-      julia -e 'println("--- :julia: Developing oneAPI")
+      julia --compiled-modules=no -e 'println("--- :julia: Developing oneAPI")
                 using Pkg
-                Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsics")
-                Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")'
+                Pkg.add(url="https://github.com/christiangnrd/oneAPI.jl", rev="intrinsicsnew")
+                Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")
+                Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew", subdir="lib/intrinsics",  name="SPIRVIntrinsics")'
       julia -e 'println("--- :julia: Instantiating project")
                 using Pkg
                 Pkg.develop(; path=pwd())' || exit 3
@@ -143,7 +144,7 @@ steps:
                 Pkg.add(url="https://github.com/christiangnrd/AcceleratedKernels.jl", rev="ka0.10simple")'
       julia -e '
                 using Pkg
-                Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsics")
+                Pkg.add(url="https://github.com/christiangnrd/AMDGPU.jl", rev="intrinsicsnew")
                 println("--- :julia: Instantiating project")
                 Pkg.develop(; path=pwd())' || exit 3
 
@@ -174,11 +175,11 @@ steps:
     command: |
       julia -e 'println("--- :julia: Developing OpenCL")
                 using Pkg
-                Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsics")
-                Pkg.develop(; name="SPIRVIntrinsics")'
+                Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew")'
       julia -e 'println("--- :julia: Instantiating project")
                 using Pkg
-                Pkg.develop(; path=pwd())' || exit 3
+                Pkg.develop(; path=pwd())
+                Pkg.develop(; name="SPIRVIntrinsics")' || exit 3
 
       julia -e 'println("+++ :julia: Running tests")
                 using Pkg

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -128,6 +128,15 @@ jobs:
               end
             end
           '
+      - name: "Develop SPIRVIntrinsics"
+        run: |
+          julia -e '
+            using Pkg
+            withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do
+              Pkg.activate(".")
+              Pkg.add(url="https://github.com/christiangnrd/OpenCL.jl", rev="intrinsicsnew", subdir="lib/intrinsics",  name="SPIRVIntrinsics")
+            end
+          '
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
         if: runner.os != 'Windows'

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "KernelAbstractions"
 uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 authors = ["Valentin Churavy <v.churavy@gmail.com> and contributors"]
-version = "0.10.0-dev"
+version = "0.10.1-dev"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"

diff --git a/src/intrinsics.jl b/src/intrinsics.jl
@@ -102,6 +102,78 @@ Returns the unique group ID.
 """
 function get_group_id end
 
+"""
+    get_sub_group_size()::UInt32
+
+Returns the number of work-items in the sub-group.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_sub_group_size()::UInt32
+    ```
+"""
+function get_sub_group_size end
+
+"""
+    get_max_sub_group_size()::UInt32
+
+Returns the maximum sub-group size for sub-groups in the current workgroup.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_max_sub_group_size()::UInt32
+    ```
+"""
+function get_max_sub_group_size end
+
+"""
+    get_num_sub_groups()::UInt32
+
+Returns the number of sub-groups in the current workgroup.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_num_sub_groups()::UInt32
+    ```
+"""
+function get_num_sub_groups end
+
+"""
+    get_sub_group_id()::UInt32
+
+Returns the sub-group ID within the work-group.
+
+!!! note
+    1-based.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_sub_group_id()::UInt32
+    ```
+"""
+function get_sub_group_id end
+
+"""
+    get_sub_group_local_id()::UInt32
+
+Returns the work-item ID within the current sub-group.
+
+!!! note
+    1-based.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override get_sub_group_local_id()::UInt32
+    ```
+"""
+function get_sub_group_local_id end
+
+
 """
     localmemory(::Type{T}, dims)
 
@@ -116,6 +188,34 @@ Declare memory that is local to a workgroup.
 """
 localmemory(::Type{T}, dims) where {T} = localmemory(T, Val(dims))
 
+"""
+    shfl_down(val::T, offset::Integer) where T
+
+Read `val` from a lane with higher id given by `offset`.
+When writing kernels using this function, it should be
+assumed that it is not synchronized.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override shfl_down(val::T, offset::Integer) where T
+    ```
+    As well as the on-device functionality.
+"""
+function shfl_down end
+
+"""
+    shfl_down_types(::Backend)::Vector{DataType}
+
+Returns a vector of `DataType`s supported on `backend`
+
+!!! note
+    Backend implementations **must** implement this function
+    only if they support `shfl_down` for any types.
+"""
+shfl_down_types(::Backend) = DataType[]
+
+
 """
     barrier()
 
@@ -139,6 +239,29 @@ function barrier()
     error("Group barrier used outside kernel or not captured")
 end
 
+"""
+    sub_group_barrier()
+
+After a `sub_group_barrier()` call, all read and writes to global and local memory
+from each thread in the sub-group are visible in from all other threads in the
+sub-group.
+
+This does **not** guarantee that a write from a thread in a certain sub-group will
+be visible to a thread in a different sub-group.
+
+!!! note
+    `sub_group_barrier()` must be encountered by all workitems of a sub-group executing the kernel or by none at all.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override sub_group_barrier()
+    ```
+"""
+function sub_group_barrier()
+    error("Sub-group barrier used outside kernel or not captured")
+end
+
 """
     _print(args...)
 
@@ -220,6 +343,22 @@ kernel launch with too big a workgroup is attempted.
 """
 function max_work_group_size end
 
+"""
+    sub_group_size(backend)::Int
+
+Returns a reasonable sub-group size supported by the currently
+active device for the specified backend. This would typically
+be 32, or 64 for devices that don't support 32.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    sub_group_size(backend::NewBackend)::Int
+    ```
+    As well as the on-device functionality.
+"""
+function sub_group_size end
+
 """
     multiprocessor_count(backend::NewBackend)::Int
 

diff --git a/src/pocl/backend.jl b/src/pocl/backend.jl
@@ -7,6 +7,8 @@ using ..POCL: device, clconvert, clfunction
 import KernelAbstractions as KA
 import KernelAbstractions.KernelIntrinsics as KI
 
+import SPIRVIntrinsics
+
 import StaticArrays
 
 import Adapt
@@ -174,10 +176,36 @@ end
 function KI.max_work_group_size(::POCLBackend)::Int
     return Int(device().max_work_group_size)
 end
+function KI.sub_group_size(::POCLBackend)::Int
+    sg_sizes = cl.device().sub_group_sizes
+    if 32 in sg_sizes
+        return 32
+    elseif 64 in sg_sizes
+        return 64
+    elseif 16 in sg_sizes
+        return 16
+    else
+        return 1
+    end
+end
 function KI.multiprocessor_count(::POCLBackend)::Int
     return Int(device().max_compute_units)
 end
 
+function KI.shfl_down_types(::POCLBackend)
+    res = copy(SPIRVIntrinsics.gentypes)
+
+    backend_extensions = cl.device().extensions
+    if "cl_khr_fp64" ∉ backend_extensions
+        res = setdiff(res, [Float64])
+    end
+    if "cl_khr_fp16" ∉ backend_extensions
+        res = setdiff(res, [Float16])
+    end
+
+    return res
+end
+
 ## Indexing Functions
 
 @device_override @inline function KI.get_local_id()
@@ -204,6 +232,16 @@ end
     return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3)))
 end
 
+@device_override KI.get_sub_group_size() = get_sub_group_size()
+
+@device_override KI.get_max_sub_group_size() = get_max_sub_group_size()
+
+@device_override KI.get_num_sub_groups() = get_num_sub_groups()
+
+@device_override KI.get_sub_group_id() = get_sub_group_id()
+
+@device_override KI.get_sub_group_local_id() = get_sub_group_local_id()
+
 @device_override @inline function KA.__validindex(ctx)
     if KA.__dynamic_checkbounds(ctx)
         I = @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
@@ -232,6 +270,14 @@ end
     work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE)
 end
 
+@device_override @inline function KI.sub_group_barrier()
+    sub_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE)
+end
+
+@device_override function KI.shfl_down(val::T, offset::Integer) where {T}
+    sub_group_shuffle(val, get_sub_group_local_id() + offset)
+end
+
 @device_override @inline function KI._print(args...)
     POCL._print(args...)
 end

diff --git a/src/pocl/compiler/compilation.jl b/src/pocl/compiler/compilation.jl
@@ -1,6 +1,14 @@
 ## gpucompiler interface
 
-struct OpenCLCompilerParams <: AbstractCompilerParams end
+Base.@kwdef struct OpenCLCompilerParams <: AbstractCompilerParams
+    sub_group_size::Int
+end
+function Base.hash(params::OpenCLCompilerParams, h::UInt)
+    h = hash(params.sub_group_size, h)
+
+    return h
+end
+
 const OpenCLCompilerConfig = CompilerConfig{SPIRVCompilerTarget, OpenCLCompilerParams}
 const OpenCLCompilerJob = CompilerJob{SPIRVCompilerTarget, OpenCLCompilerParams}
 
@@ -19,7 +27,21 @@ GPUCompiler.isintrinsic(job::OpenCLCompilerJob, fn::String) =
     in(fn, known_intrinsics) ||
     contains(fn, "__spirv_")
 
+function GPUCompiler.finish_module!(
+        @nospecialize(job::OpenCLCompilerJob),
+        mod::LLVM.Module, entry::LLVM.Function
+    )
+    entry = invoke(
+        GPUCompiler.finish_module!,
+        Tuple{CompilerJob{SPIRVCompilerTarget}, LLVM.Module, LLVM.Function},
+        job, mod, entry
+    )
 
+    # Set the subgroup size
+    metadata(entry)["intel_reqd_sub_group_size"] = MDNode([ConstantInt(Int32(job.config.params.sub_group_size))])
+
+    return entry
+end
 ## compiler implementation (cache, configure, compile, and link)
 
 # cache of compilation caches, per context
@@ -45,14 +67,17 @@ function compiler_config(dev::cl.Device; kwargs...)
     end
     return config
 end
-@noinline function _compiler_config(dev; kernel = true, name = nothing, always_inline = false, kwargs...)
+@noinline function _compiler_config(dev; kernel = true, name = nothing, always_inline = false, sub_group_size = 32, kwargs...)
     supports_fp16 = "cl_khr_fp16" in dev.extensions
     supports_fp64 = "cl_khr_fp64" in dev.extensions
 
+    if sub_group_size ∉ dev.sub_group_sizes
+        @error("$sub_group_size is not a valid sub-group size for this device.")
+    end
 
     # create GPUCompiler objects
     target = SPIRVCompilerTarget(; supports_fp16, supports_fp64, kwargs...)
-    params = OpenCLCompilerParams()
+    params = OpenCLCompilerParams(; sub_group_size)
     return CompilerConfig(target, params; kernel, name, always_inline)
 end
 

diff --git a/src/pocl/compiler/execution.jl b/src/pocl/compiler/execution.jl
@@ -4,7 +4,7 @@ export @opencl, clfunction, clconvert
 ## high-level @opencl interface
 
 const MACRO_KWARGS = [:launch]
-const COMPILER_KWARGS = [:kernel, :name, :always_inline]
+const COMPILER_KWARGS = [:kernel, :name, :always_inline, :sub_group_size]
 const LAUNCH_KWARGS = [:global_size, :local_size, :queue]
 
 macro opencl(ex...)