diff --git a/src/compiler/intrinsics/atomics.jl b/src/compiler/intrinsics/atomics.jl index 5e1af6d..d5833ee 100644 --- a/src/compiler/intrinsics/atomics.jl +++ b/src/compiler/intrinsics/atomics.jl @@ -205,3 +205,160 @@ end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_add), args) emit_atomic_rmw!(ctx, args, AtomicADD) end + +# ============================================================================ +# Tile-wise atomic operations +# These take pre-computed pointer tiles, value tiles, and masks. +# Used by the public API for tile-indexed atomic operations. +# ============================================================================ + +# cuda_tile.atomic_cas_tko with tile pointers +@eval Intrinsics begin + """ + atomic_cas_tile(ptr_tile, expected, desired, mask, memory_order, memory_scope) + + Tile-wise atomic compare-and-swap. + Operates on a tile of pointers with a tile of expected/desired values. + Mask controls which elements are active (bounds checking). + Returns a tile of original values. + """ + @noinline function atomic_cas_tile(ptr_tile::Tile, expected::Tile{T, S}, + desired::Tile{T, S}, mask::Tile, + memory_order::Int, memory_scope::Int) where {T, S} + donotdelete(ptr_tile, expected, desired, mask) + compilerbarrier(:const, expected)::Tile{T, S} + end +end + +function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_cas_tile), args) + cb = ctx.cb + tt = ctx.tt + + # args: (ptr_tile, expected, desired, mask, memory_order, memory_scope) + ptr_tv = emit_value!(ctx, args[1]) + ptr_tv === nothing && error("atomic_cas_tile requires ptr_tile") + expected_tv = emit_value!(ctx, args[2]) + expected_tv === nothing && error("atomic_cas_tile requires expected value") + desired_tv = emit_value!(ctx, args[3]) + desired_tv === nothing && error("atomic_cas_tile requires desired value") + mask_tv = emit_value!(ctx, args[4]) + mask_tv === nothing && error("atomic_cas_tile requires mask") + + # Get memory order and scope + memory_order = @something get_constant(ctx, args[5]) error("atomic_cas_tile requires constant memory_order") + memory_scope = @something get_constant(ctx, args[6]) error("atomic_cas_tile requires constant memory_scope") + + # Get shape and element type from expected tile + shape = expected_tv.shape + elem_type = expected_tv.jltype.parameters[1] # T from Tile{T, S} + + # Create result type (tile with same shape as inputs) + dtype = julia_to_tile_dtype!(tt, elem_type) + result_tile_type = tile_type!(tt, dtype, collect(shape)) + token_type = Token(tt) + + # Emit atomic CAS with mask + mem_ordering = memory_order_to_semantics(memory_order) + mem_scope = memory_scope_to_scope(memory_scope) + + old_val, new_token = encode_AtomicCASPtrOp!(cb, result_tile_type, token_type, + ptr_tv.v, expected_tv.v, desired_tv.v; + mask=mask_tv.v, + token=ctx.token, + memory_ordering=mem_ordering, + memory_scope=mem_scope) + ctx.token = new_token + + # Return Tile type with the same shape + CGVal(old_val, result_tile_type, Tile{elem_type, Tuple(shape)}, collect(shape)) +end + +# Shared helper for tile-wise atomic RMW operations +function emit_atomic_rmw_tile!(ctx::CGCtx, args::AbstractVector, mode::AtomicRMWMode) + cb = ctx.cb + tt = ctx.tt + + # args: (ptr_tile, val, mask, memory_order, memory_scope) + ptr_tv = emit_value!(ctx, args[1]) + ptr_tv === nothing && error("atomic RMW tile requires ptr_tile") + val_tv = emit_value!(ctx, args[2]) + val_tv === nothing && error("atomic RMW tile requires value") + mask_tv = emit_value!(ctx, args[3]) + mask_tv === nothing && error("atomic RMW tile requires mask") + + # Get memory order and scope + memory_order = @something get_constant(ctx, args[4]) error("atomic RMW tile requires constant memory_order") + memory_scope = @something get_constant(ctx, args[5]) error("atomic RMW tile requires constant memory_scope") + + # Get shape and element type from value tile + shape = val_tv.shape + elem_type = val_tv.jltype.parameters[1] # T from Tile{T, S} + + # Create result type (tile with same shape as inputs) + dtype = julia_to_tile_dtype!(tt, elem_type) + result_tile_type = tile_type!(tt, dtype, collect(shape)) + token_type = Token(tt) + + # Use float add mode for floating point types + actual_mode = mode + if mode == AtomicADD && elem_type <: AbstractFloat + actual_mode = AtomicADDF + end + + # Emit atomic RMW with mask + mem_ordering = memory_order_to_semantics(memory_order) + mem_scope = memory_scope_to_scope(memory_scope) + + old_val, new_token = encode_AtomicRMWPtrOp!(cb, result_tile_type, token_type, + ptr_tv.v, val_tv.v, actual_mode; + mask=mask_tv.v, + token=ctx.token, + memory_ordering=mem_ordering, + memory_scope=mem_scope) + ctx.token = new_token + + # Return Tile type with the same shape + CGVal(old_val, result_tile_type, Tile{elem_type, Tuple(shape)}, collect(shape)) +end + +# cuda_tile.atomic_rmw_tko with XCHG (tile version) +@eval Intrinsics begin + """ + atomic_xchg_tile(ptr_tile, val, mask, memory_order, memory_scope) + + Tile-wise atomic exchange. + Operates on a tile of pointers with a tile of values. + Mask controls which elements are active (bounds checking). + Returns a tile of original values. + """ + @noinline function atomic_xchg_tile(ptr_tile::Tile, val::Tile{T, S}, mask::Tile, + memory_order::Int, memory_scope::Int) where {T, S} + donotdelete(ptr_tile, val, mask) + compilerbarrier(:const, val)::Tile{T, S} + end +end + +function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_xchg_tile), args) + emit_atomic_rmw_tile!(ctx, args, AtomicXCHG) +end + +# cuda_tile.atomic_rmw_tko with ADD (tile version) +@eval Intrinsics begin + """ + atomic_add_tile(ptr_tile, val, mask, memory_order, memory_scope) + + Tile-wise atomic addition. + Operates on a tile of pointers with a tile of values. + Mask controls which elements are active (bounds checking). + Returns a tile of original values. + """ + @noinline function atomic_add_tile(ptr_tile::Tile, val::Tile{T, S}, mask::Tile, + memory_order::Int, memory_scope::Int) where {T, S} + donotdelete(ptr_tile, val, mask) + compilerbarrier(:const, val)::Tile{T, S} + end +end + +function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_add_tile), args) + emit_atomic_rmw_tile!(ctx, args, AtomicADD) +end diff --git a/src/language/arithmetic.jl b/src/language/arithmetic.jl index 1a69f39..63db996 100644 --- a/src/language/arithmetic.jl +++ b/src/language/arithmetic.jl @@ -193,3 +193,13 @@ for (op, pred) in ((:<, :CmpLessThan), (:>, :CmpGreaterThan), _cmp_intrinsic(broadcast_to(Tile(T(a)), S), b, $pred) end end + +# For index tile arithmetic: +@inline Base.Broadcast.broadcasted(::TileStyle, ::typeof(-), a::Tile{T,S}, ::Base.RefValue{One}) where {T<:Integer,S} = + a .- one(T) +@inline Base.Broadcast.broadcasted(::TileStyle, ::typeof(+), a::Tile{T,S}, ::Base.RefValue{One}) where {T<:Integer,S} = + a .+ one(T) +@inline Base.Broadcast.broadcasted(::TileStyle, ::typeof(-), ::Base.RefValue{One}, a::Tile{T,S}) where {T<:Integer,S} = + one(T) .- a +@inline Base.Broadcast.broadcasted(::TileStyle, ::typeof(+), ::Base.RefValue{One}, a::Tile{T,S}) where {T<:Integer,S} = + one(T) .+ a diff --git a/src/language/atomics.jl b/src/language/atomics.jl index 1c15f5f..372a6f6 100644 --- a/src/language/atomics.jl +++ b/src/language/atomics.jl @@ -80,3 +80,292 @@ old_val = ct.atomic_add(counters, idx, Int32(1)) memory_scope::Int=MemScope.Device) where {T, N} Intrinsics.atomic_add(array, index - One(), val, memory_order, memory_scope) end + +# ============================================================================ +# Tile-wise atomic operations +# These accept Tile indices to perform atomic operations on multiple elements. +# ============================================================================ + +# Operation registry: (name, intrinsic) pairs for RMW operations +# To add a new operation: 1) add entry here, 2) add intrinsic in compiler/intrinsics/atomics.jl +const ATOMIC_RMW_OPS = [ + (:add, :atomic_add_tile), + (:xchg, :atomic_xchg_tile), +] + +# ============================================================================ +# Pointer/Mask Helpers +# ============================================================================ + +""" +Compute pointer tile and bounds mask for 1D tile-wise atomic operations. +Returns (ptr_tile, mask, output_shape). +""" +@inline function _atomic_ptr_mask_1d(array::TileArray{T, 1}, indices::Tile{I, S}) where {T, I <: Integer, S} + indices_0 = indices .- One() + indices_i32 = astype(indices_0, Int32) + ptr_tile = Intrinsics.offset(array.ptr, indices_i32) + mask = (indices_i32 .>= Tile(Int32(0))) .& (indices_i32 .< Tile(array.sizes[1])) + (ptr_tile, mask, S) +end + +""" +Compute pointer tile and bounds mask for 2D tile-wise atomic operations. +Returns (ptr_tile, mask, output_shape). +""" +@inline function _atomic_ptr_mask_2d(array::TileArray{T, 2}, + indices::Tuple{Tile{I0, S0}, Tile{I1, S1}}) where {T, I0 <: Integer, I1 <: Integer, S0, S1} + idx0_0 = indices[1] .- One() + idx1_0 = indices[2] .- One() + S = broadcast_shape(S0, S1) + idx0_i32 = astype(broadcast_to(idx0_0, S), Int32) + idx1_i32 = astype(broadcast_to(idx1_0, S), Int32) + linear_idx = idx0_i32 .* Tile(array.strides[1]) .+ idx1_i32 .* Tile(array.strides[2]) + ptr_tile = Intrinsics.offset(array.ptr, linear_idx) + mask = (idx0_i32 .>= Tile(Int32(0))) .& (idx0_i32 .< Tile(array.sizes[1])) .& + (idx1_i32 .>= Tile(Int32(0))) .& (idx1_i32 .< Tile(array.sizes[2])) + (ptr_tile, mask, S) +end + +""" +Compute pointer tile and bounds mask for 2D tile-wise operations with value broadcasting. +Returns (ptr_tile, mask, output_shape, idx0_i32, idx1_i32) for value broadcasting. +""" +@inline function _atomic_ptr_mask_2d_bc(array::TileArray{T, 2}, + indices::Tuple{Tile{I0, S0}, Tile{I1, S1}}, + Sval) where {T, I0 <: Integer, I1 <: Integer, S0, S1} + idx0_0 = indices[1] .- One() + idx1_0 = indices[2] .- One() + S = broadcast_shape(broadcast_shape(S0, S1), Sval) + idx0_i32 = astype(broadcast_to(idx0_0, S), Int32) + idx1_i32 = astype(broadcast_to(idx1_0, S), Int32) + linear_idx = idx0_i32 .* Tile(array.strides[1]) .+ idx1_i32 .* Tile(array.strides[2]) + ptr_tile = Intrinsics.offset(array.ptr, linear_idx) + mask = (idx0_i32 .>= Tile(Int32(0))) .& (idx0_i32 .< Tile(array.sizes[1])) .& + (idx1_i32 .>= Tile(Int32(0))) .& (idx1_i32 .< Tile(array.sizes[2])) + (ptr_tile, mask, S) +end + +""" +Compute pointer tile and bounds mask for N-dimensional tile-level atomic operations. +`index` is an N-tuple of tile-space indices (1-indexed). +`Shape` is the tile shape. +Returns (ptr_tile, mask). +""" +@inline function _tile_level_atomic_args(array::TileArray{T, N}, index::NTuple{N, Integer}, + ::Val{Shape}) where {T, N, Shape} + # Create 1-indexed element index tiles for each dimension + # For dim d: arange [1..Shape[d]], reshaped for broadcasting, plus base offset + idx_tiles = ntuple(N) do d + bcast_shape = ntuple(i -> i == d ? Shape[d] : 1, N) + base = Int32((index[d] - 1) * Shape[d]) + reshape(arange((Shape[d],), Int32), bcast_shape) .+ Tile(base) + end + + # Compute 0-indexed linear offset: sum((idx[d] - 1) * stride[d]) + linear_idx = reduce(.+, ntuple(N) do d + (idx_tiles[d] .- Tile(Int32(1))) .* Tile(array.strides[d]) + end) + + ptr_tile = Intrinsics.offset(array.ptr, linear_idx) + + # Bounds mask: 1 <= idx[d] <= sizes[d] for all d + mask = reduce(.&, ntuple(N) do d + (idx_tiles[d] .>= Tile(Int32(1))) .& (idx_tiles[d] .<= Tile(array.sizes[d])) + end) + + (ptr_tile, mask) +end + +# ============================================================================ +# Generated RMW Operations (add, xchg, ...) +# ============================================================================ + +for (op, intrinsic) in ATOMIC_RMW_OPS + fname = Symbol("atomic_", op) + doc_op = string(op) + + # 1D tile-wise with scalar value + @eval begin + @doc """ + $($fname)(array::TileArray{T, 1}, indices::Tile, val::T; memory_order, memory_scope) -> Tile{T, S} + + Tile-wise atomic $($doc_op) on a 1D array. + Indices are 1-indexed. Out-of-bounds indices are masked. + """ + @inline function $fname(array::TileArray{T, 1}, indices::Tile{I, S}, val::T; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, I <: Integer, S} + ptr_tile, mask, _ = _atomic_ptr_mask_1d(array, indices) + val_tile = broadcast_to(Tile(val), S) + Intrinsics.$intrinsic(ptr_tile, val_tile, mask, memory_order, memory_scope) + end + end + + # 1D tile-wise with tile value + @eval begin + @doc """ + $($fname)(array::TileArray{T, 1}, indices::Tile, val::Tile{T, S}; ...) -> Tile{T, S} + + Tile-wise atomic $($doc_op) with a tile of values. + """ + @inline function $fname(array::TileArray{T, 1}, indices::Tile{I, S}, val::Tile{T, S}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, I <: Integer, S} + ptr_tile, mask, _ = _atomic_ptr_mask_1d(array, indices) + Intrinsics.$intrinsic(ptr_tile, val, mask, memory_order, memory_scope) + end + end + + # 2D tile-wise with scalar value + @eval begin + @doc """ + $($fname)(array::TileArray{T, 2}, indices::Tuple{Tile, Tile}, val::T; ...) -> Tile{T, S} + + Tile-wise atomic $($doc_op) on a 2D array. + """ + @inline function $fname(array::TileArray{T, 2}, + indices::Tuple{Tile{I0, S0}, Tile{I1, S1}}, val::T; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer, S0, S1} + ptr_tile, mask, S = _atomic_ptr_mask_2d(array, indices) + val_tile = broadcast_to(Tile(val), S) + Intrinsics.$intrinsic(ptr_tile, val_tile, mask, memory_order, memory_scope) + end + end + + # 2D tile-wise with tile value + @eval begin + @doc """ + $($fname)(array::TileArray{T, 2}, indices::Tuple{Tile, Tile}, val::Tile; ...) -> Tile{T, S} + + Tile-wise atomic $($doc_op) on a 2D array with a tile of values. + """ + @inline function $fname(array::TileArray{T, 2}, + indices::Tuple{Tile{I0, S0}, Tile{I1, S1}}, val::Tile{T, Stile}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer, S0, S1, Stile} + ptr_tile, mask, S = _atomic_ptr_mask_2d_bc(array, indices, Stile) + val_bc = broadcast_to(val, S) + Intrinsics.$intrinsic(ptr_tile, val_bc, mask, memory_order, memory_scope) + end + end + + # Tile-level N-D (tuple of integer indices, tile value) + @eval begin + @doc """ + $($fname)(array::TileArray{T, N}, index, tile::Tile{T, Shape}; ...) -> Tile{T, Shape} + + Atomic $($doc_op) at tile-level index (like `store`). + Index can be an Integer (1D) or NTuple{N, Integer} (N-D). + """ + @inline function $fname(array::TileArray{T, N}, index::NTuple{N, Integer}, tile::Tile{T, Shape}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, N, Shape} + ptr_tile, mask = _tile_level_atomic_args(array, index, Val(Shape)) + Intrinsics.$intrinsic(ptr_tile, tile, mask, memory_order, memory_scope) + end + end + + # Tile-level 1D convenience (integer index -> 1-tuple) + @eval begin + @inline function $fname(array::TileArray{T, 1}, index::Integer, tile::Tile{T, Shape}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, Shape} + $fname(array, (index,), tile; memory_order, memory_scope) + end + end +end + +# ============================================================================ +# CAS Operations (separate - has expected + desired args) +# ============================================================================ + +""" + atomic_cas(array::TileArray{T, 1}, indices::Tile, expected, desired; memory_order, memory_scope) -> Tile{T, S} + +Tile-wise atomic compare-and-swap on a 1D array. +Indices are 1-indexed. Out-of-bounds indices are masked. +""" +@inline function atomic_cas(array::TileArray{T, 1}, indices::Tile{I, S}, + expected::T, desired::T; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, I <: Integer, S} + ptr_tile, mask, _ = _atomic_ptr_mask_1d(array, indices) + expected_tile = broadcast_to(Tile(expected), S) + desired_tile = broadcast_to(Tile(desired), S) + Intrinsics.atomic_cas_tile(ptr_tile, expected_tile, desired_tile, mask, + memory_order, memory_scope) +end + +""" + atomic_cas(array::TileArray{T, 1}, indices::Tile, expected::Tile, desired::Tile; ...) -> Tile{T, S} + +Tile-wise atomic compare-and-swap with tiles of expected/desired values. +""" +@inline function atomic_cas(array::TileArray{T, 1}, indices::Tile{I, S}, + expected::Tile{T, S}, desired::Tile{T, S}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, I <: Integer, S} + ptr_tile, mask, _ = _atomic_ptr_mask_1d(array, indices) + Intrinsics.atomic_cas_tile(ptr_tile, expected, desired, mask, + memory_order, memory_scope) +end + +""" + atomic_cas(array::TileArray{T, 2}, indices::Tuple{Tile, Tile}, expected, desired; ...) -> Tile{T, S} + +Tile-wise atomic compare-and-swap on a 2D array. +""" +@inline function atomic_cas(array::TileArray{T, 2}, + indices::Tuple{Tile{I0, S0}, Tile{I1, S1}}, + expected::T, desired::T; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer, S0, S1} + ptr_tile, mask, S = _atomic_ptr_mask_2d(array, indices) + expected_tile = broadcast_to(Tile(expected), S) + desired_tile = broadcast_to(Tile(desired), S) + Intrinsics.atomic_cas_tile(ptr_tile, expected_tile, desired_tile, mask, + memory_order, memory_scope) +end + +""" + atomic_cas(array::TileArray{T, 2}, indices::Tuple{Tile, Tile}, expected::Tile, desired::Tile; ...) -> Tile{T, S} + +Tile-wise atomic compare-and-swap on a 2D array with tiles of values. +""" +@inline function atomic_cas(array::TileArray{T, 2}, + indices::Tuple{Tile{I0, S0}, Tile{I1, S1}}, + expected::Tile{T, Se}, desired::Tile{T, Sd}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer, S0, S1, Se, Sd} + S = broadcast_shape(broadcast_shape(broadcast_shape(S0, S1), Se), Sd) + ptr_tile, mask, _ = _atomic_ptr_mask_2d_bc(array, indices, Se) + expected_bc = broadcast_to(expected, S) + desired_bc = broadcast_to(desired, S) + Intrinsics.atomic_cas_tile(ptr_tile, expected_bc, desired_bc, mask, + memory_order, memory_scope) +end + +""" + atomic_cas(array::TileArray{T, N}, index, expected::Tile, desired::Tile; ...) -> Tile{T, Shape} + +Atomic compare-and-swap at tile-level index (like `store`). +Index can be an Integer (1D) or NTuple{N, Integer} (N-D). +""" +@inline function atomic_cas(array::TileArray{T, N}, index::NTuple{N, Integer}, + expected::Tile{T, Shape}, desired::Tile{T, Shape}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, N, Shape} + ptr_tile, mask = _tile_level_atomic_args(array, index, Val(Shape)) + Intrinsics.atomic_cas_tile(ptr_tile, expected, desired, mask, + memory_order, memory_scope) +end + +# 1D convenience (integer index -> 1-tuple) +@inline function atomic_cas(array::TileArray{T, 1}, index::Integer, + expected::Tile{T, Shape}, desired::Tile{T, Shape}; + memory_order::Int=MemoryOrder.AcqRel, + memory_scope::Int=MemScope.Device) where {T, Shape} + atomic_cas(array, (index,), expected, desired; memory_order, memory_scope) +end diff --git a/src/language/operations.jl b/src/language/operations.jl index 463a358..eedcb2b 100644 --- a/src/language/operations.jl +++ b/src/language/operations.jl @@ -199,7 +199,7 @@ tile = ct.gather(arr, indices; latency=3) @inline function gather(array::TileArray{T, 1}, indices::Tile{I, S}; latency::Union{Int, Nothing}=nothing) where {T, I <: Integer, S} # Convert to 0-indexed - indices_0 = indices .- one(I) + indices_0 = indices .- One() # Convert to Int32 for consistency with array.sizes indices_i32 = astype(indices_0, Int32) @@ -232,8 +232,8 @@ Indices are 1-indexed. Index tiles are broadcast to a common shape. @inline function gather(array::TileArray{T, 2}, indices::Tuple{Tile{I0, S0}, Tile{I1, S1}}; latency::Union{Int, Nothing}=nothing) where {T, I0 <: Integer, I1 <: Integer, S0, S1} # Convert to 0-indexed - idx0_0 = indices[1] .- one(I0) - idx1_0 = indices[2] .- one(I1) + idx0_0 = indices[1] .- One() + idx1_0 = indices[2] .- One() # Broadcast indices to common shape S = broadcast_shape(S0, S1) @@ -291,7 +291,7 @@ ct.scatter(arr, indices, result_tile; latency=3) @inline function scatter(array::TileArray{T, 1}, indices::Tile{I, S}, tile::Tile{T, S}; latency::Union{Int, Nothing}=nothing) where {T, I <: Integer, S} # Convert to 0-indexed - indices_0 = indices .- one(I) + indices_0 = indices .- One() # Convert to Int32 for consistency with array.sizes indices_i32 = astype(indices_0, Int32) @@ -321,8 +321,8 @@ Indices are 1-indexed. Index tiles and value tile must broadcast to same shape. @inline function scatter(array::TileArray{T, 2}, indices::Tuple{Tile{I0, S0}, Tile{I1, S1}}, tile::Tile{T, Stile}; latency::Union{Int, Nothing}=nothing) where {T, I0 <: Integer, I1 <: Integer, S0, S1, Stile} # Convert to 0-indexed - idx0_0 = indices[1] .- one(I0) - idx1_0 = indices[2] .- one(I1) + idx0_0 = indices[1] .- One() + idx1_0 = indices[2] .- One() # Broadcast indices to common shape (include value tile shape) S = broadcast_shape(broadcast_shape(S0, S1), Stile) diff --git a/test/codegen.jl b/test/codegen.jl index ae4b42e..b5515af 100644 --- a/test/codegen.jl +++ b/test/codegen.jl @@ -1094,6 +1094,66 @@ end end end + + @testset "tile-wise atomic_cas_tko" begin + spec = ct.ArraySpec{1}(16, true) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,1,spec}}) do arr + # Create index tile + @check "iota" + indices = ct.arange((16,), Int) + # Tile-wise atomic CAS + @check "offset" + @check "atomic_cas_tko" + ct.atomic_cas(arr, indices, Int32(0), Int32(1)) + return + end + end + end + + @testset "tile-wise atomic_rmw_tko" begin + spec = ct.ArraySpec{1}(16, true) + # tile-wise xchg + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,1,spec}}) do arr + @check "iota" + indices = ct.arange((16,), Int) + @check "offset" + @check "atomic_rmw_tko" + ct.atomic_xchg(arr, indices, Int32(42)) + return + end + end + + # tile-wise add (integer) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,1,spec}}) do arr + @check "iota" + indices = ct.arange((16,), Int) + @check "offset" + @check "atomic_rmw_tko" + ct.atomic_add(arr, indices, Int32(1)) + return + end + end + + # tile-wise add (float) + spec_f32 = ct.ArraySpec{1}(16, true) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,1,spec_f32}}) do arr + @check "iota" + indices = ct.arange((16,), Int) + @check "offset" + @check "atomic_rmw_tko" + ct.atomic_add(arr, indices, 1.5f0) + return + end + end + end end #========================================================================= diff --git a/test/execution.jl b/test/execution.jl index a2072f3..6f60d5a 100644 --- a/test/execution.jl +++ b/test/execution.jl @@ -1544,6 +1544,337 @@ end @test result == n_blocks end +@testset "atomic_add tile-wise 1D" begin + function atomic_add_tile_kernel(arr::ct.TileArray{Int,1}, TILE::ct.Constant{Int}) + bid = ct.bid(1) + base = (bid - 1) * TILE[] + indices = base .+ ct.arange((TILE[],), Int) + ct.atomic_add(arr, indices, 1; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + tile_size = 16 + n = 256 + n_blocks = div(n, tile_size) + arr = CUDA.zeros(Int, n) + + ct.launch(atomic_add_tile_kernel, n_blocks, arr, ct.Constant(tile_size)) + + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-wise returns old values" begin + function atomic_add_return_kernel(arr::ct.TileArray{Int,1}, out::ct.TileArray{Int,1}) + indices = ct.arange((16,), Int) + old_vals = ct.atomic_add(arr, indices, 1; + memory_order=ct.MemoryOrder.AcqRel) + ct.scatter(out, indices, old_vals) + return + end + + arr = CUDA.zeros(Int, 16) + out = CUDA.fill(Int(-1), 16) + + ct.launch(atomic_add_return_kernel, 1, arr, out) + + @test all(Array(out) .== 0) + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-wise Float32" begin + function atomic_add_f32_tile_kernel(arr::ct.TileArray{Float32,1}, TILE::ct.Constant{Int}) + bid = ct.bid(1) + base = (bid - 1) * TILE[] + indices = base .+ ct.arange((TILE[],), Int) + ct.atomic_add(arr, indices, 1.5f0; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + tile_size = 16 + n = 256 + n_blocks = div(n, tile_size) + arr = CUDA.zeros(Float32, n) + + ct.launch(atomic_add_f32_tile_kernel, n_blocks, arr, ct.Constant(tile_size)) + + @test all(isapprox.(Array(arr), 1.5f0)) +end + +@testset "atomic_add tile-wise with tile values" begin + function atomic_add_tile_val_kernel(arr::ct.TileArray{Int,1}, + vals::ct.TileArray{Int,1}) + indices = ct.arange((16,), Int) + val_tile = ct.gather(vals, indices) + old_vals = ct.atomic_add(arr, indices, val_tile; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + arr = CUDA.zeros(Int, 16) + vals = CUDA.collect(Int, 1:16) + + ct.launch(atomic_add_tile_val_kernel, 1, arr, vals) + + @test Array(arr) == collect(1:16) +end + +@testset "atomic_xchg tile-wise" begin + function atomic_xchg_tile_kernel(arr::ct.TileArray{Int,1}) + bid = ct.bid(1) + indices = ct.arange((16,), Int) + ct.atomic_xchg(arr, indices, bid + 1; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + arr = CUDA.zeros(Int, 16) + + ct.launch(atomic_xchg_tile_kernel, 1, arr) + + @test all(Array(arr) .== 2) +end + +@testset "atomic_cas tile-wise success" begin + function atomic_cas_tile_kernel(arr::ct.TileArray{Int,1}, out::ct.TileArray{Int,1}) + indices = ct.arange((16,), Int) + old_vals = ct.atomic_cas(arr, indices, 0, 1; + memory_order=ct.MemoryOrder.AcqRel) + ct.scatter(out, indices, old_vals) + return + end + + arr = CUDA.zeros(Int, 16) + out = CUDA.fill(Int(-1), 16) + + ct.launch(atomic_cas_tile_kernel, 1, arr, out) + + @test all(Array(out) .== 0) + @test all(Array(arr) .== 1) +end + +@testset "atomic_cas tile-wise failure" begin + function atomic_cas_fail_kernel(arr::ct.TileArray{Int,1}, out::ct.TileArray{Int,1}) + indices = ct.arange((16,), Int) + old_vals = ct.atomic_cas(arr, indices, 0, 2; + memory_order=ct.MemoryOrder.AcqRel) + ct.scatter(out, indices, old_vals) + return + end + + arr = CUDA.fill(Int(1), 16) + out = CUDA.fill(Int(-1), 16) + + ct.launch(atomic_cas_fail_kernel, 1, arr, out) + + @test all(Array(out) .== 1) + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-wise out-of-bounds" begin + function atomic_add_oob_kernel(arr::ct.TileArray{Int,1}) + indices = ct.arange((16,), Int) + ct.atomic_add(arr, indices, 1; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + arr = CUDA.zeros(Int, 8) + + ct.launch(atomic_add_oob_kernel, 1, arr) + + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-level 1D" begin + function atomic_add_tile_level_kernel(arr::ct.TileArray{Int,1}) + tile = ct.full((16,), 1, Int) + ct.atomic_add(arr, 2, tile) + return + end + + arr = CUDA.zeros(Int, 32) + + ct.launch(atomic_add_tile_level_kernel, 1, arr) + + result = Array(arr) + @test all(result[1:16] .== 0) + @test all(result[17:32] .== 1) +end + +@testset "atomic_add tile-level 1D accumulates" begin + function atomic_add_tile_level_accum_kernel(arr::ct.TileArray{Int,1}) + tile = ct.full((16,), 1, Int) + ct.atomic_add(arr, 1, tile) + return + end + + arr = CUDA.zeros(Int, 16) + n_blocks = 10 + + ct.launch(atomic_add_tile_level_accum_kernel, n_blocks, arr) + + result = Array(arr) + @test all(result .== n_blocks) +end + +@testset "atomic_add tile-level 2D" begin + function atomic_add_tile_level_2d_kernel(arr::ct.TileArray{Float32,2}) + tile = ct.full((8, 8), 1.0f0, Float32) + ct.atomic_add(arr, (1, 2), tile) + return + end + + arr = CUDA.zeros(Float32, 16, 16) + + ct.launch(atomic_add_tile_level_2d_kernel, 1, arr) + + result = Array(arr) + @test all(result[1:8, 9:16] .== 1.0f0) + @test all(result[1:8, 1:8] .== 0.0f0) + @test all(result[9:16, :] .== 0.0f0) +end + +@testset "atomic_xchg tile-level 1D" begin + function atomic_xchg_tile_level_kernel(arr::ct.TileArray{Int,1}, + out::ct.TileArray{Int,1}) + tile = ct.full((16,), 42, Int) + old_vals = ct.atomic_xchg(arr, 1, tile) + ct.store(out, 1, old_vals) + return + end + + arr = CUDA.fill(Int(10), 16) + out = CUDA.zeros(Int, 16) + + ct.launch(atomic_xchg_tile_level_kernel, 1, arr, out) + + @test all(Array(arr) .== 42) + @test all(Array(out) .== 10) +end + +@testset "atomic_cas tile-level 1D success" begin + function atomic_cas_tile_level_kernel(arr::ct.TileArray{Int,1}) + expected = ct.full((16,), 0, Int) + desired = ct.full((16,), 1, Int) + ct.atomic_cas(arr, 1, expected, desired) + return + end + + arr = CUDA.zeros(Int, 16) + + ct.launch(atomic_cas_tile_level_kernel, 1, arr) + + @test all(Array(arr) .== 1) +end + +@testset "atomic_add tile-level 3D" begin + function atomic_add_tile_level_3d_kernel(arr::ct.TileArray{Float32,3}) + tile = ct.full((4, 4, 4), 1.0f0, Float32) + ct.atomic_add(arr, (1, 2, 1), tile) + return + end + + arr = CUDA.zeros(Float32, 8, 8, 8) + + ct.launch(atomic_add_tile_level_3d_kernel, 1, arr) + + result = Array(arr) + # Tile at (1,2,1) means elements [1:4, 5:8, 1:4] + @test all(result[1:4, 5:8, 1:4] .== 1.0f0) + @test all(result[5:8, :, :] .== 0.0f0) + @test all(result[:, 1:4, :] .== 0.0f0) + @test all(result[:, :, 5:8] .== 0.0f0) +end + +@testset "atomic_add tile-level 3D accumulates" begin + function atomic_add_3d_accum_kernel(arr::ct.TileArray{Int,3}) + tile = ct.full((4, 4, 4), 1, Int) + ct.atomic_add(arr, (1, 1, 1), tile) + return + end + + arr = CUDA.zeros(Int, 4, 4, 4) + n_blocks = 5 + + ct.launch(atomic_add_3d_accum_kernel, n_blocks, arr) + + result = Array(arr) + @test all(result .== 5) +end + +@testset "atomic_add tile-level 4D" begin + function atomic_add_tile_level_4d_kernel(arr::ct.TileArray{Float32,4}) + tile = ct.full((2, 2, 2, 2), 1.0f0, Float32) + ct.atomic_add(arr, (2, 1, 2, 1), tile) + return + end + + arr = CUDA.zeros(Float32, 4, 4, 4, 4) + + ct.launch(atomic_add_tile_level_4d_kernel, 1, arr) + + result = Array(arr) + # Tile at (2,1,2,1) means elements [3:4, 1:2, 3:4, 1:2] + @test all(result[3:4, 1:2, 3:4, 1:2] .== 1.0f0) + # Check some zero regions + @test all(result[1:2, :, :, :] .== 0.0f0) + @test all(result[:, 3:4, :, :] .== 0.0f0) +end + +@testset "atomic_xchg tile-level 3D" begin + function atomic_xchg_tile_level_3d_kernel(arr::ct.TileArray{Int,3}, + out::ct.TileArray{Int,3}) + tile = ct.full((4, 4, 4), 42, Int) + old_vals = ct.atomic_xchg(arr, (1, 1, 1), tile) + ct.store(out, (1, 1, 1), old_vals) + return + end + + arr = CUDA.fill(Int(7), 4, 4, 4) + out = CUDA.zeros(Int, 4, 4, 4) + + ct.launch(atomic_xchg_tile_level_3d_kernel, 1, arr, out) + + @test all(Array(arr) .== 42) + @test all(Array(out) .== 7) +end + +@testset "atomic_cas tile-level 3D success" begin + function atomic_cas_tile_level_3d_kernel(arr::ct.TileArray{Int,3}) + expected = ct.full((4, 4, 4), 0, Int) + desired = ct.full((4, 4, 4), 1, Int) + ct.atomic_cas(arr, (1, 1, 1), expected, desired) + return + end + + arr = CUDA.zeros(Int, 4, 4, 4) + + ct.launch(atomic_cas_tile_level_3d_kernel, 1, arr) + + @test all(Array(arr) .== 1) +end + +@testset "atomic_cas tile-level 3D failure" begin + function atomic_cas_tile_level_3d_fail_kernel(arr::ct.TileArray{Int,3}, + out::ct.TileArray{Int,3}) + expected = ct.full((4, 4, 4), 0, Int) # wrong expected value + desired = ct.full((4, 4, 4), 99, Int) + old_vals = ct.atomic_cas(arr, (1, 1, 1), expected, desired) + ct.store(out, (1, 1, 1), old_vals) + return + end + + arr = CUDA.fill(Int(5), 4, 4, 4) # actual value is 5, not 0 + out = CUDA.zeros(Int, 4, 4, 4) + + ct.launch(atomic_cas_tile_level_3d_fail_kernel, 1, arr, out) + + @test all(Array(arr) .== 5) # unchanged (CAS failed) + @test all(Array(out) .== 5) # old values returned +end + @testset "1D gather - simple" begin # Simple 1D gather: copy first 16 elements using gather function gather_simple_kernel(src::ct.TileArray{Float32,1}, dst::ct.TileArray{Float32,1}) @@ -1588,6 +1919,84 @@ end @test Array(dst) ≈ Array(src) end +@testset "2D gather - simple" begin + function gather_2d_kernel(src::ct.TileArray{Float32,2}, dst::ct.TileArray{Float32,2}) + row_indices = ct.reshape(ct.arange((4,), Int), (4, 1)) + col_indices = ct.reshape(ct.arange((4,), Int), (1, 4)) + tile = ct.gather(src, (row_indices, col_indices)) + ct.store(dst, (1, 1), tile) + return + end + + src = CUDA.rand(Float32, 4, 4) + dst = CUDA.zeros(Float32, 4, 4) + + ct.launch(gather_2d_kernel, 1, src, dst) + + @test Array(dst) ≈ Array(src) +end + +@testset "2D scatter - simple" begin + function scatter_2d_kernel(src::ct.TileArray{Float32,2}, dst::ct.TileArray{Float32,2}) + tile = ct.load(src, (1, 1), (4, 4)) + row_indices = ct.reshape(ct.arange((4,), Int), (4, 1)) + col_indices = ct.reshape(ct.arange((4,), Int), (1, 4)) + ct.scatter(dst, (row_indices, col_indices), tile) + return + end + + src = CUDA.rand(Float32, 4, 4) + dst = CUDA.zeros(Float32, 4, 4) + + ct.launch(scatter_2d_kernel, 1, src, dst) + + @test Array(dst) ≈ Array(src) +end + +@testset "2D gather - strided access" begin + function gather_2d_strided_kernel(src::ct.TileArray{Float32,2}, dst::ct.TileArray{Float32,2}) + row_base = ct.arange((4,), Int) + col_base = ct.arange((4,), Int) + row_indices = ct.reshape(row_base .* 2 .- 1, (4, 1)) # 1,3,5,7 + col_indices = ct.reshape(col_base .* 2 .- 1, (1, 4)) # 1,3,5,7 + tile = ct.gather(src, (row_indices, col_indices)) + ct.store(dst, (1, 1), tile) + return + end + + src = CUDA.rand(Float32, 8, 8) + dst = CUDA.zeros(Float32, 4, 4) + + ct.launch(gather_2d_strided_kernel, 1, src, dst) + + src_cpu = Array(src) + expected = src_cpu[1:2:7, 1:2:7] + @test Array(dst) ≈ expected +end + +@testset "2D scatter - strided access" begin + function scatter_2d_strided_kernel(src::ct.TileArray{Float32,2}, dst::ct.TileArray{Float32,2}) + tile = ct.load(src, (1, 1), (4, 4)) + row_base = ct.arange((4,), Int) + col_base = ct.arange((4,), Int) + row_indices = ct.reshape(row_base .* 2 .- 1, (4, 1)) + col_indices = ct.reshape(col_base .* 2 .- 1, (1, 4)) + ct.scatter(dst, (row_indices, col_indices), tile) + return + end + + src = CUDA.rand(Float32, 4, 4) + dst = CUDA.zeros(Float32, 8, 8) + + ct.launch(scatter_2d_strided_kernel, 1, src, dst) + + src_cpu = Array(src) + dst_cpu = Array(dst) + @test dst_cpu[1:2:7, 1:2:7] ≈ src_cpu + @test all(dst_cpu[2:2:8, :] .== 0) + @test all(dst_cpu[:, 2:2:8] .== 0) +end + end @testset "Entry Hints" begin