diff --git a/src/compiler/intrinsics/atomics.jl b/src/compiler/intrinsics/atomics.jl
index 5e1af6d..d5833ee 100644
--- a/src/compiler/intrinsics/atomics.jl
+++ b/src/compiler/intrinsics/atomics.jl
@@ -205,3 +205,160 @@ end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_add), args)
     emit_atomic_rmw!(ctx, args, AtomicADD)
 end
+
+# ============================================================================
+# Tile-wise atomic operations
+# These take pre-computed pointer tiles, value tiles, and masks.
+# Used by the public API for tile-indexed atomic operations.
+# ============================================================================
+
+# cuda_tile.atomic_cas_tko with tile pointers
+@eval Intrinsics begin
+    """
+        atomic_cas_tile(ptr_tile, expected, desired, mask, memory_order, memory_scope)
+
+    Tile-wise atomic compare-and-swap.
+    Operates on a tile of pointers with a tile of expected/desired values.
+    Mask controls which elements are active (bounds checking).
+    Returns a tile of original values.
+    """
+    @noinline function atomic_cas_tile(ptr_tile::Tile, expected::Tile{T, S},
+                                        desired::Tile{T, S}, mask::Tile,
+                                        memory_order::Int, memory_scope::Int) where {T, S}
+        donotdelete(ptr_tile, expected, desired, mask)
+        compilerbarrier(:const, expected)::Tile{T, S}
+    end
+end
+
+function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_cas_tile), args)
+    cb = ctx.cb
+    tt = ctx.tt
+
+    # args: (ptr_tile, expected, desired, mask, memory_order, memory_scope)
+    ptr_tv = emit_value!(ctx, args[1])
+    ptr_tv === nothing && error("atomic_cas_tile requires ptr_tile")
+    expected_tv = emit_value!(ctx, args[2])
+    expected_tv === nothing && error("atomic_cas_tile requires expected value")
+    desired_tv = emit_value!(ctx, args[3])
+    desired_tv === nothing && error("atomic_cas_tile requires desired value")
+    mask_tv = emit_value!(ctx, args[4])
+    mask_tv === nothing && error("atomic_cas_tile requires mask")
+
+    # Get memory order and scope
+    memory_order = @something get_constant(ctx, args[5]) error("atomic_cas_tile requires constant memory_order")
+    memory_scope = @something get_constant(ctx, args[6]) error("atomic_cas_tile requires constant memory_scope")
+
+    # Get shape and element type from expected tile
+    shape = expected_tv.shape
+    elem_type = expected_tv.jltype.parameters[1]  # T from Tile{T, S}
+
+    # Create result type (tile with same shape as inputs)
+    dtype = julia_to_tile_dtype!(tt, elem_type)
+    result_tile_type = tile_type!(tt, dtype, collect(shape))
+    token_type = Token(tt)
+
+    # Emit atomic CAS with mask
+    mem_ordering = memory_order_to_semantics(memory_order)
+    mem_scope = memory_scope_to_scope(memory_scope)
+
+    old_val, new_token = encode_AtomicCASPtrOp!(cb, result_tile_type, token_type,
+                                                 ptr_tv.v, expected_tv.v, desired_tv.v;
+                                                 mask=mask_tv.v,
+                                                 token=ctx.token,
+                                                 memory_ordering=mem_ordering,
+                                                 memory_scope=mem_scope)
+    ctx.token = new_token
+
+    # Return Tile type with the same shape
+    CGVal(old_val, result_tile_type, Tile{elem_type, Tuple(shape)}, collect(shape))
+end
+
+# Shared helper for tile-wise atomic RMW operations
+function emit_atomic_rmw_tile!(ctx::CGCtx, args::AbstractVector, mode::AtomicRMWMode)
+    cb = ctx.cb
+    tt = ctx.tt
+
+    # args: (ptr_tile, val, mask, memory_order, memory_scope)
+    ptr_tv = emit_value!(ctx, args[1])
+    ptr_tv === nothing && error("atomic RMW tile requires ptr_tile")
+    val_tv = emit_value!(ctx, args[2])
+    val_tv === nothing && error("atomic RMW tile requires value")
+    mask_tv = emit_value!(ctx, args[3])
+    mask_tv === nothing && error("atomic RMW tile requires mask")
+
+    # Get memory order and scope
+    memory_order = @something get_constant(ctx, args[4]) error("atomic RMW tile requires constant memory_order")
+    memory_scope = @something get_constant(ctx, args[5]) error("atomic RMW tile requires constant memory_scope")
+
+    # Get shape and element type from value tile
+    shape = val_tv.shape
+    elem_type = val_tv.jltype.parameters[1]  # T from Tile{T, S}
+
+    # Create result type (tile with same shape as inputs)
+    dtype = julia_to_tile_dtype!(tt, elem_type)
+    result_tile_type = tile_type!(tt, dtype, collect(shape))
+    token_type = Token(tt)
+
+    # Use float add mode for floating point types
+    actual_mode = mode
+    if mode == AtomicADD && elem_type <: AbstractFloat
+        actual_mode = AtomicADDF
+    end
+
+    # Emit atomic RMW with mask
+    mem_ordering = memory_order_to_semantics(memory_order)
+    mem_scope = memory_scope_to_scope(memory_scope)
+
+    old_val, new_token = encode_AtomicRMWPtrOp!(cb, result_tile_type, token_type,
+                                                 ptr_tv.v, val_tv.v, actual_mode;
+                                                 mask=mask_tv.v,
+                                                 token=ctx.token,
+                                                 memory_ordering=mem_ordering,
+                                                 memory_scope=mem_scope)
+    ctx.token = new_token
+
+    # Return Tile type with the same shape
+    CGVal(old_val, result_tile_type, Tile{elem_type, Tuple(shape)}, collect(shape))
+end
+
+# cuda_tile.atomic_rmw_tko with XCHG (tile version)
+@eval Intrinsics begin
+    """
+        atomic_xchg_tile(ptr_tile, val, mask, memory_order, memory_scope)
+
+    Tile-wise atomic exchange.
+    Operates on a tile of pointers with a tile of values.
+    Mask controls which elements are active (bounds checking).
+    Returns a tile of original values.
+    """
+    @noinline function atomic_xchg_tile(ptr_tile::Tile, val::Tile{T, S}, mask::Tile,
+                                         memory_order::Int, memory_scope::Int) where {T, S}
+        donotdelete(ptr_tile, val, mask)
+        compilerbarrier(:const, val)::Tile{T, S}
+    end
+end
+
+function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_xchg_tile), args)
+    emit_atomic_rmw_tile!(ctx, args, AtomicXCHG)
+end
+
+# cuda_tile.atomic_rmw_tko with ADD (tile version)
+@eval Intrinsics begin
+    """
+        atomic_add_tile(ptr_tile, val, mask, memory_order, memory_scope)
+
+    Tile-wise atomic addition.
+    Operates on a tile of pointers with a tile of values.
+    Mask controls which elements are active (bounds checking).
+    Returns a tile of original values.
+    """
+    @noinline function atomic_add_tile(ptr_tile::Tile, val::Tile{T, S}, mask::Tile,
+                                        memory_order::Int, memory_scope::Int) where {T, S}
+        donotdelete(ptr_tile, val, mask)
+        compilerbarrier(:const, val)::Tile{T, S}
+    end
+end
+
+function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.atomic_add_tile), args)
+    emit_atomic_rmw_tile!(ctx, args, AtomicADD)
+end
diff --git a/src/language/arithmetic.jl b/src/language/arithmetic.jl
index 1a69f39..63db996 100644
--- a/src/language/arithmetic.jl
+++ b/src/language/arithmetic.jl
@@ -193,3 +193,13 @@ for (op, pred) in ((:<, :CmpLessThan), (:>, :CmpGreaterThan),
             _cmp_intrinsic(broadcast_to(Tile(T(a)), S), b, $pred)
     end
 end
+
+# For index tile arithmetic:
+@inline Base.Broadcast.broadcasted(::TileStyle, ::typeof(-), a::Tile{T,S}, ::Base.RefValue{One}) where {T<:Integer,S} =
+    a .- one(T)
+@inline Base.Broadcast.broadcasted(::TileStyle, ::typeof(+), a::Tile{T,S}, ::Base.RefValue{One}) where {T<:Integer,S} =
+    a .+ one(T)
+@inline Base.Broadcast.broadcasted(::TileStyle, ::typeof(-), ::Base.RefValue{One}, a::Tile{T,S}) where {T<:Integer,S} =
+    one(T) .- a
+@inline Base.Broadcast.broadcasted(::TileStyle, ::typeof(+), ::Base.RefValue{One}, a::Tile{T,S}) where {T<:Integer,S} =
+    one(T) .+ a
diff --git a/src/language/atomics.jl b/src/language/atomics.jl
index 1c15f5f..372a6f6 100644
--- a/src/language/atomics.jl
+++ b/src/language/atomics.jl
@@ -80,3 +80,292 @@ old_val = ct.atomic_add(counters, idx, Int32(1))
                             memory_scope::Int=MemScope.Device) where {T, N}
     Intrinsics.atomic_add(array, index - One(), val, memory_order, memory_scope)
 end
+
+# ============================================================================
+# Tile-wise atomic operations
+# These accept Tile indices to perform atomic operations on multiple elements.
+# ============================================================================
+
+# Operation registry: (name, intrinsic) pairs for RMW operations
+# To add a new operation: 1) add entry here, 2) add intrinsic in compiler/intrinsics/atomics.jl
+const ATOMIC_RMW_OPS = [
+    (:add,  :atomic_add_tile),
+    (:xchg, :atomic_xchg_tile),
+]
+
+# ============================================================================
+# Pointer/Mask Helpers
+# ============================================================================
+
+"""
+Compute pointer tile and bounds mask for 1D tile-wise atomic operations.
+Returns (ptr_tile, mask, output_shape).
+"""
+@inline function _atomic_ptr_mask_1d(array::TileArray{T, 1}, indices::Tile{I, S}) where {T, I <: Integer, S}
+    indices_0 = indices .- One()
+    indices_i32 = astype(indices_0, Int32)
+    ptr_tile = Intrinsics.offset(array.ptr, indices_i32)
+    mask = (indices_i32 .>= Tile(Int32(0))) .& (indices_i32 .< Tile(array.sizes[1]))
+    (ptr_tile, mask, S)
+end
+
+"""
+Compute pointer tile and bounds mask for 2D tile-wise atomic operations.
+Returns (ptr_tile, mask, output_shape).
+"""
+@inline function _atomic_ptr_mask_2d(array::TileArray{T, 2},
+                                      indices::Tuple{Tile{I0, S0}, Tile{I1, S1}}) where {T, I0 <: Integer, I1 <: Integer, S0, S1}
+    idx0_0 = indices[1] .- One()
+    idx1_0 = indices[2] .- One()
+    S = broadcast_shape(S0, S1)
+    idx0_i32 = astype(broadcast_to(idx0_0, S), Int32)
+    idx1_i32 = astype(broadcast_to(idx1_0, S), Int32)
+    linear_idx = idx0_i32 .* Tile(array.strides[1]) .+ idx1_i32 .* Tile(array.strides[2])
+    ptr_tile = Intrinsics.offset(array.ptr, linear_idx)
+    mask = (idx0_i32 .>= Tile(Int32(0))) .& (idx0_i32 .< Tile(array.sizes[1])) .&
+           (idx1_i32 .>= Tile(Int32(0))) .& (idx1_i32 .< Tile(array.sizes[2]))
+    (ptr_tile, mask, S)
+end
+
+"""
+Compute pointer tile and bounds mask for 2D tile-wise operations with value broadcasting.
+Returns (ptr_tile, mask, output_shape, idx0_i32, idx1_i32) for value broadcasting.
+"""
+@inline function _atomic_ptr_mask_2d_bc(array::TileArray{T, 2},
+                                         indices::Tuple{Tile{I0, S0}, Tile{I1, S1}},
+                                         Sval) where {T, I0 <: Integer, I1 <: Integer, S0, S1}
+    idx0_0 = indices[1] .- One()
+    idx1_0 = indices[2] .- One()
+    S = broadcast_shape(broadcast_shape(S0, S1), Sval)
+    idx0_i32 = astype(broadcast_to(idx0_0, S), Int32)
+    idx1_i32 = astype(broadcast_to(idx1_0, S), Int32)
+    linear_idx = idx0_i32 .* Tile(array.strides[1]) .+ idx1_i32 .* Tile(array.strides[2])
+    ptr_tile = Intrinsics.offset(array.ptr, linear_idx)
+    mask = (idx0_i32 .>= Tile(Int32(0))) .& (idx0_i32 .< Tile(array.sizes[1])) .&
+           (idx1_i32 .>= Tile(Int32(0))) .& (idx1_i32 .< Tile(array.sizes[2]))
+    (ptr_tile, mask, S)
+end
+
+"""
+Compute pointer tile and bounds mask for N-dimensional tile-level atomic operations.
+`index` is an N-tuple of tile-space indices (1-indexed).
+`Shape` is the tile shape.
+Returns (ptr_tile, mask).
+"""
+@inline function _tile_level_atomic_args(array::TileArray{T, N}, index::NTuple{N, Integer},
+                                          ::Val{Shape}) where {T, N, Shape}
+    # Create 1-indexed element index tiles for each dimension
+    # For dim d: arange [1..Shape[d]], reshaped for broadcasting, plus base offset
+    idx_tiles = ntuple(N) do d
+        bcast_shape = ntuple(i -> i == d ? Shape[d] : 1, N)
+        base = Int32((index[d] - 1) * Shape[d])
+        reshape(arange((Shape[d],), Int32), bcast_shape) .+ Tile(base)
+    end
+
+    # Compute 0-indexed linear offset: sum((idx[d] - 1) * stride[d])
+    linear_idx = reduce(.+, ntuple(N) do d
+        (idx_tiles[d] .- Tile(Int32(1))) .* Tile(array.strides[d])
+    end)
+
+    ptr_tile = Intrinsics.offset(array.ptr, linear_idx)
+
+    # Bounds mask: 1 <= idx[d] <= sizes[d] for all d
+    mask = reduce(.&, ntuple(N) do d
+        (idx_tiles[d] .>= Tile(Int32(1))) .& (idx_tiles[d] .<= Tile(array.sizes[d]))
+    end)
+
+    (ptr_tile, mask)
+end
+
+# ============================================================================
+# Generated RMW Operations (add, xchg, ...)
+# ============================================================================
+
+for (op, intrinsic) in ATOMIC_RMW_OPS
+    fname = Symbol("atomic_", op)
+    doc_op = string(op)
+
+    # 1D tile-wise with scalar value
+    @eval begin
+        @doc """
+            $($fname)(array::TileArray{T, 1}, indices::Tile, val::T; memory_order, memory_scope) -> Tile{T, S}
+
+        Tile-wise atomic $($doc_op) on a 1D array.
+        Indices are 1-indexed. Out-of-bounds indices are masked.
+        """
+        @inline function $fname(array::TileArray{T, 1}, indices::Tile{I, S}, val::T;
+                                memory_order::Int=MemoryOrder.AcqRel,
+                                memory_scope::Int=MemScope.Device) where {T, I <: Integer, S}
+            ptr_tile, mask, _ = _atomic_ptr_mask_1d(array, indices)
+            val_tile = broadcast_to(Tile(val), S)
+            Intrinsics.$intrinsic(ptr_tile, val_tile, mask, memory_order, memory_scope)
+        end
+    end
+
+    # 1D tile-wise with tile value
+    @eval begin
+        @doc """
+            $($fname)(array::TileArray{T, 1}, indices::Tile, val::Tile{T, S}; ...) -> Tile{T, S}
+
+        Tile-wise atomic $($doc_op) with a tile of values.
+        """
+        @inline function $fname(array::TileArray{T, 1}, indices::Tile{I, S}, val::Tile{T, S};
+                                memory_order::Int=MemoryOrder.AcqRel,
+                                memory_scope::Int=MemScope.Device) where {T, I <: Integer, S}
+            ptr_tile, mask, _ = _atomic_ptr_mask_1d(array, indices)
+            Intrinsics.$intrinsic(ptr_tile, val, mask, memory_order, memory_scope)
+        end
+    end
+
+    # 2D tile-wise with scalar value
+    @eval begin
+        @doc """
+            $($fname)(array::TileArray{T, 2}, indices::Tuple{Tile, Tile}, val::T; ...) -> Tile{T, S}
+
+        Tile-wise atomic $($doc_op) on a 2D array.
+        """
+        @inline function $fname(array::TileArray{T, 2},
+                                indices::Tuple{Tile{I0, S0}, Tile{I1, S1}}, val::T;
+                                memory_order::Int=MemoryOrder.AcqRel,
+                                memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer, S0, S1}
+            ptr_tile, mask, S = _atomic_ptr_mask_2d(array, indices)
+            val_tile = broadcast_to(Tile(val), S)
+            Intrinsics.$intrinsic(ptr_tile, val_tile, mask, memory_order, memory_scope)
+        end
+    end
+
+    # 2D tile-wise with tile value
+    @eval begin
+        @doc """
+            $($fname)(array::TileArray{T, 2}, indices::Tuple{Tile, Tile}, val::Tile; ...) -> Tile{T, S}
+
+        Tile-wise atomic $($doc_op) on a 2D array with a tile of values.
+        """
+        @inline function $fname(array::TileArray{T, 2},
+                                indices::Tuple{Tile{I0, S0}, Tile{I1, S1}}, val::Tile{T, Stile};
+                                memory_order::Int=MemoryOrder.AcqRel,
+                                memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer, S0, S1, Stile}
+            ptr_tile, mask, S = _atomic_ptr_mask_2d_bc(array, indices, Stile)
+            val_bc = broadcast_to(val, S)
+            Intrinsics.$intrinsic(ptr_tile, val_bc, mask, memory_order, memory_scope)
+        end
+    end
+
+    # Tile-level N-D (tuple of integer indices, tile value)
+    @eval begin
+        @doc """
+            $($fname)(array::TileArray{T, N}, index, tile::Tile{T, Shape}; ...) -> Tile{T, Shape}
+
+        Atomic $($doc_op) at tile-level index (like `store`).
+        Index can be an Integer (1D) or NTuple{N, Integer} (N-D).
+        """
+        @inline function $fname(array::TileArray{T, N}, index::NTuple{N, Integer}, tile::Tile{T, Shape};
+                                memory_order::Int=MemoryOrder.AcqRel,
+                                memory_scope::Int=MemScope.Device) where {T, N, Shape}
+            ptr_tile, mask = _tile_level_atomic_args(array, index, Val(Shape))
+            Intrinsics.$intrinsic(ptr_tile, tile, mask, memory_order, memory_scope)
+        end
+    end
+
+    # Tile-level 1D convenience (integer index -> 1-tuple)
+    @eval begin
+        @inline function $fname(array::TileArray{T, 1}, index::Integer, tile::Tile{T, Shape};
+                                memory_order::Int=MemoryOrder.AcqRel,
+                                memory_scope::Int=MemScope.Device) where {T, Shape}
+            $fname(array, (index,), tile; memory_order, memory_scope)
+        end
+    end
+end
+
+# ============================================================================
+# CAS Operations (separate - has expected + desired args)
+# ============================================================================
+
+"""
+    atomic_cas(array::TileArray{T, 1}, indices::Tile, expected, desired; memory_order, memory_scope) -> Tile{T, S}
+
+Tile-wise atomic compare-and-swap on a 1D array.
+Indices are 1-indexed. Out-of-bounds indices are masked.
+"""
+@inline function atomic_cas(array::TileArray{T, 1}, indices::Tile{I, S},
+                            expected::T, desired::T;
+                            memory_order::Int=MemoryOrder.AcqRel,
+                            memory_scope::Int=MemScope.Device) where {T, I <: Integer, S}
+    ptr_tile, mask, _ = _atomic_ptr_mask_1d(array, indices)
+    expected_tile = broadcast_to(Tile(expected), S)
+    desired_tile = broadcast_to(Tile(desired), S)
+    Intrinsics.atomic_cas_tile(ptr_tile, expected_tile, desired_tile, mask,
+                               memory_order, memory_scope)
+end
+
+"""
+    atomic_cas(array::TileArray{T, 1}, indices::Tile, expected::Tile, desired::Tile; ...) -> Tile{T, S}
+
+Tile-wise atomic compare-and-swap with tiles of expected/desired values.
+"""
+@inline function atomic_cas(array::TileArray{T, 1}, indices::Tile{I, S},
+                            expected::Tile{T, S}, desired::Tile{T, S};
+                            memory_order::Int=MemoryOrder.AcqRel,
+                            memory_scope::Int=MemScope.Device) where {T, I <: Integer, S}
+    ptr_tile, mask, _ = _atomic_ptr_mask_1d(array, indices)
+    Intrinsics.atomic_cas_tile(ptr_tile, expected, desired, mask,
+                               memory_order, memory_scope)
+end
+
+"""
+    atomic_cas(array::TileArray{T, 2}, indices::Tuple{Tile, Tile}, expected, desired; ...) -> Tile{T, S}
+
+Tile-wise atomic compare-and-swap on a 2D array.
+"""
+@inline function atomic_cas(array::TileArray{T, 2},
+                            indices::Tuple{Tile{I0, S0}, Tile{I1, S1}},
+                            expected::T, desired::T;
+                            memory_order::Int=MemoryOrder.AcqRel,
+                            memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer, S0, S1}
+    ptr_tile, mask, S = _atomic_ptr_mask_2d(array, indices)
+    expected_tile = broadcast_to(Tile(expected), S)
+    desired_tile = broadcast_to(Tile(desired), S)
+    Intrinsics.atomic_cas_tile(ptr_tile, expected_tile, desired_tile, mask,
+                               memory_order, memory_scope)
+end
+
+"""
+    atomic_cas(array::TileArray{T, 2}, indices::Tuple{Tile, Tile}, expected::Tile, desired::Tile; ...) -> Tile{T, S}
+
+Tile-wise atomic compare-and-swap on a 2D array with tiles of values.
+"""
+@inline function atomic_cas(array::TileArray{T, 2},
+                            indices::Tuple{Tile{I0, S0}, Tile{I1, S1}},
+                            expected::Tile{T, Se}, desired::Tile{T, Sd};
+                            memory_order::Int=MemoryOrder.AcqRel,
+                            memory_scope::Int=MemScope.Device) where {T, I0 <: Integer, I1 <: Integer, S0, S1, Se, Sd}
+    S = broadcast_shape(broadcast_shape(broadcast_shape(S0, S1), Se), Sd)
+    ptr_tile, mask, _ = _atomic_ptr_mask_2d_bc(array, indices, Se)
+    expected_bc = broadcast_to(expected, S)
+    desired_bc = broadcast_to(desired, S)
+    Intrinsics.atomic_cas_tile(ptr_tile, expected_bc, desired_bc, mask,
+                               memory_order, memory_scope)
+end
+
+"""
+    atomic_cas(array::TileArray{T, N}, index, expected::Tile, desired::Tile; ...) -> Tile{T, Shape}
+
+Atomic compare-and-swap at tile-level index (like `store`).
+Index can be an Integer (1D) or NTuple{N, Integer} (N-D).
+"""
+@inline function atomic_cas(array::TileArray{T, N}, index::NTuple{N, Integer},
+                            expected::Tile{T, Shape}, desired::Tile{T, Shape};
+                            memory_order::Int=MemoryOrder.AcqRel,
+                            memory_scope::Int=MemScope.Device) where {T, N, Shape}
+    ptr_tile, mask = _tile_level_atomic_args(array, index, Val(Shape))
+    Intrinsics.atomic_cas_tile(ptr_tile, expected, desired, mask,
+                               memory_order, memory_scope)
+end
+
+# 1D convenience (integer index -> 1-tuple)
+@inline function atomic_cas(array::TileArray{T, 1}, index::Integer,
+                            expected::Tile{T, Shape}, desired::Tile{T, Shape};
+                            memory_order::Int=MemoryOrder.AcqRel,
+                            memory_scope::Int=MemScope.Device) where {T, Shape}
+    atomic_cas(array, (index,), expected, desired; memory_order, memory_scope)
+end
diff --git a/src/language/operations.jl b/src/language/operations.jl
index 463a358..eedcb2b 100644
--- a/src/language/operations.jl
+++ b/src/language/operations.jl
@@ -199,7 +199,7 @@ tile = ct.gather(arr, indices; latency=3)
 @inline function gather(array::TileArray{T, 1}, indices::Tile{I, S};
                         latency::Union{Int, Nothing}=nothing) where {T, I <: Integer, S}
     # Convert to 0-indexed
-    indices_0 = indices .- one(I)
+    indices_0 = indices .- One()
 
     # Convert to Int32 for consistency with array.sizes
     indices_i32 = astype(indices_0, Int32)
@@ -232,8 +232,8 @@ Indices are 1-indexed. Index tiles are broadcast to a common shape.
 @inline function gather(array::TileArray{T, 2}, indices::Tuple{Tile{I0, S0}, Tile{I1, S1}};
                         latency::Union{Int, Nothing}=nothing) where {T, I0 <: Integer, I1 <: Integer, S0, S1}
     # Convert to 0-indexed
-    idx0_0 = indices[1] .- one(I0)
-    idx1_0 = indices[2] .- one(I1)
+    idx0_0 = indices[1] .- One()
+    idx1_0 = indices[2] .- One()
 
     # Broadcast indices to common shape
     S = broadcast_shape(S0, S1)
@@ -291,7 +291,7 @@ ct.scatter(arr, indices, result_tile; latency=3)
 @inline function scatter(array::TileArray{T, 1}, indices::Tile{I, S}, tile::Tile{T, S};
                          latency::Union{Int, Nothing}=nothing) where {T, I <: Integer, S}
     # Convert to 0-indexed
-    indices_0 = indices .- one(I)
+    indices_0 = indices .- One()
 
     # Convert to Int32 for consistency with array.sizes
     indices_i32 = astype(indices_0, Int32)
@@ -321,8 +321,8 @@ Indices are 1-indexed. Index tiles and value tile must broadcast to same shape.
 @inline function scatter(array::TileArray{T, 2}, indices::Tuple{Tile{I0, S0}, Tile{I1, S1}}, tile::Tile{T, Stile};
                          latency::Union{Int, Nothing}=nothing) where {T, I0 <: Integer, I1 <: Integer, S0, S1, Stile}
     # Convert to 0-indexed
-    idx0_0 = indices[1] .- one(I0)
-    idx1_0 = indices[2] .- one(I1)
+    idx0_0 = indices[1] .- One()
+    idx1_0 = indices[2] .- One()
 
     # Broadcast indices to common shape (include value tile shape)
     S = broadcast_shape(broadcast_shape(S0, S1), Stile)
diff --git a/test/codegen.jl b/test/codegen.jl
index ae4b42e..b5515af 100644
--- a/test/codegen.jl
+++ b/test/codegen.jl
@@ -1094,6 +1094,66 @@
                 end
             end
         end
+
+        @testset "tile-wise atomic_cas_tko" begin
+            spec = ct.ArraySpec{1}(16, true)
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Int32,1,spec}}) do arr
+                    # Create index tile
+                    @check "iota"
+                    indices = ct.arange((16,), Int)
+                    # Tile-wise atomic CAS
+                    @check "offset"
+                    @check "atomic_cas_tko"
+                    ct.atomic_cas(arr, indices, Int32(0), Int32(1))
+                    return
+                end
+            end
+        end
+
+        @testset "tile-wise atomic_rmw_tko" begin
+            spec = ct.ArraySpec{1}(16, true)
+            # tile-wise xchg
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Int32,1,spec}}) do arr
+                    @check "iota"
+                    indices = ct.arange((16,), Int)
+                    @check "offset"
+                    @check "atomic_rmw_tko"
+                    ct.atomic_xchg(arr, indices, Int32(42))
+                    return
+                end
+            end
+
+            # tile-wise add (integer)
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Int32,1,spec}}) do arr
+                    @check "iota"
+                    indices = ct.arange((16,), Int)
+                    @check "offset"
+                    @check "atomic_rmw_tko"
+                    ct.atomic_add(arr, indices, Int32(1))
+                    return
+                end
+            end
+
+            # tile-wise add (float)
+            spec_f32 = ct.ArraySpec{1}(16, true)
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Float32,1,spec_f32}}) do arr
+                    @check "iota"
+                    indices = ct.arange((16,), Int)
+                    @check "offset"
+                    @check "atomic_rmw_tko"
+                    ct.atomic_add(arr, indices, 1.5f0)
+                    return
+                end
+            end
+        end
     end
 
     #=========================================================================
diff --git a/test/execution.jl b/test/execution.jl
index a2072f3..6f60d5a 100644
--- a/test/execution.jl
+++ b/test/execution.jl
@@ -1544,6 +1544,337 @@ end
     @test result == n_blocks
 end
 
+@testset "atomic_add tile-wise 1D" begin
+    function atomic_add_tile_kernel(arr::ct.TileArray{Int,1}, TILE::ct.Constant{Int})
+        bid = ct.bid(1)
+        base = (bid - 1) * TILE[]
+        indices = base .+ ct.arange((TILE[],), Int)
+        ct.atomic_add(arr, indices, 1;
+                     memory_order=ct.MemoryOrder.AcqRel)
+        return
+    end
+
+    tile_size = 16
+    n = 256
+    n_blocks = div(n, tile_size)
+    arr = CUDA.zeros(Int, n)
+
+    ct.launch(atomic_add_tile_kernel, n_blocks, arr, ct.Constant(tile_size))
+
+    @test all(Array(arr) .== 1)
+end
+
+@testset "atomic_add tile-wise returns old values" begin
+    function atomic_add_return_kernel(arr::ct.TileArray{Int,1}, out::ct.TileArray{Int,1})
+        indices = ct.arange((16,), Int)
+        old_vals = ct.atomic_add(arr, indices, 1;
+                                memory_order=ct.MemoryOrder.AcqRel)
+        ct.scatter(out, indices, old_vals)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 16)
+    out = CUDA.fill(Int(-1), 16)
+
+    ct.launch(atomic_add_return_kernel, 1, arr, out)
+
+    @test all(Array(out) .== 0)
+    @test all(Array(arr) .== 1)
+end
+
+@testset "atomic_add tile-wise Float32" begin
+    function atomic_add_f32_tile_kernel(arr::ct.TileArray{Float32,1}, TILE::ct.Constant{Int})
+        bid = ct.bid(1)
+        base = (bid - 1) * TILE[]
+        indices = base .+ ct.arange((TILE[],), Int)
+        ct.atomic_add(arr, indices, 1.5f0;
+                     memory_order=ct.MemoryOrder.AcqRel)
+        return
+    end
+
+    tile_size = 16
+    n = 256
+    n_blocks = div(n, tile_size)
+    arr = CUDA.zeros(Float32, n)
+
+    ct.launch(atomic_add_f32_tile_kernel, n_blocks, arr, ct.Constant(tile_size))
+
+    @test all(isapprox.(Array(arr), 1.5f0))
+end
+
+@testset "atomic_add tile-wise with tile values" begin
+    function atomic_add_tile_val_kernel(arr::ct.TileArray{Int,1},
+                                        vals::ct.TileArray{Int,1})
+        indices = ct.arange((16,), Int)
+        val_tile = ct.gather(vals, indices)
+        old_vals = ct.atomic_add(arr, indices, val_tile;
+                                memory_order=ct.MemoryOrder.AcqRel)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 16)
+    vals = CUDA.collect(Int, 1:16)
+
+    ct.launch(atomic_add_tile_val_kernel, 1, arr, vals)
+
+    @test Array(arr) == collect(1:16)
+end
+
+@testset "atomic_xchg tile-wise" begin
+    function atomic_xchg_tile_kernel(arr::ct.TileArray{Int,1})
+        bid = ct.bid(1)
+        indices = ct.arange((16,), Int)
+        ct.atomic_xchg(arr, indices, bid + 1;
+                      memory_order=ct.MemoryOrder.AcqRel)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 16)
+
+    ct.launch(atomic_xchg_tile_kernel, 1, arr)
+
+    @test all(Array(arr) .== 2)
+end
+
+@testset "atomic_cas tile-wise success" begin
+    function atomic_cas_tile_kernel(arr::ct.TileArray{Int,1}, out::ct.TileArray{Int,1})
+        indices = ct.arange((16,), Int)
+        old_vals = ct.atomic_cas(arr, indices, 0, 1;
+                                memory_order=ct.MemoryOrder.AcqRel)
+        ct.scatter(out, indices, old_vals)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 16)
+    out = CUDA.fill(Int(-1), 16)
+
+    ct.launch(atomic_cas_tile_kernel, 1, arr, out)
+
+    @test all(Array(out) .== 0)
+    @test all(Array(arr) .== 1)
+end
+
+@testset "atomic_cas tile-wise failure" begin
+    function atomic_cas_fail_kernel(arr::ct.TileArray{Int,1}, out::ct.TileArray{Int,1})
+        indices = ct.arange((16,), Int)
+        old_vals = ct.atomic_cas(arr, indices, 0, 2;
+                                memory_order=ct.MemoryOrder.AcqRel)
+        ct.scatter(out, indices, old_vals)
+        return
+    end
+
+    arr = CUDA.fill(Int(1), 16)
+    out = CUDA.fill(Int(-1), 16)
+
+    ct.launch(atomic_cas_fail_kernel, 1, arr, out)
+
+    @test all(Array(out) .== 1)
+    @test all(Array(arr) .== 1)
+end
+
+@testset "atomic_add tile-wise out-of-bounds" begin
+    function atomic_add_oob_kernel(arr::ct.TileArray{Int,1})
+        indices = ct.arange((16,), Int)
+        ct.atomic_add(arr, indices, 1;
+                     memory_order=ct.MemoryOrder.AcqRel)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 8)
+
+    ct.launch(atomic_add_oob_kernel, 1, arr)
+
+    @test all(Array(arr) .== 1)
+end
+
+@testset "atomic_add tile-level 1D" begin
+    function atomic_add_tile_level_kernel(arr::ct.TileArray{Int,1})
+        tile = ct.full((16,), 1, Int)
+        ct.atomic_add(arr, 2, tile)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 32)
+
+    ct.launch(atomic_add_tile_level_kernel, 1, arr)
+
+    result = Array(arr)
+    @test all(result[1:16] .== 0)
+    @test all(result[17:32] .== 1)
+end
+
+@testset "atomic_add tile-level 1D accumulates" begin
+    function atomic_add_tile_level_accum_kernel(arr::ct.TileArray{Int,1})
+        tile = ct.full((16,), 1, Int)
+        ct.atomic_add(arr, 1, tile)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 16)
+    n_blocks = 10
+
+    ct.launch(atomic_add_tile_level_accum_kernel, n_blocks, arr)
+
+    result = Array(arr)
+    @test all(result .== n_blocks)
+end
+
+@testset "atomic_add tile-level 2D" begin
+    function atomic_add_tile_level_2d_kernel(arr::ct.TileArray{Float32,2})
+        tile = ct.full((8, 8), 1.0f0, Float32)
+        ct.atomic_add(arr, (1, 2), tile)
+        return
+    end
+
+    arr = CUDA.zeros(Float32, 16, 16)
+
+    ct.launch(atomic_add_tile_level_2d_kernel, 1, arr)
+
+    result = Array(arr)
+    @test all(result[1:8, 9:16] .== 1.0f0)
+    @test all(result[1:8, 1:8] .== 0.0f0)
+    @test all(result[9:16, :] .== 0.0f0)
+end
+
+@testset "atomic_xchg tile-level 1D" begin
+    function atomic_xchg_tile_level_kernel(arr::ct.TileArray{Int,1},
+                                            out::ct.TileArray{Int,1})
+        tile = ct.full((16,), 42, Int)
+        old_vals = ct.atomic_xchg(arr, 1, tile)
+        ct.store(out, 1, old_vals)
+        return
+    end
+
+    arr = CUDA.fill(Int(10), 16)
+    out = CUDA.zeros(Int, 16)
+
+    ct.launch(atomic_xchg_tile_level_kernel, 1, arr, out)
+
+    @test all(Array(arr) .== 42)
+    @test all(Array(out) .== 10)
+end
+
+@testset "atomic_cas tile-level 1D success" begin
+    function atomic_cas_tile_level_kernel(arr::ct.TileArray{Int,1})
+        expected = ct.full((16,), 0, Int)
+        desired = ct.full((16,), 1, Int)
+        ct.atomic_cas(arr, 1, expected, desired)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 16)
+
+    ct.launch(atomic_cas_tile_level_kernel, 1, arr)
+
+    @test all(Array(arr) .== 1)
+end
+
+@testset "atomic_add tile-level 3D" begin
+    function atomic_add_tile_level_3d_kernel(arr::ct.TileArray{Float32,3})
+        tile = ct.full((4, 4, 4), 1.0f0, Float32)
+        ct.atomic_add(arr, (1, 2, 1), tile)
+        return
+    end
+
+    arr = CUDA.zeros(Float32, 8, 8, 8)
+
+    ct.launch(atomic_add_tile_level_3d_kernel, 1, arr)
+
+    result = Array(arr)
+    # Tile at (1,2,1) means elements [1:4, 5:8, 1:4]
+    @test all(result[1:4, 5:8, 1:4] .== 1.0f0)
+    @test all(result[5:8, :, :] .== 0.0f0)
+    @test all(result[:, 1:4, :] .== 0.0f0)
+    @test all(result[:, :, 5:8] .== 0.0f0)
+end
+
+@testset "atomic_add tile-level 3D accumulates" begin
+    function atomic_add_3d_accum_kernel(arr::ct.TileArray{Int,3})
+        tile = ct.full((4, 4, 4), 1, Int)
+        ct.atomic_add(arr, (1, 1, 1), tile)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 4, 4, 4)
+    n_blocks = 5
+
+    ct.launch(atomic_add_3d_accum_kernel, n_blocks, arr)
+
+    result = Array(arr)
+    @test all(result .== 5)
+end
+
+@testset "atomic_add tile-level 4D" begin
+    function atomic_add_tile_level_4d_kernel(arr::ct.TileArray{Float32,4})
+        tile = ct.full((2, 2, 2, 2), 1.0f0, Float32)
+        ct.atomic_add(arr, (2, 1, 2, 1), tile)
+        return
+    end
+
+    arr = CUDA.zeros(Float32, 4, 4, 4, 4)
+
+    ct.launch(atomic_add_tile_level_4d_kernel, 1, arr)
+
+    result = Array(arr)
+    # Tile at (2,1,2,1) means elements [3:4, 1:2, 3:4, 1:2]
+    @test all(result[3:4, 1:2, 3:4, 1:2] .== 1.0f0)
+    # Check some zero regions
+    @test all(result[1:2, :, :, :] .== 0.0f0)
+    @test all(result[:, 3:4, :, :] .== 0.0f0)
+end
+
+@testset "atomic_xchg tile-level 3D" begin
+    function atomic_xchg_tile_level_3d_kernel(arr::ct.TileArray{Int,3},
+                                               out::ct.TileArray{Int,3})
+        tile = ct.full((4, 4, 4), 42, Int)
+        old_vals = ct.atomic_xchg(arr, (1, 1, 1), tile)
+        ct.store(out, (1, 1, 1), old_vals)
+        return
+    end
+
+    arr = CUDA.fill(Int(7), 4, 4, 4)
+    out = CUDA.zeros(Int, 4, 4, 4)
+
+    ct.launch(atomic_xchg_tile_level_3d_kernel, 1, arr, out)
+
+    @test all(Array(arr) .== 42)
+    @test all(Array(out) .== 7)
+end
+
+@testset "atomic_cas tile-level 3D success" begin
+    function atomic_cas_tile_level_3d_kernel(arr::ct.TileArray{Int,3})
+        expected = ct.full((4, 4, 4), 0, Int)
+        desired = ct.full((4, 4, 4), 1, Int)
+        ct.atomic_cas(arr, (1, 1, 1), expected, desired)
+        return
+    end
+
+    arr = CUDA.zeros(Int, 4, 4, 4)
+
+    ct.launch(atomic_cas_tile_level_3d_kernel, 1, arr)
+
+    @test all(Array(arr) .== 1)
+end
+
+@testset "atomic_cas tile-level 3D failure" begin
+    function atomic_cas_tile_level_3d_fail_kernel(arr::ct.TileArray{Int,3},
+                                                   out::ct.TileArray{Int,3})
+        expected = ct.full((4, 4, 4), 0, Int)  # wrong expected value
+        desired = ct.full((4, 4, 4), 99, Int)
+        old_vals = ct.atomic_cas(arr, (1, 1, 1), expected, desired)
+        ct.store(out, (1, 1, 1), old_vals)
+        return
+    end
+
+    arr = CUDA.fill(Int(5), 4, 4, 4)  # actual value is 5, not 0
+    out = CUDA.zeros(Int, 4, 4, 4)
+
+    ct.launch(atomic_cas_tile_level_3d_fail_kernel, 1, arr, out)
+
+    @test all(Array(arr) .== 5)   # unchanged (CAS failed)
+    @test all(Array(out) .== 5)   # old values returned
+end
+
 @testset "1D gather - simple" begin
     # Simple 1D gather: copy first 16 elements using gather
     function gather_simple_kernel(src::ct.TileArray{Float32,1}, dst::ct.TileArray{Float32,1})
@@ -1588,6 +1919,84 @@ end
     @test Array(dst) ≈ Array(src)
 end
 
+@testset "2D gather - simple" begin
+    function gather_2d_kernel(src::ct.TileArray{Float32,2}, dst::ct.TileArray{Float32,2})
+        row_indices = ct.reshape(ct.arange((4,), Int), (4, 1))
+        col_indices = ct.reshape(ct.arange((4,), Int), (1, 4))
+        tile = ct.gather(src, (row_indices, col_indices))
+        ct.store(dst, (1, 1), tile)
+        return
+    end
+
+    src = CUDA.rand(Float32, 4, 4)
+    dst = CUDA.zeros(Float32, 4, 4)
+
+    ct.launch(gather_2d_kernel, 1, src, dst)
+
+    @test Array(dst) ≈ Array(src)
+end
+
+@testset "2D scatter - simple" begin
+    function scatter_2d_kernel(src::ct.TileArray{Float32,2}, dst::ct.TileArray{Float32,2})
+        tile = ct.load(src, (1, 1), (4, 4))
+        row_indices = ct.reshape(ct.arange((4,), Int), (4, 1))
+        col_indices = ct.reshape(ct.arange((4,), Int), (1, 4))
+        ct.scatter(dst, (row_indices, col_indices), tile)
+        return
+    end
+
+    src = CUDA.rand(Float32, 4, 4)
+    dst = CUDA.zeros(Float32, 4, 4)
+
+    ct.launch(scatter_2d_kernel, 1, src, dst)
+
+    @test Array(dst) ≈ Array(src)
+end
+
+@testset "2D gather - strided access" begin
+    function gather_2d_strided_kernel(src::ct.TileArray{Float32,2}, dst::ct.TileArray{Float32,2})
+        row_base = ct.arange((4,), Int)
+        col_base = ct.arange((4,), Int)
+        row_indices = ct.reshape(row_base .* 2 .- 1, (4, 1))  # 1,3,5,7
+        col_indices = ct.reshape(col_base .* 2 .- 1, (1, 4))  # 1,3,5,7
+        tile = ct.gather(src, (row_indices, col_indices))
+        ct.store(dst, (1, 1), tile)
+        return
+    end
+
+    src = CUDA.rand(Float32, 8, 8)
+    dst = CUDA.zeros(Float32, 4, 4)
+
+    ct.launch(gather_2d_strided_kernel, 1, src, dst)
+
+    src_cpu = Array(src)
+    expected = src_cpu[1:2:7, 1:2:7]
+    @test Array(dst) ≈ expected
+end
+
+@testset "2D scatter - strided access" begin
+    function scatter_2d_strided_kernel(src::ct.TileArray{Float32,2}, dst::ct.TileArray{Float32,2})
+        tile = ct.load(src, (1, 1), (4, 4))
+        row_base = ct.arange((4,), Int)
+        col_base = ct.arange((4,), Int)
+        row_indices = ct.reshape(row_base .* 2 .- 1, (4, 1))
+        col_indices = ct.reshape(col_base .* 2 .- 1, (1, 4))
+        ct.scatter(dst, (row_indices, col_indices), tile)
+        return
+    end
+
+    src = CUDA.rand(Float32, 4, 4)
+    dst = CUDA.zeros(Float32, 8, 8)
+
+    ct.launch(scatter_2d_strided_kernel, 1, src, dst)
+
+    src_cpu = Array(src)
+    dst_cpu = Array(dst)
+    @test dst_cpu[1:2:7, 1:2:7] ≈ src_cpu
+    @test all(dst_cpu[2:2:8, :] .== 0)
+    @test all(dst_cpu[:, 2:2:8] .== 0)
+end
+
 end
 
 @testset "Entry Hints" begin