From 59956d593f0f519932eb630fcee58ec964af88ff Mon Sep 17 00:00:00 2001 From: AntonOresten Date: Thu, 22 Jan 2026 13:48:48 +0100 Subject: [PATCH] add support and tests for PermutedDimsArray --- src/language/types.jl | 7 +++++-- test/execution.jl | 26 ++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/src/language/types.jl b/src/language/types.jl index 813f446..179a1e4 100644 --- a/src/language/types.jl +++ b/src/language/types.jl @@ -162,6 +162,9 @@ function TileArray(ptr::Ptr{T}, sizes::NTuple{N, Int32}, strides::NTuple{N, Int3 TileArray{T, N, spec}(ptr, sizes, strides) end +_pointer(arr::AbstractArray) = pointer(arr) +_pointer(arr::PermutedDimsArray) = pointer(parent(arr)) + """ TileArray(arr) @@ -169,13 +172,13 @@ Create a TileArray from a CUDA array (CuArray or similar). Automatically extracts pointer, sizes, strides, and computes ArraySpec. This method works with any array type that supports: -- `pointer(arr)` - returns device pointer +- `pointer(arr)` - returns device pointer (or parent's pointer for wrapper types) - `size(arr)` - returns array dimensions - `strides(arr)` - returns array strides """ function TileArray(arr::AbstractArray{T, N}) where {T, N} # Use reinterpret to handle both Ptr and CuPtr (device pointers) - ptr = reinterpret(Ptr{T}, pointer(arr)) + ptr = reinterpret(Ptr{T}, _pointer(arr)) sizes = NTuple{N, Int32}(Int32.(size(arr))) strides_val = NTuple{N, Int32}(Int32.(strides(arr))) TileArray(ptr, sizes, strides_val) diff --git a/test/execution.jl b/test/execution.jl index a2072f3..fb5a7d9 100644 --- a/test/execution.jl +++ b/test/execution.jl @@ -418,6 +418,32 @@ end end end +@testset "strided" begin + @testset "PermutedDimsArray" begin + function copy_kernel_2d( + src::ct.TileArray{Float32, 2}, dst::ct.TileArray{Float32, 2}, + tile_x::ct.Constant{Int}, tile_y::ct.Constant{Int} + ) + bid_x = ct.bid(1) + bid_y = ct.bid(2) + tile = ct.load(src, (bid_x, bid_y), (tile_x[], tile_y[])) + ct.store(dst, (bid_x, bid_y), tile) + return + end + + m, n = 64, 32 + tm, tn = 16, 16 + A = CuArray(Float32.(reshape(1:n*m, n, m))) + P = PermutedDimsArray(A, (2, 1)) + out = CUDA.zeros(Float32, m, n) + + grid = (cld(m, tm), cld(n, tn)) + ct.launch(copy_kernel_2d, grid, P, out, ct.Constant(tm), ct.Constant(tn)) + + @test out == permutedims(A, (2, 1)) + end +end + @testset "extract" begin @testset "extract identity (0,0) full shape" begin function extract_identity_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2})