Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 36 additions & 8 deletions deps/generate_interfaces.jl
Original file line number Diff line number Diff line change
Expand Up @@ -444,20 +444,48 @@ function generate_cpp(library::String, filename::Vector{String}, output::String;
variant = "column_major::"
end

# Build catch clause: LAPACK functions also catch computation_error for info
lapack_catch = "catch (const oneapi::mkl::lapack::computation_error& e) { return e.info(); } catch (const sycl::exception& e) { return -1; }"
sycl_catch = "catch (const sycl::exception& e) { return -1; }"

write(oneapi_cpp, "extern \"C\" $header {\n")
if template
type = version_types[version]
!occursin("scratchpad_size", name) && write(oneapi_cpp, " auto status = oneapi::mkl::$library::$variant$name<$type>($parameters, {});\n device_queue->val.wait_and_throw();\n")
occursin("scratchpad_size", name) && write(oneapi_cpp, " int64_t scratchpad_size = oneapi::mkl::$library::$variant$name<$type>($parameters);\n device_queue->val.wait_and_throw();\n")
# !occursin("scratchpad_size", name) && write(oneapi_cpp, " auto status = oneapi::mkl::$library::$variant$name<$type>($parameters, {});\n")
# occursin("scratchpad_size", name) && write(oneapi_cpp, " int64_t scratchpad_size = oneapi::mkl::$library::$variant$name<$type>($parameters);\n")
if !occursin("scratchpad_size", name)
catch_clause = library == "lapack" ? lapack_catch : sycl_catch
write(oneapi_cpp, " try {\n")
write(oneapi_cpp, " auto status = oneapi::mkl::$library::$variant$name<$type>($parameters, {});\n")
write(oneapi_cpp, " device_queue->val.wait_and_throw();\n")
write(oneapi_cpp, " } $catch_clause\n")
end
if occursin("scratchpad_size", name)
write(oneapi_cpp, " int64_t scratchpad_size = oneapi::mkl::$library::$variant$name<$type>($parameters);\n device_queue->val.wait_and_throw();\n")
end
else
if !(name void_output)
write(oneapi_cpp, " auto status = oneapi::mkl::$library::$variant$name($parameters, {});\n")
occursin("device_queue", parameters) && write(oneapi_cpp, " device_queue->val.wait_and_throw();\n")
has_queue = occursin("device_queue", parameters)
is_scratchpad = occursin("scratchpad_size", name)
if has_queue && !is_scratchpad
catch_clause = library == "lapack" ? lapack_catch : sycl_catch
write(oneapi_cpp, " try {\n")
write(oneapi_cpp, " auto status = oneapi::mkl::$library::$variant$name($parameters, {});\n")
write(oneapi_cpp, " device_queue->val.wait_and_throw();\n")
write(oneapi_cpp, " } $catch_clause\n")
else
write(oneapi_cpp, " auto status = oneapi::mkl::$library::$variant$name($parameters, {});\n")
if has_queue
write(oneapi_cpp, " device_queue->val.wait_and_throw();\n")
end
end
else
write(oneapi_cpp, " oneapi::mkl::$library::$variant$name($parameters);\n")
occursin("device_queue", parameters) && write(oneapi_cpp, " device_queue->val.wait_and_throw();\n")
if occursin("device_queue", parameters)
write(oneapi_cpp, " try {\n")
write(oneapi_cpp, " oneapi::mkl::$library::$variant$name($parameters);\n")
write(oneapi_cpp, " device_queue->val.wait_and_throw();\n")
write(oneapi_cpp, " } $sycl_catch\n")
else
write(oneapi_cpp, " oneapi::mkl::$library::$variant$name($parameters);\n")
end
end
end
if occursin("scratchpad_size", name)
Expand Down
3,120 changes: 2,080 additions & 1,040 deletions deps/src/onemkl.cpp

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions deps/src/sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ extern "C" int syclQueueDestroy(syclQueue_t obj) {
return 0;
}

extern "C" int syclQueueWait(syclQueue_t obj) {
obj->val.wait();
return 0;
}

extern "C" int syclEventCreate(syclEvent_t *obj, syclContext_t context,
ze_event_handle_t event, int keep_ownership) {
auto sycl_ownership =
Expand Down
1 change: 1 addition & 0 deletions deps/src/sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ typedef struct syclQueue_st *syclQueue_t;
int syclQueueCreate(syclQueue_t *obj, syclContext_t context, syclDevice_t device,
ze_command_queue_handle_t queue, int keep_ownership);
int syclQueueDestroy(syclQueue_t obj);
int syclQueueWait(syclQueue_t obj);

typedef struct syclEvent_st *syclEvent_t;
int syclEventCreate(syclEvent_t *obj, syclContext_t context,
Expand Down
4 changes: 2 additions & 2 deletions lib/mkl/linalg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,9 @@ function LinearAlgebra.generic_matvecmul!(Y::oneVector, tA::AbstractChar, A::one
if tA in ('N', 'T', 'C')
return gemv!(tA, alpha, A, B, beta, Y)
elseif tA in ('S', 's')
return symv!(tA == 'S' ? 'U' : 'L', alpha, A, x, beta, y)
return symv!(tA == 'S' ? 'U' : 'L', alpha, A, B, beta, Y)
elseif tA in ('H', 'h')
return hemv!(tA == 'H' ? 'U' : 'L', alpha, A, x, beta, y)
return hemv!(tA == 'H' ? 'U' : 'L', alpha, A, B, beta, Y)
end
end
end
Expand Down
4 changes: 4 additions & 0 deletions lib/support/liboneapi_support.jl
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ function syclQueueDestroy(obj)
@ccall liboneapi_support.syclQueueDestroy(obj::syclQueue_t)::Cint
end

function syclQueueWait(obj)
@ccall liboneapi_support.syclQueueWait(obj::syclQueue_t)::Cint
end

mutable struct syclEvent_st end

const syclEvent_t = Ptr{syclEvent_st}
Expand Down
20 changes: 14 additions & 6 deletions src/accumulate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,20 @@ import oneAPI
import oneAPI: oneArray, oneAPIBackend
import AcceleratedKernels as AK

# Use a smaller block size on Intel GPUs to work around a scan correctness issue
# with the Blelloch parallel prefix sum at larger block sizes (>=128).
const _ACCUMULATE_BLOCK_SIZE = 64

# Accumulate operations using AcceleratedKernels
Base.accumulate!(op, B::oneArray, A::oneArray; init = zero(eltype(A)), kwargs...) =
AK.accumulate!(op, B, A, oneAPIBackend(); init, kwargs...)
Base.accumulate!(op, B::oneArray, A::oneArray; init = zero(eltype(A)),
block_size = _ACCUMULATE_BLOCK_SIZE, kwargs...) =
AK.accumulate!(op, B, A, oneAPIBackend(); init, block_size, kwargs...)

Base.accumulate(op, A::oneArray; init = zero(eltype(A)), kwargs...) =
AK.accumulate(op, A, oneAPIBackend(); init, kwargs...)
Base.accumulate(op, A::oneArray; init = zero(eltype(A)),
block_size = _ACCUMULATE_BLOCK_SIZE, kwargs...) =
AK.accumulate(op, A, oneAPIBackend(); init, block_size, kwargs...)

Base.cumsum(src::oneArray; kwargs...) = AK.cumsum(src, oneAPIBackend(); kwargs...)
Base.cumprod(src::oneArray; kwargs...) = AK.cumprod(src, oneAPIBackend(); kwargs...)
Base.cumsum(src::oneArray; block_size = _ACCUMULATE_BLOCK_SIZE, kwargs...) =
AK.cumsum(src, oneAPIBackend(); block_size, kwargs...)
Base.cumprod(src::oneArray; block_size = _ACCUMULATE_BLOCK_SIZE, kwargs...) =
AK.cumprod(src, oneAPIBackend(); block_size, kwargs...)
33 changes: 25 additions & 8 deletions src/mapreduce.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,31 @@
# - serial version for lower latency
# - group-stride loop to delay need for second kernel launch

# Widen sub-word types to avoid shared memory corruption on Intel GPUs.
# Writing 1/2-byte values to local memory can clobber adjacent bytes.
# Only applies to integer/boolean types where `%` conversion is valid.
@inline _widen_type(::Type{Bool}) = Int32
@inline _widen_type(::Type{Int8}) = Int32
@inline _widen_type(::Type{UInt8}) = Int32
@inline _widen_type(::Type{Int16}) = Int32
@inline _widen_type(::Type{UInt16}) = Int32
@inline _widen_type(::Type{T}) where T = T

# Dispatch-based conversions so the compiler never generates `%` for non-integer types
@inline _to_wide(val, ::Type{W}) where W = val % W
@inline _to_wide(val::T, ::Type{T}) where T = val
@inline _from_wide(val, ::Type{T}) where T = val % T
@inline _from_wide(val::T, ::Type{T}) where T = val

# Reduce a value across a group, using local memory for communication
@inline function reduce_group(op, val::T, neutral, ::Val{maxitems}) where {T, maxitems}
items = get_local_size()
item = get_local_id()

# local mem for a complete reduction
shared = oneLocalArray(T, (maxitems,))
@inbounds shared[item] = val
# use a wider type for shared memory to avoid sub-word corruption
W = _widen_type(T)
shared = oneLocalArray(W, (maxitems,))
@inbounds shared[item] = _to_wide(val, W)

# perform a reduction
d = 1
Expand All @@ -20,18 +37,18 @@
index = 2 * d * (item-1) + 1
@inbounds if index <= items
other_val = if index + d <= items
shared[index+d]
_from_wide(shared[index+d], T)
else
neutral
end
shared[index] = op(shared[index], other_val)
shared[index] = _to_wide(op(_from_wide(shared[index], T), other_val), W)
end
d *= 2
end

# load the final value on the first item
if item == 1
val = @inbounds shared[item]
val = @inbounds _from_wide(shared[item], T)
end

return val
Expand Down Expand Up @@ -135,8 +152,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::oneWrappedArray{T},
# that's why each items also loops across their inputs, processing multiple values
# so that we can span the entire reduction dimension using a single item group.

# group size is restricted by local memory
max_lmem_elements = compute_properties(device()).maxSharedLocalMemory ÷ sizeof(T)
# group size is restricted by local memory (use widened type for sub-word types)
max_lmem_elements = compute_properties(device()).maxSharedLocalMemory ÷ sizeof(_widen_type(T))
max_items = min(compute_properties(device()).maxTotalGroupSize,
compute_items(max_lmem_elements ÷ 2))
# TODO: dynamic local memory to avoid two compilations
Expand Down
Loading