JuliaGPU · michel2323 · Feb 10, 2026 · Feb 5, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/deps/generate_interfaces.jl b/deps/generate_interfaces.jl
@@ -444,20 +444,48 @@ function generate_cpp(library::String, filename::Vector{String}, output::String;
       variant = "column_major::"
     end
 
+    # Build catch clause: LAPACK functions also catch computation_error for info
+    lapack_catch = "catch (const oneapi::mkl::lapack::computation_error& e) { return e.info(); } catch (const sycl::exception& e) { return -1; }"
+    sycl_catch = "catch (const sycl::exception& e) { return -1; }"
+
     write(oneapi_cpp, "extern \"C\" $header {\n")
     if template
       type = version_types[version]
-      !occursin("scratchpad_size", name) && write(oneapi_cpp, "   auto status = oneapi::mkl::$library::$variant$name<$type>($parameters, {});\n   device_queue->val.wait_and_throw();\n")
-      occursin("scratchpad_size", name)  && write(oneapi_cpp, "   int64_t scratchpad_size = oneapi::mkl::$library::$variant$name<$type>($parameters);\n   device_queue->val.wait_and_throw();\n")
-      # !occursin("scratchpad_size", name) && write(oneapi_cpp, "   auto status = oneapi::mkl::$library::$variant$name<$type>($parameters, {});\n")
-      # occursin("scratchpad_size", name)  && write(oneapi_cpp, "   int64_t scratchpad_size = oneapi::mkl::$library::$variant$name<$type>($parameters);\n")
+      if !occursin("scratchpad_size", name)
+        catch_clause = library == "lapack" ? lapack_catch : sycl_catch
+        write(oneapi_cpp, "   try {\n")
+        write(oneapi_cpp, "      auto status = oneapi::mkl::$library::$variant$name<$type>($parameters, {});\n")
+        write(oneapi_cpp, "      device_queue->val.wait_and_throw();\n")
+        write(oneapi_cpp, "   } $catch_clause\n")
+      end
+      if occursin("scratchpad_size", name)
+        write(oneapi_cpp, "   int64_t scratchpad_size = oneapi::mkl::$library::$variant$name<$type>($parameters);\n   device_queue->val.wait_and_throw();\n")
+      end
     else
       if !(name ∈ void_output)
-        write(oneapi_cpp, "   auto status = oneapi::mkl::$library::$variant$name($parameters, {});\n")
-        occursin("device_queue", parameters) && write(oneapi_cpp, "   device_queue->val.wait_and_throw();\n")
+        has_queue = occursin("device_queue", parameters)
+        is_scratchpad = occursin("scratchpad_size", name)
+        if has_queue && !is_scratchpad
+          catch_clause = library == "lapack" ? lapack_catch : sycl_catch
+          write(oneapi_cpp, "   try {\n")
+          write(oneapi_cpp, "      auto status = oneapi::mkl::$library::$variant$name($parameters, {});\n")
+          write(oneapi_cpp, "      device_queue->val.wait_and_throw();\n")
+          write(oneapi_cpp, "   } $catch_clause\n")
+        else
+          write(oneapi_cpp, "   auto status = oneapi::mkl::$library::$variant$name($parameters, {});\n")
+          if has_queue
+            write(oneapi_cpp, "   device_queue->val.wait_and_throw();\n")
+          end
+        end
       else
-        write(oneapi_cpp, "   oneapi::mkl::$library::$variant$name($parameters);\n")
-        occursin("device_queue", parameters) && write(oneapi_cpp, "   device_queue->val.wait_and_throw();\n")
+        if occursin("device_queue", parameters)
+          write(oneapi_cpp, "   try {\n")
+          write(oneapi_cpp, "      oneapi::mkl::$library::$variant$name($parameters);\n")
+          write(oneapi_cpp, "      device_queue->val.wait_and_throw();\n")
+          write(oneapi_cpp, "   } $sycl_catch\n")
+        else
+          write(oneapi_cpp, "   oneapi::mkl::$library::$variant$name($parameters);\n")
+        end
       end
     end
     if occursin("scratchpad_size", name)

diff --git a/deps/src/onemkl.cpp b/deps/src/onemkl.cpp
diff --git a/deps/src/sycl.cpp b/deps/src/sycl.cpp
@@ -76,6 +76,11 @@ extern "C" int syclQueueDestroy(syclQueue_t obj) {
     return 0;
 }
 
+extern "C" int syclQueueWait(syclQueue_t obj) {
+    obj->val.wait();
+    return 0;
+}
+
 extern "C" int syclEventCreate(syclEvent_t *obj, syclContext_t context,
                                ze_event_handle_t event, int keep_ownership) {
     auto sycl_ownership =

diff --git a/deps/src/sycl.h b/deps/src/sycl.h
@@ -26,6 +26,7 @@ typedef struct syclQueue_st *syclQueue_t;
 int syclQueueCreate(syclQueue_t *obj, syclContext_t context, syclDevice_t device,
                     ze_command_queue_handle_t queue, int keep_ownership);
 int syclQueueDestroy(syclQueue_t obj);
+int syclQueueWait(syclQueue_t obj);
 
 typedef struct syclEvent_st *syclEvent_t;
 int syclEventCreate(syclEvent_t *obj, syclContext_t context,

diff --git a/lib/mkl/linalg.jl b/lib/mkl/linalg.jl
@@ -98,9 +98,9 @@ function LinearAlgebra.generic_matvecmul!(Y::oneVector, tA::AbstractChar, A::one
             if tA in ('N', 'T', 'C')
                 return gemv!(tA, alpha, A, B, beta, Y)
             elseif tA in ('S', 's')
-                return symv!(tA == 'S' ? 'U' : 'L', alpha, A, x, beta, y)
+                return symv!(tA == 'S' ? 'U' : 'L', alpha, A, B, beta, Y)
             elseif tA in ('H', 'h')
-                return hemv!(tA == 'H' ? 'U' : 'L', alpha, A, x, beta, y)
+                return hemv!(tA == 'H' ? 'U' : 'L', alpha, A, B, beta, Y)
             end
         end
     end

diff --git a/lib/support/liboneapi_support.jl b/lib/support/liboneapi_support.jl
@@ -58,6 +58,10 @@ function syclQueueDestroy(obj)
     @ccall liboneapi_support.syclQueueDestroy(obj::syclQueue_t)::Cint
 end
 
+function syclQueueWait(obj)
+    @ccall liboneapi_support.syclQueueWait(obj::syclQueue_t)::Cint
+end
+
 mutable struct syclEvent_st end
 
 const syclEvent_t = Ptr{syclEvent_st}

diff --git a/src/accumulate.jl b/src/accumulate.jl
@@ -2,12 +2,20 @@ import oneAPI
 import oneAPI: oneArray, oneAPIBackend
 import AcceleratedKernels as AK
 
+# Use a smaller block size on Intel GPUs to work around a scan correctness issue
+# with the Blelloch parallel prefix sum at larger block sizes (>=128).
+const _ACCUMULATE_BLOCK_SIZE = 64
+
 # Accumulate operations using AcceleratedKernels
-Base.accumulate!(op, B::oneArray, A::oneArray; init = zero(eltype(A)), kwargs...) =
-    AK.accumulate!(op, B, A, oneAPIBackend(); init, kwargs...)
+Base.accumulate!(op, B::oneArray, A::oneArray; init = zero(eltype(A)),
+                 block_size = _ACCUMULATE_BLOCK_SIZE, kwargs...) =
+    AK.accumulate!(op, B, A, oneAPIBackend(); init, block_size, kwargs...)
 
-Base.accumulate(op, A::oneArray; init = zero(eltype(A)), kwargs...) =
-    AK.accumulate(op, A, oneAPIBackend(); init, kwargs...)
+Base.accumulate(op, A::oneArray; init = zero(eltype(A)),
+                block_size = _ACCUMULATE_BLOCK_SIZE, kwargs...) =
+    AK.accumulate(op, A, oneAPIBackend(); init, block_size, kwargs...)
 
-Base.cumsum(src::oneArray; kwargs...) = AK.cumsum(src, oneAPIBackend(); kwargs...)
-Base.cumprod(src::oneArray; kwargs...) = AK.cumprod(src, oneAPIBackend(); kwargs...)
+Base.cumsum(src::oneArray; block_size = _ACCUMULATE_BLOCK_SIZE, kwargs...) =
+    AK.cumsum(src, oneAPIBackend(); block_size, kwargs...)
+Base.cumprod(src::oneArray; block_size = _ACCUMULATE_BLOCK_SIZE, kwargs...) =
+    AK.cumprod(src, oneAPIBackend(); block_size, kwargs...)
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -4,14 +4,31 @@
 # - serial version for lower latency
 # - group-stride loop to delay need for second kernel launch
 
+# Widen sub-word types to avoid shared memory corruption on Intel GPUs.
+# Writing 1/2-byte values to local memory can clobber adjacent bytes.
+# Only applies to integer/boolean types where `%` conversion is valid.
+@inline _widen_type(::Type{Bool}) = Int32
+@inline _widen_type(::Type{Int8}) = Int32
+@inline _widen_type(::Type{UInt8}) = Int32
+@inline _widen_type(::Type{Int16}) = Int32
+@inline _widen_type(::Type{UInt16}) = Int32
+@inline _widen_type(::Type{T}) where T = T
+
+# Dispatch-based conversions so the compiler never generates `%` for non-integer types
+@inline _to_wide(val, ::Type{W}) where W = val % W
+@inline _to_wide(val::T, ::Type{T}) where T = val
+@inline _from_wide(val, ::Type{T}) where T = val % T
+@inline _from_wide(val::T, ::Type{T}) where T = val
+
 # Reduce a value across a group, using local memory for communication
 @inline function reduce_group(op, val::T, neutral, ::Val{maxitems}) where {T, maxitems}
     items = get_local_size()
     item = get_local_id()
 
-    # local mem for a complete reduction
-    shared = oneLocalArray(T, (maxitems,))
-    @inbounds shared[item] = val
+    # use a wider type for shared memory to avoid sub-word corruption
+    W = _widen_type(T)
+    shared = oneLocalArray(W, (maxitems,))
+    @inbounds shared[item] = _to_wide(val, W)
 
     # perform a reduction
     d = 1
@@ -20,18 +37,18 @@
         index = 2 * d * (item-1) + 1
         @inbounds if index <= items
             other_val = if index + d <= items
-                shared[index+d]
+                _from_wide(shared[index+d], T)
             else
                 neutral
             end
-            shared[index] = op(shared[index], other_val)
+            shared[index] = _to_wide(op(_from_wide(shared[index], T), other_val), W)
         end
         d *= 2
     end
 
     # load the final value on the first item
     if item == 1
-        val = @inbounds shared[item]
+        val = @inbounds _from_wide(shared[item], T)
     end
 
     return val
@@ -135,8 +152,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::oneWrappedArray{T},
     # that's why each items also loops across their inputs, processing multiple values
     # so that we can span the entire reduction dimension using a single item group.
 
-    # group size is restricted by local memory
-    max_lmem_elements = compute_properties(device()).maxSharedLocalMemory ÷ sizeof(T)
+    # group size is restricted by local memory (use widened type for sub-word types)
+    max_lmem_elements = compute_properties(device()).maxSharedLocalMemory ÷ sizeof(_widen_type(T))
     max_items = min(compute_properties(device()).maxTotalGroupSize,
                     compute_items(max_lmem_elements ÷ 2))
     # TODO: dynamic local memory to avoid two compilations