diff --git a/src/array.jl b/src/array.jl
index 40a6bfa1d..a73627f75 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -31,7 +31,7 @@ function check_eltype(T)
     Base.isbitsunion(T) && error("MtlArray does not yet support isbits-union arrays")
     contains_eltype(T, Float64) && error("Metal does not support Float64 values, try using Float32 instead")
     contains_eltype(T, Int128) && error("Metal does not support Int128 values, try using Int64 instead")
-    contains_eltype(T, UInt128) && error("Metal does not support UInt128 values, try using UInt64 instead")
+    return contains_eltype(T, UInt128) && error("Metal does not support UInt128 values, try using UInt64 instead")
 end
 
 """
@@ -43,14 +43,14 @@ end
 
 See the Array Programming section of the Metal.jl docs for more details.
 """
-mutable struct MtlArray{T,N,S} <: AbstractGPUArray{T,N}
+mutable struct MtlArray{T, N, S} <: AbstractGPUArray{T, N}
     data::DataRef{<:MTLBuffer}
 
     maxsize::Int  # maximum data size in bytes; excluding any selector bytes
     offset::Int   # offset of the data in the buffer, in number of elements
     dims::Dims{N}
 
-    function MtlArray{T,N,S}(::UndefInitializer, dims::Dims{N}) where {T,N,S}
+    function MtlArray{T, N, S}(::UndefInitializer, dims::Dims{N}) where {T, N, S}
         check_eltype(T)
         maxsize = prod(dims) * sizeof(T)
 
@@ -75,44 +75,48 @@ mutable struct MtlArray{T,N,S} <: AbstractGPUArray{T,N}
         end
         data[].label = "MtlArray{$(T),$(N),$(S)}(dims=$dims)"
 
-        obj = new{T,N,S}(data, maxsize, 0, dims)
-        finalizer(unsafe_free!, obj)
+        obj = new{T, N, S}(data, maxsize, 0, dims)
+        return finalizer(unsafe_free!, obj)
     end
 
-    function MtlArray{T,N,S}(data::DataRef{<:MTLBuffer}, dims::Dims{N};
-                             maxsize::Int=prod(dims) * sizeof(T), offset::Int=0) where {T,N,S}
+    function MtlArray{T, N, S}(
+            data::DataRef{<:MTLBuffer}, dims::Dims{N};
+            maxsize::Int = prod(dims) * sizeof(T), offset::Int = 0
+        ) where {T, N, S}
         check_eltype(T)
         storagemode = convert(MTL.MTLStorageMode, S)
         if storagemode != data[].storageMode
             error("Storage mode mismatch: expected $S, got $(data[].storageMode)")
         end
         obj = new{T, N, S}(copy(data), maxsize, offset, dims)
-        finalizer(unsafe_free!, obj)
+        return finalizer(unsafe_free!, obj)
     end
-    function MtlArray{T,N}(data::DataRef{<:MTLBuffer}, dims::Dims{N};
-                           maxsize::Int=prod(dims) * sizeof(T), offset::Int=0) where {T,N}
+    function MtlArray{T, N}(
+            data::DataRef{<:MTLBuffer}, dims::Dims{N};
+            maxsize::Int = prod(dims) * sizeof(T), offset::Int = 0
+        ) where {T, N}
         check_eltype(T)
         storagemode = data[].storageMode
         obj = if storagemode == MTL.MTLStorageModeShared
-            new{T,N,SharedStorage}(copy(data), maxsize, offset, dims)
+            new{T, N, SharedStorage}(copy(data), maxsize, offset, dims)
         elseif storagemode == MTL.MTLStorageModeManaged
             @warn "`ManagedStorage` is no longer supported with `MtlArray`s. Instead, use `SharedStorage` or use the Metal api directly from `Metal.MTL`."
-            new{T,N,ManagedStorage}(copy(data), maxsize, offset, dims)
+            new{T, N, ManagedStorage}(copy(data), maxsize, offset, dims)
         elseif storagemode == MTL.MTLStorageModePrivate
-            new{T,N,PrivateStorage}(copy(data), maxsize, offset, dims)
+            new{T, N, PrivateStorage}(copy(data), maxsize, offset, dims)
         elseif storagemode == MTL.MTLStorageModeMemoryless
-            new{T,N,Memoryless}(copy(data), maxsize, offset, dims)
+            new{T, N, Memoryless}(copy(data), maxsize, offset, dims)
         end
-        finalizer(unsafe_free!, obj)
+        return finalizer(unsafe_free!, obj)
     end
 end
 
 # Create MtlArray from MTLBuffer
-function MtlArray{T,N}(buf::B, dims::Dims{N}; kwargs...) where {B<:MTLBuffer,T,N}
+function MtlArray{T, N}(buf::B, dims::Dims{N}; kwargs...) where {B <: MTLBuffer, T, N}
     data = DataRef(buf) do buf
         free(buf)
     end
-    return MtlArray{T,N}(data, dims; kwargs...)
+    return MtlArray{T, N}(data, dims; kwargs...)
 end
 
 GPUArrays.storage(a::MtlArray) = a.data
@@ -125,7 +129,7 @@ Get the Metal device for an MtlArray.
 device(A::MtlArray) = A.data[].device
 
 storagemode(x::MtlArray) = storagemode(typeof(x))
-storagemode(::Type{<:MtlArray{<:Any,<:Any,S}}) where {S} = S
+storagemode(::Type{<:MtlArray{<:Any, <:Any, S}}) where {S} = S
 
 """
     is_shared(A::MtlArray)::Bool
@@ -168,7 +172,7 @@ for MtlArray{T,1,S}.
 
 See also `Vector`(@ref), and the Array Programming section of the Metal.jl docs for more details.
 """
-const MtlVector{T,S} = MtlArray{T,1,S}
+const MtlVector{T, S} = MtlArray{T, 1, S}
 
 """
     MtlMatrix{T,S} <: AbstractGPUMatrix{T}
@@ -178,7 +182,7 @@ for MtlArray{T,2,S}.
 
 See also `Matrix`(@ref), and the Array Programming section of the Metal.jl docs for more details.
 """
-const MtlMatrix{T,S} = MtlArray{T,2,S}
+const MtlMatrix{T, S} = MtlArray{T, 2, S}
 
 """
     MtlVecOrMat{T,S}
@@ -188,54 +192,63 @@ MtlMatrix or an MtlVector.
 
 See also `VecOrMat`(@ref) for examples.
 """
-const MtlVecOrMat{T,S} = Union{MtlVector{T,S},MtlMatrix{T,S}}
+const MtlVecOrMat{T, S} = Union{MtlVector{T, S}, MtlMatrix{T, S}}
 
-# default to private memory
-const DefaultStorageMode = let str = @load_preference("default_storage", "private")
+# default storage mode: "auto" selects based on unified memory architecture
+# - UMA devices (Apple Silicon): SharedStorage (zero-copy CPU access)
+# - Non-UMA devices (Intel discrete GPU): PrivateStorage
+# - Non-Apple platforms: PrivateStorage (Metal not available)
+const DefaultStorageMode = let str = @load_preference("default_storage", "auto")
     if str == "private"
         PrivateStorage
     elseif str == "shared"
         SharedStorage
+    elseif str == "auto"
+        if Sys.isapple() && !isempty(devices())
+            MTLDevice(1).hasUnifiedMemory ? SharedStorage : PrivateStorage
+        else
+            PrivateStorage
+        end
     else
-        error("unknown default storage mode: $default_storage")
+        error("unknown default storage mode: $str")
     end
 end
 
-MtlArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N} =
-    MtlArray{T,N,DefaultStorageMode}(undef, dims)
+MtlArray{T, N}(::UndefInitializer, dims::Dims{N}) where {T, N} =
+    MtlArray{T, N, DefaultStorageMode}(undef, dims)
 
 # storage, type and dimensionality specified
-MtlArray{T,N,S}(::UndefInitializer, dims::NTuple{N,Integer}) where {T,N,S} =
-    MtlArray{T,N,S}(undef, convert(Tuple{Vararg{Int}}, dims))
-MtlArray{T,N,S}(::UndefInitializer, dims::Vararg{Integer,N}) where {T,N,S} =
-    MtlArray{T,N,S}(undef, convert(Tuple{Vararg{Int}}, dims))
+MtlArray{T, N, S}(::UndefInitializer, dims::NTuple{N, Integer}) where {T, N, S} =
+    MtlArray{T, N, S}(undef, convert(Tuple{Vararg{Int}}, dims))
+MtlArray{T, N, S}(::UndefInitializer, dims::Vararg{Integer, N}) where {T, N, S} =
+    MtlArray{T, N, S}(undef, convert(Tuple{Vararg{Int}}, dims))
 
 # type and dimensionality specified
-MtlArray{T,N}(::UndefInitializer, dims::NTuple{N,Integer}) where {T,N} =
-    MtlArray{T,N}(undef, convert(Tuple{Vararg{Int}}, dims))
-MtlArray{T,N}(::UndefInitializer, dims::Vararg{Integer,N}) where {T,N} =
-    MtlArray{T,N}(undef, convert(Tuple{Vararg{Int}}, dims))
+MtlArray{T, N}(::UndefInitializer, dims::NTuple{N, Integer}) where {T, N} =
+    MtlArray{T, N}(undef, convert(Tuple{Vararg{Int}}, dims))
+MtlArray{T, N}(::UndefInitializer, dims::Vararg{Integer, N}) where {T, N} =
+    MtlArray{T, N}(undef, convert(Tuple{Vararg{Int}}, dims))
 
 # only type specified
-MtlArray{T}(::UndefInitializer, dims::NTuple{N,Integer}) where {T,N} =
-    MtlArray{T,N}(undef, convert(Tuple{Vararg{Int}}, dims))
-MtlArray{T}(::UndefInitializer, dims::Vararg{Integer,N}) where {T,N} =
-    MtlArray{T,N}(undef, convert(Tuple{Vararg{Int}}, dims))
+MtlArray{T}(::UndefInitializer, dims::NTuple{N, Integer}) where {T, N} =
+    MtlArray{T, N}(undef, convert(Tuple{Vararg{Int}}, dims))
+MtlArray{T}(::UndefInitializer, dims::Vararg{Integer, N}) where {T, N} =
+    MtlArray{T, N}(undef, convert(Tuple{Vararg{Int}}, dims))
 
 # empty vector constructor
-MtlArray{T,1,S}() where {T,S} = MtlArray{T,1,S}(undef, 0)
-MtlArray{T,1}() where {T} = MtlArray{T,1}(undef, 0)
+MtlArray{T, 1, S}() where {T, S} = MtlArray{T, 1, S}(undef, 0)
+MtlArray{T, 1}() where {T} = MtlArray{T, 1}(undef, 0)
 
-Base.similar(a::MtlArray{T,N,S}; storage=S) where {T,N,S} =
-    MtlArray{T,N,storage}(undef, size(a))
-Base.similar(::MtlArray{T,<:Any,S}, dims::Base.Dims{N}; storage=S) where {T,N,S} =
-    MtlArray{T,N,storage}(undef, dims)
-Base.similar(::MtlArray{<:Any,<:Any,S}, ::Type{T}, dims::Base.Dims{N}; storage=S) where {T,N,S} =
-    MtlArray{T,N,storage}(undef, dims)
+Base.similar(a::MtlArray{T, N, S}; storage = S) where {T, N, S} =
+    MtlArray{T, N, storage}(undef, size(a))
+Base.similar(::MtlArray{T, <:Any, S}, dims::Base.Dims{N}; storage = S) where {T, N, S} =
+    MtlArray{T, N, storage}(undef, dims)
+Base.similar(::MtlArray{<:Any, <:Any, S}, ::Type{T}, dims::Base.Dims{N}; storage = S) where {T, N, S} =
+    MtlArray{T, N, storage}(undef, dims)
 
 function Base.copy(a::MtlArray)
     b = similar(a)
-    @inbounds copyto!(b, a)
+    return @inbounds copyto!(b, a)
 end
 
 
@@ -246,7 +259,7 @@ Base.elsize(::Type{<:MtlArray{T}}) where {T} = sizeof(T)
 Base.size(x::MtlArray) = x.dims
 Base.sizeof(x::MtlArray) = Base.elsize(x) * length(x)
 
-@inline function Base.pointer(x::MtlArray{T}, i::Integer=1; storage=PrivateStorage) where {T}
+@inline function Base.pointer(x::MtlArray{T}, i::Integer = 1; storage = PrivateStorage) where {T}
     PT = if storage == PrivateStorage
         MtlPtr{T}
     elseif storage == SharedStorage
@@ -254,73 +267,73 @@ Base.sizeof(x::MtlArray) = Base.elsize(x) * length(x)
     else
         error("unknown memory type")
     end
-    Base.unsafe_convert(PT, x) + Base._memory_offset(x, i)
+    return Base.unsafe_convert(PT, x) + Base._memory_offset(x, i)
 end
 
 
 function Base.unsafe_convert(::Type{MtlPtr{T}}, x::MtlArray) where {T}
     buf = x.data[]
-    MtlPtr{T}(buf, x.offset * Base.elsize(x))
+    return MtlPtr{T}(buf, x.offset * Base.elsize(x))
 end
 
-function Base.unsafe_convert(::Type{Ptr{S}}, x::MtlArray{T}) where {S,T}
+function Base.unsafe_convert(::Type{Ptr{S}}, x::MtlArray{T}) where {S, T}
     if is_private(x)
         throw(ArgumentError("cannot take the CPU address of a $(typeof(x))"))
     end
     synchronize()
     buf = x.data[]
-    convert(Ptr{T}, buf) + x.offset * Base.elsize(x)
+    return convert(Ptr{T}, buf) + x.offset * Base.elsize(x)
 end
 
 
 ## indexing
-function Base.getindex(x::MtlArray{T,N,S}, I::Int) where {T,N,S<:SharedStorage}
+function Base.getindex(x::MtlArray{T, N, S}, I::Int) where {T, N, S <: SharedStorage}
     @boundscheck checkbounds(x, I)
-    unsafe_load(pointer(x, I; storage=S))
+    return unsafe_load(pointer(x, I; storage = S))
 end
 
-function Base.setindex!(x::MtlArray{T,N,S}, v, I::Int) where {T,N,S<:SharedStorage}
+function Base.setindex!(x::MtlArray{T, N, S}, v, I::Int) where {T, N, S <: SharedStorage}
     @boundscheck checkbounds(x, I)
-    unsafe_store!(pointer(x, I; storage=S), v)
+    return unsafe_store!(pointer(x, I; storage = S), v)
 end
 
 
 ## interop with other arrays
 
-@inline function MtlArray{T,N}(xs::AbstractArray{T,N}) where {T,N}
-    A = MtlArray{T,N}(undef, size(xs))
+@inline function MtlArray{T, N}(xs::AbstractArray{T, N}) where {T, N}
+    A = MtlArray{T, N}(undef, size(xs))
     @inline copyto!(A, convert(Array{T}, xs))
     return A
 end
-@inline function MtlArray{T,N,S}(xs::AbstractArray{T,N}) where {T,N,S}
-    A = MtlArray{T,N,S}(undef, size(xs))
+@inline function MtlArray{T, N, S}(xs::AbstractArray{T, N}) where {T, N, S}
+    A = MtlArray{T, N, S}(undef, size(xs))
     @inline copyto!(A, convert(Array{T}, xs))
     return A
 end
 
-MtlArray{T,N}(xs::AbstractArray{OT,N}) where {T,N,OT} = MtlArray{T,N}(map(T, xs))
-MtlArray{T,N,S}(xs::AbstractArray{OT,N}) where {T,N,S,OT} = MtlArray{T,N,S}(map(T, xs))
+MtlArray{T, N}(xs::AbstractArray{OT, N}) where {T, N, OT} = MtlArray{T, N}(map(T, xs))
+MtlArray{T, N, S}(xs::AbstractArray{OT, N}) where {T, N, S, OT} = MtlArray{T, N, S}(map(T, xs))
 
 # underspecified constructors
-MtlArray{T}(xs::AbstractArray{OT,N}) where {T,N,OT} = MtlArray{T,N}(xs)
-(::Type{MtlArray{T,N} where T})(x::AbstractArray{OT,N}) where {OT,N} = MtlArray{OT,N}(x)
-MtlArray(A::AbstractArray{T,N}) where {T,N} = MtlArray{T,N}(A)
+MtlArray{T}(xs::AbstractArray{OT, N}) where {T, N, OT} = MtlArray{T, N}(xs)
+(::Type{MtlArray{T, N} where {T}})(x::AbstractArray{OT, N}) where {OT, N} = MtlArray{OT, N}(x)
+MtlArray(A::AbstractArray{T, N}) where {T, N} = MtlArray{T, N}(A)
 
 # copy xs to match Array behavior with same storage mode
-MtlArray{T,N,S}(xs::MtlArray{T,N,S}) where {T,N,S} = copy(xs)
+MtlArray{T, N, S}(xs::MtlArray{T, N, S}) where {T, N, S} = copy(xs)
 
 ## derived types
 
 # wrapped arrays: can be used in kernels
-const WrappedMtlArray{T,N} = Union{MtlArray{T,N},WrappedArray{T,N,MtlArray,MtlArray{T,N}}}
-const WrappedMtlVector{T} = WrappedMtlArray{T,1}
-const WrappedMtlMatrix{T} = WrappedMtlArray{T,2}
-const WrappedMtlVecOrMat{T} = Union{WrappedMtlVector{T},WrappedMtlMatrix{T}}
+const WrappedMtlArray{T, N} = Union{MtlArray{T, N}, WrappedArray{T, N, MtlArray, MtlArray{T, N}}}
+const WrappedMtlVector{T} = WrappedMtlArray{T, 1}
+const WrappedMtlMatrix{T} = WrappedMtlArray{T, 2}
+const WrappedMtlVecOrMat{T} = Union{WrappedMtlVector{T}, WrappedMtlMatrix{T}}
 
 
 ## conversions
 
-Base.convert(::Type{T}, x::T) where T <: MtlArray = x
+Base.convert(::Type{T}, x::T) where {T <: MtlArray} = x
 
 
 ## interop with C libraries
@@ -338,14 +351,16 @@ Base.cconvert(::Type{<:id}, x::MtlArray) = x.data[]
 
 ## interop with CPU arrays
 
-Base.collect(x::MtlArray{T,N}) where {T,N} = copyto!(Array{T,N}(undef, size(x)), x)
+Base.collect(x::MtlArray{T, N}) where {T, N} = copyto!(Array{T, N}(undef, size(x)), x)
 
 
 ## memory copying
 
 # CPU -> GPU
-function Base.copyto!(dest::MtlArray{T}, doffs::Integer, src::Array{T}, soffs::Integer,
-                      n::Integer) where T
+function Base.copyto!(
+        dest::MtlArray{T}, doffs::Integer, src::Array{T}, soffs::Integer,
+        n::Integer
+    ) where {T}
     (n == 0 || sizeof(T) == 0) && return dest
     @boundscheck checkbounds(dest, doffs)
     @boundscheck checkbounds(dest, doffs + n - 1)
@@ -359,8 +374,10 @@ Base.copyto!(dest::MtlArray{T}, src::Array{T}) where {T} =
     copyto!(dest, 1, src, 1, length(src))
 
 # GPU -> CPU
-function Base.copyto!(dest::Array{T}, doffs::Integer, src::MtlArray{T}, soffs::Integer,
-                      n::Integer) where T
+function Base.copyto!(
+        dest::Array{T}, doffs::Integer, src::MtlArray{T}, soffs::Integer,
+        n::Integer
+    ) where {T}
     (n == 0 || sizeof(T) == 0) && return dest
     @boundscheck checkbounds(dest, doffs)
     @boundscheck checkbounds(dest, doffs + n - 1)
@@ -374,8 +391,10 @@ Base.copyto!(dest::Array{T}, src::MtlArray{T}) where {T} =
     copyto!(dest, 1, src, 1, length(src))
 
 # GPU -> GPU
-function Base.copyto!(dest::MtlArray{T}, doffs::Integer, src::MtlArray{T}, soffs::Integer,
-                      n::Integer) where T
+function Base.copyto!(
+        dest::MtlArray{T}, doffs::Integer, src::MtlArray{T}, soffs::Integer,
+        n::Integer
+    ) where {T}
     (n == 0 || sizeof(T) == 0) && return dest
     @boundscheck checkbounds(dest, doffs)
     @boundscheck checkbounds(dest, doffs + n - 1)
@@ -394,7 +413,7 @@ Base.copyto!(dest::MtlArray{T}, src::MtlArray{T}) where {T} =
     copyto!(dest, 1, src, 1, length(src))
 
 # CPU -> GPU
-function Base.unsafe_copyto!(dev::MTLDevice, dest::MtlArray{T}, doffs, src::Array{T}, soffs, n) where T
+function Base.unsafe_copyto!(dev::MTLDevice, dest::MtlArray{T}, doffs, src::Array{T}, soffs, n) where {T}
     # these copies are implemented using pure memcpy's, not API calls, so aren't ordered.
     synchronize()
     GC.@preserve src dest unsafe_copyto!(dev, pointer(dest, doffs), pointer(src, soffs), n)
@@ -404,15 +423,15 @@ function Base.unsafe_copyto!(dev::MTLDevice, dest::MtlArray{T}, doffs, src::Arra
     end
     return dest
 end
-function Base.unsafe_copyto!(::MTLDevice, dest::MtlArray{T,<:Any,Metal.SharedStorage}, doffs, src::Array{T}, soffs, n) where T
+function Base.unsafe_copyto!(::MTLDevice, dest::MtlArray{T, <:Any, Metal.SharedStorage}, doffs, src::Array{T}, soffs, n) where {T}
     # these copies are implemented using pure memcpy's, not API calls, so aren't ordered.
     synchronize()
-    GC.@preserve src dest unsafe_copyto!(pointer(unsafe_wrap(Array,dest), doffs), pointer(src, soffs), n)
+    GC.@preserve src dest unsafe_copyto!(pointer(unsafe_wrap(Array, dest), doffs), pointer(src, soffs), n)
     return dest
 end
 
 # GPU -> CPU
-function Base.unsafe_copyto!(dev::MTLDevice, dest::Array{T}, doffs, src::MtlArray{T}, soffs, n) where T
+function Base.unsafe_copyto!(dev::MTLDevice, dest::Array{T}, doffs, src::MtlArray{T}, soffs, n) where {T}
     # these copies are implemented using pure memcpy's, not API calls, so aren't ordered.
     synchronize()
     GC.@preserve src dest unsafe_copyto!(dev, pointer(dest, doffs), pointer(src, soffs), n)
@@ -422,15 +441,15 @@ function Base.unsafe_copyto!(dev::MTLDevice, dest::Array{T}, doffs, src::MtlArra
     end
     return dest
 end
-function Base.unsafe_copyto!(::MTLDevice, dest::Array{T}, doffs, src::MtlArray{T,<:Any,Metal.SharedStorage}, soffs, n) where T
+function Base.unsafe_copyto!(::MTLDevice, dest::Array{T}, doffs, src::MtlArray{T, <:Any, Metal.SharedStorage}, soffs, n) where {T}
     # these copies are implemented using pure memcpy's, not API calls, so aren't ordered.
     synchronize()
-    GC.@preserve src dest unsafe_copyto!(pointer(dest, doffs), pointer(unsafe_wrap(Array,src), soffs), n)
+    GC.@preserve src dest unsafe_copyto!(pointer(dest, doffs), pointer(unsafe_wrap(Array, src), soffs), n)
     return dest
 end
 
 # GPU -> GPU
-function Base.unsafe_copyto!(dev::MTLDevice, dest::MtlArray{T}, doffs, src::MtlArray{T}, soffs, n) where T
+function Base.unsafe_copyto!(dev::MTLDevice, dest::MtlArray{T}, doffs, src::MtlArray{T}, soffs, n) where {T}
     # these copies are implemented using pure memcpy's, not API calls, so aren't ordered.
     synchronize()
     GC.@preserve src dest unsafe_copyto!(dev, pointer(dest, doffs), pointer(src, soffs), n)
@@ -440,10 +459,10 @@ function Base.unsafe_copyto!(dev::MTLDevice, dest::MtlArray{T}, doffs, src::MtlA
     end
     return dest
 end
-function Base.unsafe_copyto!(::MTLDevice, dest::MtlArray{T,<:Any,Metal.SharedStorage}, doffs, src::MtlArray{T,<:Any,Metal.SharedStorage}, soffs, n) where T
+function Base.unsafe_copyto!(::MTLDevice, dest::MtlArray{T, <:Any, Metal.SharedStorage}, doffs, src::MtlArray{T, <:Any, Metal.SharedStorage}, soffs, n) where {T}
     # these copies are implemented using pure memcpy's, not API calls, so aren't ordered.
     synchronize()
-    GC.@preserve src dest unsafe_copyto!(pointer(unsafe_wrap(Array,dest), doffs), pointer(unsafe_wrap(Array,src), soffs), n)
+    GC.@preserve src dest unsafe_copyto!(pointer(unsafe_wrap(Array, dest), doffs), pointer(unsafe_wrap(Array, src), soffs), n)
     return dest
 end
 
@@ -453,16 +472,16 @@ end
 # We don't convert isbits types in `adapt`, since they are already
 # considered GPU-compatible.
 
-Adapt.adapt_storage(::Type{MtlArray}, xs::AT) where {AT<:AbstractArray} =
+Adapt.adapt_storage(::Type{MtlArray}, xs::AT) where {AT <: AbstractArray} =
     isbitstype(AT) ? xs : convert(MtlArray, xs)
 
 # if specific type parameters are specified, preserve those
-Adapt.adapt_storage(::Type{<:MtlArray{T}}, xs::AT) where {T,AT<:AbstractArray} =
+Adapt.adapt_storage(::Type{<:MtlArray{T}}, xs::AT) where {T, AT <: AbstractArray} =
     isbitstype(AT) ? xs : convert(MtlArray{T}, xs)
-Adapt.adapt_storage(::Type{<:MtlArray{T,N}}, xs::AT) where {T,N,AT<:AbstractArray} =
-    isbitstype(AT) ? xs : convert(MtlArray{T,N}, xs)
-Adapt.adapt_storage(::Type{<:MtlArray{T,N,S}}, xs::AT) where {T,N,S,AT<:AbstractArray} =
-    isbitstype(AT) ? xs : convert(MtlArray{T,N,S}, xs)
+Adapt.adapt_storage(::Type{<:MtlArray{T, N}}, xs::AT) where {T, N, AT <: AbstractArray} =
+    isbitstype(AT) ? xs : convert(MtlArray{T, N}, xs)
+Adapt.adapt_storage(::Type{<:MtlArray{T, N, S}}, xs::AT) where {T, N, S, AT <: AbstractArray} =
+    isbitstype(AT) ? xs : convert(MtlArray{T, N, S}, xs)
 
 
 ## opinionated gpu array adaptor
@@ -471,14 +490,14 @@ Adapt.adapt_storage(::Type{<:MtlArray{T,N,S}}, xs::AT) where {T,N,S,AT<:Abstract
 
 struct MtlArrayAdaptor{S} end
 
-Adapt.adapt_storage(::MtlArrayAdaptor{S}, xs::AbstractArray{T,N}) where {T,N,S} =
-    isbits(xs) ? xs : MtlArray{T,N,S}(xs)
+Adapt.adapt_storage(::MtlArrayAdaptor{S}, xs::AbstractArray{T, N}) where {T, N, S} =
+    isbits(xs) ? xs : MtlArray{T, N, S}(xs)
 
-Adapt.adapt_storage(::MtlArrayAdaptor{S}, xs::AbstractArray{T,N}) where {T<:Float64,N,S} =
-    isbits(xs) ? xs : MtlArray{Float32,N,S}(xs)
+Adapt.adapt_storage(::MtlArrayAdaptor{S}, xs::AbstractArray{T, N}) where {T <: Float64, N, S} =
+    isbits(xs) ? xs : MtlArray{Float32, N, S}(xs)
 
-Adapt.adapt_storage(::MtlArrayAdaptor{S}, xs::AbstractArray{T,N}) where {T<:Complex{<:Float64},N,S} =
-    isbits(xs) ? xs : MtlArray{ComplexF32,N,S}(xs)
+Adapt.adapt_storage(::MtlArrayAdaptor{S}, xs::AbstractArray{T, N}) where {T <: Complex{<:Float64}, N, S} =
+    isbits(xs) ? xs : MtlArray{ComplexF32, N, S}(xs)
 
 """
     mtl(A; storage=Metal.PrivateStorage)
@@ -516,34 +535,34 @@ julia> MtlArray(1:3)
  3
 ```
 """
-@inline mtl(xs; storage=DefaultStorageMode) = adapt(MtlArrayAdaptor{storage}(), xs)
+@inline mtl(xs; storage = DefaultStorageMode) = adapt(MtlArrayAdaptor{storage}(), xs)
 
 ## utilities
 
 for (fname, felt) in ((:zeros, :zero), (:ones, :one))
     @eval begin
-        $fname(::Type{T}, dims::Base.Dims{N}; storage=DefaultStorageMode) where {T,N} = fill!(MtlArray{T,N,storage}(undef, dims), $felt(T))
-        $fname(::Type{T}, dims...; storage=DefaultStorageMode) where {T} = fill!(MtlArray{T,length(dims),storage}(undef, dims), $felt(T))
-        $fname(dims...; storage=DefaultStorageMode) = fill!(MtlArray{Float32,length(dims),storage}(undef, dims), $felt(Float32))
+        $fname(::Type{T}, dims::Base.Dims{N}; storage = DefaultStorageMode) where {T, N} = fill!(MtlArray{T, N, storage}(undef, dims), $felt(T))
+        $fname(::Type{T}, dims...; storage = DefaultStorageMode) where {T} = fill!(MtlArray{T, length(dims), storage}(undef, dims), $felt(T))
+        $fname(dims...; storage = DefaultStorageMode) = fill!(MtlArray{Float32, length(dims), storage}(undef, dims), $felt(Float32))
     end
 end
 
-fill(v::T, dims::Base.Dims{N}; storage=DefaultStorageMode) where {T,N} = fill!(MtlArray{T,N,storage}(undef, dims), v)
-fill(v::T, dims...; storage=DefaultStorageMode) where T = fill!(MtlArray{T,length(dims),storage}(undef, dims), v)
+fill(v::T, dims::Base.Dims{N}; storage = DefaultStorageMode) where {T, N} = fill!(MtlArray{T, N, storage}(undef, dims), v)
+fill(v::T, dims...; storage = DefaultStorageMode) where {T} = fill!(MtlArray{T, length(dims), storage}(undef, dims), v)
 
 # optimized implementation of `fill!` for types that are directly supported by fillbuffer
-function Base.fill!(A::MtlArray{T}, val) where T <: Union{UInt8,Int8}
+function Base.fill!(A::MtlArray{T}, val) where {T <: Union{UInt8, Int8}}
     B = convert(T, val)
     unsafe_fill!(device(A), pointer(A), B, length(A))
-    A
+    return A
 end
 
 
 ## derived arrays
 
-function GPUArrays.derive(::Type{T}, a::MtlArray{<:Any,<:Any,S}, dims::Dims{N}, offset::Int) where {T,N,S}
+function GPUArrays.derive(::Type{T}, a::MtlArray{<:Any, <:Any, S}, dims::Dims{N}, offset::Int) where {T, N, S}
     offset = (a.offset * Base.elsize(a)) ÷ sizeof(T) + offset
-    MtlArray{T,N,S}(a.data, dims; a.maxsize, offset)
+    return MtlArray{T, N, S}(a.data, dims; a.maxsize, offset)
 end
 
 
@@ -552,13 +571,13 @@ end
 device(a::SubArray) = device(parent(a))
 
 # pointer conversions
-function Base.unsafe_convert(::Type{MTL.MTLBuffer}, V::SubArray{T,N,P,<:Tuple{Vararg{Base.RangeIndex}}}) where {T,N,P}
+function Base.unsafe_convert(::Type{MTL.MTLBuffer}, V::SubArray{T, N, P, <:Tuple{Vararg{Base.RangeIndex}}}) where {T, N, P}
     return Base.unsafe_convert(MTL.MTLBuffer, parent(V)) +
-           Base._memory_offset(V.parent, map(first, V.indices)...)
+        Base._memory_offset(V.parent, map(first, V.indices)...)
 end
-function Base.unsafe_convert(::Type{MTL.MTLBuffer}, V::SubArray{T,N,P,<:Tuple{Vararg{Union{Base.RangeIndex,Base.ReshapedUnitRange}}}}) where {T,N,P}
+function Base.unsafe_convert(::Type{MTL.MTLBuffer}, V::SubArray{T, N, P, <:Tuple{Vararg{Union{Base.RangeIndex, Base.ReshapedUnitRange}}}}) where {T, N, P}
     return Base.unsafe_convert(MTL.MTLBuffer, parent(V)) +
-           (Base.first_index(V) - 1) * sizeof(T)
+        (Base.first_index(V) - 1) * sizeof(T)
 end
 
 
@@ -577,22 +596,24 @@ function Base.unsafe_wrap(
         arr::MtlArray{T, N}, dims = size(arr);
         own::Bool = false
     ) where {T, N}
-    return unsafe_wrap(Array{T,N}, pointer(arr), dims; own)
+    return unsafe_wrap(Array{T, N}, pointer(arr), dims; own)
 end
 
-function Base.unsafe_wrap(t::Type{<:Array{T}}, buf::MTLBuffer, dims; own=false) where T
+function Base.unsafe_wrap(t::Type{<:Array{T}}, buf::MTLBuffer, dims; own = false) where {T}
     ptr = convert(Ptr{T}, buf)
     return unsafe_wrap(t, ptr, dims; own)
 end
 
-function Base.unsafe_wrap(t::Type{<:Array{T}}, ptr::MtlPtr{T}, dims; own=false) where T
+function Base.unsafe_wrap(t::Type{<:Array{T}}, ptr::MtlPtr{T}, dims; own = false) where {T}
     return unsafe_wrap(t, convert(Ptr{T}, ptr), dims; own)
 end
 
-function Base.unsafe_wrap(A::Type{<:MtlArray{T,N}}, arr::Array, dims=size(arr);
-                          dev=device(), kwargs...) where {T,N}
+function Base.unsafe_wrap(
+        A::Type{<:MtlArray{T, N}}, arr::Array, dims = size(arr);
+        dev = device(), kwargs...
+    ) where {T, N}
     GC.@preserve arr begin
-        buf = MTLBuffer(dev, prod(dims) * sizeof(T), pointer(arr); nocopy=true, kwargs...)
+        buf = MTLBuffer(dev, prod(dims) * sizeof(T), pointer(arr); nocopy = true, kwargs...)
         return A(buf, Dims(dims))
     end
 end
@@ -606,7 +627,7 @@ Resize `a` to contain `n` elements. If `n` is smaller than the current collectio
 the first `n` elements will be retained. If `n` is larger, the new elements are not
 guaranteed to be initialized.
 """
-function Base.resize!(A::MtlVector{T}, n::Integer) where T
+function Base.resize!(A::MtlVector{T}, n::Integer) where {T}
     # TODO: add additional space to allow for quicker resizing
     maxsize = n * sizeof(T)
     bufsize = if isbitstype(T)
@@ -618,7 +639,7 @@ function Base.resize!(A::MtlVector{T}, n::Integer) where T
 
     # replace the data with a new one. this 'unshares' the array.
     # as a result, we can safely support resizing unowned buffers.
-    buf = alloc(device(A), bufsize; storage=storagemode(A))
+    buf = alloc(device(A), bufsize; storage = storagemode(A))
     ptr = MtlPtr{T}(buf)
     m = min(length(A), n)
     if m > 0
@@ -634,5 +655,5 @@ function Base.resize!(A::MtlVector{T}, n::Integer) where T
     A.maxsize = maxsize
     A.offset = 0
 
-    A
+    return A
 end
diff --git a/test/array.jl b/test/array.jl
index e64dbf9bb..9111f35e1 100644
--- a/test/array.jl
+++ b/test/array.jl
@@ -2,604 +2,632 @@ STORAGEMODES = [Metal.PrivateStorage, Metal.SharedStorage]
 
 @testset "array" begin
 
-let arr = MtlVector{Int}(undef, 1)
-    @test sizeof(arr) == 8
-    @test length(arr) == 1
-    @test eltype(arr) == Int
-end
-
-let arr = MtlVector{Int}(undef, 0)
-    @test sizeof(arr) == 0
-end
-
-@testset "constructors" begin
-    xs = MtlArray{Int8}(undef, 2, 3)
-    @test device(xs) == device()
-    @test Base.elsize(xs) == sizeof(Int8)
-    @test xs.data[].length == 6
-    xs2 = MtlArray{Int8, 2}(xs)
-    @test xs2.data[].length == 6
-    @test pointer(xs2) != pointer(xs)
-
-    @test collect(MtlArray([1 2; 3 4])) == [1 2; 3 4]
-    @test collect(mtl([1, 2, 3])) == [1, 2, 3]
-    @test testf(vec, rand(Float32, 5,3))
-    @test mtl(1:3) === 1:3
-
-
-    # Page 22 of https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
-    # Only bfloat missing
-    supported_number_types = [Float16  => Float16,
-                              Float32  => Float32,
-                              Float64  => Float32,
-                              Bool     => Bool,
-                              Int16    => Int16,
-                              Int32    => Int32,
-                              Int64    => Int64,
-                              Int8     => Int8,
-                              UInt16   => UInt16,
-                              UInt32   => UInt32,
-                              UInt64   => UInt64,
-                              UInt8    => UInt8]
-    # Test supported types and ensure only Float64 get converted to Float32
-    for (SrcType, TargType) in supported_number_types
-        @test mtl(SrcType[1]) isa MtlArray{TargType}
-        @test mtl(Complex{SrcType}[1+1im]) isa MtlArray{Complex{TargType}}
-    end
+    @testset "default storage mode" begin
+        # Test that default storage mode respects UMA detection
+        if Metal.device().hasUnifiedMemory
+            @test Metal.DefaultStorageMode == Metal.SharedStorage
+        else
+            @test Metal.DefaultStorageMode == Metal.PrivateStorage
+        end
 
-    # test the regular adaptor
-    @test Adapt.adapt(MtlArray, [1 2;3 4]) isa MtlArray{Int, 2, Metal.DefaultStorageMode}
-    @test Adapt.adapt(MtlArray{Float32}, [1 2;3 4]) isa MtlArray{Float32, 2, Metal.DefaultStorageMode}
-    @test Adapt.adapt(MtlArray{Float32, 2}, [1 2;3 4]) isa MtlArray{Float32, 2, Metal.DefaultStorageMode}
-    @test Adapt.adapt(MtlArray{Float32, 2, Metal.SharedStorage}, [1 2;3 4]) isa MtlArray{Float32, 2, Metal.SharedStorage}
-    @test Adapt.adapt(MtlMatrix{ComplexF32, Metal.SharedStorage}, [1 2;3 4]) isa MtlArray{ComplexF32, 2, Metal.SharedStorage}
-    @test Adapt.adapt(MtlArray{Float16}, Float64[1]) isa MtlArray{Float16}
-
-    # Test a few explicitly unsupported types
-    @test_throws "MtlArray only supports element types that are stored inline" MtlArray(BigInt[1])
-    @test_throws "Metal does not support Float64 values" MtlArray(Float64[1])
-    @test_throws "Metal does not support Int128 values" MtlArray(Int128[1])
-    @test_throws "Metal does not support UInt128 values" MtlArray(UInt128[1])
-
-    @test collect(Metal.zeros(2, 2)) == zeros(Float32, 2, 2)
-    @test collect(Metal.ones(2, 2)) == ones(Float32, 2, 2)
-
-    @test collect(Metal.fill(0, 2, 2)) == zeros(Float32, 2, 2)
-    @test collect(Metal.fill(1, 2, 2)) == ones(Float32, 2, 2)
-end
+        # Test that arrays created without explicit storage use the default
+        arr = MtlArray{Float32}(undef, 100)
+        @test Metal.storagemode(arr) == Metal.DefaultStorageMode
+
+        # Test that explicit storage mode overrides the default
+        arr_private = MtlArray{Float32, 1, Metal.PrivateStorage}(undef, 100)
+        arr_shared = MtlArray{Float32, 1, Metal.SharedStorage}(undef, 100)
+        @test Metal.storagemode(arr_private) == Metal.PrivateStorage
+        @test Metal.storagemode(arr_shared) == Metal.SharedStorage
+    end
+
+    let arr = MtlVector{Int}(undef, 1)
+        @test sizeof(arr) == 8
+        @test length(arr) == 1
+        @test eltype(arr) == Int
+    end
+
+    let arr = MtlVector{Int}(undef, 0)
+        @test sizeof(arr) == 0
+    end
+
+    @testset "constructors" begin
+        xs = MtlArray{Int8}(undef, 2, 3)
+        @test device(xs) == device()
+        @test Base.elsize(xs) == sizeof(Int8)
+        @test xs.data[].length == 6
+        xs2 = MtlArray{Int8, 2}(xs)
+        @test xs2.data[].length == 6
+        @test pointer(xs2) != pointer(xs)
+
+        @test collect(MtlArray([1 2; 3 4])) == [1 2; 3 4]
+        @test collect(mtl([1, 2, 3])) == [1, 2, 3]
+        @test testf(vec, rand(Float32, 5, 3))
+        @test mtl(1:3) === 1:3
+
+
+        # Page 22 of https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
+        # Only bfloat missing
+        supported_number_types = [
+            Float16 => Float16,
+            Float32 => Float32,
+            Float64 => Float32,
+            Bool => Bool,
+            Int16 => Int16,
+            Int32 => Int32,
+            Int64 => Int64,
+            Int8 => Int8,
+            UInt16 => UInt16,
+            UInt32 => UInt32,
+            UInt64 => UInt64,
+            UInt8 => UInt8,
+        ]
+        # Test supported types and ensure only Float64 get converted to Float32
+        for (SrcType, TargType) in supported_number_types
+            @test mtl(SrcType[1]) isa MtlArray{TargType}
+            @test mtl(Complex{SrcType}[1 + 1im]) isa MtlArray{Complex{TargType}}
+        end
 
-@testset "copyto!" begin
-    @testset "$T, $S" for S in [Metal.PrivateStorage, Metal.SharedStorage],
-                          T in [Float16, Float32, Bool, Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8]
-        dim = (1000,17,10)
-        A = rand(T,dim)
-        mtlA = mtl(A;storage=S)
-
-        #cpu -> gpu
-        res = Metal.zeros(T,dim;storage=S)
-        copyto!(res,A)
-        @test Array(res) == Array(A)
-
-        #gpu -> cpu
-        res = zeros(T,dim)
-        copyto!(res,mtlA)
-        @test Array(res) == Array(mtlA)
-
-        #gpu -> gpu
-        res = Metal.zeros(T,dim;storage=S)
-        copyto!(res,mtlA)
-        @test Array(res) == Array(mtlA)
+        # test the regular adaptor
+        @test Adapt.adapt(MtlArray, [1 2;3 4]) isa MtlArray{Int, 2, Metal.DefaultStorageMode}
+        @test Adapt.adapt(MtlArray{Float32}, [1 2;3 4]) isa MtlArray{Float32, 2, Metal.DefaultStorageMode}
+        @test Adapt.adapt(MtlArray{Float32, 2}, [1 2;3 4]) isa MtlArray{Float32, 2, Metal.DefaultStorageMode}
+        @test Adapt.adapt(MtlArray{Float32, 2, Metal.SharedStorage}, [1 2;3 4]) isa MtlArray{Float32, 2, Metal.SharedStorage}
+        @test Adapt.adapt(MtlMatrix{ComplexF32, Metal.SharedStorage}, [1 2;3 4]) isa MtlArray{ComplexF32, 2, Metal.SharedStorage}
+        @test Adapt.adapt(MtlArray{Float16}, Float64[1]) isa MtlArray{Float16}
+
+        # Test a few explicitly unsupported types
+        @test_throws "MtlArray only supports element types that are stored inline" MtlArray(BigInt[1])
+        @test_throws "Metal does not support Float64 values" MtlArray(Float64[1])
+        @test_throws "Metal does not support Int128 values" MtlArray(Int128[1])
+        @test_throws "Metal does not support UInt128 values" MtlArray(UInt128[1])
+
+        @test collect(Metal.zeros(2, 2)) == zeros(Float32, 2, 2)
+        @test collect(Metal.ones(2, 2)) == ones(Float32, 2, 2)
+
+        @test collect(Metal.fill(0, 2, 2)) == zeros(Float32, 2, 2)
+        @test collect(Metal.fill(1, 2, 2)) == ones(Float32, 2, 2)
+    end
+
+    @testset "copyto!" begin
+        @testset "$T, $S" for S in [Metal.PrivateStorage, Metal.SharedStorage],
+                T in [Float16, Float32, Bool, Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8]
+            dim = (1000, 17, 10)
+            A = rand(T, dim)
+            mtlA = mtl(A; storage = S)
+
+            #cpu -> gpu
+            res = Metal.zeros(T, dim; storage = S)
+            copyto!(res, A)
+            @test Array(res) == Array(A)
+
+            #gpu -> cpu
+            res = zeros(T, dim)
+            copyto!(res, mtlA)
+            @test Array(res) == Array(mtlA)
+
+            #gpu -> gpu
+            res = Metal.zeros(T, dim; storage = S)
+            copyto!(res, mtlA)
+            @test Array(res) == Array(mtlA)
+        end
     end
-end
-
-check_storagemode(arr, smode) = Metal.storagemode(arr) == smode
-
-# There is some repetition to the GPUArrays tests to test for different storagemodes
-@testset "$SM storageMode $dim" for SM in STORAGEMODES, dim in [(10,10,10), (1000,17,10)] # The second one purposefully made to always be bigger than 16KiB
 
-    N = length(dim)
+    check_storagemode(arr, smode) = Metal.storagemode(arr) == smode
 
-    # mtl
-    let arr = mtl(rand(2,2); storage= SM)
-        @test check_storagemode(arr,  SM)
-    end
-
-    # type and dimensionality specified, accepting dims as series of Ints
-    let arr = MtlArray{Int,3,SM}(undef, dim[1],dim[2],dim[3])
-        @test check_storagemode(arr, SM)
-    end
-    let arr = MtlArray{Int,2,SM}(undef, dim[1],dim[2])
-        @test check_storagemode(arr, SM)
-    end
+    # There is some repetition to the GPUArrays tests to test for different storagemodes
+    @testset "$SM storageMode $dim" for SM in STORAGEMODES, dim in [(10, 10, 10), (1000, 17, 10)] # The second one purposefully made to always be bigger than 16KiB
 
-    # empty vector constructor
-    let arr = MtlArray{Int,1,SM}(undef, 0)
-        @test check_storagemode(arr, SM)
-    end
-    let arr = MtlVector{Int,SM}()
-        @test check_storagemode(arr, SM)
-    end
+        N = length(dim)
 
-    ## interop with other arrays
-    let arr = MtlArray{Float32,N,SM}(rand(Float32,dim))
-        @test check_storagemode(arr, SM)
-    end
-    let arr = MtlArray{Float32,N,SM}(rand(Int,dim))
-        @test check_storagemode(arr, SM)
-    end
+        # mtl
+        let arr = mtl(rand(2, 2); storage = SM)
+            @test check_storagemode(arr, SM)
+        end
 
-    # constructing new MtlArray from MtlArray
-    let arr = MtlArray{Int,N,SM}(rand(Int,dim))
-        arr2 = MtlArray{Int,N,SM}(arr)
-        @test check_storagemode(arr2, SM)
-    end
+        # type and dimensionality specified, accepting dims as series of Ints
+        let arr = MtlArray{Int, 3, SM}(undef, dim[1], dim[2], dim[3])
+            @test check_storagemode(arr, SM)
+        end
+        let arr = MtlArray{Int, 2, SM}(undef, dim[1], dim[2])
+            @test check_storagemode(arr, SM)
+        end
 
-    # fill, zeros, ones
-    let arr = Metal.fill(rand(Float32), dim; storage=SM)
-        @test check_storagemode(arr, SM)
-    end
+        # empty vector constructor
+        let arr = MtlArray{Int, 1, SM}(undef, 0)
+            @test check_storagemode(arr, SM)
+        end
+        let arr = MtlVector{Int, SM}()
+            @test check_storagemode(arr, SM)
+        end
 
-    let arr = Metal.zeros(Float32, dim; storage=SM)
-        @test check_storagemode(arr, SM)
-    end
+        ## interop with other arrays
+        let arr = MtlArray{Float32, N, SM}(rand(Float32, dim))
+            @test check_storagemode(arr, SM)
+        end
+        let arr = MtlArray{Float32, N, SM}(rand(Int, dim))
+            @test check_storagemode(arr, SM)
+        end
 
-    let arr = Metal.ones(Float32, dim; storage=SM)
-        @test check_storagemode(arr, SM)
-    end
+        # constructing new MtlArray from MtlArray
+        let arr = MtlArray{Int, N, SM}(rand(Int, dim))
+            arr2 = MtlArray{Int, N, SM}(arr)
+            @test check_storagemode(arr2, SM)
+        end
 
-    for SM2 in STORAGEMODES
-        let arr = MtlArray{Int,N,SM}(rand(Int,dim))
-            arr2 = MtlArray{Int,N,SM2}(arr)
-            @test check_storagemode(arr2, SM2)
+        # fill, zeros, ones
+        let arr = Metal.fill(rand(Float32), dim; storage = SM)
+            @test check_storagemode(arr, SM)
         end
-    end
 
-    # private storage errors.
-    if SM == Metal.PrivateStorage
-        let arr_mtl = Metal.zeros(Float32, dim...; storage=Metal.PrivateStorage)
-            @test is_private(arr_mtl) && !is_shared(arr_mtl)
-            @test_throws "Cannot access the contents of a private buffer" arr_cpu = unsafe_wrap(Array{Float32}, arr_mtl, dim)
+        let arr = Metal.zeros(Float32, dim; storage = SM)
+            @test check_storagemode(arr, SM)
         end
 
-        let b = rand(Float32, 10)
-            arr_mtl = mtl(b; storage=Metal.PrivateStorage)
-            @test_throws ErrorException arr_mtl[1]
-            @test Metal.@allowscalar arr_mtl[1] == b[1]
+        let arr = Metal.ones(Float32, dim; storage = SM)
+            @test check_storagemode(arr, SM)
         end
-    elseif SM == Metal.SharedStorage
-        let arr_mtl = Metal.zeros(Float32, dim...; storage=Metal.SharedStorage)
-            @test !is_private(arr_mtl) && is_shared(arr_mtl)
-            @test unsafe_wrap(Array{Float32}, arr_mtl) isa Array{Float32}
+
+        for SM2 in STORAGEMODES
+            let arr = MtlArray{Int, N, SM}(rand(Int, dim))
+                arr2 = MtlArray{Int, N, SM2}(arr)
+                @test check_storagemode(arr2, SM2)
+            end
         end
 
-        let b = rand(Float32, 10)
-            arr_mtl = mtl(b; storage=Metal.SharedStorage)
-            @test arr_mtl[1] == b[1]
+        # private storage errors.
+        if SM == Metal.PrivateStorage
+            let arr_mtl = Metal.zeros(Float32, dim...; storage = Metal.PrivateStorage)
+                @test is_private(arr_mtl) && !is_shared(arr_mtl)
+                @test_throws "Cannot access the contents of a private buffer" arr_cpu = unsafe_wrap(Array{Float32}, arr_mtl, dim)
+            end
+
+            let b = rand(Float32, 10)
+                arr_mtl = mtl(b; storage = Metal.PrivateStorage)
+                @test_throws ErrorException arr_mtl[1]
+                @test Metal.@allowscalar arr_mtl[1] == b[1]
+            end
+        elseif SM == Metal.SharedStorage
+            let arr_mtl = Metal.zeros(Float32, dim...; storage = Metal.SharedStorage)
+                @test !is_private(arr_mtl) && is_shared(arr_mtl)
+                @test unsafe_wrap(Array{Float32}, arr_mtl) isa Array{Float32}
+            end
+
+            let b = rand(Float32, 10)
+                arr_mtl = mtl(b; storage = Metal.SharedStorage)
+                @test arr_mtl[1] == b[1]
+            end
         end
     end
-end
 
-# Also tests changing storagemode
-@testset "similar" begin
-    check_similar(::MtlArray{T,N,S}, typ, dim, sm) where {T,N,S} =
-        T == typ && N == dim && S == sm
-    # similar
-    typ1 = Int
-    typ2 = Float32
-    dim1 = (10,10,10)
-    n1   = length(dim1)
-    dim2 = dim1[1:2]
-    n2   = length(dim2)
-    sm1  = Metal.SharedStorage
-    sm2  = Metal.PrivateStorage
+    # Also tests changing storagemode
+    @testset "similar" begin
+        check_similar(::MtlArray{T, N, S}, typ, dim, sm) where {T, N, S} =
+            T == typ && N == dim && S == sm
+        # similar
+        typ1 = Int
+        typ2 = Float32
+        dim1 = (10, 10, 10)
+        n1 = length(dim1)
+        dim2 = dim1[1:2]
+        n2 = length(dim2)
+        sm1 = Metal.SharedStorage
+        sm2 = Metal.PrivateStorage
 
-    arr = MtlArray{typ1, n1, sm1}(undef, dim1)
+        arr = MtlArray{typ1, n1, sm1}(undef, dim1)
 
-    s1 = similar(arr)
-    @test check_similar(s1,typ1,n1,sm1)
+        s1 = similar(arr)
+        @test check_similar(s1, typ1, n1, sm1)
 
-    s2 = similar(arr, dim2)
-    @test check_similar(s2,typ1,n2,sm1)
+        s2 = similar(arr, dim2)
+        @test check_similar(s2, typ1, n2, sm1)
 
-    s3 = similar(arr, typ2, dim2)
-    @test check_similar(s3,typ2,n2,sm1)
+        s3 = similar(arr, typ2, dim2)
+        @test check_similar(s3, typ2, n2, sm1)
 
-    # s4-s6 test for changing storagemode
-    s4 = similar(arr; storage=sm2)
-    @test check_similar(s4,typ1,n1,sm2)
+        # s4-s6 test for changing storagemode
+        s4 = similar(arr; storage = sm2)
+        @test check_similar(s4, typ1, n1, sm2)
 
-    s5 = similar(arr, dim2; storage=sm2)
-    @test check_similar(s5,typ1,n2,sm2)
+        s5 = similar(arr, dim2; storage = sm2)
+        @test check_similar(s5, typ1, n2, sm2)
 
-    s6 = similar(arr, typ2, dim2; storage=sm2)
-    @test check_similar(s6,typ2,n2,sm2)
+        s6 = similar(arr, typ2, dim2; storage = sm2)
+        @test check_similar(s6, typ2, n2, sm2)
 
-end
+    end
 
-@testset "fill($T)" for T in [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64,
-                              Float16, Float32]
-    b = rand(T)
+    @testset "fill($T)" for T in [
+            Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64,
+            Float16, Float32,
+        ]
+        b = rand(T)
 
-    # Dims in tuple
-    let A = Metal.fill(b, (10, 10, 10, 1000))
-        B = fill(b, (10, 10, 10, 1000))
-        @test Array(A) == B
-    end
+        # Dims in tuple
+        let A = Metal.fill(b, (10, 10, 10, 1000))
+            B = fill(b, (10, 10, 10, 1000))
+            @test Array(A) == B
+        end
 
-    let M = Metal.fill(b, (10, 10))
-        B = fill(b, (10, 10))
-        @test Array(M) == B
-    end
+        let M = Metal.fill(b, (10, 10))
+            B = fill(b, (10, 10))
+            @test Array(M) == B
+        end
 
-    let V = Metal.fill(b, (10,))
-        B = fill(b, (10,))
-        @test Array(V) == B
-    end
+        let V = Metal.fill(b, (10,))
+            B = fill(b, (10,))
+            @test Array(V) == B
+        end
 
-    #Dims already unpacked
-    let A = Metal.fill(b, 10, 1000, 1000)
-        B = fill(b, 10, 1000, 1000)
-        @test Array(A) == B
-    end
+        #Dims already unpacked
+        let A = Metal.fill(b, 10, 1000, 1000)
+            B = fill(b, 10, 1000, 1000)
+            @test Array(A) == B
+        end
 
-    let M = Metal.fill(b, 10, 10)
-        B = fill(b, 10, 10)
-        @test Array(M) == B
-    end
+        let M = Metal.fill(b, 10, 10)
+            B = fill(b, 10, 10)
+            @test Array(M) == B
+        end
 
-    let V = Metal.fill(b, 10)
-        B = fill(b, 10)
-        @test Array(V) == B
+        let V = Metal.fill(b, 10)
+            B = fill(b, 10)
+            @test Array(V) == B
+        end
     end
-end
 
-@testset "fill!($T)" for T in [Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64,
-                               Float16, Float32]
-    b = rand(T)
+    @testset "fill!($T)" for T in [
+            Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64,
+            Float16, Float32,
+        ]
+        b = rand(T)
 
-    # Dims in tuple
-    let A = MtlArray{T,3}(undef, (10, 1000, 1000))
-        fill!(A, b)
-        @test all(Array(A) .== b)
-    end
+        # Dims in tuple
+        let A = MtlArray{T, 3}(undef, (10, 1000, 1000))
+            fill!(A, b)
+            @test all(Array(A) .== b)
+        end
 
-    let M = MtlMatrix{T}(undef, (10, 10))
-        fill!(M, b)
-        @test all(Array(M) .== b)
-    end
+        let M = MtlMatrix{T}(undef, (10, 10))
+            fill!(M, b)
+            @test all(Array(M) .== b)
+        end
 
-    let V = MtlVector{T}(undef, (10,))
-        fill!(V, b)
-        @test all(Array(V) .== b)
-    end
+        let V = MtlVector{T}(undef, (10,))
+            fill!(V, b)
+            @test all(Array(V) .== b)
+        end
 
-    # Dims already unpacked
-    let A = MtlArray{T,4}(undef, 10, 10, 10, 1000)
-        fill!(A, b)
-        @test all(Array(A) .== b)
-    end
+        # Dims already unpacked
+        let A = MtlArray{T, 4}(undef, 10, 10, 10, 1000)
+            fill!(A, b)
+            @test all(Array(A) .== b)
+        end
 
-    let M = MtlMatrix{T}(undef, 10, 10)
-        fill!(M, b)
-        @test all(Array(M) .== b)
-    end
+        let M = MtlMatrix{T}(undef, 10, 10)
+            fill!(M, b)
+            @test all(Array(M) .== b)
+        end
 
-    let V = MtlVector{T}(undef, 10)
-        fill!(V, b)
-        @test all(Array(V) .== b)
-    end
+        let V = MtlVector{T}(undef, 10)
+            fill!(V, b)
+            @test all(Array(V) .== b)
+        end
 
-    # 0-length array
-    let A = MtlArray{T}(undef, 0)
-        b = rand(T)
-        fill!(A, b)
-        @test A isa MtlArray{T,1}
-        @test Array(A) == fill(b, 0)
+        # 0-length array
+        let A = MtlArray{T}(undef, 0)
+            b = rand(T)
+            fill!(A, b)
+            @test A isa MtlArray{T, 1}
+            @test Array(A) == fill(b, 0)
+        end
     end
-end
 
-# https://github.com/JuliaGPU/CUDA.jl/issues/2191
-@testset "preserving storage mode" begin
-    a = mtl([1]; storage=Metal.SharedStorage)
-    @test Metal.storagemode(a) == Metal.SharedStorage
+    # https://github.com/JuliaGPU/CUDA.jl/issues/2191
+    @testset "preserving storage mode" begin
+        a = mtl([1]; storage = Metal.SharedStorage)
+        @test Metal.storagemode(a) == Metal.SharedStorage
 
-    # storage mode should be preserved
-    b = a .+ 1
-    @test Metal.storagemode(b) == Metal.SharedStorage
+        # storage mode should be preserved
+        b = a .+ 1
+        @test Metal.storagemode(b) == Metal.SharedStorage
 
-    # when there's a conflict, we should defer to shared memory
-    c = mtl([1]; storage=Metal.PrivateStorage)
-    d = mtl([1]; storage=Metal.SharedStorage)
-    e = c .+ d
-    @test Metal.storagemode(e) == Metal.SharedStorage
-end
+        # when there's a conflict, we should defer to shared memory
+        c = mtl([1]; storage = Metal.PrivateStorage)
+        d = mtl([1]; storage = Metal.SharedStorage)
+        e = c .+ d
+        @test Metal.storagemode(e) == Metal.SharedStorage
+    end
 
-@testset "resizing" begin
-    a = MtlArray([1,2,3])
+    @testset "resizing" begin
+        a = MtlArray([1, 2, 3])
 
-    resize!(a, 3)
-    @test length(a) == 3
-    @test Array(a) == [1,2,3]
+        resize!(a, 3)
+        @test length(a) == 3
+        @test Array(a) == [1, 2, 3]
 
-    resize!(a, 5)
-    @test length(a) == 5
-    @test Array(a)[1:3] == [1,2,3]
+        resize!(a, 5)
+        @test length(a) == 5
+        @test Array(a)[1:3] == [1, 2, 3]
 
-    resize!(a, 2)
-    @test length(a) == 2
-    @test Array(a)[1:2] == [1,2]
+        resize!(a, 2)
+        @test length(a) == 2
+        @test Array(a)[1:2] == [1, 2]
 
-    b = MtlArray{Int}(undef, 0)
-    @test length(b) == 0
-    resize!(b, 1)
-    @test length(b) == 1
-end
+        b = MtlArray{Int}(undef, 0)
+        @test length(b) == 0
+        resize!(b, 1)
+        @test length(b) == 1
+    end
 
-function _alignedvec(::Type{T}, n::Integer, alignment::Integer = 16384) where {T}
-    ispow2(alignment) || throw(ArgumentError("$alignment is not a power of 2"))
-    alignment ≥ sizeof(Int) || throw(ArgumentError("$alignment is not a multiple of $(sizeof(Int))"))
-    isbitstype(T) || throw(ArgumentError("$T is not a bitstype"))
-    p = Ref{Ptr{T}}()
-    err = ccall(:posix_memalign, Cint, (Ref{Ptr{T}}, Csize_t, Csize_t), p, alignment, n * sizeof(T))
-    iszero(err) || throw(OutOfMemoryError())
-    return unsafe_wrap(Array, p[], n, own = true)
-end
+    function _alignedvec(::Type{T}, n::Integer, alignment::Integer = 16384) where {T}
+        ispow2(alignment) || throw(ArgumentError("$alignment is not a power of 2"))
+        alignment ≥ sizeof(Int) || throw(ArgumentError("$alignment is not a multiple of $(sizeof(Int))"))
+        isbitstype(T) || throw(ArgumentError("$T is not a bitstype"))
+        p = Ref{Ptr{T}}()
+        err = ccall(:posix_memalign, Cint, (Ref{Ptr{T}}, Csize_t, Csize_t), p, alignment, n * sizeof(T))
+        iszero(err) || throw(OutOfMemoryError())
+        return unsafe_wrap(Array, p[], n, own = true)
+    end
 
-@testset "unsafe_wrap" begin
-    @testset "cpu array incremented" begin
-        @testset "wrap cpu" begin
-            @testset "check cpu" begin # cpu array checked first
-                arr = _alignedvec(Float32, 16384 * 2)
-                fill!(arr, one(eltype(arr)))
-                marr = Metal.@sync unsafe_wrap(MtlVector{Float32}, arr)
+    @testset "unsafe_wrap" begin
+        @testset "cpu array incremented" begin
+            @testset "wrap cpu" begin
+                @testset "check cpu" begin # cpu array checked first
+                    arr = _alignedvec(Float32, 16384 * 2)
+                    fill!(arr, one(eltype(arr)))
+                    marr = Metal.@sync unsafe_wrap(MtlVector{Float32}, arr)
 
-                @test all(arr .== 1)
-                @test all(marr .== 1)
+                    @test all(arr .== 1)
+                    @test all(marr .== 1)
 
-                arr .+= 1
-                @test all(arr .== 2)
-                @test all(marr .== 2)
-            end
+                    arr .+= 1
+                    @test all(arr .== 2)
+                    @test all(marr .== 2)
+                end
 
-            @testset "check gpu" begin # gpu array checked first
-                arr = _alignedvec(Float32, 16384 * 2)
-                fill!(arr, one(eltype(arr)))
-                marr = Metal.@sync unsafe_wrap(MtlVector{Float32}, arr)
+                @testset "check gpu" begin # gpu array checked first
+                    arr = _alignedvec(Float32, 16384 * 2)
+                    fill!(arr, one(eltype(arr)))
+                    marr = Metal.@sync unsafe_wrap(MtlVector{Float32}, arr)
 
-                @test all(marr .== 1)
-                @test all(arr .== 1)
+                    @test all(marr .== 1)
+                    @test all(arr .== 1)
 
-                arr .+= 1
-                @test all(marr .== 2)
-                @test all(arr .== 2)
+                    arr .+= 1
+                    @test all(marr .== 2)
+                    @test all(arr .== 2)
+                end
             end
-        end
 
-        @testset "wrap gpu" begin
-            @testset "check cpu" begin # cpu array checked first
-                marr = Metal.@sync Metal.ones(Float32, 18000; storage = Metal.SharedStorage)
-                arr = unsafe_wrap(Vector{Float32}, marr)
+            @testset "wrap gpu" begin
+                @testset "check cpu" begin # cpu array checked first
+                    marr = Metal.@sync Metal.ones(Float32, 18000; storage = Metal.SharedStorage)
+                    arr = unsafe_wrap(Vector{Float32}, marr)
 
-                @test all(arr .== 1)
-                @test all(marr .== 1)
+                    @test all(arr .== 1)
+                    @test all(marr .== 1)
 
-                arr .+= 1
-                @test all(arr .== 2)
-                @test all(marr .== 2)
-            end
+                    arr .+= 1
+                    @test all(arr .== 2)
+                    @test all(marr .== 2)
+                end
 
-            @testset "check gpu" begin # gpu array checked first
-                marr = Metal.@sync Metal.ones(Float32, 18000; storage = Metal.SharedStorage)
-                arr = unsafe_wrap(Vector{Float32}, marr)
+                @testset "check gpu" begin # gpu array checked first
+                    marr = Metal.@sync Metal.ones(Float32, 18000; storage = Metal.SharedStorage)
+                    arr = unsafe_wrap(Vector{Float32}, marr)
 
-                @test all(marr .== 1)
-                @test all(arr .== 1)
+                    @test all(marr .== 1)
+                    @test all(arr .== 1)
 
-                arr .+= 1
-                @test all(marr .== 2)
-                @test all(arr .== 2)
+                    arr .+= 1
+                    @test all(marr .== 2)
+                    @test all(arr .== 2)
+                end
             end
         end
-    end
 
-    @testset "gpu array incremented" begin
-        @testset "wrap cpu" begin
-            @testset "check cpu" begin # cpu array checked first
-                arr = _alignedvec(Float32, 16384 * 2)
-                fill!(arr, one(eltype(arr)))
-                marr = Metal.@sync unsafe_wrap(MtlVector{Float32}, arr)
-
-                @test all(arr .== 1)
-                @test all(marr .== 1)
-
-                Metal.@sync marr .+= 1
-                @test all(arr .== 2)
-                @test all(marr .== 2)
+        @testset "gpu array incremented" begin
+            @testset "wrap cpu" begin
+                @testset "check cpu" begin # cpu array checked first
+                    arr = _alignedvec(Float32, 16384 * 2)
+                    fill!(arr, one(eltype(arr)))
+                    marr = Metal.@sync unsafe_wrap(MtlVector{Float32}, arr)
+
+                    @test all(arr .== 1)
+                    @test all(marr .== 1)
+
+                    Metal.@sync marr .+= 1
+                    @test all(arr .== 2)
+                    @test all(marr .== 2)
+                end
+
+                @testset "check gpu" begin # gpu array checked first
+                    arr = _alignedvec(Float32, 16384 * 2)
+                    fill!(arr, one(eltype(arr)))
+                    marr = Metal.@sync unsafe_wrap(MtlVector{Float32}, arr)
+
+                    @test all(marr .== 1)
+                    @test all(arr .== 1)
+
+                    marr .+= 1
+                    @test all(marr .== 2)
+                    @test all(arr .== 2)
+                end
             end
 
-            @testset "check gpu" begin # gpu array checked first
-                arr = _alignedvec(Float32, 16384 * 2)
-                fill!(arr, one(eltype(arr)))
-                marr = Metal.@sync unsafe_wrap(MtlVector{Float32}, arr)
+            @testset "wrap gpu" begin
+                @testset "check cpu" begin # cpu array checked first
+                    marr = Metal.@sync Metal.ones(Float32, 18000; storage = Metal.SharedStorage)
+                    arr = unsafe_wrap(Vector{Float32}, marr)
 
-                @test all(marr .== 1)
-                @test all(arr .== 1)
+                    @test all(arr .== 1)
+                    @test all(marr .== 1)
 
-                marr .+= 1
-                @test all(marr .== 2)
-                @test all(arr .== 2)
-            end
-        end
+                    Metal.@sync marr .+= 1
+                    @test all(arr .== 2)
+                    @test all(marr .== 2)
+                end
 
-        @testset "wrap gpu" begin
-            @testset "check cpu" begin # cpu array checked first
-                marr = Metal.@sync Metal.ones(Float32, 18000; storage = Metal.SharedStorage)
-                arr = unsafe_wrap(Vector{Float32}, marr)
+                @testset "check gpu" begin # gpu array checked first
+                    marr = Metal.@sync Metal.ones(Float32, 18000; storage = Metal.SharedStorage)
+                    arr = unsafe_wrap(Vector{Float32}, marr)
 
-                @test all(arr .== 1)
-                @test all(marr .== 1)
+                    @test all(marr .== 1)
+                    @test all(arr .== 1)
 
-                Metal.@sync marr .+= 1
-                @test all(arr .== 2)
-                @test all(marr .== 2)
+                    marr .+= 1
+                    @test all(marr .== 2)
+                    @test all(arr .== 2)
+                end
             end
+        end
 
-            @testset "check gpu" begin # gpu array checked first
-                marr = Metal.@sync Metal.ones(Float32, 18000; storage = Metal.SharedStorage)
-                arr = unsafe_wrap(Vector{Float32}, marr)
-
-                @test all(marr .== 1)
-                @test all(arr .== 1)
+        @testset "Issue #451" begin
+            a = mtl(reshape(Float32.(1:60), 5, 4, 3); storage = Metal.SharedStorage)
+            view_a = @view a[:, 1:4, 2]
+            b = copy(unsafe_wrap(Array, view_a))
+            c = Array(view_a)
 
-                marr .+= 1
-                @test all(marr .== 2)
-                @test all(arr .== 2)
-            end
+            @test b == c
         end
-    end
 
-    @testset "Issue #451" begin
-        a = mtl(reshape(Float32.(1:60), 5,4,3);storage=Metal.SharedStorage)
-        view_a = @view a[:,1:4,2]
-        b = copy(unsafe_wrap(Array, view_a))
-        c = Array(view_a)
+        # test that you cannot create an array with a different eltype
+        marr3 = mtl(zeros(Float32, 10); storage = Metal.SharedStorage)
+        @test_throws MethodError unsafe_wrap(Array{Float16}, marr3)
+    end
 
-        @test b == c
+    @testset "ReshapedArray" begin
+        @test Array(sum(reshape(Metal.ones(3, 10)', (5, 3, 2)); dims = 1)) == fill(5, (1, 3, 2))
+        @test Array(sum(reshape(PermutedDimsArray(reshape(mtl(collect(Float32, 1:30)), 5, 3, 2), (3, 1, 2)), (10, 3)); dims = 1)) ==
+            sum(reshape(PermutedDimsArray(reshape(Float32.(1:30), 5, 3, 2), (3, 1, 2)), (10, 3)); dims = 1)
     end
 
-    # test that you cannot create an array with a different eltype
-    marr3 = mtl(zeros(Float32, 10); storage = Metal.SharedStorage)
-    @test_throws MethodError unsafe_wrap(Array{Float16}, marr3)
-end
+    @testset "accumulate" begin
+        for n in (0, 1, 2, 3, 10, 10_000, 16384, 16384 + 1) # small, large, odd & even, pow2 and not
+            @test testf(x -> accumulate(+, x), rand(Float32, n))
+            @test testf(x -> accumulate(+, x), rand(Float32, n, 2))
+            @test testf(Base.Fix2((x, y) -> accumulate(+, x; init = y), rand(Float32)), rand(Float32, n))
+        end
 
-@testset "ReshapedArray" begin
-    @test Array(sum(reshape(Metal.ones(3, 10)', (5, 3, 2)); dims=1)) == fill(5, (1,3,2))
-    @test Array(sum(reshape(PermutedDimsArray(reshape(mtl(collect(Float32, 1:30)), 5, 3, 2), (3, 1, 2)), (10, 3)); dims=1)) ==
-        sum(reshape(PermutedDimsArray(reshape(Float32.(1:30), 5, 3, 2), (3, 1, 2)), (10, 3)); dims=1)
-end
+        # multidimensional
+        for (sizes, dims) in (
+                (2,) => 2,
+                (3, 4, 5) => 2,
+                (1, 70, 50, 20) => 3,
+            )
+            @test testf(x -> accumulate(+, x; dims = dims), rand(-10:10, sizes))
+            @test testf(x -> accumulate(+, x), rand(-10:10, sizes))
+        end
 
-@testset "accumulate" begin
-    for n in (0, 1, 2, 3, 10, 10_000, 16384, 16384+1) # small, large, odd & even, pow2 and not
-        @test testf(x->accumulate(+, x), rand(Float32, n))
-        @test testf(x->accumulate(+, x), rand(Float32, n, 2))
-        @test testf(Base.Fix2((x,y)->accumulate(+, x; init=y), rand(Float32)), rand(Float32, n))
-    end
+        # using initializer
+        for (sizes, dims) in (
+                (2,) => 2,
+                (3, 4, 5) => 2,
+                (1, 70, 50, 20) => 3,
+            )
+            @test testf(Base.Fix2((x, y) -> accumulate(+, x; dims = dims, init = y), rand(-10:10)), rand(-10:10, sizes))
+            @test testf(Base.Fix2((x, y) -> accumulate(+, x; init = y), rand(-10:10)), rand(-10:10, sizes))
+        end
 
-    # multidimensional
-    for (sizes, dims) in ((2,) => 2,
-                          (3,4,5) => 2,
-                          (1, 70, 50, 20) => 3,)
-        @test testf(x->accumulate(+, x; dims=dims), rand(-10:10, sizes))
-        @test testf(x->accumulate(+, x), rand(-10:10, sizes))
-    end
+        # in place
+        @test testf(x -> (accumulate!(+, x, copy(x)); x), rand(Float32, 2))
 
-    # using initializer
-    for (sizes, dims) in ((2,) => 2,
-                          (3,4,5) => 2,
-                          (1, 70, 50, 20) => 3)
-        @test testf(Base.Fix2((x,y)->accumulate(+, x; dims=dims, init=y), rand(-10:10)), rand(-10:10, sizes))
-        @test testf(Base.Fix2((x,y)->accumulate(+, x; init=y), rand(-10:10)), rand(-10:10, sizes))
+        # specialized
+        @test testf(cumsum, rand(Float32, 2))
+        @test testf(cumprod, rand(Float32, 2))
     end
 
-    # in place
-    @test testf(x->(accumulate!(+, x, copy(x)); x), rand(Float32, 2))
-
-    # specialized
-    @test testf(cumsum, rand(Float32, 2))
-    @test testf(cumprod, rand(Float32, 2))
-end
-
-@testset "findall" begin
-    # 1D
-    @test testf(x->findall(x), rand(Bool, 1000))
-    @test testf(x->findall(y->y>Float32(0.5), x), rand(Float32,1000))
+    @testset "findall" begin
+        # 1D
+        @test testf(x -> findall(x), rand(Bool, 1000))
+        @test testf(x -> findall(y -> y > Float32(0.5), x), rand(Float32, 1000))
 
-    # Set storage mode to a different one than the default
-    let storage=Metal.DefaultStorageMode == Metal.PrivateStorage ? Metal.SharedStorage : Metal.PrivateStorage
-        x = mtl(rand(Float32,100); storage)
-        out = findall(y->y>Float32(0.5), x)
-        @test Metal.storagemode(x) == Metal.storagemode(out)
-    end
+        # Set storage mode to a different one than the default
+        let storage = Metal.DefaultStorageMode == Metal.PrivateStorage ? Metal.SharedStorage : Metal.PrivateStorage
+            x = mtl(rand(Float32, 100); storage)
+            out = findall(y -> y > Float32(0.5), x)
+            @test Metal.storagemode(x) == Metal.storagemode(out)
+        end
 
-    # ND
-    let x = rand(Bool, 0, 0)
-      @test findall(x) == Array(findall(MtlArray(x)))
-    end
-    let x = rand(Bool, 1000, 1000)
-        @test findall(x) == Array(findall(MtlArray(x)))
-    end
-    let x = rand(Float32, 1000, 1000)
-        @test findall(y->y>Float32(0.5), x) == Array(findall(y->y>Float32(0.5), MtlArray(x)))
-    end
+        # ND
+        let x = rand(Bool, 0, 0)
+            @test findall(x) == Array(findall(MtlArray(x)))
+        end
+        let x = rand(Bool, 1000, 1000)
+            @test findall(x) == Array(findall(MtlArray(x)))
+        end
+        let x = rand(Float32, 1000, 1000)
+            @test findall(y -> y > Float32(0.5), x) == Array(findall(y -> y > Float32(0.5), MtlArray(x)))
+        end
 
-    # ambiguity
-    let f = in(3)
-        x = MtlArray([1, 2, 3, 4, 5, 3])
-        @test Array(findall(f, x)) == [3, 6]
+        # ambiguity
+        let f = in(3)
+            x = MtlArray([1, 2, 3, 4, 5, 3])
+            @test Array(findall(f, x)) == [3, 6]
+        end
     end
-end
 
-@testset "broadcast" begin
-    testf(f, x) = Array(f(MtlArray(x))) ≈ f(x)
+    @testset "broadcast" begin
+        testf(f, x) = Array(f(MtlArray(x))) ≈ f(x)
 
-    @test testf(x->max.(x, zero(Float32)), randn(Float32, 1000))
-    @test testf(x->min.(x, one(Float32)), randn(Float32, 1000))
-    @test testf(x->min.(max.(x, zero(Float32)), one(Float32)), randn(Float32, 1000))
-    @test testf(x->max.(min.(x, one(Float32)), zero(Float32)), randn(Float32, 1000))
+        @test testf(x -> max.(x, zero(Float32)), randn(Float32, 1000))
+        @test testf(x -> min.(x, one(Float32)), randn(Float32, 1000))
+        @test testf(x -> min.(max.(x, zero(Float32)), one(Float32)), randn(Float32, 1000))
+        @test testf(x -> max.(min.(x, one(Float32)), zero(Float32)), randn(Float32, 1000))
 
-    # preserving buffer types
-    let x = Metal.zeros(Float32, 1; storage=Metal.SharedStorage)
-        y = x .+ 1
-        @test is_shared(y)
-    end
+        # preserving buffer types
+        let x = Metal.zeros(Float32, 1; storage = Metal.SharedStorage)
+            y = x .+ 1
+            @test is_shared(y)
+        end
 
-    # when storages are different, choose shared
-    let x = Metal.zeros(Float32, 1; storage=Metal.SharedStorage), y = Metal.zeros(Float32, 1; storage=Metal.PrivateStorage)
-        z = x .+ y
-        @test is_shared(z)
-    end
+        # when storages are different, choose shared
+        let x = Metal.zeros(Float32, 1; storage = Metal.SharedStorage), y = Metal.zeros(Float32, 1; storage = Metal.PrivateStorage)
+            z = x .+ y
+            @test is_shared(z)
+        end
 
-    let x = Metal.zeros(Float32, 2, 2; storage=Metal.SharedStorage), y = Metal.zeros(Float32, 2; storage=Metal.PrivateStorage)
-        z = x .+ y
-        @test is_shared(z)
+        let x = Metal.zeros(Float32, 2, 2; storage = Metal.SharedStorage), y = Metal.zeros(Float32, 2; storage = Metal.PrivateStorage)
+            z = x .+ y
+            @test is_shared(z)
+        end
     end
-end
 
 
 end
 
 @testset "large map reduce" begin
-  dev = device()
+    dev = device()
 
-  big_size = Metal.serial_mapreduce_threshold(dev) + 5
-  a = rand(Float32, big_size, 31)
-  c = MtlArray(a)
+    big_size = Metal.serial_mapreduce_threshold(dev) + 5
+    a = rand(Float32, big_size, 31)
+    c = MtlArray(a)
 
-  expected = minimum(a, dims=2)
-  actual = minimum(c, dims=2)
-  @test expected == Array(actual)
+    expected = minimum(a, dims = 2)
+    actual = minimum(c, dims = 2)
+    @test expected == Array(actual)
 
-  expected = findmax(a, dims=2)
-  actual = findmax(c, dims=2)
-  @test expected == map(Array, actual)
+    expected = findmax(a, dims = 2)
+    actual = findmax(c, dims = 2)
+    @test expected == map(Array, actual)
 
-  expected = sum(a, dims=2)
-  actual = sum(c, dims=2)
-  @test expected == Array(actual)
+    expected = sum(a, dims = 2)
+    actual = sum(c, dims = 2)
+    @test expected == Array(actual)
 
-  a = rand(Int, big_size, 31)
-  c = MtlArray(a)
+    a = rand(Int, big_size, 31)
+    c = MtlArray(a)
 
-  expected = minimum(a, dims=2)
-  actual = minimum(c, dims=2)
-  @test expected == Array(actual)
+    expected = minimum(a, dims = 2)
+    actual = minimum(c, dims = 2)
+    @test expected == Array(actual)
 
-  expected = findmax(a, dims=2)
-  actual = findmax(c, dims=2)
-  @test expected == map(Array, actual)
+    expected = findmax(a, dims = 2)
+    actual = findmax(c, dims = 2)
+    @test expected == map(Array, actual)
 
-  expected = sum(a, dims=2)
-  actual = sum(c, dims=2)
-  @test expected == Array(actual)
+    expected = sum(a, dims = 2)
+    actual = sum(c, dims = 2)
+    @test expected == Array(actual)
 end
-