diff --git a/src/plan.jl b/src/plan.jl
index efa8333..8a72045 100644
--- a/src/plan.jl
+++ b/src/plan.jl
@@ -145,10 +145,17 @@ function LinearAlgebra.mul!(y::AbstractArray{U,N}, p::FFTAPlan_cx{T,1}, x::Abstr
     end
     Rpre = CartesianIndices(size(x)[1:p.region[]-1])
     Rpost = CartesianIndices(size(x)[p.region[]+1:end])
-    for Ipre in Rpre
-        for Ipost in Rpost
-            @views fft!(y[Ipre,:,Ipost], x[Ipre,:,Ipost], 1, 1, p.dir, p.callgraph[1][1].type, p.callgraph[1], 1)
-        end
+    _mul_loop!(y, x, Rpre, Rpost, p)
+end
+
+function _mul_loop!(
+    y::AbstractArray{U,N},
+    x::AbstractArray{T,N},
+    Rpre::CartesianIndices,
+    Rpost::CartesianIndices,
+    p::FFTAPlan_cx{T,1}) where {T,U,N}
+    for Ipost in Rpost, Ipre in Rpre
+        @views fft!(y[Ipre,:,Ipost], x[Ipre,:,Ipost], 1, 1, p.dir, p.callgraph[1][1].type, p.callgraph[1], 1)
     end
 end
 
@@ -172,10 +179,10 @@ function LinearAlgebra.mul!(y::AbstractArray{U,N}, p::FFTAPlan_cx{T,2}, x::Abstr
     # Introduce function barrier here since the variables used in the loop ranges aren't inferred. This
     # is partly because the region field of the plan is abstractly typed but even if that wasn't the case,
     # it might be a bit tricky to construct the Rxs in an inferred way.
-    _mul_loop(y_tmp, y, x, p, R1, R2, R3, rows, cols)
+    _mul_loop!(y_tmp, y, x, p, R1, R2, R3, rows, cols)
 end
 
-function _mul_loop(
+function _mul_loop!(
     y_tmp::AbstractArray,
     y::AbstractArray,
     x::AbstractArray,
@@ -186,17 +193,13 @@ function _mul_loop(
     rows::Int,
     cols::Int
 )
-    for I1 in R1
-        for I2 in R2
-            for I3 in R3
-                for k in 1:cols
-                    @views fft!(y_tmp[:,k],  x[I1,:,I2,k,I3], 1, 1, p.dir, p.callgraph[1][1].type, p.callgraph[1], 1)
-                end
+    for I3 in R3, I2 in R2, I1 in R1
+        for k in 1:cols
+            @views fft!(y_tmp[:,k],  x[I1,:,I2,k,I3], 1, 1, p.dir, p.callgraph[1][1].type, p.callgraph[1], 1)
+        end
 
-                for k in 1:rows
-                    @views fft!(y[I1,k,I2,:,I3], y_tmp[k,:], 1, 1, p.dir, p.callgraph[2][1].type, p.callgraph[2], 1)
-                end
-            end
+        for k in 1:rows
+            @views fft!(y[I1,k,I2,:,I3], y_tmp[k,:], 1, 1, p.dir, p.callgraph[2][1].type, p.callgraph[2], 1)
         end
     end
 end