diff --git a/.github/workflows/Documentation.yml b/.github/workflows/Documentation.yml
index 6fc7902..80bff9c 100644
--- a/.github/workflows/Documentation.yml
+++ b/.github/workflows/Documentation.yml
@@ -17,20 +17,11 @@ jobs:
         os: [ubuntu-latest]
     steps:
       - uses: actions/checkout@v4
-      - uses: julia-actions/setup-julia@latest
+      - uses: julia-actions/setup-julia@v2
         with:
           version: ${{ matrix.julia-version }}
       - name: Cache artifacts
-        uses: actions/cache@v4
-        env:
-          cache-name: cache-artifacts
-        with:
-          path: ~/.julia/artifacts 
-          key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
-          restore-keys: |
-            ${{ runner.os }}-test-${{ env.cache-name }}-
-            ${{ runner.os }}-test-
-            ${{ runner.os }}-
+        uses: julia-actions/cache@v2
       - name: Install dependencies
         run: julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
       - name: Build and deploy
diff --git a/.github/workflows/UnitTest.yml b/.github/workflows/UnitTest.yml
index cff571a..8a450da 100644
--- a/.github/workflows/UnitTest.yml
+++ b/.github/workflows/UnitTest.yml
@@ -44,7 +44,9 @@ jobs:
         uses: julia-actions/cache@v2
 
       - name: "Unit Test"
-        uses: julia-actions/julia-runtest@master
+        uses: julia-actions/julia-runtest@v1
+        env:
+          JULIA_NUM_THREADS: 2
 
       - uses: julia-actions/julia-processcoverage@v1
       - uses: codecov/codecov-action@v5
diff --git a/Project.toml b/Project.toml
index 1e5f3e4..0caee1b 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,6 +14,7 @@ ImageCore = "a09fc81d-aa75-5fe9-8630-4744c3626534"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
+RealFFTs = "3645faec-0534-4e91-afef-021db0981eec"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
@@ -31,10 +32,11 @@ ImageCore = "0.10"
 OffsetArrays = "1.9"
 PrecompileTools = "1"
 Reexport = "1.1"
+RealFFTs = "1"
 StaticArrays = "0.10, 0.11, 0.12, 1.0"
 Statistics = "1"
 TiledIteration = "0.2, 0.3, 0.4, 0.5"
-julia = "1.6"
+julia = "1.10"
 
 [extras]
 AxisArrays = "39de3d68-74b9-583c-8d2d-e117c070f3a9"
diff --git a/docs/src/index.md b/docs/src/index.md
index fd57e3f..7f168f1 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -13,7 +13,7 @@ processing.
 
 The main functions provided by this package are:
 
-| Function                 | Action         | 
+| Function                 | Action         |
 |:-------------------------|:---------------|
 |[`imfilter`](@ref)        | Filter a one, two or multidimensional array img with a kernel by computing their correlation |
 |[`imfilter!`](@ref)       | Filter an array img with kernel kernel by computing their correlation, storing the result in imgfilt |
@@ -25,7 +25,7 @@ The main functions provided by this package are:
 |[`findlocalminima`](@ref) | Returns the coordinates of elements whose value is smaller than all of their immediate neighbors |
 |[`findlocalmaxima`](@ref) | Returns the coordinates of elements whose value is larger than all of their immediate neighbors |
 
-Common kernels (filters) are organized in the `Kernel` and `KernelFactors` modules. 
+Common kernels (filters) are organized in the `Kernel` and `KernelFactors` modules.
 
 A common task in image processing and computer vision is computing
 image *gradients* (derivatives), for which there is the dedicated
@@ -80,6 +80,24 @@ transform (FFT).  By default, this choice is made based on kernel
 size. You can manually specify the algorithm using [`Algorithm.FFT()`](@ref)
 or [`Algorithm.FIR()`](@ref).
 
+#### Reusing FFT plans
+
+It is possible to reuse FFT plans if the operation is going to be done on the
+same array type and dimensions i.e. on each image of an image stack
+
+```julia
+using ImageFiltering, ComputationalResources
+imgstack = rand(Float64, 200, 100, 10)
+imgstack_filtered = similar(imgstack)
+
+kernel = ImageFiltering.factorkernel(Kernel.LoG(1))
+fft_planned = CPU1(ImageFiltering.planned_fft(imgstack_filtered[:,:,1], kernel))
+
+for i in axes(imgstack, 3)
+    imfilter!(fft_planned, imgstack_filtered[:,:,i], imgstack[:,:,i], kernel)
+end
+```
+
 ### Feature: Multithreading
 
 If you launch Julia with `JULIA_NUM_THREADS=n` (where `n > 1`), then
diff --git a/src/ImageFiltering.jl b/src/ImageFiltering.jl
index df07083..0bfb4d9 100644
--- a/src/ImageFiltering.jl
+++ b/src/ImageFiltering.jl
@@ -1,12 +1,14 @@
 module ImageFiltering
 
 using FFTW
+using RealFFTs
 using ImageCore, FFTViews, OffsetArrays, StaticArrays, ComputationalResources, TiledIteration
 # Where possible we avoid a direct dependency to reduce the number of [compat] bounds
 # using FixedPointNumbers: Normed, N0f8 # reexported by ImageCore
 using ImageCore.MappedArrays
 using Statistics, LinearAlgebra
 using Base: Indices, tail, fill_to_length, @pure, depwarn, @propagate_inbounds
+import Base: copy!
 using OffsetArrays: IdentityUnitRange   # using the one in OffsetArrays makes this work with multiple Julia versions
 using SparseArrays   # only needed to fix an ambiguity in borderarray
 using Reexport
@@ -30,7 +32,8 @@ export Kernel, KernelFactors,
     imgradients, padarray, centered, kernelfactors, reflect,
     freqkernel, spacekernel,
     findlocalminima, findlocalmaxima,
-    blob_LoG, BlobLoG
+    blob_LoG, BlobLoG,
+    planned_fft
 
 const FixedColorant{T<:Normed} = Colorant{T}
 const StaticOffsetArray{T,N,A<:StaticArray} = OffsetArray{T,N,A}
@@ -50,16 +53,33 @@ function Base.transpose(A::StaticOffsetArray{T,2}) where T
 end
 
 module Algorithm
+    import FFTW
     # deliberately don't export these, but it's expected that they
     # will be used as Algorithm.FFT(), etc.
     abstract type Alg end
-    "Filter using the Fast Fourier Transform" struct FFT <: Alg end
+    "Filter using the Fast Fourier Transform" struct FFT <: Alg
+        plan1::Union{Function,Nothing}
+        plan2::Union{Function,Nothing}
+        plan3::Union{Function,Nothing}
+    end
+    FFT() = FFT(nothing, nothing, nothing)
+    function Base.show(io::IO, alg::FFT)
+        if alg.plan1 === nothing && alg.plan2 === nothing && alg.plan3 === nothing
+            print(io, "FFT()")
+        else
+            print(io, "FFT(planned)")
+        end
+    end
     "Filter using a direct algorithm" struct FIR <: Alg end
+    Base.show(io::IO, alg::FIR) = print(io, "FIR()")
     "Cache-efficient filtering using tiles" struct FIRTiled{N} <: Alg
         tilesize::Dims{N}
     end
+    Base.show(io::IO, ::FIRTiled{N}) where N = print(io, "FIRTiled{$N}()")
     "Filter with an Infinite Impulse Response filter" struct IIR <: Alg end
+    Base.show(io::IO, alg::IIR) = print(io, "IIR()")
     "Filter with a cascade of mixed types (IIR, FIR)" struct Mixed <: Alg end
+    Base.show(io::IO, alg::Mixed) = print(io, "Mixed()")
 
     FIRTiled() = FIRTiled(())
 end
diff --git a/src/imfilter.jl b/src/imfilter.jl
index 4def267..1aff532 100644
--- a/src/imfilter.jl
+++ b/src/imfilter.jl
@@ -826,7 +826,7 @@ function _imfilter_fft!(r::AbstractCPU{FFT},
     for I in CartesianIndices(axes(kern))
         krn[I] = kern[I]
     end
-    Af = filtfft(A, krn)
+    Af = filtfft(A, krn, r.settings.plan1, r.settings.plan2, r.settings.plan3)
     if map(first, axes(out)) == map(first, axes(Af))
         R = CartesianIndices(axes(out))
         copyto!(out, R, Af, R)
@@ -858,6 +858,299 @@ end
 @inline _stretch_mul(AT::Type{<:Real}, A_fft::AbstractArray, BT::Type{<:Complex}, B_fft::AbstractArray, d::Int) = _stretch_mul(BT, B_fft, AT, A_fft, d)
 @inline _stretch_mul(AT::Type{<:Real}, A_fft::AbstractArray, BT::Type{<:Real}, B_fft::AbstractArray, _::Int) = A_fft .* B_fft
 @inline _stretch_mul(AT::Type{<:Complex}, A_fft::AbstractArray, BT::Type{<:Complex}, B_fft::AbstractArray, _::Int) = A_fft .* B_fft
+
+"""
+    buffered_planned_rfft(a::AbstractArray{T}, dims=1:ndims(a)) where {T}
+
+Create a buffered, planned real FFT function for arrays with the same size and type as `a`.
+
+Returns a function that performs an in-place rfft using a pre-computed plan.
+Each call allocates a task-local buffer to ensure thread safety when used concurrently.
+
+For full transforms (all dimensions), uses `RealFFTs.RCpair` for efficiency.
+For partial transforms, uses standard FFTW plans.
+
+This is an internal function used by [`planned_fft`](@ref).
+"""
+function buffered_planned_rfft(a::AbstractArray{T}, dims=1:ndims(a)) where {T}
+    numeric_type = T <: Real ? T : real(T)
+
+    if dims == 1:ndims(a) && length(dims) == ndims(a)
+        # Use RealFFTs.RCpair for full transforms (more efficient)
+        buf_proto = RealFFTs.RCpair{numeric_type}(undef, size(a))
+        plan = RealFFTs.plan_rfft!(buf_proto; flags=FFTW.ESTIMATE)
+        buf_size = size(a)
+        
+        # Use task-local storage for thread-safe buffer reuse
+        bufs = Dict{UInt, RealFFTs.RCpair{numeric_type}}()
+        bufs_lock = ReentrantLock()
+        
+        return function (arr::AbstractArray)
+            tid = objectid(current_task())
+            buf = lock(bufs_lock) do
+                get!(bufs, tid) do
+                    RealFFTs.RCpair{numeric_type}(undef, buf_size)
+                end
+            end
+            copy!(buf, OffsetArrays.no_offset_view(arr))
+            plan(buf)
+            return copy(complex(buf))
+        end
+    else
+        # Use standard FFTW for partial transforms
+        buf_size = size(a)
+        buf_proto = Array{numeric_type}(undef, buf_size)
+        plan = plan_rfft(buf_proto, dims; flags=FFTW.ESTIMATE)
+        
+        # Use task-local storage for thread-safe buffer reuse
+        bufs = Dict{UInt, Array{numeric_type}}()
+        bufs_lock = ReentrantLock()
+        
+        return function (arr::AbstractArray)
+            tid = objectid(current_task())
+            buf = lock(bufs_lock) do
+                get!(bufs, tid) do
+                    Array{numeric_type}(undef, buf_size)
+                end
+            end
+            copyto!(buf, OffsetArrays.no_offset_view(arr))
+            return plan * buf
+        end
+    end
+end
+
+"""
+    buffered_planned_irfft(a::AbstractArray{T}, dims=1:ndims(a), d::Int=size(a,1)) where {T}
+
+Create a buffered, planned inverse real FFT function for arrays with the same size and type as `a`.
+
+Returns a function that performs an in-place irfft using a pre-computed plan.
+Each call uses a task-local buffer to ensure thread safety when used concurrently.
+
+# Arguments
+- `a`: Prototype array defining the size and element type
+- `dims`: Dimensions along which to perform the transform (default: all dimensions)
+- `d`: Size of the first transformed dimension in the original (real) array
+
+This is an internal function used by [`planned_fft`](@ref).
+"""
+function buffered_planned_irfft(a::AbstractArray{T}, dims=1:ndims(a), d::Int=size(a,1)) where {T}
+    numeric_type = T <: Real ? T : real(T)
+
+    if dims == 1:ndims(a) && length(dims) == ndims(a)
+        # Use RealFFTs.RCpair for full transforms (more efficient)
+        buf_proto = RealFFTs.RCpair{numeric_type}(undef, size(a))
+        plan = RealFFTs.plan_irfft!(buf_proto; flags=FFTW.ESTIMATE)
+        buf_size = size(a)
+        
+        # Use task-local storage for thread-safe buffer reuse
+        bufs = Dict{UInt, RealFFTs.RCpair{numeric_type}}()
+        bufs_lock = ReentrantLock()
+        
+        return function (arr::AbstractArray)
+            tid = objectid(current_task())
+            buf = lock(bufs_lock) do
+                get!(bufs, tid) do
+                    RealFFTs.RCpair{numeric_type}(undef, buf_size)
+                end
+            end
+            copy!(buf, OffsetArrays.no_offset_view(arr))
+            plan(buf)
+            return copy(real(buf))
+        end
+    else
+        # Use standard FFTW for partial transforms
+        input_size = collect(size(a))
+        input_size[dims[1]] = input_size[dims[1]] ÷ 2 + 1
+        buf_size = Tuple(input_size)
+        buf_proto = Array{Complex{numeric_type}}(undef, buf_size)
+        plan = plan_irfft(buf_proto, d, dims; flags=FFTW.ESTIMATE)
+        
+        # Use task-local storage for thread-safe buffer reuse
+        bufs = Dict{UInt, Array{Complex{numeric_type}}}()
+        bufs_lock = ReentrantLock()
+        
+        return function (arr::AbstractArray)
+            tid = objectid(current_task())
+            buf_in = lock(bufs_lock) do
+                get!(bufs, tid) do
+                    Array{Complex{numeric_type}}(undef, buf_size)
+                end
+            end
+            copyto!(buf_in, OffsetArrays.no_offset_view(arr))
+            return plan * buf_in
+        end
+    end
+end
+
+"""
+    planned_fft(A, kernel, [border="replicate"]) -> Algorithm.FFT
+
+Create a planned FFT algorithm that can be reused for filtering multiple images of the
+same size and type as `A`.
+
+This is useful when filtering many images with the same kernel, as the FFT plans are
+computed once and reused. The returned algorithm is thread-safe and can be used
+concurrently from multiple threads.
+
+# Arguments
+- `A`: A prototype array (the actual values are not used, only size and element type)
+- `kernel`: The filter kernel (array, tuple of arrays, or `Kernel` type)
+- `border`: Border specification (default: `"replicate"`). Note: `NA()` borders are not supported.
+
+# Returns
+An `Algorithm.FFT` instance with pre-computed plans that can be passed to `imfilter` or `imfilter!`.
+
+# Example
+```julia
+using ImageFiltering, ComputationalResources
+
+# Create a stack of images to filter
+imgstack = [rand(Float64, 512, 512) for _ in 1:100]
+kernel = Kernel.gaussian((3, 3))
+
+# Create the planned FFT algorithm once
+planned_alg = planned_fft(imgstack[1], kernel, "replicate")
+
+# Reuse for all images (can be parallelized with Threads.@threads)
+results = [imfilter(img, kernel, "replicate", planned_alg) for img in imgstack]
+
+# Or use with imfilter! and ComputationalResources
+out = similar(imgstack[1])
+for img in imgstack
+    imfilter!(CPU1(planned_alg), out, img, kernel, "replicate")
+    # process out...
+end
+```
+
+# Notes
+- The plan is specific to the array size, element type, kernel, and border specification.
+  Using it with differently-sized arrays will error.
+- IIR filters (like `KernelFactors.IIRGaussian`) are not supported; use `Algorithm.IIR()` instead.
+- For colorant images (e.g., `RGB`, `Gray`), the plans handle channel separation automatically.
+
+See also: [`imfilter`](@ref), [`imfilter!`](@ref), [`Algorithm.FFT`](@ref)
+"""
+function planned_fft(A::AbstractArray{T,N},
+            kernel::ProcessedKernel,
+            border::BorderSpecAny=Pad(:replicate)
+            ) where {T,N}
+    # Check if any kernel is an IIR filter (not compatible with FFT)
+    for k in kernel
+        # Check both direct IIR filters and ReshapedOneD-wrapped IIR filters
+        k_data = k isa ReshapedOneD ? k.data : k
+        if k_data isa IIRFilter
+            throw(ArgumentError("planned_fft does not support IIR filters like TriggsSdika or IIRGaussian. " *
+                              "IIR filters use a different algorithm that cannot be accelerated with FFT plans. " *
+                              "Use Algorithm.IIR() instead, or use imfilter without a plan."))
+        end
+    end
+    
+    # Use ffteltype to convert to appropriate floating point type for FFT
+    FT = ffteltype(T)
+
+    # Determine if we're dealing with colorants
+    if T <: Colorant
+        # For colorants, create target type with FFT-compatible element type
+        target_T = base_colorant_type(T){FT}
+        bord = border(kernel, A, Algorithm.FFT())
+        _A = padarray(target_T, A, bord)
+
+        # Get channelview and dims for colorant processing
+        Av, dims = channelview_dims(_A)
+
+        # Create planned functions that handle the dims parameter for colorants
+        bfp1 = buffered_planned_rfft(Av, dims)
+
+        # Create kernel for colorants
+        kern = samedims(_A, kernelconv(kernel...))
+        krn = FFTView(zeros(eltype(kern), map(length, axes(_A))))
+        for I in CartesianIndices(axes(kern))
+            krn[I] = kern[I]
+        end
+        kernrs = kreshape(T, krn)
+        bfp2 = buffered_planned_rfft(kernrs, dims)
+
+        # Create planned irfft with proper dimensions
+        bfp3 = buffered_planned_irfft(Av, dims, length(axes(Av, dims[1])))
+    else
+        # For regular arrays
+        bord = border(kernel, A, Algorithm.FFT())
+        _A = padarray(FT, A, bord)
+        bfp1 = buffered_planned_rfft(_A)
+        kern = samedims(_A, kernelconv(kernel...))
+        krn = FFTView(zeros(eltype(kern), map(length, axes(_A))))
+        bfp2 = buffered_planned_rfft(krn)
+        bfp3 = buffered_planned_irfft(_A)
+    end
+
+    return Algorithm.FFT(bfp1, bfp2, bfp3)
+end
+planned_fft(A::AbstractArray, kernel, border::AbstractString) = planned_fft(A, kernel, borderinstance(border))
+planned_fft(A::AbstractArray, kernel::Union{ArrayLike,Laplacian}, border::BorderSpecAny) = planned_fft(A, factorkernel(kernel), border)
+
+# Error methods for NA borders - these should not be used with planned FFT
+function planned_fft(A::AbstractArray{T,N},
+            kernel::ProcessedKernel,
+            border::NA) where {T,N}
+    throw(ArgumentError("NA borders are not supported with planned FFT algorithms."))
+end
+
+function planned_fft(A::AbstractArray,
+            kernel::Union{ArrayLike,Laplacian},
+            border::NA)
+    throw(ArgumentError("NA borders are not supported with planned FFT algorithms."))
+end
+
+planned_fft(A::AbstractArray, kernel, border::NA) = throw(ArgumentError("NA borders are not supported with planned FFT algorithms."))
+
+# Unified filtfft function for planned FFT operations
+function filtfft(A::AbstractArray{T}, krn, planned_rfft1::Function, planned_rfft2::Function, planned_irfft::Function) where {T}
+    if T <: Colorant
+        # Handle colorant arrays
+        Av, dims = channelview_dims(A)
+        kernrs = kreshape(T, krn)
+
+        # Use planned functions with proper dims handling
+        B = planned_rfft1(Av)
+        krn_buf = planned_rfft2(kernrs)
+        B .*= conj!(krn_buf)
+        Avf_no_offset = planned_irfft(B)
+
+        # Restore offset information by creating an OffsetArray with the original axes
+        Avf = OffsetArray(Avf_no_offset, axes(Av))
+
+        # Convert back to colorant array
+        return colorview(base_colorant_type(T){eltype(Avf)}, Avf)
+    else
+        # Handle regular arrays
+        B = planned_rfft1(A)
+        # Create a proper FFTView for the kernel and populate it
+        krn_fft = FFTView(zeros(eltype(krn), map(length, axes(A))))
+        for I in CartesianIndices(axes(krn))
+            krn_fft[I] = krn[I]
+        end
+        krn_buf = planned_rfft2(krn_fft)
+        B .*= conj!(krn_buf)
+        result = planned_irfft(B)
+
+        # Ensure the result has the same axes as the input array A
+        if axes(result) != axes(A)
+            # The planned IRFFT buffer might have different size or axes
+            if size(result) != size(A)
+                # Extract the portion that matches the size of A
+                result_view = view(result, ntuple(i -> 1:size(A,i), ndims(A))...)
+                return OffsetArray(collect(result_view), axes(A))
+            else
+                # Same size but different axes - use collect to get the actual data
+                return OffsetArray(collect(result), axes(A))
+            end
+        end
+        return result
+    end
+end
+filtfft(A, krn, ::Nothing, ::Nothing, ::Nothing) = filtfft(A, krn)
+
 function filtfft(A::AbstractArray{ST}, krn::AbstractArray{KT}) where {ST<:Union{Real,Complex},KT<:Union{Real,Complex}}
     CT = promote_type(ST, KT)
     B = _fft(A)
@@ -868,9 +1161,16 @@ end
 function filtfft(A::AbstractArray{CT}, krn) where {CT<:Colorant}
     Av, dims = channelview_dims(A)
     kernrs = kreshape(CT, krn)
-    B = rfft(Av, dims)
+    # Preserve offset information by working with offset arrays directly
+    # but use no_offset_view for RFFT operations
+    Av_no_offset = OffsetArrays.no_offset_view(Av)
+    B = rfft(Av_no_offset, dims)
     B .*= conj!(rfft(kernrs, dims))
-    Avf = irfft(B, length(axes(Av, dims[1])), dims)
+    Avf_no_offset = irfft(B, size(Av_no_offset, dims[1]), dims)
+
+    # Restore offset information by creating an OffsetArray with the original axes
+    Avf = OffsetArray(Avf_no_offset, axes(Av))
+
     colorview(base_colorant_type(CT){eltype(Avf)}, Avf)
 end
 channelview_dims(A::AbstractArray{C,N}) where {C<:Colorant,N} = channelview(A), ntuple(d -> d + 1, Val(N))
@@ -878,7 +1178,7 @@ channelview_dims(A::AbstractArray{C,N}) where {C<:ImageCore.Color1,N} = channelv
 
 function kreshape(::Type{C}, krn::FFTView) where {C<:Colorant}
     kern = parent(krn)
-    kernrs = FFTView(reshape(kern, 1, size(kern)...))
+    return FFTView(reshape(kern, 1, size(kern)...))
 end
 kreshape(::Type{C}, krn::FFTView) where {C<:ImageCore.Color1} = krn
 
diff --git a/src/utils.jl b/src/utils.jl
index 80e3839..e3ce850 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -126,7 +126,7 @@ const SmallInts = Union{UInt8,Int8,UInt16,Int16}
 accumfilter(pixelval::SmallInts, filterval::SmallInts) = Int(pixelval)*Int(filterval)
 # advice: don't use FixedPoint for the kernel
 accumfilter(pixelval::N0f8, filterval::N0f8) = Float32(pixelval)*Float32(filterval)
-accumfilter(pixelval::Colorant{N0f8}, filterval::N0f8) = float32(c)*Float32(filterval)
+accumfilter(pixelval::Colorant{N0f8}, filterval::N0f8) = float32(pixelval)*Float32(filterval)
 
 # In theory, the following might need to be specialized. For safety, make it a
 # standalone function call.
diff --git a/test/2d.jl b/test/2d.jl
index c3cdb72..26bbd91 100644
--- a/test/2d.jl
+++ b/test/2d.jl
@@ -36,6 +36,42 @@ using ImageFiltering: borderinstance, filtfft
     end
 end
 
+# Helper function to check if a type supports planned_fft
+function supports_planned_fft(::Type{T}) where T
+    # AbstractFloat types are directly supported
+    T <: AbstractFloat && return true
+
+    # Colorant types are supported if their element type can be converted to FFT-compatible types
+    if T <: Colorant
+        try
+            # Check if ffteltype can convert the element type
+            elt = eltype(T)
+            fft_type = ImageFiltering.ffteltype(elt)
+            return fft_type <: Union{Float32, Float64}
+        catch
+            return false
+        end
+    end
+
+    return false
+end
+
+function supported_algs(img::AbstractArray{T}, kernel, border) where T
+    base_algs = (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
+
+    # Include planned_fft if:
+    # 1. The image type supports planned_fft (AbstractFloat or compatible Colorants)
+    # 2. All kernel elements are floating point or complex floating point
+    # 3. Border is not NA (since NA requires special handling)
+    if supports_planned_fft(T) &&
+       all(k -> eltype(k) <: Union{AbstractFloat, Complex{<:AbstractFloat}}, (kernel isa Tuple ? kernel : (kernel,))) &&
+       !isa(border, NA)
+        return (base_algs..., planned_fft(img, kernel, border))
+    else
+        return base_algs
+    end
+end
+
 @testset "FIR/FFT" begin
     f32type(img) = f32type(eltype(img))
     f32type(::Type{C}) where {C<:Colorant} = base_colorant_type(C){Float32}
@@ -47,100 +83,110 @@ end
     imgi = zeros(Int, 5, 7); imgi[3,4] = 1
     imgg = fill(Gray{N0f8}(0), 5, 7); imgg[3,4] = 1
     imgc = fill(RGB{N0f8}(0,0,0), 5, 7); imgc[3,4] = RGB(1,0,0)
-    # Dense inseparable kernel
-    kern = [0.1 0.2; 0.4 0.5]
-    kernel = OffsetArray(kern, -1:0, 1:2)
-    for img in (imgf, imgi, imgg, imgc)
-        targetimg = zeros(typeof(img[1]*kern[1]), size(img))
-        targetimg[3:4,2:3] = rot180(kern) .* img[3,4]
-        @test @inferred(imfilter(img, kernel)) ≈ targetimg
-        @test @inferred(imfilter(img, (kernel,))) ≈ targetimg
-        @test @inferred(imfilter(f32type(img), img, kernel)) ≈ float32.(targetimg)
-        ret = imfilter(img, kernel)
-        fill!(ret, zero(eltype(ret)))
-        @test @inferred(imfilter!(ret, img, kernel)) ≈ targetimg
-        fill!(ret, zero(eltype(ret)))
-        @test @inferred(imfilter!(CPU1(Algorithm.FIR()), ret, img, kernel)) ≈ targetimg
-        for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
-            @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
-            @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
+    imgs = (("Float64", imgf), ("Int", imgi), ("Gray{N0f8}", imgg), ("RGB{N0f8}", imgc))
+    @testset "Dense inseparable kernel" begin
+        kern = [0.1 0.2; 0.4 0.5]
+        kernel = OffsetArray(kern, -1:0, 1:2)
+        @testset "$imgname" for (imgname, img) in imgs
+            targetimg = zeros(typeof(img[1]*kern[1]), size(img))
+            targetimg[3:4,2:3] = rot180(kern) .* img[3,4]
+            @test @inferred(imfilter(img, kernel)) ≈ targetimg
+            @test @inferred(imfilter(img, (kernel,))) ≈ targetimg
+            @test @inferred(imfilter(f32type(img), img, kernel)) ≈ float32.(targetimg)
+            ret = imfilter(img, kernel)
+            fill!(ret, zero(eltype(ret)))
+            @test @inferred(imfilter!(ret, img, kernel)) ≈ targetimg
             fill!(ret, zero(eltype(ret)))
-            @test @inferred(imfilter!(ret, img, kernel, border)) ≈ targetimg
-            for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-                @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
-                @test @inferred(imfilter(img, (kernel,), border, alg)) ≈ targetimg
-                @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
+            @test @inferred(imfilter!(CPU1(Algorithm.FIR()), ret, img, kernel)) ≈ targetimg
+            @testset "$border" for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
+                @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
+                @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
                 fill!(ret, zero(eltype(ret)))
-                @test @inferred(imfilter!(CPU1(alg), ret, img, kernel, border)) ≈ targetimg
+                @test @inferred(imfilter!(ret, img, kernel, border)) ≈ targetimg
+                @testset "$alg" for alg in supported_algs(img, kernel, border)
+                    @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
+                    @test @inferred(imfilter(img, (kernel,), border, alg)) ≈ targetimg
+                    @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
+                    fill!(ret, zero(eltype(ret)))
+                    @test @inferred(imfilter!(CPU1(alg), ret, img, kernel, border)) ≈ targetimg
+                end
+                @test_throws MethodError imfilter!(CPU1(Algorithm.FIR()), ret, img, kernel, border, Algorithm.FFT())
             end
-            @test_throws MethodError imfilter!(CPU1(Algorithm.FIR()), ret, img, kernel, border, Algorithm.FFT())
-        end
-        targetimg_inner = OffsetArray(targetimg[2:end, 1:end-2], 2:5, 1:5)
-        @test @inferred(imfilter(img, kernel, Inner())) ≈ targetimg_inner
-        @test @inferred(imfilter(f32type(img), img, kernel, Inner())) ≈ float32.(targetimg_inner)
-        for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-            @test @inferred(imfilter(img, kernel, Inner(), alg)) ≈ targetimg_inner
-            @test @inferred(imfilter(f32type(img), img, kernel, Inner(), alg)) ≈ float32.(targetimg_inner)
-            @test @inferred(imfilter(CPU1(alg), img, kernel, Inner())) ≈ targetimg_inner
-        end
-    end
-    # Factored kernel
-    kernel = (OffsetArray([0.2,0.8], -1:0), OffsetArray([0.3 0.6], 0:0, 1:2))
-    kern = parent(kernel[1]).*parent(kernel[2])
-    for img in (imgf, imgi, imgg, imgc)
-        targetimg = zeros(typeof(img[1]*kern[1]), size(img))
-        targetimg[3:4,2:3] = rot180(kern) .* img[3,4]
-        ret = similar(targetimg)
-        @test @inferred(imfilter(img, kernel)) ≈ targetimg
-        @test @inferred(imfilter(f32type(img), img, kernel)) ≈ float32.(targetimg)
-        for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
-            @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
-            @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
-            for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-                @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
-                @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
+            @testset "Inner()" begin
+                targetimg_inner = OffsetArray(targetimg[2:end, 1:end-2], 2:5, 1:5)
+                @test @inferred(imfilter(img, kernel, Inner())) ≈ targetimg_inner
+                @test @inferred(imfilter(f32type(img), img, kernel, Inner())) ≈ float32.(targetimg_inner)
+                @testset "$alg" for alg in supported_algs(img, kernel, Inner())
+                    @test @inferred(imfilter(img, kernel, Inner(), alg)) ≈ targetimg_inner
+                    @test @inferred(imfilter(f32type(img), img, kernel, Inner(), alg)) ≈ float32.(targetimg_inner)
+                    @test @inferred(imfilter(CPU1(alg), img, kernel, Inner())) ≈ targetimg_inner
+                end
             end
-            @test_throws MethodError imfilter!(CPU1(Algorithm.FIR()), ret, img, kernel, border, Algorithm.FFT())
-            @test_throws ErrorException imfilter!(CPU1(Algorithm.FIR()), ret, img, kernel, borderinstance(border), axes(ret)) #167
-        end
-        targetimg_inner = OffsetArray(targetimg[2:end, 1:end-2], 2:5, 1:5)
-        @test @inferred(imfilter(img, kernel, Inner())) ≈ targetimg_inner
-        @test @inferred(imfilter(f32type(img), img, kernel, Inner())) ≈ float32.(targetimg_inner)
-        for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-            @test @inferred(imfilter(img, kernel, Inner(), alg)) ≈ targetimg_inner
-            @test @inferred(imfilter(f32type(img), img, kernel, Inner(), alg)) ≈ float32.(targetimg_inner)
         end
     end
-    # Rational filter coefficients
-    kernel = (centered([1//3, 1//3, 1//3]), centered([1//3, 1//3, 1//3]'))
-    kern = kernel[1].*kernel[2]
-    for img in (imgf, imgi, imgg, imgc)
-        targetimg = zeros(typeof(img[1]*kern[1]), size(img))
-        targetimg[2:4,3:5] .= img[3,4]*(1//9)
-        @test @inferred(imfilter(img, kernel)) ≈ targetimg
-        @test @inferred(imfilter(f32type(img), img, kernel)) ≈ float32.(targetimg)
-        for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
-            @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
-            @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
-            for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-                if alg == Algorithm.FFT() && eltype(img) == Int
-                    @test @inferred(imfilter(Float64, img, kernel, border, alg)) ≈ targetimg
-                else
+    @testset "Factored kernel" begin
+        kernel = (OffsetArray([0.2,0.8], -1:0), OffsetArray([0.3 0.6], 0:0, 1:2))
+        kern = parent(kernel[1]).*parent(kernel[2])
+        @testset "$imgname" for (imgname, img) in imgs
+            targetimg = zeros(typeof(img[1]*kern[1]), size(img))
+            targetimg[3:4,2:3] = rot180(kern) .* img[3,4]
+            ret = similar(targetimg)
+            @test @inferred(imfilter(img, kernel)) ≈ targetimg
+            @test @inferred(imfilter(f32type(img), img, kernel)) ≈ float32.(targetimg)
+            @testset "$border" for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
+                @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
+                @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
+                @testset "$alg" for alg in supported_algs(img, kernel, border)
                     @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
+                    @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
+                end
+                @test_throws MethodError imfilter!(CPU1(Algorithm.FIR()), ret, img, kernel, border, Algorithm.FFT())
+                @test_throws ErrorException imfilter!(CPU1(Algorithm.FIR()), ret, img, kernel, borderinstance(border), axes(ret)) #167
+            end
+            @testset "Inner()" begin
+                targetimg_inner = OffsetArray(targetimg[2:end, 1:end-2], 2:5, 1:5)
+                @test @inferred(imfilter(img, kernel, Inner())) ≈ targetimg_inner
+                @test @inferred(imfilter(f32type(img), img, kernel, Inner())) ≈ float32.(targetimg_inner)
+                @testset "$alg" for alg in supported_algs(img, kernel, Inner())
+                    @test @inferred(imfilter(img, kernel, Inner(), alg)) ≈ targetimg_inner
+                    @test @inferred(imfilter(f32type(img), img, kernel, Inner(), alg)) ≈ float32.(targetimg_inner)
                 end
-                @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
             end
         end
-        targetimg_inner = OffsetArray(targetimg[2:end-1, 2:end-1], 2:4, 2:6)
-        @test @inferred(imfilter(img, kernel, Inner())) ≈ targetimg_inner
-        @test @inferred(imfilter(f32type(img), img, kernel, Inner())) ≈ float32.(targetimg_inner)
-        for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-            if alg == Algorithm.FFT() && eltype(img) == Int
-                @test @inferred(imfilter(Float64, img, kernel, Inner(), alg)) ≈ targetimg_inner
-            else
-                @test @inferred(imfilter(img, kernel, Inner(), alg)) ≈ targetimg_inner
+    end
+    @testset "Rational filter coefficients" begin
+        kernel = (centered([1//3, 1//3, 1//3]), centered([1//3, 1//3, 1//3]'))
+        kern = kernel[1].*kernel[2]
+        @testset "$imgname" for (imgname, img) in imgs
+            targetimg = zeros(typeof(img[1]*kern[1]), size(img))
+            targetimg[2:4,3:5] .= img[3,4]*(1//9)
+            @test @inferred(imfilter(img, kernel)) ≈ targetimg
+            @test @inferred(imfilter(f32type(img), img, kernel)) ≈ float32.(targetimg)
+            @testset "$border" for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
+                @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
+                @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
+                @testset "$alg" for alg in supported_algs(img, kernel, border)
+                    if alg == Algorithm.FFT() && eltype(img) == Int
+                        @test @inferred(imfilter(Float64, img, kernel, border, alg)) ≈ targetimg
+                    else
+                        @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
+                    end
+                    @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
+                end
+            end
+            @testset "Inner()" begin
+                targetimg_inner = OffsetArray(targetimg[2:end-1, 2:end-1], 2:4, 2:6)
+                @test @inferred(imfilter(img, kernel, Inner())) ≈ targetimg_inner
+                @test @inferred(imfilter(f32type(img), img, kernel, Inner())) ≈ float32.(targetimg_inner)
+                @testset "$alg" for alg in supported_algs(img, kernel, Inner())
+                    if alg == Algorithm.FFT() && eltype(img) == Int
+                        @test @inferred(imfilter(Float64, img, kernel, Inner(), alg)) ≈ targetimg_inner
+                    else
+                        @test @inferred(imfilter(img, kernel, Inner(), alg)) ≈ targetimg_inner
+                    end
+                    @test @inferred(imfilter(f32type(img), img, kernel, Inner(), alg)) ≈ float32.(targetimg_inner)
+                end
             end
-            @test @inferred(imfilter(f32type(img), img, kernel, Inner(), alg)) ≈ float32.(targetimg_inner)
         end
     end
     ## Images for which the boundary conditions matter
@@ -148,9 +194,8 @@ end
     imgi = zeros(Int, 5, 7); imgi[1,2] = 1
     imgg = fill(Gray(0), 5, 7); imgg[1,2] = 1
     imgc = fill(RGB(0,0,0), 5, 7); imgc[1,2] = RGB(1,0,0)
-    # Dense kernel
-    kern = [0.1 0.2; 0.4 0.5]
-    kernel = OffsetArray(kern, -1:0, 1:2)
+    imgs = (("Float64", imgf), ("Int", imgi), ("Gray{N0f8}", imgg), ("RGB{N0f8}", imgc))
+
     function target1(img::AbstractArray{T}, kern, border) where T
         if border ∈ ("replicate", "symmetric")
             ret = float64.(zero(img))
@@ -179,63 +224,111 @@ end
         ret[:,end] .= nan(eltype(ret))  # for the last column, this kernel is entirely in the padding region
         ret
     end
-    for img in (imgf, imgi, imgg, imgc)
-        for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
-            targetimg = target1(img, kern, border)
-            @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
-            @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
-            for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-                @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
-                @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
+    @testset "Dense kernel" begin
+        kern = [0.1 0.2; 0.4 0.5]
+        kernel = OffsetArray(kern, -1:0, 1:2)
+        @testset "$imgname" for (imgname, img) in imgs
+            @testset "$border" for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
+                targetimg = target1(img, kern, border)
+                @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
+                @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
+                @testset "$alg" for alg in supported_algs(img, kernel, border)
+                    @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
+                    @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
+                end
+            end
+            @testset "NA()" begin
+                border = NA()
+                targetimg0 = target1(img, kern, border)
+                nanflag = isnan.(targetimg0)
+                targetimg = zerona!(copy(targetimg0))
+                @test @inferred(zerona!(imfilter(img, kernel, border))) ≈ targetimg
+                @test @inferred(zerona!(imfilter(f32type(img), img, kernel, border))) ≈ float32.(targetimg)
+                @testset "$alg" for alg in supported_algs(img, kernel, border)
+                    @test @inferred(zerona!(imfilter(img, kernel, border, alg), nanflag)) ≈ targetimg
+                    @test @inferred(zerona!(imfilter(f32type(img), img, kernel, border, alg), nanflag)) ≈ float32.(targetimg)
+                end
             end
-        end
-        border = NA()
-        targetimg0 = target1(img, kern, border)
-        nanflag = isnan.(targetimg0)
-        targetimg = zerona!(copy(targetimg0))
-        @test @inferred(zerona!(imfilter(img, kernel, border))) ≈ targetimg
-        @test @inferred(zerona!(imfilter(f32type(img), img, kernel, border))) ≈ float32.(targetimg)
-        for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-            @test @inferred(zerona!(imfilter(img, kernel, border, alg), nanflag)) ≈ targetimg
-            @test @inferred(zerona!(imfilter(f32type(img), img, kernel, border, alg), nanflag)) ≈ float32.(targetimg)
         end
     end
-    # Factored kernel
-    kernel = (OffsetArray([0.2,0.8], -1:0), OffsetArray([0.3 0.6], 0:0, 1:2))
-    kern = parent(kernel[1]).*parent(kernel[2])
-    for img in (imgf, imgi, imgg, imgc)
-        for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
-            targetimg = target1(img, kern, border)
-            @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
-            @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
-            for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-                @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
-                @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
+    @testset  "Factored kernel" begin
+        kernel = (OffsetArray([0.2,0.8], -1:0), OffsetArray([0.3 0.6], 0:0, 1:2))
+        kern = parent(kernel[1]).*parent(kernel[2])
+        @testset "$imgname" for (imgname, img) in imgs
+            @testset "$border" for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
+                targetimg = target1(img, kern, border)
+                @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
+                @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
+                @testset "$alg" for alg in supported_algs(img, kernel, border)
+                    @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
+                    @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
+                end
+            end
+            @testset "NA()" begin
+                border = NA()
+                targetimg0 = target1(img, kern, border)
+                nanflag = isnan.(targetimg0)
+                targetimg = zerona!(copy(targetimg0))
+                @test @inferred(zerona!(imfilter(img, kernel, border))) ≈ targetimg
+                @test @inferred(zerona!(imfilter(f32type(img), img, kernel, border))) ≈ float32.(targetimg)
+                @testset "$alg" for alg in supported_algs(img, kernel, border)
+                    @test @inferred(zerona!(imfilter(img, kernel, border, alg), nanflag)) ≈ targetimg
+                    @test @inferred(zerona!(imfilter(f32type(img), img, kernel, border, alg), nanflag)) ≈ float32.(targetimg)
+                end
             end
         end
-        border = NA()
-        targetimg0 = target1(img, kern, border)
-        nanflag = isnan.(targetimg0)
-        targetimg = zerona!(copy(targetimg0))
-        @test @inferred(zerona!(imfilter(img, kernel, border))) ≈ targetimg
-        @test @inferred(zerona!(imfilter(f32type(img), img, kernel, border))) ≈ float32.(targetimg)
-        for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-            @test @inferred(zerona!(imfilter(img, kernel, border, alg), nanflag)) ≈ targetimg
-            @test @inferred(zerona!(imfilter(f32type(img), img, kernel, border, alg), nanflag)) ≈ float32.(targetimg)
+        # filtering with a 0d kernel
+        a = rand(3,3)
+        # @test imfilter(a, reshape([2])) ≈ 2a
+        # imfilter! as a copy
+        ret = zeros(3, 3)
+        b = @inferred(imfilter!(CPU1(Algorithm.FIR()), ret, a, (), NoPad()))
+        @test b == a
+        @test !(b === a)
+        # OffsetArrays
+        img = OffsetArray(rand(RGB{N0f8}, 80, 100), (-5, 3))
+        imgf = imfilter(img, Kernel.gaussian((3,3)))
+        @test axes(imgf) == axes(img)
+    end
+
+    @testset "Planned FFT edge cases" begin
+        # Test case 1: axes(result) != axes(A) but size(result) == size(A)
+        A_offset = OffsetArray(rand(Float64, 6, 8), (-1, 1))
+        kernel = rand(Float64, 3, 3)
+        planned_alg = planned_fft(A_offset, kernel, Fill(0.0))
+        result_planned = imfilter(A_offset, kernel, Fill(0.0), planned_alg)
+        result_standard = imfilter(A_offset, kernel, Fill(0.0), Algorithm.FFT())
+        @test axes(result_planned) == axes(A_offset)
+        @test result_planned ≈ result_standard
+
+        # Test case 2: size(result) != size(A) - create artificial size mismatch
+        A = rand(Float64, 7, 9)
+        bord = Fill(0.0)(rand(3,5), A, Algorithm.FFT())
+        _A = ImageFiltering.padarray(Float64, A, bord)
+        kern = ImageFiltering.samedims(_A, ImageFiltering.kernelconv(rand(3,5)))
+        krn = FFTView(zeros(eltype(kern), map(length, axes(_A))))
+
+        # Create custom planned function that returns larger buffer
+        original_irfft = ImageFiltering.buffered_planned_irfft(_A)
+        custom_irfft = function(arr)
+            result = original_irfft(arr)
+            if size(result, 1) > 2 && size(result, 2) > 2
+                # Return buffer that's 1 larger in each dimension
+                larger = zeros(eltype(result), size(result, 1) + 1, size(result, 2) + 1)
+                larger[1:size(result,1), 1:size(result,2)] = result
+                return larger
+            end
+            return result
         end
+
+        result_custom = ImageFiltering.filtfft(_A, krn,
+            ImageFiltering.buffered_planned_rfft(_A),
+            ImageFiltering.buffered_planned_rfft(krn),
+            custom_irfft)
+        result_standard = ImageFiltering.filtfft(_A, krn)
+        @test size(result_custom) == size(_A)
+        @test result_custom ≈ result_standard
     end
-    # filtering with a 0d kernel
-    a = rand(3,3)
-    # @test imfilter(a, reshape([2])) ≈ 2a
-    # imfilter! as a copy
-    ret = zeros(3, 3)
-    b = @inferred(imfilter!(CPU1(Algorithm.FIR()), ret, a, (), NoPad()))
-    @test b == a
-    @test !(b === a)
-    # OffsetArrays
-    img = OffsetArray(rand(RGB{N0f8}, 80, 100), (-5, 3))
-    imgf = imfilter(img, Kernel.gaussian((3,3)))
-    @test axes(imgf) == axes(img)
 end
 
 @testset "Borders (issue #85)" begin
@@ -356,3 +449,215 @@ end
     @test filtfft(C, D) ≈ filtfft(C, ComplexF32.(D))
     @test filtfft(C, D) ≈ filtfft(ComplexF32.(C), ComplexF32.(D))
 end
+
+@testset "Planned FFT with NA borders should error" begin
+    imgf = zeros(Float64, 5, 7); imgf[3,4] = 1.0
+    imgi = zeros(Int, 5, 7); imgi[3,4] = 1
+    imgc = fill(RGB{Float64}(0,0,0), 5, 7); imgc[3,4] = RGB{Float64}(1,0,0)
+    kernel = OffsetArray([0.1 0.2; 0.4 0.5], -1:0, 1:2)
+    na_border = NA()
+
+    @test_throws ArgumentError planned_fft(imgf, kernel, na_border)
+    @test_throws ArgumentError planned_fft(imgi, kernel, na_border)
+    @test_throws ArgumentError planned_fft(imgc, kernel, na_border)
+    kern_tuple = (OffsetArray([0.2, 0.8], -1:0), OffsetArray([0.3 0.6], 0:0, 1:2))
+    @test_throws ArgumentError planned_fft(imgf, kern_tuple, na_border)
+    @test_throws ArgumentError planned_fft(imgf, Kernel.Laplacian(), na_border)
+end
+
+@testset "Planned FFT with IIR filters should error" begin
+    imgf = rand(Float64, 50, 50)
+    
+    # IIRGaussian is an IIR filter
+    iir_kernel = KernelFactors.IIRGaussian(Float64, (3.0,))
+    @test_throws ArgumentError planned_fft(imgf, iir_kernel, "replicate")
+    
+    # TriggsSdika tuple
+    ts_kernel = KernelFactors.IIRGaussian(Float64, (3.0, 3.0))
+    @test_throws ArgumentError planned_fft(imgf, ts_kernel, Pad(:replicate))
+end
+
+@testset "Plan Reuse Across Multiple Images" begin
+    @testset "Reuse with same-size Float64 images" begin
+        # Create multiple images of the same size
+        images = [rand(Float64, 50, 50) for _ in 1:5]
+        # Add some structure to make filtering meaningful
+        for img in images
+            img[20:30, 20:30] .= 1.0
+        end
+        
+        kernel = rand(Float64, 3, 3)
+        border = Fill(0.0)
+        
+        # Create plan once
+        planned_alg = planned_fft(images[1], kernel, border)
+        
+        # Reuse plan for all images
+        for (i, img) in enumerate(images)
+            result_planned = imfilter(img, kernel, border, planned_alg)
+            result_standard = imfilter(img, kernel, border, Algorithm.FFT())
+            @test result_planned ≈ result_standard rtol=1e-10
+        end
+    end
+    
+    @testset "Reuse with same-size Gray images" begin
+        images = [rand(Gray{Float64}, 40, 60) for _ in 1:3]
+        kernel = Kernel.gaussian((3, 3))
+        border = Pad(:replicate)
+        
+        planned_alg = planned_fft(images[1], kernel, border)
+        
+        for img in images
+            result_planned = imfilter(img, kernel, border, planned_alg)
+            result_standard = imfilter(img, kernel, border, Algorithm.FFT())
+            @test result_planned ≈ result_standard rtol=1e-10
+        end
+    end
+    
+    @testset "Wrong-size image should error" begin
+        img1 = rand(Float64, 50, 50)
+        img2 = rand(Float64, 60, 60)  # Different size
+        kernel = rand(Float64, 3, 3)
+        border = Fill(0.0)
+        
+        planned_alg = planned_fft(img1, kernel, border)
+        
+        # First image should work
+        result1 = imfilter(img1, kernel, border, planned_alg)
+        @test result1 isa Array{Float64, 2}
+        
+        # Wrong-size image should throw an error
+        @test_throws Exception imfilter(img2, kernel, border, planned_alg)
+    end
+    
+    @testset "Different element type with same size" begin
+        img_f64 = rand(Float64, 50, 50)
+        img_f32 = rand(Float32, 50, 50)
+        kernel = rand(Float64, 3, 3)
+        border = Fill(0.0)
+        
+        planned_alg = planned_fft(img_f64, kernel, border)
+        
+        # Float32 image with Float64 plan should work (gets converted)
+        result = imfilter(img_f32, kernel, border, planned_alg)
+        @test result isa Array
+        
+        # Compare with standard FFT
+        result_standard = imfilter(img_f32, kernel, border, Algorithm.FFT())
+        @test result ≈ result_standard rtol=1e-6
+    end
+end
+
+@testset "Concurrent Filtering with Planned FFT" begin
+    # Only run if we have multiple threads
+    if Threads.nthreads() > 1
+        @testset "Concurrent Float64 filtering" begin
+            n_images = 10
+            images = [rand(Float64, 100, 100) for _ in 1:n_images]
+            # Add structure to images
+            for img in images
+                img[40:60, 40:60] .= rand()
+            end
+            
+            kernel = Kernel.gaussian((5, 5))
+            border = Pad(:replicate)
+            
+            # Create plan once
+            planned_alg = planned_fft(images[1], kernel, border)
+            results = Vector{Any}(undef, n_images)
+            
+            # Filter concurrently
+            Threads.@threads for i in 1:n_images
+                results[i] = imfilter(images[i], kernel, border, planned_alg)
+            end
+            
+            # Verify correctness
+            for i in 1:n_images
+                expected = imfilter(images[i], kernel, border, Algorithm.FFT())
+                @test results[i] ≈ expected rtol=1e-8
+            end
+        end
+        
+        @testset "Concurrent Gray image filtering" begin
+            n_images = 8
+            images = [rand(Gray{Float64}, 80, 80) for _ in 1:n_images]
+            kernel = Kernel.gaussian((3, 3))
+            border = Fill(Gray(0.0))
+            
+            planned_alg = planned_fft(images[1], kernel, border)
+            results = Vector{Any}(undef, n_images)
+            
+            Threads.@threads for i in 1:n_images
+                results[i] = imfilter(images[i], kernel, border, planned_alg)
+            end
+            
+            for i in 1:n_images
+                expected = imfilter(images[i], kernel, border, Algorithm.FFT())
+                @test results[i] ≈ expected rtol=1e-8
+            end
+        end
+    else
+        @info "Skipping concurrent tests (only 1 thread available). Run with multiple threads to test concurrent filtering."
+    end
+end
+
+@testset "Plan Reuse Validation" begin
+    @testset "Plan structure validation" begin
+        img = rand(Float64, 50, 50)
+        kernel = rand(Float64, 3, 3)
+        border = Fill(0.0)
+        
+        planned_alg = planned_fft(img, kernel, border)
+        
+        # Verify it's an Algorithm.FFT with plans
+        @test planned_alg isa Algorithm.FFT
+        @test planned_alg.plan1 !== nothing
+        @test planned_alg.plan2 !== nothing
+        @test planned_alg.plan3 !== nothing
+        
+        # Verify plans are callable functions
+        @test planned_alg.plan1 isa Function
+        @test planned_alg.plan2 isa Function
+        @test planned_alg.plan3 isa Function
+    end
+    
+    @testset "Compare planned vs unplanned FFT" begin
+        img = rand(Float64, 100, 100)
+        img[45:55, 45:55] .= 1.0
+        kernel = Kernel.gaussian((7, 7))
+        border = Pad(:symmetric)
+        
+        # Planned FFT
+        planned_alg = planned_fft(img, kernel, border)
+        result_planned = imfilter(img, kernel, border, planned_alg)
+        
+        # Unplanned FFT
+        result_unplanned = imfilter(img, kernel, border, Algorithm.FFT())
+        
+        # Results should be very close (within numerical precision)
+        @test result_planned ≈ result_unplanned rtol=1e-10
+        
+        # Verify we get the same axes
+        @test axes(result_planned) == axes(result_unplanned)
+    end
+    
+    @testset "Offset arrays with planned FFT reuse" begin
+        images = [OffsetArray(rand(Float64, 50, 50), (0:49, 1:50)) for _ in 1:3]
+        kernel = rand(Float64, 3, 3)
+        border = Fill(0.0)
+        
+        planned_alg = planned_fft(images[1], kernel, border)
+        
+        for img in images
+            result_planned = imfilter(img, kernel, border, planned_alg)
+            result_standard = imfilter(img, kernel, border, Algorithm.FFT())
+            
+            # Check axes are preserved
+            @test axes(result_planned) == axes(img)
+            @test axes(result_standard) == axes(img)
+            
+            # Check values match
+            @test result_planned ≈ result_standard rtol=1e-10
+        end
+    end
+end
diff --git a/test/gabor.jl b/test/gabor.jl
index 5c77508..faa49cb 100644
--- a/test/gabor.jl
+++ b/test/gabor.jl
@@ -7,11 +7,15 @@ using ImageFiltering, Test, Statistics
     size_y = 6*σy+1
     γ = σx/σy
     # zero size forces default kernel width, with warnings
-    @info "Four warnings are expected"
-    kernel = Kernel.gabor(0,0,σx,0,5,γ,0)
-    @test isequal(size(kernel[1]),(size_x,size_y))
-    kernel = Kernel.gabor(0,0,σx,π,5,γ,0)
-    @test isequal(size(kernel[1]),(size_x,size_y))
+
+    @test_logs (:warn, r"The input parameter size_") match_mode=:any begin
+        kernel = Kernel.gabor(0,0,σx,0,5,γ,0)
+        @test isequal(size(kernel[1]),(size_x,size_y))
+    end
+    @test_logs (:warn, r"The input parameter size_") match_mode=:any begin
+        kernel = Kernel.gabor(0,0,σx,π,5,γ,0)
+        @test isequal(size(kernel[1]),(size_x,size_y))
+    end
 
     for x in 0:4, y in 0:4, z in 0:4, t in 0:4
         σx1 = 2*x+1
diff --git a/test/nd.jl b/test/nd.jl
index 8842743..b57ae3f 100644
--- a/test/nd.jl
+++ b/test/nd.jl
@@ -49,8 +49,11 @@ Base.zero(::Type{WrappedFloat}) = WrappedFloat(0.0)
     # Element-type widening (issue #17)
     v = fill(0xff, 10)
     kern = centered(fill(0xff, 3))
-    @info "Two warnings are expected"
-    @test_throws InexactError imfilter(v, kern)
+
+    @test_logs (:warn, r"Likely overflow or conversion error detected") begin
+        @test_throws InexactError imfilter(v, kern)
+    end
+
     vout = imfilter(UInt32, v, kern)
     @test eltype(vout) == UInt32
     @test all(x->x==0x0002fa03, vout)
@@ -106,8 +109,8 @@ Base.zero(::Type{WrappedFloat}) = WrappedFloat(0.0)
         around_i = [abs(i-j) <= 15 for j in eachindex(v)]
         @test all(isequal(x), wf[around_i])
         @test wf[.!around_i] ≈ vf[.!around_i]
-    end   
-   
+    end
+
     # Issue #110
     img = reinterpret(WrappedFloat, rand(128))
     kern = centered(rand(31))
@@ -119,7 +122,9 @@ end
     # issue #17
     img = fill(typemax(Int16), 10, 10)
     kern = centered(Int16[1 2 2 2 1])
-    @test_throws InexactError imfilter(img, kern)
+    @test_logs (:warn, r"Likely overflow or conversion error detected") begin
+        @test_throws InexactError imfilter(img, kern)
+    end
     ret = imfilter(Int32, img, kern)
     @test eltype(ret) == Int32
     @test all(x->x==262136, ret)
@@ -129,6 +134,7 @@ end
     img = trues(10,10,10)
     kernel = centered(trues(3,3,3)/27)
     for border in ("replicate", "circular", "symmetric", "reflect", Fill(true))
+        # TODO: add support for boolean images in planned_fft
         for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
             @test imfilter(img, kernel, border) ≈ img
         end
diff --git a/test/runtests.jl b/test/runtests.jl
index cfb5ae5..955179d 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -6,6 +6,13 @@ using ImageQualityIndexes
 import StaticArrays
 using Random
 
+function typestring(::Type{T}) where T   # from https://github.com/JuliaImages/ImageCore.jl/pull/133
+    buf = IOBuffer()
+    show(buf, T)
+    String(take!(buf))
+end
+
+@testset "ImageFiltering" verbose=true begin
 @testset "Project meta quality checks" begin
     # Ambiguity test
     if Base.VERSION >= v"1.6.0-DEV.1005"   # julia #37616
@@ -26,12 +33,6 @@ using Random
     end
 end
 
-function typestring(::Type{T}) where T   # from https://github.com/JuliaImages/ImageCore.jl/pull/133
-    buf = IOBuffer()
-    show(buf, T)
-    String(take!(buf))
-end
-
 include("compat.jl")
 include("border.jl")
 include("nd.jl")
@@ -49,7 +50,6 @@ include("models.jl")
 
 CUDA_INSTALLED = false
 try
-    global CUDA_INSTALLED
     # This errors with `IOError` when nvidia driver is not available,
     # in which case we don't even need to try `using CUDA`
     run(pipeline(`nvidia-smi`, stdout=devnull, stderr=devnull))
@@ -71,3 +71,5 @@ else
     @warn "CUDA test: disabled"
 end
 nothing
+
+end