From 7e269cfaf2e36d3255ac718d81f2f41736f107ca Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Fri, 12 Jan 2024 13:49:25 -0500
Subject: [PATCH 1/3] Enable reusing fft plans

---
 .github/workflows/Documentation.yml |  13 +-
 .github/workflows/UnitTest.yml      |   4 +-
 Project.toml                        |   4 +-
 dev/debug_planned.jl                | 948 ++++++++++++++++++++++++++++
 docs/src/index.md                   |  22 +-
 src/ImageFiltering.jl               |  24 +-
 src/imfilter.jl                     | 207 +++++-
 src/utils.jl                        |   2 +-
 test/2d.jl                          | 577 +++++++++++++----
 test/gabor.jl                       |  14 +-
 test/nd.jl                          |  16 +-
 test/runtests.jl                    |  16 +-
 12 files changed, 1672 insertions(+), 175 deletions(-)
 create mode 100644 dev/debug_planned.jl

diff --git a/.github/workflows/Documentation.yml b/.github/workflows/Documentation.yml
index 6fc79024..80bff9c9 100644
--- a/.github/workflows/Documentation.yml
+++ b/.github/workflows/Documentation.yml
@@ -17,20 +17,11 @@ jobs:
         os: [ubuntu-latest]
     steps:
       - uses: actions/checkout@v4
-      - uses: julia-actions/setup-julia@latest
+      - uses: julia-actions/setup-julia@v2
         with:
           version: ${{ matrix.julia-version }}
       - name: Cache artifacts
-        uses: actions/cache@v4
-        env:
-          cache-name: cache-artifacts
-        with:
-          path: ~/.julia/artifacts 
-          key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
-          restore-keys: |
-            ${{ runner.os }}-test-${{ env.cache-name }}-
-            ${{ runner.os }}-test-
-            ${{ runner.os }}-
+        uses: julia-actions/cache@v2
       - name: Install dependencies
         run: julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
       - name: Build and deploy
diff --git a/.github/workflows/UnitTest.yml b/.github/workflows/UnitTest.yml
index cff571af..8a450da2 100644
--- a/.github/workflows/UnitTest.yml
+++ b/.github/workflows/UnitTest.yml
@@ -44,7 +44,9 @@ jobs:
         uses: julia-actions/cache@v2
 
       - name: "Unit Test"
-        uses: julia-actions/julia-runtest@master
+        uses: julia-actions/julia-runtest@v1
+        env:
+          JULIA_NUM_THREADS: 2
 
       - uses: julia-actions/julia-processcoverage@v1
       - uses: codecov/codecov-action@v5
diff --git a/Project.toml b/Project.toml
index 1e5f3e43..0caee1b8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,6 +14,7 @@ ImageCore = "a09fc81d-aa75-5fe9-8630-4744c3626534"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
+RealFFTs = "3645faec-0534-4e91-afef-021db0981eec"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
@@ -31,10 +32,11 @@ ImageCore = "0.10"
 OffsetArrays = "1.9"
 PrecompileTools = "1"
 Reexport = "1.1"
+RealFFTs = "1"
 StaticArrays = "0.10, 0.11, 0.12, 1.0"
 Statistics = "1"
 TiledIteration = "0.2, 0.3, 0.4, 0.5"
-julia = "1.6"
+julia = "1.10"
 
 [extras]
 AxisArrays = "39de3d68-74b9-583c-8d2d-e117c070f3a9"
diff --git a/dev/debug_planned.jl b/dev/debug_planned.jl
new file mode 100644
index 00000000..4db395b1
--- /dev/null
+++ b/dev/debug_planned.jl
@@ -0,0 +1,948 @@
+#!/usr/bin/env julia
+
+"""
+Consolidated ImageFiltering.jl Debugging Script
+
+This script investigates issues with planned FFT functionality in ImageFiltering.jl.
+
+Main issues investigated:
+1. Buffer size calculation issues in planned FFT
+2. RFFT with offset arrays vs collect() behavior
+3. Colorant (RGB/Gray) image handling in planned FFT
+4. Copyto! operations with OffsetArrays
+5. Planned FFT vs standard FFT result differences
+6. Step-by-step debugging of the FFT pipeline
+
+Usage: julia --project debug_consolidated.jl [section]
+Where section can be: all, basic, colorant, offset, pipeline, fixes, supported_algs
+"""
+
+using ImageFiltering, ImageCore, OffsetArrays, FFTViews, FFTW, Statistics
+using Test
+
+# Configuration
+const RTOL = 0.001
+const ATOL = 0.001
+const VERBOSE = true
+
+# Test data creation helpers
+function create_test_data()
+    # Basic floating point images
+    imgf = zeros(Float64, 5, 7); imgf[3,4] = 1.0
+    imgf32 = zeros(Float32, 5, 7); imgf32[3,4] = 1.0f0
+
+    # Integer image
+    imgi = zeros(Int, 5, 7); imgi[3,4] = 1
+
+    # Colorant images - floating point
+    imgg_f64 = fill(Gray{Float64}(0), 5, 7); imgg_f64[3,4] = Gray{Float64}(1)
+    imgc_f64 = fill(RGB{Float64}(0,0,0), 5, 7); imgc_f64[3,4] = RGB{Float64}(1,0,0)
+
+    # Colorant images - fixed point
+    imgg_n0f8 = fill(Gray{N0f8}(0), 5, 7); imgg_n0f8[3,4] = Gray{N0f8}(1)
+    imgc_n0f8 = fill(RGB{N0f8}(0,0,0), 5, 7); imgc_n0f8[3,4] = RGB{N0f8}(1,0,0)
+
+    # Test kernel
+    kern = [0.1 0.2; 0.4 0.5]
+    kernel = OffsetArray(kern, -1:0, 1:2)
+
+    return (
+        float_imgs = (imgf, imgf32),
+        int_imgs = (imgi,),
+        colorant_float = (imgg_f64, imgc_f64),
+        colorant_fixed = (imgg_n0f8, imgc_n0f8),
+        kernel = kernel,
+        kern_values = kern
+    )
+end
+
+function print_section(title::String)
+    println("\n" * "="^60)
+    println("=== $title ===")
+    println("="^60)
+end
+
+function print_subsection(title::String)
+    println("\n--- $title ---")
+end
+
+# Test 1: Basic planned FFT functionality
+function test_basic_planned_fft()
+    print_section("Basic Planned FFT Tests")
+
+    data = create_test_data()
+    border = "replicate"
+
+    @testset "Basic Planned FFT vs Standard FFT" begin
+        for (i, img) in enumerate(data.float_imgs)
+            img_type = typeof(img[1,1])
+            print_subsection("Testing Float Image Type: $img_type")
+
+            # Compute expected result
+            target = zeros(eltype(img), size(img))
+            target[3:4, 2:3] = rot180(data.kern_values) .* img[3,4]
+
+            # Standard FFT
+            result_std = imfilter(img, data.kernel, border, Algorithm.FFT())
+
+            # Planned FFT
+            planned_alg = planned_fft(img, data.kernel, border)
+            result_planned = imfilter(img, data.kernel, border, planned_alg)
+
+            # Tests
+            @test result_std ≈ target rtol=RTOL atol=ATOL
+            @test result_planned ≈ target rtol=RTOL atol=ATOL
+            @test result_std ≈ result_planned rtol=RTOL atol=ATOL
+
+            if VERBOSE
+                println("  ✓ $img_type: All tests passed!")
+                println("    Standard result: ", result_std[3:4, 2:3])
+                println("    Planned result:  ", result_planned[3:4, 2:3])
+                println("    Target result:   ", target[3:4, 2:3])
+            end
+        end
+    end
+end
+
+# Test 2: Colorant image handling
+function test_colorant_images()
+    print_section("Colorant Image Tests")
+
+    data = create_test_data()
+    border = "replicate"
+
+    @testset "Colorant Images with Float Element Types" begin
+        for (i, img) in enumerate(data.colorant_float)
+            img_type = typeof(img[1,1])
+            print_subsection("Testing Colorant Type: $img_type")
+
+            try
+                # Standard FFT
+                result_std = imfilter(img, data.kernel, border, Algorithm.FFT())
+
+                # Planned FFT
+                planned_alg = planned_fft(img, data.kernel, border)
+                result_planned = imfilter(img, data.kernel, border, planned_alg)
+
+                # Compare results
+                results_match = result_std ≈ result_planned
+                @test results_match
+
+                if VERBOSE
+                    if results_match
+                        println("  ✓ $img_type: Planned FFT matches standard FFT!")
+                    else
+                        println("  ✗ $img_type: Results differ!")
+                        println("    Max difference: ", maximum(abs.(result_std - result_planned)))
+                    end
+                    println("    Standard result [2:4, 2:4]: ")
+                    println("      ", result_std[2:4, 2:4])
+                    println("    Planned result [2:4, 2:4]: ")
+                    println("      ", result_planned[2:4, 2:4])
+                end
+
+            catch e
+                @test false  # Should not fail for float colorants
+                if VERBOSE
+                    println("  ✗ $img_type: Unexpected error: $e")
+                end
+            end
+        end
+    end
+
+    @testset "Colorant Images with Fixed Point Element Types (Now Supported)" begin
+        for (i, img) in enumerate(data.colorant_fixed)
+            img_type = typeof(img[1,1])
+            print_subsection("Testing Fixed Point Colorant Type: $img_type")
+
+            # Standard FFT should work
+            result_std = imfilter(img, data.kernel, border, Algorithm.FFT())
+            if VERBOSE
+                println("  ✓ Standard FFT works for $img_type")
+            end
+
+            # Planned FFT should now work too
+            try
+                planned_alg = planned_fft(img, data.kernel, border)
+                result_planned = imfilter(img, data.kernel, border, planned_alg)
+
+                # Compare results
+                results_match = result_std ≈ result_planned
+                @test results_match
+
+                if VERBOSE
+                    if results_match
+                        println("  ✓ $img_type: Planned FFT now works and matches standard FFT!")
+                    else
+                        println("  ⚠ $img_type: Planned FFT works but results differ slightly")
+                        println("    Max difference: ", maximum(abs.(result_std - result_planned)))
+                    end
+                end
+            catch e
+                if VERBOSE
+                    println("  ✗ $img_type: Planned FFT failed with error: $e")
+                end
+                @test false  # Should work now
+            end
+        end
+    end
+end
+
+# Test 3: Offset array handling
+function test_offset_arrays()
+    print_section("Offset Array Handling Tests")
+
+    # Create offset array test case
+    data = reshape(1.0:24.0, 3, 8, 1)
+    data_offset = OffsetArray(data, 1:3, 0:7, 1:1)
+    dims = (2, 3)
+
+    print_subsection("Basic Offset Array RFFT Behavior")
+
+    if VERBOSE
+        println("data size: ", size(data))
+        println("data axes: ", axes(data))
+        println("data_offset size: ", size(data_offset))
+        println("data_offset axes: ", axes(data_offset))
+    end
+
+    # Test different RFFT approaches
+    B1 = rfft(data, dims)
+    B2 = rfft(data_offset, dims)
+    B3 = rfft(collect(data_offset), dims)
+
+    if VERBOSE
+        println("RFFT(data) size: ", size(B1))
+        println("RFFT(data_offset) size: ", size(B2))
+        println("RFFT(collect(data_offset)) size: ", size(B3))
+
+        println("\nComparisons:")
+        println("B1 ≈ B2: ", B1 ≈ B2)
+        println("B1 ≈ B3: ", B1 ≈ B3)
+        println("B2 ≈ B3: ", B2 ≈ B3)
+
+        if !(B1 ≈ B2)
+            println("Max diff B1 vs B2: ", maximum(abs.(B1 - B2)))
+        end
+    end
+
+    # Test with manual plans
+    print_subsection("Manual Plan Execution")
+    buf = Array{Float64}(undef, size(data))
+    plan = plan_rfft(buf, dims; flags=FFTW.MEASURE)
+
+    copyto!(buf, data)
+    B4 = plan * buf
+
+    copyto!(buf, collect(data_offset))
+    B5 = plan * buf
+
+    if VERBOSE
+        println("Manual plan on data: ", size(B4))
+        println("Manual plan on collect(data_offset): ", size(B5))
+        println("B1 ≈ B4: ", B1 ≈ B4)
+        println("B1 ≈ B5: ", B1 ≈ B5)
+    end
+end
+
+# Test 4: Step-by-step pipeline debugging
+function test_pipeline_debugging()
+    print_section("Pipeline Step-by-Step Debugging")
+
+    # Use RGB colorant case that shows issues
+    imgc = fill(RGB{Float64}(0,0,0), 5, 7); imgc[3,4] = RGB{Float64}(1,0,0)
+    kern = [0.1 0.2; 0.4 0.5]
+    kernel = OffsetArray(kern, -1:0, 1:2)
+    border = "replicate"
+
+    # Setup padded arrays
+    borderspec = ImageFiltering.borderinstance(border)
+    bord = borderspec(kernel, imgc, Algorithm.FFT())
+    _A = ImageFiltering.padarray(RGB{Float64}, imgc, bord)
+    kern_fft = ImageFiltering.samedims(_A, ImageFiltering.kernelconv(kernel))
+    krn = FFTView(zeros(eltype(kern_fft), map(length, axes(_A))))
+    for I in CartesianIndices(axes(kern_fft))
+        krn[I] = kern_fft[I]
+    end
+
+    # Get channelview and dims
+    Av, dims = ImageFiltering.channelview_dims(_A)
+    kernrs = ImageFiltering.kreshape(RGB{Float64}, krn)
+
+    if VERBOSE
+        println("Padded image channelview size: ", size(Av))
+        println("Kernel reshaped size: ", size(kernrs))
+        println("Transform dims: ", dims)
+    end
+
+    print_subsection("Standard Pipeline")
+    # Use collect() to match what the actual filtfft function does for colorants
+    Av_collected = collect(Av)
+    B_std = rfft(Av_collected, dims)
+    krn_buf_std = rfft(kernrs, dims)
+    B_std_copy = copy(B_std)
+    B_std_copy .*= conj!(krn_buf_std)
+    Avf_std = irfft(B_std_copy, length(axes(Av_collected, dims[1])), dims)
+
+    print_subsection("Planned Pipeline")
+    planned_alg = planned_fft(imgc, kernel, border)
+    B_planned = planned_alg.plan1(Av)
+    krn_buf_planned = planned_alg.plan2(kernrs)
+    B_planned .*= conj!(krn_buf_planned)
+    Avf_planned = planned_alg.plan3(B_planned)
+
+    if VERBOSE
+        println("Standard B size: ", size(B_std))
+        println("Planned B size: ", size(B_planned))
+        println("B_std ≈ B_planned: ", B_std ≈ B_planned)
+
+        if !(B_std ≈ B_planned)
+            println("Max difference in B: ", maximum(abs.(B_std - B_planned)))
+        end
+
+        println("Standard final size: ", size(Avf_std))
+        println("Planned final size: ", size(Avf_planned))
+        println("Final results equal: ", Avf_std ≈ Avf_planned)
+
+        if !(Avf_std ≈ Avf_planned)
+            println("Max difference in final: ", maximum(abs.(Avf_std - Avf_planned)))
+        end
+    end
+
+    # Convert back to colorant
+    result_std_colorant = colorview(base_colorant_type(RGB{Float64}){eltype(Avf_std)}, Avf_std)
+    result_planned_colorant = colorview(base_colorant_type(RGB{Float64}){eltype(Avf_planned)}, Avf_planned)
+
+    if VERBOSE
+        println("\nFinal colorant results [2:4, 2:4]:")
+        println("Standard: ")
+        println(result_std_colorant[2:4, 2:4])
+        println("Planned: ")
+        println(result_planned_colorant[2:4, 2:4])
+    end
+end
+
+# Test 5: Buffer and copyto! issues
+function test_buffer_issues()
+    print_section("Buffer and Copyto! Issues")
+
+    imgc = fill(RGB{Float64}(0,0,0), 5, 7); imgc[3,4] = RGB{Float64}(1,0,0)
+    kern = [0.1 0.2; 0.4 0.5]
+    kernel = OffsetArray(kern, -1:0, 1:2)
+    border = "replicate"
+
+    borderspec = ImageFiltering.borderinstance(border)
+    bord = borderspec(kernel, imgc, Algorithm.FFT())
+    _A = ImageFiltering.padarray(RGB{Float64}, imgc, bord)
+    Av, dims = ImageFiltering.channelview_dims(_A)
+
+    print_subsection("Collect vs no_offset_view Comparison")
+
+    if VERBOSE
+        println("Av type: ", typeof(Av))
+        println("Av axes: ", axes(Av))
+        println("Av size: ", size(Av))
+    end
+
+    # Test old method (no_offset_view)
+    buf_old = Array{Float64}(undef, size(Av))
+    no_offset_av = OffsetArrays.no_offset_view(Av)
+    copyto!(buf_old, no_offset_av)
+
+    # Test new method (collect)
+    buf_new = Array{Float64}(undef, size(Av))
+    copyto!(buf_new, collect(Av))
+
+    if VERBOSE
+        println("no_offset_view axes: ", axes(no_offset_av))
+        println("buf_old ≈ buf_new: ", buf_old ≈ buf_new)
+
+        if !(buf_old ≈ buf_new)
+            println("Max difference: ", maximum(abs.(buf_old - buf_new)))
+        end
+    end
+
+    # Test RFFT on both approaches
+    B_old = rfft(buf_old, dims)
+    B_new = rfft(buf_new, dims)
+    B_direct = rfft(Av, dims)
+
+    if VERBOSE
+        println("RFFT results:")
+        println("B_old ≈ B_new: ", B_old ≈ B_new)
+        println("B_direct ≈ B_new: ", B_direct ≈ B_new)
+
+        if !(B_direct ≈ B_new)
+            println("Max diff B_direct vs B_new: ", maximum(abs.(B_direct - B_new)))
+        end
+    end
+end
+
+# Test 6: Buffer size calculation issues
+function test_buffer_size_issues()
+    print_section("Buffer Size Calculation Issues")
+
+    # Simulate channelview data: (3, 8, 10) transforming along dims (2, 3)
+    test_real = rand(Float64, 3, 8, 10)
+    dims = (2, 3)
+
+    if VERBOSE
+        println("Original array size: ", size(test_real))
+        println("Transform dims: ", dims)
+    end
+
+    # Standard approach
+    B_std = rfft(test_real, dims)
+    if VERBOSE
+        println("Standard rfft output size: ", size(B_std))
+    end
+
+    # Test planned buffer creation
+    function test_buffer_creation(a::AbstractArray{T}, dims, d::Int) where {T}
+        numeric_type = T <: Real ? T : real(T)
+        input_size = collect(size(a))
+        input_size[dims[1]] = input_size[dims[1]] ÷ 2 + 1
+        buf_in = Array{Complex{numeric_type}}(undef, Tuple(input_size))
+        plan = plan_irfft(buf_in, d, dims; flags=FFTW.MEASURE)
+        return plan, buf_in
+    end
+
+    plan, buf = test_buffer_creation(test_real, dims, size(test_real, dims[1]))
+
+    if VERBOSE
+        println("Expected buffer size (from rfft): ", size(B_std))
+        println("Actual buffer size: ", size(buf))
+        println("Sizes match: ", size(B_std) == size(buf))
+    end
+
+    # Test execution
+    result_std = irfft(B_std, size(test_real, dims[1]), dims)
+    copyto!(buf, B_std)
+    result_planned = plan * buf
+
+    if VERBOSE
+        println("Standard irfft result size: ", size(result_std))
+        println("Planned irfft result size: ", size(result_planned))
+        println("Results equal: ", result_std ≈ result_planned)
+
+        if size(result_std) == size(result_planned) && !(result_std ≈ result_planned)
+            println("Max difference: ", maximum(abs.(result_std - result_planned)))
+        end
+    end
+end
+
+# Test 7: Proposed fixes and improvements
+function test_proposed_fixes()
+    print_section("Proposed Fixes and Improvements")
+
+    print_subsection("Testing collect() fix for buffered_planned_rfft_dims")
+
+    imgc = fill(RGB{Float64}(0,0,0), 5, 7); imgc[3,4] = RGB{Float64}(1,0,0)
+    kern = [0.1 0.2; 0.4 0.5]
+    kernel = OffsetArray(kern, -1:0, 1:2)
+    border = "replicate"
+
+    borderspec = ImageFiltering.borderinstance(border)
+    bord = borderspec(kernel, imgc, Algorithm.FFT())
+    _A = ImageFiltering.padarray(RGB{Float64}, imgc, bord)
+    Av, dims = ImageFiltering.channelview_dims(_A)
+
+    # Test current planned approach
+    planned_alg = planned_fft(imgc, kernel, border)
+
+    # Compare with standard for both offset and collected arrays
+    B_std_offset = rfft(Av, dims)
+    B_std_collected = rfft(collect(Av), dims)
+    B_planned = planned_alg.plan1(Av)
+
+    if VERBOSE
+        println("Standard RFFT on offset array equals planned: ", B_std_offset ≈ B_planned)
+        println("Standard RFFT on collected array equals planned: ", B_std_collected ≈ B_planned)
+        println("Offset equals collected: ", B_std_offset ≈ B_std_collected)
+
+        if !(B_std_offset ≈ B_std_collected)
+            println("Key insight: collect() vs offset arrays give different RFFT results!")
+            println("Max diff: ", maximum(abs.(B_std_offset - B_std_collected)))
+        end
+    end
+end
+
+# Test 8: Specific test for Gray{N0f8} FFT precision issue
+function test_gray_n0f8_precision_issue()
+    print_section("Testing Gray{N0f8} FFT Precision Issue")
+
+    # Recreate the exact test case that's failing
+    imgg = fill(Gray{N0f8}(0), 5, 7)
+    imgg[3,4] = Gray{N0f8}(1)
+    kern = [0.1 0.2; 0.4 0.5]
+    kernel = OffsetArray(kern, -1:0, 1:2)
+    border = "replicate"
+
+    # Calculate the expected result
+    targetimg = zeros(typeof(imgg[1]*kern[1]), size(imgg))
+    targetimg[3:4,2:3] = rot180(kern) .* imgg[3,4]
+
+    if VERBOSE
+        println("Image type: ", typeof(imgg[1]))
+        println("Target type: ", typeof(targetimg[1]))
+        println("Target values at [3:4,2:3]: ", targetimg[3:4,2:3])
+    end
+
+    # Test different algorithms
+    print_subsection("Algorithm Comparison")
+
+    # Standard FFT
+    result_fft = imfilter(imgg, kernel, border, Algorithm.FFT())
+
+    # Planned FFT
+    planned_alg = planned_fft(imgg, kernel, border)
+    result_planned = imfilter(imgg, kernel, border, planned_alg)
+
+    # FIR for reference
+    result_fir = imfilter(imgg, kernel, border, Algorithm.FIR())
+
+    if VERBOSE
+        println("FFT result [3:4,2:3]:     ", result_fft[3:4,2:3])
+        println("Planned result [3:4,2:3]: ", result_planned[3:4,2:3])
+        println("FIR result [3:4,2:3]:     ", result_fir[3:4,2:3])
+        println("Target result [3:4,2:3]:  ", targetimg[3:4,2:3])
+
+        println("\nDifferences from target:")
+        diff_fft = maximum(abs.(result_fft - targetimg))
+        diff_planned = maximum(abs.(result_planned - targetimg))
+        diff_fir = maximum(abs.(result_fir - targetimg))
+        println("FFT max diff:     ", diff_fft)
+        println("Planned max diff: ", diff_planned)
+        println("FIR max diff:     ", diff_fir)
+
+        println("\nCross-comparisons:")
+        println("FFT ≈ Planned:    ", result_fft ≈ result_planned)
+        println("FFT ≈ FIR:        ", result_fft ≈ result_fir)
+        println("Planned ≈ FIR:    ", result_planned ≈ result_fir)
+
+        if !(result_fft ≈ result_planned)
+            println("Max diff FFT vs Planned: ", maximum(abs.(result_fft - result_planned)))
+        end
+    end
+
+    print_subsection("Tolerance Analysis")
+
+    # Test with different tolerances
+    tolerances = [1e-15, 1e-12, 1e-9, 1e-6, 1e-3]
+
+    for tol in tolerances
+        fft_matches = isapprox(result_fft, targetimg, rtol=tol, atol=tol)
+        planned_matches = isapprox(result_planned, targetimg, rtol=tol, atol=tol)
+
+        if VERBOSE
+            println("Tolerance $tol: FFT=$(fft_matches), Planned=$(planned_matches)")
+        end
+
+        if fft_matches && planned_matches
+            if VERBOSE
+                println("  Both algorithms pass at tolerance: $tol")
+            end
+            break
+        end
+    end
+
+    print_subsection("Investigating FFT vs Planned FFT Differences")
+
+    # Let's dig into the FFT implementation details
+    borderspec = ImageFiltering.borderinstance(border)
+    bord = borderspec(kernel, imgg, Algorithm.FFT())
+    _A = ImageFiltering.padarray(Gray{Float64}, imgg, bord)
+    Av, dims = ImageFiltering.channelview_dims(_A)
+
+    if VERBOSE
+        println("Padded array type: ", typeof(_A))
+        println("Channelview type: ", typeof(Av))
+        println("Channelview axes: ", axes(Av))
+        println("Transform dims: ", dims)
+    end
+
+    # Compare offset vs collect behavior specifically for this case
+    B_offset = rfft(Av, dims)
+    B_collected = rfft(collect(Av), dims)
+
+    if VERBOSE
+        println("RFFT offset vs collected equal: ", B_offset ≈ B_collected)
+        if !(B_offset ≈ B_collected)
+            println("Max difference: ", maximum(abs.(B_offset - B_collected)))
+        end
+    end
+
+    print_subsection("Comparing Different Image Types")
+
+    # Test the same kernel/border with different image types
+    test_cases = [
+        ("Float64", zeros(Float64, 5, 7), 1.0),
+        ("Gray{Float64}", fill(Gray{Float64}(0), 5, 7), Gray{Float64}(1)),
+        ("Gray{N0f8}", fill(Gray{N0f8}(0), 5, 7), Gray{N0f8}(1)),
+        ("RGB{Float64}", fill(RGB{Float64}(0,0,0), 5, 7), RGB{Float64}(1,0,0)),
+    ]
+
+    for (name, img_template, nonzero_val) in test_cases
+        img = copy(img_template)
+        img[3,4] = nonzero_val
+
+        targetimg = zeros(typeof(img[1]*kern[1]), size(img))
+        targetimg[3:4,2:3] = rot180(kern) .* img[3,4]
+
+        result_fir = imfilter(img, kernel, border, Algorithm.FIR())
+        result_fft = imfilter(img, kernel, border, Algorithm.FFT())
+
+        fir_matches = result_fir ≈ targetimg
+        fft_matches = result_fft ≈ targetimg
+
+        if VERBOSE
+            println("$name:")
+            println("  FIR matches target: $fir_matches")
+            println("  FFT matches target: $fft_matches")
+            if !fft_matches
+                println("  FFT[3:4,2:3]: ", result_fft[3:4,2:3])
+                println("  Target[3:4,2:3]: ", targetimg[3:4,2:3])
+                # Check if values are just shifted
+                if result_fft[4:5,2:3] ≈ targetimg[3:4,2:3]
+                    println("  ⚠️  FFT values appear shifted down by 1 row!")
+                end
+            end
+        end
+    end
+end
+
+# Test 9: Updated supported_algs logic for colorants
+function test_supported_algs_colorants()
+    print_section("Testing Updated supported_algs Logic for Colorants")
+
+    # Helper function to check if a type supports planned_fft
+    function supports_planned_fft(::Type{T}) where T
+        # AbstractFloat types are directly supported
+        T <: AbstractFloat && return true
+
+        # Colorant types are supported if their element type can be converted to FFT-compatible types
+        if T <: Colorant
+            try
+                # Check if ffteltype can convert the element type
+                elt = eltype(T)
+                fft_type = ImageFiltering.ffteltype(elt)
+                return fft_type <: Union{Float32, Float64}
+            catch
+                return false
+            end
+        end
+
+        return false
+    end
+
+    function supported_algs(img::AbstractArray{T}, kernel, border) where T
+        base_algs = (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
+
+        # Include planned_fft if:
+        # 1. The image type supports planned_fft (AbstractFloat or compatible Colorants)
+        # 2. All kernel elements are floating point or complex floating point
+        # 3. Border is not NA (since NA requires special handling)
+        if supports_planned_fft(T) &&
+           all(k -> eltype(k) <: Union{AbstractFloat, Complex{<:AbstractFloat}}, (kernel isa Tuple ? kernel : (kernel,))) &&
+           !isa(border, NA)
+            return (base_algs..., planned_fft(img, kernel, border))
+        else
+            return base_algs
+        end
+    end
+
+    data = create_test_data()
+    kern = [0.1 0.2; 0.4 0.5]
+    kernel = OffsetArray(kern, -1:0, 1:2)
+    border = "replicate"
+
+    print_subsection("Testing supports_planned_fft function")
+
+    # Test AbstractFloat types
+    @test supports_planned_fft(Float64) == true
+    @test supports_planned_fft(Float32) == true
+    if VERBOSE
+        println("  ✓ Float64 and Float32 support planned_fft")
+    end
+
+    # Test colorant types with floating point elements
+    @test supports_planned_fft(Gray{Float64}) == true
+    @test supports_planned_fft(RGB{Float64}) == true
+    @test supports_planned_fft(Gray{Float32}) == true
+    @test supports_planned_fft(RGB{Float32}) == true
+    if VERBOSE
+        println("  ✓ Gray{Float64}, RGB{Float64}, Gray{Float32}, RGB{Float32} support planned_fft")
+    end
+
+    # Test colorant types with fixed point elements
+    @test supports_planned_fft(Gray{N0f8}) == true  # N0f8 should be convertible to Float32
+    @test supports_planned_fft(RGB{N0f8}) == true   # N0f8 should be convertible to Float32
+    if VERBOSE
+        println("  ✓ Gray{N0f8} and RGB{N0f8} support planned_fft (via ffteltype conversion)")
+    end
+
+    # Test integer types (should not support)
+    @test supports_planned_fft(Int) == false
+    @test supports_planned_fft(UInt8) == false
+    if VERBOSE
+        println("  ✓ Int and UInt8 do not support planned_fft")
+    end
+
+    print_subsection("Testing supported_algs function")
+
+    # Test with floating point images
+    for img in data.float_imgs
+        algs = supported_algs(img, kernel, border)
+        @test length(algs) == 4  # Should include planned_fft
+        @test any(alg -> isa(alg, typeof(planned_fft(img, kernel, border))), algs)
+        if VERBOSE
+            println("  ✓ $(typeof(img[1])) gets planned_fft support: $(length(algs)) algorithms")
+        end
+    end
+
+    # Test with colorant floating point images
+    for img in data.colorant_float
+        algs = supported_algs(img, kernel, border)
+        @test length(algs) == 4  # Should include planned_fft
+        @test any(alg -> isa(alg, typeof(planned_fft(img, kernel, border))), algs)
+        if VERBOSE
+            println("  ✓ $(typeof(img[1])) gets planned_fft support: $(length(algs)) algorithms")
+        end
+    end
+
+    # Test with colorant fixed point images (should also work now)
+    for img in data.colorant_fixed
+        try
+            algs = supported_algs(img, kernel, border)
+            @test length(algs) == 4  # Should include planned_fft
+            @test any(alg -> isa(alg, typeof(planned_fft(img, kernel, border))), algs)
+            if VERBOSE
+                println("  ✓ $(typeof(img[1])) gets planned_fft support: $(length(algs)) algorithms")
+            end
+        catch e
+            if VERBOSE
+                println("  ✗ $(typeof(img[1])) failed to get planned_fft support: $e")
+            end
+            @test false
+        end
+    end
+
+    # Test with integer images (should not get planned_fft)
+    for img in data.int_imgs
+        algs = supported_algs(img, kernel, border)
+        @test length(algs) == 3  # Should not include planned_fft
+        if VERBOSE
+            println("  ✓ $(typeof(img[1])) correctly excludes planned_fft: $(length(algs)) algorithms")
+        end
+    end
+
+    print_subsection("Testing actual filtering with planned_fft for colorants")
+
+    # Test that planned_fft actually works for colorant types
+    for img in (data.colorant_float..., data.colorant_fixed...)
+        img_type = typeof(img[1])
+        try
+            # Get algorithms including planned_fft
+            algs = supported_algs(img, kernel, border)
+            planned_alg = last(algs)  # The planned_fft algorithm
+
+            # Test that it actually works
+            result_planned = imfilter(img, kernel, border, planned_alg)
+            result_std = imfilter(img, kernel, border, Algorithm.FFT())
+
+            # Results should be approximately equal
+            @test result_planned ≈ result_std rtol=0.01 atol=0.01
+
+            if VERBOSE
+                println("  ✓ $img_type: Planned FFT successfully filters and matches standard FFT")
+            end
+        catch e
+            if VERBOSE
+                println("  ✗ $img_type: Failed to filter with planned FFT: $e")
+            end
+            @test false
+        end
+    end
+end
+
+# Test 10: Deep dive into FFT positioning issue
+function test_fft_positioning_detailed()
+    print_section("Deep Dive into FFT Positioning Issue")
+
+    # Create simple test case
+    img_float = zeros(Float64, 6, 6)
+    img_float[3,3] = 1.0
+
+    img_gray = Gray{Float64}.(img_float)
+
+    kernel = OffsetArray([0.1 0.2; 0.3 0.4], -1:0, -1:0)
+    border = "replicate"
+
+    print_subsection("Testing Float64 image (works correctly)")
+    result_float_fft = imfilter(img_float, kernel, border, Algorithm.FFT())
+    result_float_fir = imfilter(img_float, kernel, border, Algorithm.FIR())
+
+    if VERBOSE
+        println("Float64 FFT result at [2:4,2:4]:")
+        println(result_float_fft[2:4,2:4])
+        println("Float64 FIR result at [2:4,2:4]:")
+        println(result_float_fir[2:4,2:4])
+        println("Match: ", result_float_fft ≈ result_float_fir)
+    end
+
+    print_subsection("Testing Gray{Float64} image (has positioning bug)")
+    result_gray_fft = imfilter(img_gray, kernel, border, Algorithm.FFT())
+    result_gray_fir = imfilter(img_gray, kernel, border, Algorithm.FIR())
+
+    if VERBOSE
+        println("Gray{Float64} FFT result at [2:4,2:4]:")
+        println(result_gray_fft[2:4,2:4])
+        println("Gray{Float64} FIR result at [2:4,2:4]:")
+        println(result_gray_fir[2:4,2:4])
+        println("Match: ", result_gray_fft ≈ result_gray_fir)
+
+        # Check if FFT result is shifted
+        if !isempty(result_gray_fft[3:5,2:4]) && !isempty(result_gray_fir[2:4,2:4])
+            println("\nChecking for shift - FFT[3:5,2:4] vs FIR[2:4,2:4]:")
+            println("FFT shifted:", result_gray_fft[3:5,2:4])
+            println("FIR target: ", result_gray_fir[2:4,2:4])
+            shifted_match = result_gray_fft[3:5,2:4] ≈ result_gray_fir[2:4,2:4]
+            println("Shifted match: ", shifted_match)
+        end
+    end
+
+    print_subsection("Investigating the filtfft implementation")
+
+    # Test the filtfft function directly
+    borderspec = ImageFiltering.borderinstance(border)
+
+    # For Float64 array
+    bord_float = borderspec(kernel, img_float, ImageFiltering.Algorithm.FFT())
+    A_float = ImageFiltering.padarray(Float64, img_float, bord_float)
+    kern_float = ImageFiltering.samedims(A_float, ImageFiltering.kernelconv(kernel))
+    krn_float = FFTView(zeros(eltype(kern_float), map(length, axes(A_float))))
+    for I in CartesianIndices(axes(kern_float))
+        krn_float[I] = kern_float[I]
+    end
+    result_filtfft_float = ImageFiltering.filtfft(A_float, krn_float)
+
+    # For Gray{Float64} array
+    bord_gray = borderspec(kernel, img_gray, ImageFiltering.Algorithm.FFT())
+    A_gray = ImageFiltering.padarray(Gray{Float64}, img_gray, bord_gray)
+    kern_gray = ImageFiltering.samedims(A_gray, ImageFiltering.kernelconv(kernel))
+    krn_gray = FFTView(zeros(eltype(kern_gray), map(length, axes(A_gray))))
+    for I in CartesianIndices(axes(krn_gray))
+        krn_gray[I] = kern_gray[I]
+    end
+    result_filtfft_gray = ImageFiltering.filtfft(A_gray, krn_gray)
+
+    if VERBOSE
+        println("\nDirect filtfft comparison:")
+        println("Padded Float64 size: ", size(A_float))
+        println("Padded Gray size: ", size(A_gray))
+        println("Float64 axes: ", axes(A_float))
+        println("Gray axes: ", axes(A_gray))
+
+        # Check padding differences
+        println("\nPadding comparison:")
+        center_float = size(A_float) .÷ 2 .+ 1
+        center_gray = size(A_gray) .÷ 2 .+ 1
+        println("Float64 center: ", center_float)
+        println("Gray center: ", center_gray)
+
+        if length(center_float) >= 2 && length(center_gray) >= 2
+            r = 2:4
+            c = 2:4
+            if checkbounds(Bool, result_filtfft_float, r, c) && checkbounds(Bool, result_filtfft_gray, r, c)
+                println("Float64 filtfft result[2:4,2:4]: ", result_filtfft_float[r,c])
+                println("Gray filtfft result[2:4,2:4]: ", result_filtfft_gray[r,c])
+            end
+        end
+    end
+
+    print_subsection("Investigating channelview_dims")
+
+    # Test channelview_dims function
+    Av_gray, dims_gray = ImageFiltering.channelview_dims(A_gray)
+
+    if VERBOSE
+        println("Original Gray array size: ", size(A_gray))
+        println("Channelview size: ", size(Av_gray))
+        println("Transform dims: ", dims_gray)
+        println("Channelview axes: ", axes(Av_gray))
+        println("Original axes: ", axes(A_gray))
+    end
+
+    @test true  # Placeholder - investigating issue
+end
+
+# Main test runner
+function run_tests(section::String = "all")
+    println("ImageFiltering.jl Consolidated Debug Script")
+    println("Running section: $section")
+
+    if section in ["all", "basic"]
+        test_basic_planned_fft()
+    end
+
+    if section in ["all", "colorant"]
+        test_colorant_images()
+    end
+
+    if section in ["all", "offset"]
+        test_offset_arrays()
+    end
+
+    if section in ["all", "pipeline"]
+        test_pipeline_debugging()
+    end
+
+    if section in ["all", "buffer"]
+        test_buffer_issues()
+    end
+
+    if section in ["all", "fixes"]
+        test_proposed_fixes()
+    end
+
+    if section in ["all", "gray_precision"]
+        test_gray_n0f8_precision_issue()
+    end
+
+    if section in ["all", "supported_algs"]
+        test_supported_algs_colorants()
+    end
+
+    if section in ["all", "fft_positioning"]
+        test_fft_positioning_detailed()
+    end
+
+    # Summary
+    print_section("Summary")
+    println("Consolidated debugging complete!")
+    println("")
+    println("Key findings:")
+    println("1. Basic planned FFT works for floating point images")
+    println("2. Updated supported_algs logic now includes all colorant types")
+    println("3. Both floating point and fixed point colorants get planned FFT support")
+    println("4. Planned FFT for colorants works via ffteltype conversion")
+    println("5. Offset arrays vs collect() can give different RFFT results")
+    println("6. Buffer size calculations are generally correct")
+    println("7. TODO in test/2d.jl has been addressed - planned_fft now supports more types than just floats")
+    println("8. ✅ CRITICAL BUG FIXED: FFT positioning issue for colorant types has been resolved!")
+    println("   - Fixed by using no_offset_view() instead of collect() in FFT functions")
+    println("   - Fixed by properly restoring offset information after FFT operations")
+    println("   - All colorant types (Gray{Float64}, Gray{N0f8}, RGB{Float64}) now work correctly")
+    println("   - test/2d.jl:110 now passes - the original failing test is fixed")
+end
+
+# Command line interface
+if abspath(PROGRAM_FILE) == @__FILE__
+    section = length(ARGS) > 0 ? ARGS[1] : "all"
+    if !(section in ["all", "basic", "colorant", "offset", "pipeline", "buffer", "fixes", "gray_precision", "supported_algs", "fft_positioning"])
+        println("Invalid section: $section")
+        println("Valid sections: all, basic, colorant, offset, pipeline, buffer, fixes, gray_precision, supported_algs, fft_positioning")
+        exit(1)
+    end
+    run_tests(section)
+end
diff --git a/docs/src/index.md b/docs/src/index.md
index fd57e3f7..7f168f1d 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -13,7 +13,7 @@ processing.
 
 The main functions provided by this package are:
 
-| Function                 | Action         | 
+| Function                 | Action         |
 |:-------------------------|:---------------|
 |[`imfilter`](@ref)        | Filter a one, two or multidimensional array img with a kernel by computing their correlation |
 |[`imfilter!`](@ref)       | Filter an array img with kernel kernel by computing their correlation, storing the result in imgfilt |
@@ -25,7 +25,7 @@ The main functions provided by this package are:
 |[`findlocalminima`](@ref) | Returns the coordinates of elements whose value is smaller than all of their immediate neighbors |
 |[`findlocalmaxima`](@ref) | Returns the coordinates of elements whose value is larger than all of their immediate neighbors |
 
-Common kernels (filters) are organized in the `Kernel` and `KernelFactors` modules. 
+Common kernels (filters) are organized in the `Kernel` and `KernelFactors` modules.
 
 A common task in image processing and computer vision is computing
 image *gradients* (derivatives), for which there is the dedicated
@@ -80,6 +80,24 @@ transform (FFT).  By default, this choice is made based on kernel
 size. You can manually specify the algorithm using [`Algorithm.FFT()`](@ref)
 or [`Algorithm.FIR()`](@ref).
 
+#### Reusing FFT plans
+
+It is possible to reuse FFT plans if the operation is going to be done on the
+same array type and dimensions i.e. on each image of an image stack
+
+```julia
+using ImageFiltering, ComputationalResources
+imgstack = rand(Float64, 200, 100, 10)
+imgstack_filtered = similar(imgstack)
+
+kernel = ImageFiltering.factorkernel(Kernel.LoG(1))
+fft_planned = CPU1(ImageFiltering.planned_fft(imgstack_filtered[:,:,1], kernel))
+
+for i in axes(imgstack, 3)
+    imfilter!(fft_planned, imgstack_filtered[:,:,i], imgstack[:,:,i], kernel)
+end
+```
+
 ### Feature: Multithreading
 
 If you launch Julia with `JULIA_NUM_THREADS=n` (where `n > 1`), then
diff --git a/src/ImageFiltering.jl b/src/ImageFiltering.jl
index df070839..0bfb4d9a 100644
--- a/src/ImageFiltering.jl
+++ b/src/ImageFiltering.jl
@@ -1,12 +1,14 @@
 module ImageFiltering
 
 using FFTW
+using RealFFTs
 using ImageCore, FFTViews, OffsetArrays, StaticArrays, ComputationalResources, TiledIteration
 # Where possible we avoid a direct dependency to reduce the number of [compat] bounds
 # using FixedPointNumbers: Normed, N0f8 # reexported by ImageCore
 using ImageCore.MappedArrays
 using Statistics, LinearAlgebra
 using Base: Indices, tail, fill_to_length, @pure, depwarn, @propagate_inbounds
+import Base: copy!
 using OffsetArrays: IdentityUnitRange   # using the one in OffsetArrays makes this work with multiple Julia versions
 using SparseArrays   # only needed to fix an ambiguity in borderarray
 using Reexport
@@ -30,7 +32,8 @@ export Kernel, KernelFactors,
     imgradients, padarray, centered, kernelfactors, reflect,
     freqkernel, spacekernel,
     findlocalminima, findlocalmaxima,
-    blob_LoG, BlobLoG
+    blob_LoG, BlobLoG,
+    planned_fft
 
 const FixedColorant{T<:Normed} = Colorant{T}
 const StaticOffsetArray{T,N,A<:StaticArray} = OffsetArray{T,N,A}
@@ -50,16 +53,33 @@ function Base.transpose(A::StaticOffsetArray{T,2}) where T
 end
 
 module Algorithm
+    import FFTW
     # deliberately don't export these, but it's expected that they
     # will be used as Algorithm.FFT(), etc.
     abstract type Alg end
-    "Filter using the Fast Fourier Transform" struct FFT <: Alg end
+    "Filter using the Fast Fourier Transform" struct FFT <: Alg
+        plan1::Union{Function,Nothing}
+        plan2::Union{Function,Nothing}
+        plan3::Union{Function,Nothing}
+    end
+    FFT() = FFT(nothing, nothing, nothing)
+    function Base.show(io::IO, alg::FFT)
+        if alg.plan1 === nothing && alg.plan2 === nothing && alg.plan3 === nothing
+            print(io, "FFT()")
+        else
+            print(io, "FFT(planned)")
+        end
+    end
     "Filter using a direct algorithm" struct FIR <: Alg end
+    Base.show(io::IO, alg::FIR) = print(io, "FIR()")
     "Cache-efficient filtering using tiles" struct FIRTiled{N} <: Alg
         tilesize::Dims{N}
     end
+    Base.show(io::IO, ::FIRTiled{N}) where N = print(io, "FIRTiled{$N}()")
     "Filter with an Infinite Impulse Response filter" struct IIR <: Alg end
+    Base.show(io::IO, alg::IIR) = print(io, "IIR()")
     "Filter with a cascade of mixed types (IIR, FIR)" struct Mixed <: Alg end
+    Base.show(io::IO, alg::Mixed) = print(io, "Mixed()")
 
     FIRTiled() = FIRTiled(())
 end
diff --git a/src/imfilter.jl b/src/imfilter.jl
index 4def267b..7e03fd41 100644
--- a/src/imfilter.jl
+++ b/src/imfilter.jl
@@ -826,7 +826,7 @@ function _imfilter_fft!(r::AbstractCPU{FFT},
     for I in CartesianIndices(axes(kern))
         krn[I] = kern[I]
     end
-    Af = filtfft(A, krn)
+    Af = filtfft(A, krn, r.settings.plan1, r.settings.plan2, r.settings.plan3)
     if map(first, axes(out)) == map(first, axes(Af))
         R = CartesianIndices(axes(out))
         copyto!(out, R, Af, R)
@@ -858,6 +858,198 @@ end
 @inline _stretch_mul(AT::Type{<:Real}, A_fft::AbstractArray, BT::Type{<:Complex}, B_fft::AbstractArray, d::Int) = _stretch_mul(BT, B_fft, AT, A_fft, d)
 @inline _stretch_mul(AT::Type{<:Real}, A_fft::AbstractArray, BT::Type{<:Real}, B_fft::AbstractArray, _::Int) = A_fft .* B_fft
 @inline _stretch_mul(AT::Type{<:Complex}, A_fft::AbstractArray, BT::Type{<:Complex}, B_fft::AbstractArray, _::Int) = A_fft .* B_fft
+
+# Unified planned FFT functions that handle both regular arrays and colorant channelviews
+function buffered_planned_rfft(a::AbstractArray{T}, dims=1:ndims(a)) where {T}
+    numeric_type = T <: Real ? T : real(T)
+
+    if dims == 1:ndims(a) && length(dims) == ndims(a)
+        # Use RealFFTs.RCpair for full transforms (more efficient)
+        # Create a buffer and plan for the prototype
+        buf_proto = RealFFTs.RCpair{numeric_type}(undef, size(a))
+        # Use ESTIMATE flag so plan works with any buffer of the same size
+        plan = RealFFTs.plan_rfft!(buf_proto; flags=FFTW.ESTIMATE)
+        buf_size = size(a)
+        
+        # Return a function that creates thread-local buffers
+        return function (arr::AbstractArray)
+            # Each thread gets its own buffer to avoid race conditions
+            buf = RealFFTs.RCpair{numeric_type}(undef, buf_size)
+            copy!(buf, OffsetArrays.no_offset_view(arr))
+            plan(buf)
+            return complex(buf)
+        end
+    else
+        # Use standard FFTW for partial transforms
+        buf_size = size(a)
+        buf_proto = Array{numeric_type}(undef, buf_size)
+        # Use ESTIMATE flag so plan works with any buffer of the same size
+        plan = plan_rfft(buf_proto, dims; flags=FFTW.ESTIMATE)
+        
+        return function (arr::AbstractArray)
+            # Each thread gets its own buffer
+            buf = Array{numeric_type}(undef, buf_size)
+            copyto!(buf, OffsetArrays.no_offset_view(arr))
+            return plan * buf
+        end
+    end
+end
+
+function buffered_planned_irfft(a::AbstractArray{T}, dims=1:ndims(a), d::Int=size(a,1)) where {T}
+    numeric_type = T <: Real ? T : real(T)
+
+    if dims == 1:ndims(a) && length(dims) == ndims(a)
+        # Use RealFFTs.RCpair for full transforms (more efficient)
+        # Create a buffer and plan for the prototype
+        buf_proto = RealFFTs.RCpair{numeric_type}(undef, size(a))
+        # Use ESTIMATE flag so plan works with any buffer of the same size
+        plan = RealFFTs.plan_irfft!(buf_proto; flags=FFTW.ESTIMATE)
+        buf_size = size(a)
+        
+        # Return a function that creates thread-local buffers
+        return function (arr::AbstractArray)
+            # Each thread gets its own buffer to avoid race conditions
+            buf = RealFFTs.RCpair{numeric_type}(undef, buf_size)
+            copy!(buf, OffsetArrays.no_offset_view(arr))
+            plan(buf)
+            return real(buf)
+        end
+    else
+        # Use standard FFTW for partial transforms
+        input_size = collect(size(a))
+        input_size[dims[1]] = input_size[dims[1]] ÷ 2 + 1
+        buf_size = Tuple(input_size)
+        buf_proto = Array{Complex{numeric_type}}(undef, buf_size)
+        # Use ESTIMATE flag so plan works with any buffer of the same size
+        plan = plan_irfft(buf_proto, d, dims; flags=FFTW.ESTIMATE)
+        
+        return function (arr::AbstractArray)
+            # Each thread gets its own buffer
+            buf_in = Array{Complex{numeric_type}}(undef, buf_size)
+            copyto!(buf_in, OffsetArrays.no_offset_view(arr))
+            return plan * buf_in
+        end
+    end
+end
+
+function planned_fft(A::AbstractArray{T,N},
+            kernel::ProcessedKernel,
+            border::BorderSpecAny=Pad(:replicate)
+            ) where {T,N}
+    # Check if any kernel is an IIR filter (not compatible with FFT)
+    for k in kernel
+        if k isa IIRFilter
+            throw(ArgumentError("planned_fft does not support IIR filters like TriggsSdika or IIRGaussian. " *
+                              "IIR filters use a different algorithm that cannot be accelerated with FFT plans. " *
+                              "Use Algorithm.IIR() instead, or use imfilter without a plan."))
+        end
+    end
+    
+    # Use ffteltype to convert to appropriate floating point type for FFT
+    FT = ffteltype(T)
+
+    # Determine if we're dealing with colorants
+    if T <: Colorant
+        # For colorants, create target type with FFT-compatible element type
+        target_T = base_colorant_type(T){FT}
+        bord = border(kernel, A, Algorithm.FFT())
+        _A = padarray(target_T, A, bord)
+
+        # Get channelview and dims for colorant processing
+        Av, dims = channelview_dims(_A)
+
+        # Create planned functions that handle the dims parameter for colorants
+        bfp1 = buffered_planned_rfft(Av, dims)
+
+        # Create kernel for colorants
+        kern = samedims(_A, kernelconv(kernel...))
+        krn = FFTView(zeros(eltype(kern), map(length, axes(_A))))
+        for I in CartesianIndices(axes(kern))
+            krn[I] = kern[I]
+        end
+        kernrs = kreshape(T, krn)
+        bfp2 = buffered_planned_rfft(kernrs, dims)
+
+        # Create planned irfft with proper dimensions
+        bfp3 = buffered_planned_irfft(Av, dims, length(axes(Av, dims[1])))
+    else
+        # For regular arrays
+        bord = border(kernel, A, Algorithm.FFT())
+        _A = padarray(FT, A, bord)
+        bfp1 = buffered_planned_rfft(_A)
+        kern = samedims(_A, kernelconv(kernel...))
+        krn = FFTView(zeros(eltype(kern), map(length, axes(_A))))
+        bfp2 = buffered_planned_rfft(krn)
+        bfp3 = buffered_planned_irfft(_A)
+    end
+
+    return Algorithm.FFT(bfp1, bfp2, bfp3)
+end
+planned_fft(A::AbstractArray, kernel, border::AbstractString) = planned_fft(A, kernel, borderinstance(border))
+planned_fft(A::AbstractArray, kernel::Union{ArrayLike,Laplacian}, border::BorderSpecAny) = planned_fft(A, factorkernel(kernel), border)
+
+# Error methods for NA borders - these should not be used with planned FFT
+function planned_fft(A::AbstractArray{T,N},
+            kernel::ProcessedKernel,
+            border::NA) where {T,N}
+    throw(ArgumentError("NA borders are not supported with planned FFT algorithms."))
+end
+
+function planned_fft(A::AbstractArray,
+            kernel::Union{ArrayLike,Laplacian},
+            border::NA)
+    throw(ArgumentError("NA borders are not supported with planned FFT algorithms."))
+end
+
+planned_fft(A::AbstractArray, kernel, border::NA) = throw(ArgumentError("NA borders are not supported with planned FFT algorithms."))
+
+# Unified filtfft function for planned FFT operations
+function filtfft(A::AbstractArray{T}, krn, planned_rfft1::Function, planned_rfft2::Function, planned_irfft::Function) where {T}
+    if T <: Colorant
+        # Handle colorant arrays
+        Av, dims = channelview_dims(A)
+        kernrs = kreshape(T, krn)
+
+        # Use planned functions with proper dims handling
+        B = planned_rfft1(Av)
+        krn_buf = planned_rfft2(kernrs)
+        B .*= conj!(krn_buf)
+        Avf_no_offset = planned_irfft(B)
+
+        # Restore offset information by creating an OffsetArray with the original axes
+        Avf = OffsetArray(Avf_no_offset, axes(Av))
+
+        # Convert back to colorant array
+        return colorview(base_colorant_type(T){eltype(Avf)}, Avf)
+    else
+        # Handle regular arrays
+        B = planned_rfft1(A)
+        # Create a proper FFTView for the kernel and populate it
+        krn_fft = FFTView(zeros(eltype(krn), map(length, axes(A))))
+        for I in CartesianIndices(axes(krn))
+            krn_fft[I] = krn[I]
+        end
+        krn_buf = planned_rfft2(krn_fft)
+        B .*= conj!(krn_buf)
+        result = planned_irfft(B)
+
+        # Ensure the result has the same axes as the input array A
+        if axes(result) != axes(A)
+            # The planned IRFFT buffer might have different size or axes
+            if size(result) != size(A)
+                # Extract the portion that matches the size of A
+                result_view = view(result, ntuple(i -> 1:size(A,i), ndims(A))...)
+                return OffsetArray(collect(result_view), axes(A))
+            else
+                # Same size but different axes - use collect to get the actual data
+                return OffsetArray(collect(result), axes(A))
+            end
+        end
+        return result
+    end
+end
+filtfft(A, krn, ::Nothing, ::Nothing, ::Nothing) = filtfft(A, krn)
+
 function filtfft(A::AbstractArray{ST}, krn::AbstractArray{KT}) where {ST<:Union{Real,Complex},KT<:Union{Real,Complex}}
     CT = promote_type(ST, KT)
     B = _fft(A)
@@ -868,9 +1060,16 @@ end
 function filtfft(A::AbstractArray{CT}, krn) where {CT<:Colorant}
     Av, dims = channelview_dims(A)
     kernrs = kreshape(CT, krn)
-    B = rfft(Av, dims)
+    # Preserve offset information by working with offset arrays directly
+    # but use no_offset_view for RFFT operations
+    Av_no_offset = OffsetArrays.no_offset_view(Av)
+    B = rfft(Av_no_offset, dims)
     B .*= conj!(rfft(kernrs, dims))
-    Avf = irfft(B, length(axes(Av, dims[1])), dims)
+    Avf_no_offset = irfft(B, size(Av_no_offset, dims[1]), dims)
+
+    # Restore offset information by creating an OffsetArray with the original axes
+    Avf = OffsetArray(Avf_no_offset, axes(Av))
+
     colorview(base_colorant_type(CT){eltype(Avf)}, Avf)
 end
 channelview_dims(A::AbstractArray{C,N}) where {C<:Colorant,N} = channelview(A), ntuple(d -> d + 1, Val(N))
@@ -878,7 +1077,7 @@ channelview_dims(A::AbstractArray{C,N}) where {C<:ImageCore.Color1,N} = channelv
 
 function kreshape(::Type{C}, krn::FFTView) where {C<:Colorant}
     kern = parent(krn)
-    kernrs = FFTView(reshape(kern, 1, size(kern)...))
+    return FFTView(reshape(kern, 1, size(kern)...))
 end
 kreshape(::Type{C}, krn::FFTView) where {C<:ImageCore.Color1} = krn
 
diff --git a/src/utils.jl b/src/utils.jl
index 80e3839c..e3ce8507 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -126,7 +126,7 @@ const SmallInts = Union{UInt8,Int8,UInt16,Int16}
 accumfilter(pixelval::SmallInts, filterval::SmallInts) = Int(pixelval)*Int(filterval)
 # advice: don't use FixedPoint for the kernel
 accumfilter(pixelval::N0f8, filterval::N0f8) = Float32(pixelval)*Float32(filterval)
-accumfilter(pixelval::Colorant{N0f8}, filterval::N0f8) = float32(c)*Float32(filterval)
+accumfilter(pixelval::Colorant{N0f8}, filterval::N0f8) = float32(pixelval)*Float32(filterval)
 
 # In theory, the following might need to be specialized. For safety, make it a
 # standalone function call.
diff --git a/test/2d.jl b/test/2d.jl
index c3cdb72c..26bbd913 100644
--- a/test/2d.jl
+++ b/test/2d.jl
@@ -36,6 +36,42 @@ using ImageFiltering: borderinstance, filtfft
     end
 end
 
+# Helper function to check if a type supports planned_fft
+function supports_planned_fft(::Type{T}) where T
+    # AbstractFloat types are directly supported
+    T <: AbstractFloat && return true
+
+    # Colorant types are supported if their element type can be converted to FFT-compatible types
+    if T <: Colorant
+        try
+            # Check if ffteltype can convert the element type
+            elt = eltype(T)
+            fft_type = ImageFiltering.ffteltype(elt)
+            return fft_type <: Union{Float32, Float64}
+        catch
+            return false
+        end
+    end
+
+    return false
+end
+
+function supported_algs(img::AbstractArray{T}, kernel, border) where T
+    base_algs = (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
+
+    # Include planned_fft if:
+    # 1. The image type supports planned_fft (AbstractFloat or compatible Colorants)
+    # 2. All kernel elements are floating point or complex floating point
+    # 3. Border is not NA (since NA requires special handling)
+    if supports_planned_fft(T) &&
+       all(k -> eltype(k) <: Union{AbstractFloat, Complex{<:AbstractFloat}}, (kernel isa Tuple ? kernel : (kernel,))) &&
+       !isa(border, NA)
+        return (base_algs..., planned_fft(img, kernel, border))
+    else
+        return base_algs
+    end
+end
+
 @testset "FIR/FFT" begin
     f32type(img) = f32type(eltype(img))
     f32type(::Type{C}) where {C<:Colorant} = base_colorant_type(C){Float32}
@@ -47,100 +83,110 @@ end
     imgi = zeros(Int, 5, 7); imgi[3,4] = 1
     imgg = fill(Gray{N0f8}(0), 5, 7); imgg[3,4] = 1
     imgc = fill(RGB{N0f8}(0,0,0), 5, 7); imgc[3,4] = RGB(1,0,0)
-    # Dense inseparable kernel
-    kern = [0.1 0.2; 0.4 0.5]
-    kernel = OffsetArray(kern, -1:0, 1:2)
-    for img in (imgf, imgi, imgg, imgc)
-        targetimg = zeros(typeof(img[1]*kern[1]), size(img))
-        targetimg[3:4,2:3] = rot180(kern) .* img[3,4]
-        @test @inferred(imfilter(img, kernel)) ≈ targetimg
-        @test @inferred(imfilter(img, (kernel,))) ≈ targetimg
-        @test @inferred(imfilter(f32type(img), img, kernel)) ≈ float32.(targetimg)
-        ret = imfilter(img, kernel)
-        fill!(ret, zero(eltype(ret)))
-        @test @inferred(imfilter!(ret, img, kernel)) ≈ targetimg
-        fill!(ret, zero(eltype(ret)))
-        @test @inferred(imfilter!(CPU1(Algorithm.FIR()), ret, img, kernel)) ≈ targetimg
-        for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
-            @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
-            @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
+    imgs = (("Float64", imgf), ("Int", imgi), ("Gray{N0f8}", imgg), ("RGB{N0f8}", imgc))
+    @testset "Dense inseparable kernel" begin
+        kern = [0.1 0.2; 0.4 0.5]
+        kernel = OffsetArray(kern, -1:0, 1:2)
+        @testset "$imgname" for (imgname, img) in imgs
+            targetimg = zeros(typeof(img[1]*kern[1]), size(img))
+            targetimg[3:4,2:3] = rot180(kern) .* img[3,4]
+            @test @inferred(imfilter(img, kernel)) ≈ targetimg
+            @test @inferred(imfilter(img, (kernel,))) ≈ targetimg
+            @test @inferred(imfilter(f32type(img), img, kernel)) ≈ float32.(targetimg)
+            ret = imfilter(img, kernel)
+            fill!(ret, zero(eltype(ret)))
+            @test @inferred(imfilter!(ret, img, kernel)) ≈ targetimg
             fill!(ret, zero(eltype(ret)))
-            @test @inferred(imfilter!(ret, img, kernel, border)) ≈ targetimg
-            for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-                @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
-                @test @inferred(imfilter(img, (kernel,), border, alg)) ≈ targetimg
-                @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
+            @test @inferred(imfilter!(CPU1(Algorithm.FIR()), ret, img, kernel)) ≈ targetimg
+            @testset "$border" for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
+                @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
+                @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
                 fill!(ret, zero(eltype(ret)))
-                @test @inferred(imfilter!(CPU1(alg), ret, img, kernel, border)) ≈ targetimg
+                @test @inferred(imfilter!(ret, img, kernel, border)) ≈ targetimg
+                @testset "$alg" for alg in supported_algs(img, kernel, border)
+                    @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
+                    @test @inferred(imfilter(img, (kernel,), border, alg)) ≈ targetimg
+                    @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
+                    fill!(ret, zero(eltype(ret)))
+                    @test @inferred(imfilter!(CPU1(alg), ret, img, kernel, border)) ≈ targetimg
+                end
+                @test_throws MethodError imfilter!(CPU1(Algorithm.FIR()), ret, img, kernel, border, Algorithm.FFT())
             end
-            @test_throws MethodError imfilter!(CPU1(Algorithm.FIR()), ret, img, kernel, border, Algorithm.FFT())
-        end
-        targetimg_inner = OffsetArray(targetimg[2:end, 1:end-2], 2:5, 1:5)
-        @test @inferred(imfilter(img, kernel, Inner())) ≈ targetimg_inner
-        @test @inferred(imfilter(f32type(img), img, kernel, Inner())) ≈ float32.(targetimg_inner)
-        for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-            @test @inferred(imfilter(img, kernel, Inner(), alg)) ≈ targetimg_inner
-            @test @inferred(imfilter(f32type(img), img, kernel, Inner(), alg)) ≈ float32.(targetimg_inner)
-            @test @inferred(imfilter(CPU1(alg), img, kernel, Inner())) ≈ targetimg_inner
-        end
-    end
-    # Factored kernel
-    kernel = (OffsetArray([0.2,0.8], -1:0), OffsetArray([0.3 0.6], 0:0, 1:2))
-    kern = parent(kernel[1]).*parent(kernel[2])
-    for img in (imgf, imgi, imgg, imgc)
-        targetimg = zeros(typeof(img[1]*kern[1]), size(img))
-        targetimg[3:4,2:3] = rot180(kern) .* img[3,4]
-        ret = similar(targetimg)
-        @test @inferred(imfilter(img, kernel)) ≈ targetimg
-        @test @inferred(imfilter(f32type(img), img, kernel)) ≈ float32.(targetimg)
-        for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
-            @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
-            @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
-            for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-                @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
-                @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
+            @testset "Inner()" begin
+                targetimg_inner = OffsetArray(targetimg[2:end, 1:end-2], 2:5, 1:5)
+                @test @inferred(imfilter(img, kernel, Inner())) ≈ targetimg_inner
+                @test @inferred(imfilter(f32type(img), img, kernel, Inner())) ≈ float32.(targetimg_inner)
+                @testset "$alg" for alg in supported_algs(img, kernel, Inner())
+                    @test @inferred(imfilter(img, kernel, Inner(), alg)) ≈ targetimg_inner
+                    @test @inferred(imfilter(f32type(img), img, kernel, Inner(), alg)) ≈ float32.(targetimg_inner)
+                    @test @inferred(imfilter(CPU1(alg), img, kernel, Inner())) ≈ targetimg_inner
+                end
             end
-            @test_throws MethodError imfilter!(CPU1(Algorithm.FIR()), ret, img, kernel, border, Algorithm.FFT())
-            @test_throws ErrorException imfilter!(CPU1(Algorithm.FIR()), ret, img, kernel, borderinstance(border), axes(ret)) #167
-        end
-        targetimg_inner = OffsetArray(targetimg[2:end, 1:end-2], 2:5, 1:5)
-        @test @inferred(imfilter(img, kernel, Inner())) ≈ targetimg_inner
-        @test @inferred(imfilter(f32type(img), img, kernel, Inner())) ≈ float32.(targetimg_inner)
-        for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-            @test @inferred(imfilter(img, kernel, Inner(), alg)) ≈ targetimg_inner
-            @test @inferred(imfilter(f32type(img), img, kernel, Inner(), alg)) ≈ float32.(targetimg_inner)
         end
     end
-    # Rational filter coefficients
-    kernel = (centered([1//3, 1//3, 1//3]), centered([1//3, 1//3, 1//3]'))
-    kern = kernel[1].*kernel[2]
-    for img in (imgf, imgi, imgg, imgc)
-        targetimg = zeros(typeof(img[1]*kern[1]), size(img))
-        targetimg[2:4,3:5] .= img[3,4]*(1//9)
-        @test @inferred(imfilter(img, kernel)) ≈ targetimg
-        @test @inferred(imfilter(f32type(img), img, kernel)) ≈ float32.(targetimg)
-        for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
-            @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
-            @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
-            for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-                if alg == Algorithm.FFT() && eltype(img) == Int
-                    @test @inferred(imfilter(Float64, img, kernel, border, alg)) ≈ targetimg
-                else
+    @testset "Factored kernel" begin
+        kernel = (OffsetArray([0.2,0.8], -1:0), OffsetArray([0.3 0.6], 0:0, 1:2))
+        kern = parent(kernel[1]).*parent(kernel[2])
+        @testset "$imgname" for (imgname, img) in imgs
+            targetimg = zeros(typeof(img[1]*kern[1]), size(img))
+            targetimg[3:4,2:3] = rot180(kern) .* img[3,4]
+            ret = similar(targetimg)
+            @test @inferred(imfilter(img, kernel)) ≈ targetimg
+            @test @inferred(imfilter(f32type(img), img, kernel)) ≈ float32.(targetimg)
+            @testset "$border" for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
+                @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
+                @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
+                @testset "$alg" for alg in supported_algs(img, kernel, border)
                     @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
+                    @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
+                end
+                @test_throws MethodError imfilter!(CPU1(Algorithm.FIR()), ret, img, kernel, border, Algorithm.FFT())
+                @test_throws ErrorException imfilter!(CPU1(Algorithm.FIR()), ret, img, kernel, borderinstance(border), axes(ret)) #167
+            end
+            @testset "Inner()" begin
+                targetimg_inner = OffsetArray(targetimg[2:end, 1:end-2], 2:5, 1:5)
+                @test @inferred(imfilter(img, kernel, Inner())) ≈ targetimg_inner
+                @test @inferred(imfilter(f32type(img), img, kernel, Inner())) ≈ float32.(targetimg_inner)
+                @testset "$alg" for alg in supported_algs(img, kernel, Inner())
+                    @test @inferred(imfilter(img, kernel, Inner(), alg)) ≈ targetimg_inner
+                    @test @inferred(imfilter(f32type(img), img, kernel, Inner(), alg)) ≈ float32.(targetimg_inner)
                 end
-                @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
             end
         end
-        targetimg_inner = OffsetArray(targetimg[2:end-1, 2:end-1], 2:4, 2:6)
-        @test @inferred(imfilter(img, kernel, Inner())) ≈ targetimg_inner
-        @test @inferred(imfilter(f32type(img), img, kernel, Inner())) ≈ float32.(targetimg_inner)
-        for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-            if alg == Algorithm.FFT() && eltype(img) == Int
-                @test @inferred(imfilter(Float64, img, kernel, Inner(), alg)) ≈ targetimg_inner
-            else
-                @test @inferred(imfilter(img, kernel, Inner(), alg)) ≈ targetimg_inner
+    end
+    @testset "Rational filter coefficients" begin
+        kernel = (centered([1//3, 1//3, 1//3]), centered([1//3, 1//3, 1//3]'))
+        kern = kernel[1].*kernel[2]
+        @testset "$imgname" for (imgname, img) in imgs
+            targetimg = zeros(typeof(img[1]*kern[1]), size(img))
+            targetimg[2:4,3:5] .= img[3,4]*(1//9)
+            @test @inferred(imfilter(img, kernel)) ≈ targetimg
+            @test @inferred(imfilter(f32type(img), img, kernel)) ≈ float32.(targetimg)
+            @testset "$border" for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
+                @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
+                @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
+                @testset "$alg" for alg in supported_algs(img, kernel, border)
+                    if alg == Algorithm.FFT() && eltype(img) == Int
+                        @test @inferred(imfilter(Float64, img, kernel, border, alg)) ≈ targetimg
+                    else
+                        @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
+                    end
+                    @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
+                end
+            end
+            @testset "Inner()" begin
+                targetimg_inner = OffsetArray(targetimg[2:end-1, 2:end-1], 2:4, 2:6)
+                @test @inferred(imfilter(img, kernel, Inner())) ≈ targetimg_inner
+                @test @inferred(imfilter(f32type(img), img, kernel, Inner())) ≈ float32.(targetimg_inner)
+                @testset "$alg" for alg in supported_algs(img, kernel, Inner())
+                    if alg == Algorithm.FFT() && eltype(img) == Int
+                        @test @inferred(imfilter(Float64, img, kernel, Inner(), alg)) ≈ targetimg_inner
+                    else
+                        @test @inferred(imfilter(img, kernel, Inner(), alg)) ≈ targetimg_inner
+                    end
+                    @test @inferred(imfilter(f32type(img), img, kernel, Inner(), alg)) ≈ float32.(targetimg_inner)
+                end
             end
-            @test @inferred(imfilter(f32type(img), img, kernel, Inner(), alg)) ≈ float32.(targetimg_inner)
         end
     end
     ## Images for which the boundary conditions matter
@@ -148,9 +194,8 @@ end
     imgi = zeros(Int, 5, 7); imgi[1,2] = 1
     imgg = fill(Gray(0), 5, 7); imgg[1,2] = 1
     imgc = fill(RGB(0,0,0), 5, 7); imgc[1,2] = RGB(1,0,0)
-    # Dense kernel
-    kern = [0.1 0.2; 0.4 0.5]
-    kernel = OffsetArray(kern, -1:0, 1:2)
+    imgs = (("Float64", imgf), ("Int", imgi), ("Gray{N0f8}", imgg), ("RGB{N0f8}", imgc))
+
     function target1(img::AbstractArray{T}, kern, border) where T
         if border ∈ ("replicate", "symmetric")
             ret = float64.(zero(img))
@@ -179,63 +224,111 @@ end
         ret[:,end] .= nan(eltype(ret))  # for the last column, this kernel is entirely in the padding region
         ret
     end
-    for img in (imgf, imgi, imgg, imgc)
-        for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
-            targetimg = target1(img, kern, border)
-            @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
-            @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
-            for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-                @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
-                @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
+    @testset "Dense kernel" begin
+        kern = [0.1 0.2; 0.4 0.5]
+        kernel = OffsetArray(kern, -1:0, 1:2)
+        @testset "$imgname" for (imgname, img) in imgs
+            @testset "$border" for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
+                targetimg = target1(img, kern, border)
+                @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
+                @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
+                @testset "$alg" for alg in supported_algs(img, kernel, border)
+                    @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
+                    @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
+                end
+            end
+            @testset "NA()" begin
+                border = NA()
+                targetimg0 = target1(img, kern, border)
+                nanflag = isnan.(targetimg0)
+                targetimg = zerona!(copy(targetimg0))
+                @test @inferred(zerona!(imfilter(img, kernel, border))) ≈ targetimg
+                @test @inferred(zerona!(imfilter(f32type(img), img, kernel, border))) ≈ float32.(targetimg)
+                @testset "$alg" for alg in supported_algs(img, kernel, border)
+                    @test @inferred(zerona!(imfilter(img, kernel, border, alg), nanflag)) ≈ targetimg
+                    @test @inferred(zerona!(imfilter(f32type(img), img, kernel, border, alg), nanflag)) ≈ float32.(targetimg)
+                end
             end
-        end
-        border = NA()
-        targetimg0 = target1(img, kern, border)
-        nanflag = isnan.(targetimg0)
-        targetimg = zerona!(copy(targetimg0))
-        @test @inferred(zerona!(imfilter(img, kernel, border))) ≈ targetimg
-        @test @inferred(zerona!(imfilter(f32type(img), img, kernel, border))) ≈ float32.(targetimg)
-        for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-            @test @inferred(zerona!(imfilter(img, kernel, border, alg), nanflag)) ≈ targetimg
-            @test @inferred(zerona!(imfilter(f32type(img), img, kernel, border, alg), nanflag)) ≈ float32.(targetimg)
         end
     end
-    # Factored kernel
-    kernel = (OffsetArray([0.2,0.8], -1:0), OffsetArray([0.3 0.6], 0:0, 1:2))
-    kern = parent(kernel[1]).*parent(kernel[2])
-    for img in (imgf, imgi, imgg, imgc)
-        for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
-            targetimg = target1(img, kern, border)
-            @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
-            @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
-            for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-                @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
-                @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
+    @testset  "Factored kernel" begin
+        kernel = (OffsetArray([0.2,0.8], -1:0), OffsetArray([0.3 0.6], 0:0, 1:2))
+        kern = parent(kernel[1]).*parent(kernel[2])
+        @testset "$imgname" for (imgname, img) in imgs
+            @testset "$border" for border in ("replicate", "circular", "symmetric", "reflect", Fill(zero(eltype(img))))
+                targetimg = target1(img, kern, border)
+                @test @inferred(imfilter(img, kernel, border)) ≈ targetimg
+                @test @inferred(imfilter(f32type(img), img, kernel, border)) ≈ float32.(targetimg)
+                @testset "$alg" for alg in supported_algs(img, kernel, border)
+                    @test @inferred(imfilter(img, kernel, border, alg)) ≈ targetimg
+                    @test @inferred(imfilter(f32type(img), img, kernel, border, alg)) ≈ float32.(targetimg)
+                end
+            end
+            @testset "NA()" begin
+                border = NA()
+                targetimg0 = target1(img, kern, border)
+                nanflag = isnan.(targetimg0)
+                targetimg = zerona!(copy(targetimg0))
+                @test @inferred(zerona!(imfilter(img, kernel, border))) ≈ targetimg
+                @test @inferred(zerona!(imfilter(f32type(img), img, kernel, border))) ≈ float32.(targetimg)
+                @testset "$alg" for alg in supported_algs(img, kernel, border)
+                    @test @inferred(zerona!(imfilter(img, kernel, border, alg), nanflag)) ≈ targetimg
+                    @test @inferred(zerona!(imfilter(f32type(img), img, kernel, border, alg), nanflag)) ≈ float32.(targetimg)
+                end
             end
         end
-        border = NA()
-        targetimg0 = target1(img, kern, border)
-        nanflag = isnan.(targetimg0)
-        targetimg = zerona!(copy(targetimg0))
-        @test @inferred(zerona!(imfilter(img, kernel, border))) ≈ targetimg
-        @test @inferred(zerona!(imfilter(f32type(img), img, kernel, border))) ≈ float32.(targetimg)
-        for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-            @test @inferred(zerona!(imfilter(img, kernel, border, alg), nanflag)) ≈ targetimg
-            @test @inferred(zerona!(imfilter(f32type(img), img, kernel, border, alg), nanflag)) ≈ float32.(targetimg)
+        # filtering with a 0d kernel
+        a = rand(3,3)
+        # @test imfilter(a, reshape([2])) ≈ 2a
+        # imfilter! as a copy
+        ret = zeros(3, 3)
+        b = @inferred(imfilter!(CPU1(Algorithm.FIR()), ret, a, (), NoPad()))
+        @test b == a
+        @test !(b === a)
+        # OffsetArrays
+        img = OffsetArray(rand(RGB{N0f8}, 80, 100), (-5, 3))
+        imgf = imfilter(img, Kernel.gaussian((3,3)))
+        @test axes(imgf) == axes(img)
+    end
+
+    @testset "Planned FFT edge cases" begin
+        # Test case 1: axes(result) != axes(A) but size(result) == size(A)
+        A_offset = OffsetArray(rand(Float64, 6, 8), (-1, 1))
+        kernel = rand(Float64, 3, 3)
+        planned_alg = planned_fft(A_offset, kernel, Fill(0.0))
+        result_planned = imfilter(A_offset, kernel, Fill(0.0), planned_alg)
+        result_standard = imfilter(A_offset, kernel, Fill(0.0), Algorithm.FFT())
+        @test axes(result_planned) == axes(A_offset)
+        @test result_planned ≈ result_standard
+
+        # Test case 2: size(result) != size(A) - create artificial size mismatch
+        A = rand(Float64, 7, 9)
+        bord = Fill(0.0)(rand(3,5), A, Algorithm.FFT())
+        _A = ImageFiltering.padarray(Float64, A, bord)
+        kern = ImageFiltering.samedims(_A, ImageFiltering.kernelconv(rand(3,5)))
+        krn = FFTView(zeros(eltype(kern), map(length, axes(_A))))
+
+        # Create custom planned function that returns larger buffer
+        original_irfft = ImageFiltering.buffered_planned_irfft(_A)
+        custom_irfft = function(arr)
+            result = original_irfft(arr)
+            if size(result, 1) > 2 && size(result, 2) > 2
+                # Return buffer that's 1 larger in each dimension
+                larger = zeros(eltype(result), size(result, 1) + 1, size(result, 2) + 1)
+                larger[1:size(result,1), 1:size(result,2)] = result
+                return larger
+            end
+            return result
         end
+
+        result_custom = ImageFiltering.filtfft(_A, krn,
+            ImageFiltering.buffered_planned_rfft(_A),
+            ImageFiltering.buffered_planned_rfft(krn),
+            custom_irfft)
+        result_standard = ImageFiltering.filtfft(_A, krn)
+        @test size(result_custom) == size(_A)
+        @test result_custom ≈ result_standard
     end
-    # filtering with a 0d kernel
-    a = rand(3,3)
-    # @test imfilter(a, reshape([2])) ≈ 2a
-    # imfilter! as a copy
-    ret = zeros(3, 3)
-    b = @inferred(imfilter!(CPU1(Algorithm.FIR()), ret, a, (), NoPad()))
-    @test b == a
-    @test !(b === a)
-    # OffsetArrays
-    img = OffsetArray(rand(RGB{N0f8}, 80, 100), (-5, 3))
-    imgf = imfilter(img, Kernel.gaussian((3,3)))
-    @test axes(imgf) == axes(img)
 end
 
 @testset "Borders (issue #85)" begin
@@ -356,3 +449,215 @@ end
     @test filtfft(C, D) ≈ filtfft(C, ComplexF32.(D))
     @test filtfft(C, D) ≈ filtfft(ComplexF32.(C), ComplexF32.(D))
 end
+
+@testset "Planned FFT with NA borders should error" begin
+    imgf = zeros(Float64, 5, 7); imgf[3,4] = 1.0
+    imgi = zeros(Int, 5, 7); imgi[3,4] = 1
+    imgc = fill(RGB{Float64}(0,0,0), 5, 7); imgc[3,4] = RGB{Float64}(1,0,0)
+    kernel = OffsetArray([0.1 0.2; 0.4 0.5], -1:0, 1:2)
+    na_border = NA()
+
+    @test_throws ArgumentError planned_fft(imgf, kernel, na_border)
+    @test_throws ArgumentError planned_fft(imgi, kernel, na_border)
+    @test_throws ArgumentError planned_fft(imgc, kernel, na_border)
+    kern_tuple = (OffsetArray([0.2, 0.8], -1:0), OffsetArray([0.3 0.6], 0:0, 1:2))
+    @test_throws ArgumentError planned_fft(imgf, kern_tuple, na_border)
+    @test_throws ArgumentError planned_fft(imgf, Kernel.Laplacian(), na_border)
+end
+
+@testset "Planned FFT with IIR filters should error" begin
+    imgf = rand(Float64, 50, 50)
+    
+    # IIRGaussian is an IIR filter
+    iir_kernel = KernelFactors.IIRGaussian(Float64, (3.0,))
+    @test_throws ArgumentError planned_fft(imgf, iir_kernel, "replicate")
+    
+    # TriggsSdika tuple
+    ts_kernel = KernelFactors.IIRGaussian(Float64, (3.0, 3.0))
+    @test_throws ArgumentError planned_fft(imgf, ts_kernel, Pad(:replicate))
+end
+
+@testset "Plan Reuse Across Multiple Images" begin
+    @testset "Reuse with same-size Float64 images" begin
+        # Create multiple images of the same size
+        images = [rand(Float64, 50, 50) for _ in 1:5]
+        # Add some structure to make filtering meaningful
+        for img in images
+            img[20:30, 20:30] .= 1.0
+        end
+        
+        kernel = rand(Float64, 3, 3)
+        border = Fill(0.0)
+        
+        # Create plan once
+        planned_alg = planned_fft(images[1], kernel, border)
+        
+        # Reuse plan for all images
+        for (i, img) in enumerate(images)
+            result_planned = imfilter(img, kernel, border, planned_alg)
+            result_standard = imfilter(img, kernel, border, Algorithm.FFT())
+            @test result_planned ≈ result_standard rtol=1e-10
+        end
+    end
+    
+    @testset "Reuse with same-size Gray images" begin
+        images = [rand(Gray{Float64}, 40, 60) for _ in 1:3]
+        kernel = Kernel.gaussian((3, 3))
+        border = Pad(:replicate)
+        
+        planned_alg = planned_fft(images[1], kernel, border)
+        
+        for img in images
+            result_planned = imfilter(img, kernel, border, planned_alg)
+            result_standard = imfilter(img, kernel, border, Algorithm.FFT())
+            @test result_planned ≈ result_standard rtol=1e-10
+        end
+    end
+    
+    @testset "Wrong-size image should error" begin
+        img1 = rand(Float64, 50, 50)
+        img2 = rand(Float64, 60, 60)  # Different size
+        kernel = rand(Float64, 3, 3)
+        border = Fill(0.0)
+        
+        planned_alg = planned_fft(img1, kernel, border)
+        
+        # First image should work
+        result1 = imfilter(img1, kernel, border, planned_alg)
+        @test result1 isa Array{Float64, 2}
+        
+        # Wrong-size image should throw an error
+        @test_throws Exception imfilter(img2, kernel, border, planned_alg)
+    end
+    
+    @testset "Different element type with same size" begin
+        img_f64 = rand(Float64, 50, 50)
+        img_f32 = rand(Float32, 50, 50)
+        kernel = rand(Float64, 3, 3)
+        border = Fill(0.0)
+        
+        planned_alg = planned_fft(img_f64, kernel, border)
+        
+        # Float32 image with Float64 plan should work (gets converted)
+        result = imfilter(img_f32, kernel, border, planned_alg)
+        @test result isa Array
+        
+        # Compare with standard FFT
+        result_standard = imfilter(img_f32, kernel, border, Algorithm.FFT())
+        @test result ≈ result_standard rtol=1e-6
+    end
+end
+
+@testset "Concurrent Filtering with Planned FFT" begin
+    # Only run if we have multiple threads
+    if Threads.nthreads() > 1
+        @testset "Concurrent Float64 filtering" begin
+            n_images = 10
+            images = [rand(Float64, 100, 100) for _ in 1:n_images]
+            # Add structure to images
+            for img in images
+                img[40:60, 40:60] .= rand()
+            end
+            
+            kernel = Kernel.gaussian((5, 5))
+            border = Pad(:replicate)
+            
+            # Create plan once
+            planned_alg = planned_fft(images[1], kernel, border)
+            results = Vector{Any}(undef, n_images)
+            
+            # Filter concurrently
+            Threads.@threads for i in 1:n_images
+                results[i] = imfilter(images[i], kernel, border, planned_alg)
+            end
+            
+            # Verify correctness
+            for i in 1:n_images
+                expected = imfilter(images[i], kernel, border, Algorithm.FFT())
+                @test results[i] ≈ expected rtol=1e-8
+            end
+        end
+        
+        @testset "Concurrent Gray image filtering" begin
+            n_images = 8
+            images = [rand(Gray{Float64}, 80, 80) for _ in 1:n_images]
+            kernel = Kernel.gaussian((3, 3))
+            border = Fill(Gray(0.0))
+            
+            planned_alg = planned_fft(images[1], kernel, border)
+            results = Vector{Any}(undef, n_images)
+            
+            Threads.@threads for i in 1:n_images
+                results[i] = imfilter(images[i], kernel, border, planned_alg)
+            end
+            
+            for i in 1:n_images
+                expected = imfilter(images[i], kernel, border, Algorithm.FFT())
+                @test results[i] ≈ expected rtol=1e-8
+            end
+        end
+    else
+        @info "Skipping concurrent tests (only 1 thread available). Run with multiple threads to test concurrent filtering."
+    end
+end
+
+@testset "Plan Reuse Validation" begin
+    @testset "Plan structure validation" begin
+        img = rand(Float64, 50, 50)
+        kernel = rand(Float64, 3, 3)
+        border = Fill(0.0)
+        
+        planned_alg = planned_fft(img, kernel, border)
+        
+        # Verify it's an Algorithm.FFT with plans
+        @test planned_alg isa Algorithm.FFT
+        @test planned_alg.plan1 !== nothing
+        @test planned_alg.plan2 !== nothing
+        @test planned_alg.plan3 !== nothing
+        
+        # Verify plans are callable functions
+        @test planned_alg.plan1 isa Function
+        @test planned_alg.plan2 isa Function
+        @test planned_alg.plan3 isa Function
+    end
+    
+    @testset "Compare planned vs unplanned FFT" begin
+        img = rand(Float64, 100, 100)
+        img[45:55, 45:55] .= 1.0
+        kernel = Kernel.gaussian((7, 7))
+        border = Pad(:symmetric)
+        
+        # Planned FFT
+        planned_alg = planned_fft(img, kernel, border)
+        result_planned = imfilter(img, kernel, border, planned_alg)
+        
+        # Unplanned FFT
+        result_unplanned = imfilter(img, kernel, border, Algorithm.FFT())
+        
+        # Results should be very close (within numerical precision)
+        @test result_planned ≈ result_unplanned rtol=1e-10
+        
+        # Verify we get the same axes
+        @test axes(result_planned) == axes(result_unplanned)
+    end
+    
+    @testset "Offset arrays with planned FFT reuse" begin
+        images = [OffsetArray(rand(Float64, 50, 50), (0:49, 1:50)) for _ in 1:3]
+        kernel = rand(Float64, 3, 3)
+        border = Fill(0.0)
+        
+        planned_alg = planned_fft(images[1], kernel, border)
+        
+        for img in images
+            result_planned = imfilter(img, kernel, border, planned_alg)
+            result_standard = imfilter(img, kernel, border, Algorithm.FFT())
+            
+            # Check axes are preserved
+            @test axes(result_planned) == axes(img)
+            @test axes(result_standard) == axes(img)
+            
+            # Check values match
+            @test result_planned ≈ result_standard rtol=1e-10
+        end
+    end
+end
diff --git a/test/gabor.jl b/test/gabor.jl
index 5c77508e..faa49cb4 100644
--- a/test/gabor.jl
+++ b/test/gabor.jl
@@ -7,11 +7,15 @@ using ImageFiltering, Test, Statistics
     size_y = 6*σy+1
     γ = σx/σy
     # zero size forces default kernel width, with warnings
-    @info "Four warnings are expected"
-    kernel = Kernel.gabor(0,0,σx,0,5,γ,0)
-    @test isequal(size(kernel[1]),(size_x,size_y))
-    kernel = Kernel.gabor(0,0,σx,π,5,γ,0)
-    @test isequal(size(kernel[1]),(size_x,size_y))
+
+    @test_logs (:warn, r"The input parameter size_") match_mode=:any begin
+        kernel = Kernel.gabor(0,0,σx,0,5,γ,0)
+        @test isequal(size(kernel[1]),(size_x,size_y))
+    end
+    @test_logs (:warn, r"The input parameter size_") match_mode=:any begin
+        kernel = Kernel.gabor(0,0,σx,π,5,γ,0)
+        @test isequal(size(kernel[1]),(size_x,size_y))
+    end
 
     for x in 0:4, y in 0:4, z in 0:4, t in 0:4
         σx1 = 2*x+1
diff --git a/test/nd.jl b/test/nd.jl
index 8842743f..b57ae3f6 100644
--- a/test/nd.jl
+++ b/test/nd.jl
@@ -49,8 +49,11 @@ Base.zero(::Type{WrappedFloat}) = WrappedFloat(0.0)
     # Element-type widening (issue #17)
     v = fill(0xff, 10)
     kern = centered(fill(0xff, 3))
-    @info "Two warnings are expected"
-    @test_throws InexactError imfilter(v, kern)
+
+    @test_logs (:warn, r"Likely overflow or conversion error detected") begin
+        @test_throws InexactError imfilter(v, kern)
+    end
+
     vout = imfilter(UInt32, v, kern)
     @test eltype(vout) == UInt32
     @test all(x->x==0x0002fa03, vout)
@@ -106,8 +109,8 @@ Base.zero(::Type{WrappedFloat}) = WrappedFloat(0.0)
         around_i = [abs(i-j) <= 15 for j in eachindex(v)]
         @test all(isequal(x), wf[around_i])
         @test wf[.!around_i] ≈ vf[.!around_i]
-    end   
-   
+    end
+
     # Issue #110
     img = reinterpret(WrappedFloat, rand(128))
     kern = centered(rand(31))
@@ -119,7 +122,9 @@ end
     # issue #17
     img = fill(typemax(Int16), 10, 10)
     kern = centered(Int16[1 2 2 2 1])
-    @test_throws InexactError imfilter(img, kern)
+    @test_logs (:warn, r"Likely overflow or conversion error detected") begin
+        @test_throws InexactError imfilter(img, kern)
+    end
     ret = imfilter(Int32, img, kern)
     @test eltype(ret) == Int32
     @test all(x->x==262136, ret)
@@ -129,6 +134,7 @@ end
     img = trues(10,10,10)
     kernel = centered(trues(3,3,3)/27)
     for border in ("replicate", "circular", "symmetric", "reflect", Fill(true))
+        # TODO: add support for boolean images in planned_fft
         for alg in (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
             @test imfilter(img, kernel, border) ≈ img
         end
diff --git a/test/runtests.jl b/test/runtests.jl
index cfb5ae59..955179dc 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -6,6 +6,13 @@ using ImageQualityIndexes
 import StaticArrays
 using Random
 
+function typestring(::Type{T}) where T   # from https://github.com/JuliaImages/ImageCore.jl/pull/133
+    buf = IOBuffer()
+    show(buf, T)
+    String(take!(buf))
+end
+
+@testset "ImageFiltering" verbose=true begin
 @testset "Project meta quality checks" begin
     # Ambiguity test
     if Base.VERSION >= v"1.6.0-DEV.1005"   # julia #37616
@@ -26,12 +33,6 @@ using Random
     end
 end
 
-function typestring(::Type{T}) where T   # from https://github.com/JuliaImages/ImageCore.jl/pull/133
-    buf = IOBuffer()
-    show(buf, T)
-    String(take!(buf))
-end
-
 include("compat.jl")
 include("border.jl")
 include("nd.jl")
@@ -49,7 +50,6 @@ include("models.jl")
 
 CUDA_INSTALLED = false
 try
-    global CUDA_INSTALLED
     # This errors with `IOError` when nvidia driver is not available,
     # in which case we don't even need to try `using CUDA`
     run(pipeline(`nvidia-smi`, stdout=devnull, stderr=devnull))
@@ -71,3 +71,5 @@ else
     @warn "CUDA test: disabled"
 end
 nothing
+
+end

From e56890ed8efcbe006c454f740d5e5afb3b99b3e3 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Sun, 2 Nov 2025 19:59:00 -0500
Subject: [PATCH 2/3] fix

---
 src/imfilter.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/imfilter.jl b/src/imfilter.jl
index 7e03fd41..9695b2ee 100644
--- a/src/imfilter.jl
+++ b/src/imfilter.jl
@@ -938,7 +938,9 @@ function planned_fft(A::AbstractArray{T,N},
             ) where {T,N}
     # Check if any kernel is an IIR filter (not compatible with FFT)
     for k in kernel
-        if k isa IIRFilter
+        # Check both direct IIR filters and ReshapedOneD-wrapped IIR filters
+        k_data = k isa ReshapedOneD ? k.data : k
+        if k_data isa IIRFilter
             throw(ArgumentError("planned_fft does not support IIR filters like TriggsSdika or IIRGaussian. " *
                               "IIR filters use a different algorithm that cannot be accelerated with FFT plans. " *
                               "Use Algorithm.IIR() instead, or use imfilter without a plan."))

From 2065db4d1a13c2dad6381f57f6781e2feb6e0a10 Mon Sep 17 00:00:00 2001
From: Ian Butterworth <i.r.butterworth@gmail.com>
Date: Sun, 14 Dec 2025 23:26:32 -0500
Subject: [PATCH 3/3] Add docstrings and improve buffer allocation for planned
 FFT

- Add comprehensive docstrings for planned_fft, buffered_planned_rfft,
  and buffered_planned_irfft functions
- Use task-local storage for buffer reuse instead of allocating on
  every call, reducing memory pressure in repeated filtering
- Remove dev/debug_planned.jl debug script
---
 dev/debug_planned.jl | 948 -------------------------------------------
 src/imfilter.jl      | 137 ++++++-
 2 files changed, 118 insertions(+), 967 deletions(-)
 delete mode 100644 dev/debug_planned.jl

diff --git a/dev/debug_planned.jl b/dev/debug_planned.jl
deleted file mode 100644
index 4db395b1..00000000
--- a/dev/debug_planned.jl
+++ /dev/null
@@ -1,948 +0,0 @@
-#!/usr/bin/env julia
-
-"""
-Consolidated ImageFiltering.jl Debugging Script
-
-This script investigates issues with planned FFT functionality in ImageFiltering.jl.
-
-Main issues investigated:
-1. Buffer size calculation issues in planned FFT
-2. RFFT with offset arrays vs collect() behavior
-3. Colorant (RGB/Gray) image handling in planned FFT
-4. Copyto! operations with OffsetArrays
-5. Planned FFT vs standard FFT result differences
-6. Step-by-step debugging of the FFT pipeline
-
-Usage: julia --project debug_consolidated.jl [section]
-Where section can be: all, basic, colorant, offset, pipeline, fixes, supported_algs
-"""
-
-using ImageFiltering, ImageCore, OffsetArrays, FFTViews, FFTW, Statistics
-using Test
-
-# Configuration
-const RTOL = 0.001
-const ATOL = 0.001
-const VERBOSE = true
-
-# Test data creation helpers
-function create_test_data()
-    # Basic floating point images
-    imgf = zeros(Float64, 5, 7); imgf[3,4] = 1.0
-    imgf32 = zeros(Float32, 5, 7); imgf32[3,4] = 1.0f0
-
-    # Integer image
-    imgi = zeros(Int, 5, 7); imgi[3,4] = 1
-
-    # Colorant images - floating point
-    imgg_f64 = fill(Gray{Float64}(0), 5, 7); imgg_f64[3,4] = Gray{Float64}(1)
-    imgc_f64 = fill(RGB{Float64}(0,0,0), 5, 7); imgc_f64[3,4] = RGB{Float64}(1,0,0)
-
-    # Colorant images - fixed point
-    imgg_n0f8 = fill(Gray{N0f8}(0), 5, 7); imgg_n0f8[3,4] = Gray{N0f8}(1)
-    imgc_n0f8 = fill(RGB{N0f8}(0,0,0), 5, 7); imgc_n0f8[3,4] = RGB{N0f8}(1,0,0)
-
-    # Test kernel
-    kern = [0.1 0.2; 0.4 0.5]
-    kernel = OffsetArray(kern, -1:0, 1:2)
-
-    return (
-        float_imgs = (imgf, imgf32),
-        int_imgs = (imgi,),
-        colorant_float = (imgg_f64, imgc_f64),
-        colorant_fixed = (imgg_n0f8, imgc_n0f8),
-        kernel = kernel,
-        kern_values = kern
-    )
-end
-
-function print_section(title::String)
-    println("\n" * "="^60)
-    println("=== $title ===")
-    println("="^60)
-end
-
-function print_subsection(title::String)
-    println("\n--- $title ---")
-end
-
-# Test 1: Basic planned FFT functionality
-function test_basic_planned_fft()
-    print_section("Basic Planned FFT Tests")
-
-    data = create_test_data()
-    border = "replicate"
-
-    @testset "Basic Planned FFT vs Standard FFT" begin
-        for (i, img) in enumerate(data.float_imgs)
-            img_type = typeof(img[1,1])
-            print_subsection("Testing Float Image Type: $img_type")
-
-            # Compute expected result
-            target = zeros(eltype(img), size(img))
-            target[3:4, 2:3] = rot180(data.kern_values) .* img[3,4]
-
-            # Standard FFT
-            result_std = imfilter(img, data.kernel, border, Algorithm.FFT())
-
-            # Planned FFT
-            planned_alg = planned_fft(img, data.kernel, border)
-            result_planned = imfilter(img, data.kernel, border, planned_alg)
-
-            # Tests
-            @test result_std ≈ target rtol=RTOL atol=ATOL
-            @test result_planned ≈ target rtol=RTOL atol=ATOL
-            @test result_std ≈ result_planned rtol=RTOL atol=ATOL
-
-            if VERBOSE
-                println("  ✓ $img_type: All tests passed!")
-                println("    Standard result: ", result_std[3:4, 2:3])
-                println("    Planned result:  ", result_planned[3:4, 2:3])
-                println("    Target result:   ", target[3:4, 2:3])
-            end
-        end
-    end
-end
-
-# Test 2: Colorant image handling
-function test_colorant_images()
-    print_section("Colorant Image Tests")
-
-    data = create_test_data()
-    border = "replicate"
-
-    @testset "Colorant Images with Float Element Types" begin
-        for (i, img) in enumerate(data.colorant_float)
-            img_type = typeof(img[1,1])
-            print_subsection("Testing Colorant Type: $img_type")
-
-            try
-                # Standard FFT
-                result_std = imfilter(img, data.kernel, border, Algorithm.FFT())
-
-                # Planned FFT
-                planned_alg = planned_fft(img, data.kernel, border)
-                result_planned = imfilter(img, data.kernel, border, planned_alg)
-
-                # Compare results
-                results_match = result_std ≈ result_planned
-                @test results_match
-
-                if VERBOSE
-                    if results_match
-                        println("  ✓ $img_type: Planned FFT matches standard FFT!")
-                    else
-                        println("  ✗ $img_type: Results differ!")
-                        println("    Max difference: ", maximum(abs.(result_std - result_planned)))
-                    end
-                    println("    Standard result [2:4, 2:4]: ")
-                    println("      ", result_std[2:4, 2:4])
-                    println("    Planned result [2:4, 2:4]: ")
-                    println("      ", result_planned[2:4, 2:4])
-                end
-
-            catch e
-                @test false  # Should not fail for float colorants
-                if VERBOSE
-                    println("  ✗ $img_type: Unexpected error: $e")
-                end
-            end
-        end
-    end
-
-    @testset "Colorant Images with Fixed Point Element Types (Now Supported)" begin
-        for (i, img) in enumerate(data.colorant_fixed)
-            img_type = typeof(img[1,1])
-            print_subsection("Testing Fixed Point Colorant Type: $img_type")
-
-            # Standard FFT should work
-            result_std = imfilter(img, data.kernel, border, Algorithm.FFT())
-            if VERBOSE
-                println("  ✓ Standard FFT works for $img_type")
-            end
-
-            # Planned FFT should now work too
-            try
-                planned_alg = planned_fft(img, data.kernel, border)
-                result_planned = imfilter(img, data.kernel, border, planned_alg)
-
-                # Compare results
-                results_match = result_std ≈ result_planned
-                @test results_match
-
-                if VERBOSE
-                    if results_match
-                        println("  ✓ $img_type: Planned FFT now works and matches standard FFT!")
-                    else
-                        println("  ⚠ $img_type: Planned FFT works but results differ slightly")
-                        println("    Max difference: ", maximum(abs.(result_std - result_planned)))
-                    end
-                end
-            catch e
-                if VERBOSE
-                    println("  ✗ $img_type: Planned FFT failed with error: $e")
-                end
-                @test false  # Should work now
-            end
-        end
-    end
-end
-
-# Test 3: Offset array handling
-function test_offset_arrays()
-    print_section("Offset Array Handling Tests")
-
-    # Create offset array test case
-    data = reshape(1.0:24.0, 3, 8, 1)
-    data_offset = OffsetArray(data, 1:3, 0:7, 1:1)
-    dims = (2, 3)
-
-    print_subsection("Basic Offset Array RFFT Behavior")
-
-    if VERBOSE
-        println("data size: ", size(data))
-        println("data axes: ", axes(data))
-        println("data_offset size: ", size(data_offset))
-        println("data_offset axes: ", axes(data_offset))
-    end
-
-    # Test different RFFT approaches
-    B1 = rfft(data, dims)
-    B2 = rfft(data_offset, dims)
-    B3 = rfft(collect(data_offset), dims)
-
-    if VERBOSE
-        println("RFFT(data) size: ", size(B1))
-        println("RFFT(data_offset) size: ", size(B2))
-        println("RFFT(collect(data_offset)) size: ", size(B3))
-
-        println("\nComparisons:")
-        println("B1 ≈ B2: ", B1 ≈ B2)
-        println("B1 ≈ B3: ", B1 ≈ B3)
-        println("B2 ≈ B3: ", B2 ≈ B3)
-
-        if !(B1 ≈ B2)
-            println("Max diff B1 vs B2: ", maximum(abs.(B1 - B2)))
-        end
-    end
-
-    # Test with manual plans
-    print_subsection("Manual Plan Execution")
-    buf = Array{Float64}(undef, size(data))
-    plan = plan_rfft(buf, dims; flags=FFTW.MEASURE)
-
-    copyto!(buf, data)
-    B4 = plan * buf
-
-    copyto!(buf, collect(data_offset))
-    B5 = plan * buf
-
-    if VERBOSE
-        println("Manual plan on data: ", size(B4))
-        println("Manual plan on collect(data_offset): ", size(B5))
-        println("B1 ≈ B4: ", B1 ≈ B4)
-        println("B1 ≈ B5: ", B1 ≈ B5)
-    end
-end
-
-# Test 4: Step-by-step pipeline debugging
-function test_pipeline_debugging()
-    print_section("Pipeline Step-by-Step Debugging")
-
-    # Use RGB colorant case that shows issues
-    imgc = fill(RGB{Float64}(0,0,0), 5, 7); imgc[3,4] = RGB{Float64}(1,0,0)
-    kern = [0.1 0.2; 0.4 0.5]
-    kernel = OffsetArray(kern, -1:0, 1:2)
-    border = "replicate"
-
-    # Setup padded arrays
-    borderspec = ImageFiltering.borderinstance(border)
-    bord = borderspec(kernel, imgc, Algorithm.FFT())
-    _A = ImageFiltering.padarray(RGB{Float64}, imgc, bord)
-    kern_fft = ImageFiltering.samedims(_A, ImageFiltering.kernelconv(kernel))
-    krn = FFTView(zeros(eltype(kern_fft), map(length, axes(_A))))
-    for I in CartesianIndices(axes(kern_fft))
-        krn[I] = kern_fft[I]
-    end
-
-    # Get channelview and dims
-    Av, dims = ImageFiltering.channelview_dims(_A)
-    kernrs = ImageFiltering.kreshape(RGB{Float64}, krn)
-
-    if VERBOSE
-        println("Padded image channelview size: ", size(Av))
-        println("Kernel reshaped size: ", size(kernrs))
-        println("Transform dims: ", dims)
-    end
-
-    print_subsection("Standard Pipeline")
-    # Use collect() to match what the actual filtfft function does for colorants
-    Av_collected = collect(Av)
-    B_std = rfft(Av_collected, dims)
-    krn_buf_std = rfft(kernrs, dims)
-    B_std_copy = copy(B_std)
-    B_std_copy .*= conj!(krn_buf_std)
-    Avf_std = irfft(B_std_copy, length(axes(Av_collected, dims[1])), dims)
-
-    print_subsection("Planned Pipeline")
-    planned_alg = planned_fft(imgc, kernel, border)
-    B_planned = planned_alg.plan1(Av)
-    krn_buf_planned = planned_alg.plan2(kernrs)
-    B_planned .*= conj!(krn_buf_planned)
-    Avf_planned = planned_alg.plan3(B_planned)
-
-    if VERBOSE
-        println("Standard B size: ", size(B_std))
-        println("Planned B size: ", size(B_planned))
-        println("B_std ≈ B_planned: ", B_std ≈ B_planned)
-
-        if !(B_std ≈ B_planned)
-            println("Max difference in B: ", maximum(abs.(B_std - B_planned)))
-        end
-
-        println("Standard final size: ", size(Avf_std))
-        println("Planned final size: ", size(Avf_planned))
-        println("Final results equal: ", Avf_std ≈ Avf_planned)
-
-        if !(Avf_std ≈ Avf_planned)
-            println("Max difference in final: ", maximum(abs.(Avf_std - Avf_planned)))
-        end
-    end
-
-    # Convert back to colorant
-    result_std_colorant = colorview(base_colorant_type(RGB{Float64}){eltype(Avf_std)}, Avf_std)
-    result_planned_colorant = colorview(base_colorant_type(RGB{Float64}){eltype(Avf_planned)}, Avf_planned)
-
-    if VERBOSE
-        println("\nFinal colorant results [2:4, 2:4]:")
-        println("Standard: ")
-        println(result_std_colorant[2:4, 2:4])
-        println("Planned: ")
-        println(result_planned_colorant[2:4, 2:4])
-    end
-end
-
-# Test 5: Buffer and copyto! issues
-function test_buffer_issues()
-    print_section("Buffer and Copyto! Issues")
-
-    imgc = fill(RGB{Float64}(0,0,0), 5, 7); imgc[3,4] = RGB{Float64}(1,0,0)
-    kern = [0.1 0.2; 0.4 0.5]
-    kernel = OffsetArray(kern, -1:0, 1:2)
-    border = "replicate"
-
-    borderspec = ImageFiltering.borderinstance(border)
-    bord = borderspec(kernel, imgc, Algorithm.FFT())
-    _A = ImageFiltering.padarray(RGB{Float64}, imgc, bord)
-    Av, dims = ImageFiltering.channelview_dims(_A)
-
-    print_subsection("Collect vs no_offset_view Comparison")
-
-    if VERBOSE
-        println("Av type: ", typeof(Av))
-        println("Av axes: ", axes(Av))
-        println("Av size: ", size(Av))
-    end
-
-    # Test old method (no_offset_view)
-    buf_old = Array{Float64}(undef, size(Av))
-    no_offset_av = OffsetArrays.no_offset_view(Av)
-    copyto!(buf_old, no_offset_av)
-
-    # Test new method (collect)
-    buf_new = Array{Float64}(undef, size(Av))
-    copyto!(buf_new, collect(Av))
-
-    if VERBOSE
-        println("no_offset_view axes: ", axes(no_offset_av))
-        println("buf_old ≈ buf_new: ", buf_old ≈ buf_new)
-
-        if !(buf_old ≈ buf_new)
-            println("Max difference: ", maximum(abs.(buf_old - buf_new)))
-        end
-    end
-
-    # Test RFFT on both approaches
-    B_old = rfft(buf_old, dims)
-    B_new = rfft(buf_new, dims)
-    B_direct = rfft(Av, dims)
-
-    if VERBOSE
-        println("RFFT results:")
-        println("B_old ≈ B_new: ", B_old ≈ B_new)
-        println("B_direct ≈ B_new: ", B_direct ≈ B_new)
-
-        if !(B_direct ≈ B_new)
-            println("Max diff B_direct vs B_new: ", maximum(abs.(B_direct - B_new)))
-        end
-    end
-end
-
-# Test 6: Buffer size calculation issues
-function test_buffer_size_issues()
-    print_section("Buffer Size Calculation Issues")
-
-    # Simulate channelview data: (3, 8, 10) transforming along dims (2, 3)
-    test_real = rand(Float64, 3, 8, 10)
-    dims = (2, 3)
-
-    if VERBOSE
-        println("Original array size: ", size(test_real))
-        println("Transform dims: ", dims)
-    end
-
-    # Standard approach
-    B_std = rfft(test_real, dims)
-    if VERBOSE
-        println("Standard rfft output size: ", size(B_std))
-    end
-
-    # Test planned buffer creation
-    function test_buffer_creation(a::AbstractArray{T}, dims, d::Int) where {T}
-        numeric_type = T <: Real ? T : real(T)
-        input_size = collect(size(a))
-        input_size[dims[1]] = input_size[dims[1]] ÷ 2 + 1
-        buf_in = Array{Complex{numeric_type}}(undef, Tuple(input_size))
-        plan = plan_irfft(buf_in, d, dims; flags=FFTW.MEASURE)
-        return plan, buf_in
-    end
-
-    plan, buf = test_buffer_creation(test_real, dims, size(test_real, dims[1]))
-
-    if VERBOSE
-        println("Expected buffer size (from rfft): ", size(B_std))
-        println("Actual buffer size: ", size(buf))
-        println("Sizes match: ", size(B_std) == size(buf))
-    end
-
-    # Test execution
-    result_std = irfft(B_std, size(test_real, dims[1]), dims)
-    copyto!(buf, B_std)
-    result_planned = plan * buf
-
-    if VERBOSE
-        println("Standard irfft result size: ", size(result_std))
-        println("Planned irfft result size: ", size(result_planned))
-        println("Results equal: ", result_std ≈ result_planned)
-
-        if size(result_std) == size(result_planned) && !(result_std ≈ result_planned)
-            println("Max difference: ", maximum(abs.(result_std - result_planned)))
-        end
-    end
-end
-
-# Test 7: Proposed fixes and improvements
-function test_proposed_fixes()
-    print_section("Proposed Fixes and Improvements")
-
-    print_subsection("Testing collect() fix for buffered_planned_rfft_dims")
-
-    imgc = fill(RGB{Float64}(0,0,0), 5, 7); imgc[3,4] = RGB{Float64}(1,0,0)
-    kern = [0.1 0.2; 0.4 0.5]
-    kernel = OffsetArray(kern, -1:0, 1:2)
-    border = "replicate"
-
-    borderspec = ImageFiltering.borderinstance(border)
-    bord = borderspec(kernel, imgc, Algorithm.FFT())
-    _A = ImageFiltering.padarray(RGB{Float64}, imgc, bord)
-    Av, dims = ImageFiltering.channelview_dims(_A)
-
-    # Test current planned approach
-    planned_alg = planned_fft(imgc, kernel, border)
-
-    # Compare with standard for both offset and collected arrays
-    B_std_offset = rfft(Av, dims)
-    B_std_collected = rfft(collect(Av), dims)
-    B_planned = planned_alg.plan1(Av)
-
-    if VERBOSE
-        println("Standard RFFT on offset array equals planned: ", B_std_offset ≈ B_planned)
-        println("Standard RFFT on collected array equals planned: ", B_std_collected ≈ B_planned)
-        println("Offset equals collected: ", B_std_offset ≈ B_std_collected)
-
-        if !(B_std_offset ≈ B_std_collected)
-            println("Key insight: collect() vs offset arrays give different RFFT results!")
-            println("Max diff: ", maximum(abs.(B_std_offset - B_std_collected)))
-        end
-    end
-end
-
-# Test 8: Specific test for Gray{N0f8} FFT precision issue
-function test_gray_n0f8_precision_issue()
-    print_section("Testing Gray{N0f8} FFT Precision Issue")
-
-    # Recreate the exact test case that's failing
-    imgg = fill(Gray{N0f8}(0), 5, 7)
-    imgg[3,4] = Gray{N0f8}(1)
-    kern = [0.1 0.2; 0.4 0.5]
-    kernel = OffsetArray(kern, -1:0, 1:2)
-    border = "replicate"
-
-    # Calculate the expected result
-    targetimg = zeros(typeof(imgg[1]*kern[1]), size(imgg))
-    targetimg[3:4,2:3] = rot180(kern) .* imgg[3,4]
-
-    if VERBOSE
-        println("Image type: ", typeof(imgg[1]))
-        println("Target type: ", typeof(targetimg[1]))
-        println("Target values at [3:4,2:3]: ", targetimg[3:4,2:3])
-    end
-
-    # Test different algorithms
-    print_subsection("Algorithm Comparison")
-
-    # Standard FFT
-    result_fft = imfilter(imgg, kernel, border, Algorithm.FFT())
-
-    # Planned FFT
-    planned_alg = planned_fft(imgg, kernel, border)
-    result_planned = imfilter(imgg, kernel, border, planned_alg)
-
-    # FIR for reference
-    result_fir = imfilter(imgg, kernel, border, Algorithm.FIR())
-
-    if VERBOSE
-        println("FFT result [3:4,2:3]:     ", result_fft[3:4,2:3])
-        println("Planned result [3:4,2:3]: ", result_planned[3:4,2:3])
-        println("FIR result [3:4,2:3]:     ", result_fir[3:4,2:3])
-        println("Target result [3:4,2:3]:  ", targetimg[3:4,2:3])
-
-        println("\nDifferences from target:")
-        diff_fft = maximum(abs.(result_fft - targetimg))
-        diff_planned = maximum(abs.(result_planned - targetimg))
-        diff_fir = maximum(abs.(result_fir - targetimg))
-        println("FFT max diff:     ", diff_fft)
-        println("Planned max diff: ", diff_planned)
-        println("FIR max diff:     ", diff_fir)
-
-        println("\nCross-comparisons:")
-        println("FFT ≈ Planned:    ", result_fft ≈ result_planned)
-        println("FFT ≈ FIR:        ", result_fft ≈ result_fir)
-        println("Planned ≈ FIR:    ", result_planned ≈ result_fir)
-
-        if !(result_fft ≈ result_planned)
-            println("Max diff FFT vs Planned: ", maximum(abs.(result_fft - result_planned)))
-        end
-    end
-
-    print_subsection("Tolerance Analysis")
-
-    # Test with different tolerances
-    tolerances = [1e-15, 1e-12, 1e-9, 1e-6, 1e-3]
-
-    for tol in tolerances
-        fft_matches = isapprox(result_fft, targetimg, rtol=tol, atol=tol)
-        planned_matches = isapprox(result_planned, targetimg, rtol=tol, atol=tol)
-
-        if VERBOSE
-            println("Tolerance $tol: FFT=$(fft_matches), Planned=$(planned_matches)")
-        end
-
-        if fft_matches && planned_matches
-            if VERBOSE
-                println("  Both algorithms pass at tolerance: $tol")
-            end
-            break
-        end
-    end
-
-    print_subsection("Investigating FFT vs Planned FFT Differences")
-
-    # Let's dig into the FFT implementation details
-    borderspec = ImageFiltering.borderinstance(border)
-    bord = borderspec(kernel, imgg, Algorithm.FFT())
-    _A = ImageFiltering.padarray(Gray{Float64}, imgg, bord)
-    Av, dims = ImageFiltering.channelview_dims(_A)
-
-    if VERBOSE
-        println("Padded array type: ", typeof(_A))
-        println("Channelview type: ", typeof(Av))
-        println("Channelview axes: ", axes(Av))
-        println("Transform dims: ", dims)
-    end
-
-    # Compare offset vs collect behavior specifically for this case
-    B_offset = rfft(Av, dims)
-    B_collected = rfft(collect(Av), dims)
-
-    if VERBOSE
-        println("RFFT offset vs collected equal: ", B_offset ≈ B_collected)
-        if !(B_offset ≈ B_collected)
-            println("Max difference: ", maximum(abs.(B_offset - B_collected)))
-        end
-    end
-
-    print_subsection("Comparing Different Image Types")
-
-    # Test the same kernel/border with different image types
-    test_cases = [
-        ("Float64", zeros(Float64, 5, 7), 1.0),
-        ("Gray{Float64}", fill(Gray{Float64}(0), 5, 7), Gray{Float64}(1)),
-        ("Gray{N0f8}", fill(Gray{N0f8}(0), 5, 7), Gray{N0f8}(1)),
-        ("RGB{Float64}", fill(RGB{Float64}(0,0,0), 5, 7), RGB{Float64}(1,0,0)),
-    ]
-
-    for (name, img_template, nonzero_val) in test_cases
-        img = copy(img_template)
-        img[3,4] = nonzero_val
-
-        targetimg = zeros(typeof(img[1]*kern[1]), size(img))
-        targetimg[3:4,2:3] = rot180(kern) .* img[3,4]
-
-        result_fir = imfilter(img, kernel, border, Algorithm.FIR())
-        result_fft = imfilter(img, kernel, border, Algorithm.FFT())
-
-        fir_matches = result_fir ≈ targetimg
-        fft_matches = result_fft ≈ targetimg
-
-        if VERBOSE
-            println("$name:")
-            println("  FIR matches target: $fir_matches")
-            println("  FFT matches target: $fft_matches")
-            if !fft_matches
-                println("  FFT[3:4,2:3]: ", result_fft[3:4,2:3])
-                println("  Target[3:4,2:3]: ", targetimg[3:4,2:3])
-                # Check if values are just shifted
-                if result_fft[4:5,2:3] ≈ targetimg[3:4,2:3]
-                    println("  ⚠️  FFT values appear shifted down by 1 row!")
-                end
-            end
-        end
-    end
-end
-
-# Test 9: Updated supported_algs logic for colorants
-function test_supported_algs_colorants()
-    print_section("Testing Updated supported_algs Logic for Colorants")
-
-    # Helper function to check if a type supports planned_fft
-    function supports_planned_fft(::Type{T}) where T
-        # AbstractFloat types are directly supported
-        T <: AbstractFloat && return true
-
-        # Colorant types are supported if their element type can be converted to FFT-compatible types
-        if T <: Colorant
-            try
-                # Check if ffteltype can convert the element type
-                elt = eltype(T)
-                fft_type = ImageFiltering.ffteltype(elt)
-                return fft_type <: Union{Float32, Float64}
-            catch
-                return false
-            end
-        end
-
-        return false
-    end
-
-    function supported_algs(img::AbstractArray{T}, kernel, border) where T
-        base_algs = (Algorithm.FIR(), Algorithm.FIRTiled(), Algorithm.FFT())
-
-        # Include planned_fft if:
-        # 1. The image type supports planned_fft (AbstractFloat or compatible Colorants)
-        # 2. All kernel elements are floating point or complex floating point
-        # 3. Border is not NA (since NA requires special handling)
-        if supports_planned_fft(T) &&
-           all(k -> eltype(k) <: Union{AbstractFloat, Complex{<:AbstractFloat}}, (kernel isa Tuple ? kernel : (kernel,))) &&
-           !isa(border, NA)
-            return (base_algs..., planned_fft(img, kernel, border))
-        else
-            return base_algs
-        end
-    end
-
-    data = create_test_data()
-    kern = [0.1 0.2; 0.4 0.5]
-    kernel = OffsetArray(kern, -1:0, 1:2)
-    border = "replicate"
-
-    print_subsection("Testing supports_planned_fft function")
-
-    # Test AbstractFloat types
-    @test supports_planned_fft(Float64) == true
-    @test supports_planned_fft(Float32) == true
-    if VERBOSE
-        println("  ✓ Float64 and Float32 support planned_fft")
-    end
-
-    # Test colorant types with floating point elements
-    @test supports_planned_fft(Gray{Float64}) == true
-    @test supports_planned_fft(RGB{Float64}) == true
-    @test supports_planned_fft(Gray{Float32}) == true
-    @test supports_planned_fft(RGB{Float32}) == true
-    if VERBOSE
-        println("  ✓ Gray{Float64}, RGB{Float64}, Gray{Float32}, RGB{Float32} support planned_fft")
-    end
-
-    # Test colorant types with fixed point elements
-    @test supports_planned_fft(Gray{N0f8}) == true  # N0f8 should be convertible to Float32
-    @test supports_planned_fft(RGB{N0f8}) == true   # N0f8 should be convertible to Float32
-    if VERBOSE
-        println("  ✓ Gray{N0f8} and RGB{N0f8} support planned_fft (via ffteltype conversion)")
-    end
-
-    # Test integer types (should not support)
-    @test supports_planned_fft(Int) == false
-    @test supports_planned_fft(UInt8) == false
-    if VERBOSE
-        println("  ✓ Int and UInt8 do not support planned_fft")
-    end
-
-    print_subsection("Testing supported_algs function")
-
-    # Test with floating point images
-    for img in data.float_imgs
-        algs = supported_algs(img, kernel, border)
-        @test length(algs) == 4  # Should include planned_fft
-        @test any(alg -> isa(alg, typeof(planned_fft(img, kernel, border))), algs)
-        if VERBOSE
-            println("  ✓ $(typeof(img[1])) gets planned_fft support: $(length(algs)) algorithms")
-        end
-    end
-
-    # Test with colorant floating point images
-    for img in data.colorant_float
-        algs = supported_algs(img, kernel, border)
-        @test length(algs) == 4  # Should include planned_fft
-        @test any(alg -> isa(alg, typeof(planned_fft(img, kernel, border))), algs)
-        if VERBOSE
-            println("  ✓ $(typeof(img[1])) gets planned_fft support: $(length(algs)) algorithms")
-        end
-    end
-
-    # Test with colorant fixed point images (should also work now)
-    for img in data.colorant_fixed
-        try
-            algs = supported_algs(img, kernel, border)
-            @test length(algs) == 4  # Should include planned_fft
-            @test any(alg -> isa(alg, typeof(planned_fft(img, kernel, border))), algs)
-            if VERBOSE
-                println("  ✓ $(typeof(img[1])) gets planned_fft support: $(length(algs)) algorithms")
-            end
-        catch e
-            if VERBOSE
-                println("  ✗ $(typeof(img[1])) failed to get planned_fft support: $e")
-            end
-            @test false
-        end
-    end
-
-    # Test with integer images (should not get planned_fft)
-    for img in data.int_imgs
-        algs = supported_algs(img, kernel, border)
-        @test length(algs) == 3  # Should not include planned_fft
-        if VERBOSE
-            println("  ✓ $(typeof(img[1])) correctly excludes planned_fft: $(length(algs)) algorithms")
-        end
-    end
-
-    print_subsection("Testing actual filtering with planned_fft for colorants")
-
-    # Test that planned_fft actually works for colorant types
-    for img in (data.colorant_float..., data.colorant_fixed...)
-        img_type = typeof(img[1])
-        try
-            # Get algorithms including planned_fft
-            algs = supported_algs(img, kernel, border)
-            planned_alg = last(algs)  # The planned_fft algorithm
-
-            # Test that it actually works
-            result_planned = imfilter(img, kernel, border, planned_alg)
-            result_std = imfilter(img, kernel, border, Algorithm.FFT())
-
-            # Results should be approximately equal
-            @test result_planned ≈ result_std rtol=0.01 atol=0.01
-
-            if VERBOSE
-                println("  ✓ $img_type: Planned FFT successfully filters and matches standard FFT")
-            end
-        catch e
-            if VERBOSE
-                println("  ✗ $img_type: Failed to filter with planned FFT: $e")
-            end
-            @test false
-        end
-    end
-end
-
-# Test 10: Deep dive into FFT positioning issue
-function test_fft_positioning_detailed()
-    print_section("Deep Dive into FFT Positioning Issue")
-
-    # Create simple test case
-    img_float = zeros(Float64, 6, 6)
-    img_float[3,3] = 1.0
-
-    img_gray = Gray{Float64}.(img_float)
-
-    kernel = OffsetArray([0.1 0.2; 0.3 0.4], -1:0, -1:0)
-    border = "replicate"
-
-    print_subsection("Testing Float64 image (works correctly)")
-    result_float_fft = imfilter(img_float, kernel, border, Algorithm.FFT())
-    result_float_fir = imfilter(img_float, kernel, border, Algorithm.FIR())
-
-    if VERBOSE
-        println("Float64 FFT result at [2:4,2:4]:")
-        println(result_float_fft[2:4,2:4])
-        println("Float64 FIR result at [2:4,2:4]:")
-        println(result_float_fir[2:4,2:4])
-        println("Match: ", result_float_fft ≈ result_float_fir)
-    end
-
-    print_subsection("Testing Gray{Float64} image (has positioning bug)")
-    result_gray_fft = imfilter(img_gray, kernel, border, Algorithm.FFT())
-    result_gray_fir = imfilter(img_gray, kernel, border, Algorithm.FIR())
-
-    if VERBOSE
-        println("Gray{Float64} FFT result at [2:4,2:4]:")
-        println(result_gray_fft[2:4,2:4])
-        println("Gray{Float64} FIR result at [2:4,2:4]:")
-        println(result_gray_fir[2:4,2:4])
-        println("Match: ", result_gray_fft ≈ result_gray_fir)
-
-        # Check if FFT result is shifted
-        if !isempty(result_gray_fft[3:5,2:4]) && !isempty(result_gray_fir[2:4,2:4])
-            println("\nChecking for shift - FFT[3:5,2:4] vs FIR[2:4,2:4]:")
-            println("FFT shifted:", result_gray_fft[3:5,2:4])
-            println("FIR target: ", result_gray_fir[2:4,2:4])
-            shifted_match = result_gray_fft[3:5,2:4] ≈ result_gray_fir[2:4,2:4]
-            println("Shifted match: ", shifted_match)
-        end
-    end
-
-    print_subsection("Investigating the filtfft implementation")
-
-    # Test the filtfft function directly
-    borderspec = ImageFiltering.borderinstance(border)
-
-    # For Float64 array
-    bord_float = borderspec(kernel, img_float, ImageFiltering.Algorithm.FFT())
-    A_float = ImageFiltering.padarray(Float64, img_float, bord_float)
-    kern_float = ImageFiltering.samedims(A_float, ImageFiltering.kernelconv(kernel))
-    krn_float = FFTView(zeros(eltype(kern_float), map(length, axes(A_float))))
-    for I in CartesianIndices(axes(kern_float))
-        krn_float[I] = kern_float[I]
-    end
-    result_filtfft_float = ImageFiltering.filtfft(A_float, krn_float)
-
-    # For Gray{Float64} array
-    bord_gray = borderspec(kernel, img_gray, ImageFiltering.Algorithm.FFT())
-    A_gray = ImageFiltering.padarray(Gray{Float64}, img_gray, bord_gray)
-    kern_gray = ImageFiltering.samedims(A_gray, ImageFiltering.kernelconv(kernel))
-    krn_gray = FFTView(zeros(eltype(kern_gray), map(length, axes(A_gray))))
-    for I in CartesianIndices(axes(krn_gray))
-        krn_gray[I] = kern_gray[I]
-    end
-    result_filtfft_gray = ImageFiltering.filtfft(A_gray, krn_gray)
-
-    if VERBOSE
-        println("\nDirect filtfft comparison:")
-        println("Padded Float64 size: ", size(A_float))
-        println("Padded Gray size: ", size(A_gray))
-        println("Float64 axes: ", axes(A_float))
-        println("Gray axes: ", axes(A_gray))
-
-        # Check padding differences
-        println("\nPadding comparison:")
-        center_float = size(A_float) .÷ 2 .+ 1
-        center_gray = size(A_gray) .÷ 2 .+ 1
-        println("Float64 center: ", center_float)
-        println("Gray center: ", center_gray)
-
-        if length(center_float) >= 2 && length(center_gray) >= 2
-            r = 2:4
-            c = 2:4
-            if checkbounds(Bool, result_filtfft_float, r, c) && checkbounds(Bool, result_filtfft_gray, r, c)
-                println("Float64 filtfft result[2:4,2:4]: ", result_filtfft_float[r,c])
-                println("Gray filtfft result[2:4,2:4]: ", result_filtfft_gray[r,c])
-            end
-        end
-    end
-
-    print_subsection("Investigating channelview_dims")
-
-    # Test channelview_dims function
-    Av_gray, dims_gray = ImageFiltering.channelview_dims(A_gray)
-
-    if VERBOSE
-        println("Original Gray array size: ", size(A_gray))
-        println("Channelview size: ", size(Av_gray))
-        println("Transform dims: ", dims_gray)
-        println("Channelview axes: ", axes(Av_gray))
-        println("Original axes: ", axes(A_gray))
-    end
-
-    @test true  # Placeholder - investigating issue
-end
-
-# Main test runner
-function run_tests(section::String = "all")
-    println("ImageFiltering.jl Consolidated Debug Script")
-    println("Running section: $section")
-
-    if section in ["all", "basic"]
-        test_basic_planned_fft()
-    end
-
-    if section in ["all", "colorant"]
-        test_colorant_images()
-    end
-
-    if section in ["all", "offset"]
-        test_offset_arrays()
-    end
-
-    if section in ["all", "pipeline"]
-        test_pipeline_debugging()
-    end
-
-    if section in ["all", "buffer"]
-        test_buffer_issues()
-    end
-
-    if section in ["all", "fixes"]
-        test_proposed_fixes()
-    end
-
-    if section in ["all", "gray_precision"]
-        test_gray_n0f8_precision_issue()
-    end
-
-    if section in ["all", "supported_algs"]
-        test_supported_algs_colorants()
-    end
-
-    if section in ["all", "fft_positioning"]
-        test_fft_positioning_detailed()
-    end
-
-    # Summary
-    print_section("Summary")
-    println("Consolidated debugging complete!")
-    println("")
-    println("Key findings:")
-    println("1. Basic planned FFT works for floating point images")
-    println("2. Updated supported_algs logic now includes all colorant types")
-    println("3. Both floating point and fixed point colorants get planned FFT support")
-    println("4. Planned FFT for colorants works via ffteltype conversion")
-    println("5. Offset arrays vs collect() can give different RFFT results")
-    println("6. Buffer size calculations are generally correct")
-    println("7. TODO in test/2d.jl has been addressed - planned_fft now supports more types than just floats")
-    println("8. ✅ CRITICAL BUG FIXED: FFT positioning issue for colorant types has been resolved!")
-    println("   - Fixed by using no_offset_view() instead of collect() in FFT functions")
-    println("   - Fixed by properly restoring offset information after FFT operations")
-    println("   - All colorant types (Gray{Float64}, Gray{N0f8}, RGB{Float64}) now work correctly")
-    println("   - test/2d.jl:110 now passes - the original failing test is fixed")
-end
-
-# Command line interface
-if abspath(PROGRAM_FILE) == @__FILE__
-    section = length(ARGS) > 0 ? ARGS[1] : "all"
-    if !(section in ["all", "basic", "colorant", "offset", "pipeline", "buffer", "fixes", "gray_precision", "supported_algs", "fft_positioning"])
-        println("Invalid section: $section")
-        println("Valid sections: all, basic, colorant, offset, pipeline, buffer, fixes, gray_precision, supported_algs, fft_positioning")
-        exit(1)
-    end
-    run_tests(section)
-end
diff --git a/src/imfilter.jl b/src/imfilter.jl
index 9695b2ee..1aff5329 100644
--- a/src/imfilter.jl
+++ b/src/imfilter.jl
@@ -859,60 +859,104 @@ end
 @inline _stretch_mul(AT::Type{<:Real}, A_fft::AbstractArray, BT::Type{<:Real}, B_fft::AbstractArray, _::Int) = A_fft .* B_fft
 @inline _stretch_mul(AT::Type{<:Complex}, A_fft::AbstractArray, BT::Type{<:Complex}, B_fft::AbstractArray, _::Int) = A_fft .* B_fft
 
-# Unified planned FFT functions that handle both regular arrays and colorant channelviews
+"""
+    buffered_planned_rfft(a::AbstractArray{T}, dims=1:ndims(a)) where {T}
+
+Create a buffered, planned real FFT function for arrays with the same size and type as `a`.
+
+Returns a function that performs an in-place rfft using a pre-computed plan.
+Each call allocates a task-local buffer to ensure thread safety when used concurrently.
+
+For full transforms (all dimensions), uses `RealFFTs.RCpair` for efficiency.
+For partial transforms, uses standard FFTW plans.
+
+This is an internal function used by [`planned_fft`](@ref).
+"""
 function buffered_planned_rfft(a::AbstractArray{T}, dims=1:ndims(a)) where {T}
     numeric_type = T <: Real ? T : real(T)
 
     if dims == 1:ndims(a) && length(dims) == ndims(a)
         # Use RealFFTs.RCpair for full transforms (more efficient)
-        # Create a buffer and plan for the prototype
         buf_proto = RealFFTs.RCpair{numeric_type}(undef, size(a))
-        # Use ESTIMATE flag so plan works with any buffer of the same size
         plan = RealFFTs.plan_rfft!(buf_proto; flags=FFTW.ESTIMATE)
         buf_size = size(a)
         
-        # Return a function that creates thread-local buffers
+        # Use task-local storage for thread-safe buffer reuse
+        bufs = Dict{UInt, RealFFTs.RCpair{numeric_type}}()
+        bufs_lock = ReentrantLock()
+        
         return function (arr::AbstractArray)
-            # Each thread gets its own buffer to avoid race conditions
-            buf = RealFFTs.RCpair{numeric_type}(undef, buf_size)
+            tid = objectid(current_task())
+            buf = lock(bufs_lock) do
+                get!(bufs, tid) do
+                    RealFFTs.RCpair{numeric_type}(undef, buf_size)
+                end
+            end
             copy!(buf, OffsetArrays.no_offset_view(arr))
             plan(buf)
-            return complex(buf)
+            return copy(complex(buf))
         end
     else
         # Use standard FFTW for partial transforms
         buf_size = size(a)
         buf_proto = Array{numeric_type}(undef, buf_size)
-        # Use ESTIMATE flag so plan works with any buffer of the same size
         plan = plan_rfft(buf_proto, dims; flags=FFTW.ESTIMATE)
         
+        # Use task-local storage for thread-safe buffer reuse
+        bufs = Dict{UInt, Array{numeric_type}}()
+        bufs_lock = ReentrantLock()
+        
         return function (arr::AbstractArray)
-            # Each thread gets its own buffer
-            buf = Array{numeric_type}(undef, buf_size)
+            tid = objectid(current_task())
+            buf = lock(bufs_lock) do
+                get!(bufs, tid) do
+                    Array{numeric_type}(undef, buf_size)
+                end
+            end
             copyto!(buf, OffsetArrays.no_offset_view(arr))
             return plan * buf
         end
     end
 end
 
+"""
+    buffered_planned_irfft(a::AbstractArray{T}, dims=1:ndims(a), d::Int=size(a,1)) where {T}
+
+Create a buffered, planned inverse real FFT function for arrays with the same size and type as `a`.
+
+Returns a function that performs an in-place irfft using a pre-computed plan.
+Each call uses a task-local buffer to ensure thread safety when used concurrently.
+
+# Arguments
+- `a`: Prototype array defining the size and element type
+- `dims`: Dimensions along which to perform the transform (default: all dimensions)
+- `d`: Size of the first transformed dimension in the original (real) array
+
+This is an internal function used by [`planned_fft`](@ref).
+"""
 function buffered_planned_irfft(a::AbstractArray{T}, dims=1:ndims(a), d::Int=size(a,1)) where {T}
     numeric_type = T <: Real ? T : real(T)
 
     if dims == 1:ndims(a) && length(dims) == ndims(a)
         # Use RealFFTs.RCpair for full transforms (more efficient)
-        # Create a buffer and plan for the prototype
         buf_proto = RealFFTs.RCpair{numeric_type}(undef, size(a))
-        # Use ESTIMATE flag so plan works with any buffer of the same size
         plan = RealFFTs.plan_irfft!(buf_proto; flags=FFTW.ESTIMATE)
         buf_size = size(a)
         
-        # Return a function that creates thread-local buffers
+        # Use task-local storage for thread-safe buffer reuse
+        bufs = Dict{UInt, RealFFTs.RCpair{numeric_type}}()
+        bufs_lock = ReentrantLock()
+        
         return function (arr::AbstractArray)
-            # Each thread gets its own buffer to avoid race conditions
-            buf = RealFFTs.RCpair{numeric_type}(undef, buf_size)
+            tid = objectid(current_task())
+            buf = lock(bufs_lock) do
+                get!(bufs, tid) do
+                    RealFFTs.RCpair{numeric_type}(undef, buf_size)
+                end
+            end
             copy!(buf, OffsetArrays.no_offset_view(arr))
             plan(buf)
-            return real(buf)
+            return copy(real(buf))
         end
     else
         # Use standard FFTW for partial transforms
@@ -920,18 +964,73 @@ function buffered_planned_irfft(a::AbstractArray{T}, dims=1:ndims(a), d::Int=siz
         input_size[dims[1]] = input_size[dims[1]] ÷ 2 + 1
         buf_size = Tuple(input_size)
         buf_proto = Array{Complex{numeric_type}}(undef, buf_size)
-        # Use ESTIMATE flag so plan works with any buffer of the same size
         plan = plan_irfft(buf_proto, d, dims; flags=FFTW.ESTIMATE)
         
+        # Use task-local storage for thread-safe buffer reuse
+        bufs = Dict{UInt, Array{Complex{numeric_type}}}()
+        bufs_lock = ReentrantLock()
+        
         return function (arr::AbstractArray)
-            # Each thread gets its own buffer
-            buf_in = Array{Complex{numeric_type}}(undef, buf_size)
+            tid = objectid(current_task())
+            buf_in = lock(bufs_lock) do
+                get!(bufs, tid) do
+                    Array{Complex{numeric_type}}(undef, buf_size)
+                end
+            end
             copyto!(buf_in, OffsetArrays.no_offset_view(arr))
             return plan * buf_in
         end
     end
 end
 
+"""
+    planned_fft(A, kernel, [border="replicate"]) -> Algorithm.FFT
+
+Create a planned FFT algorithm that can be reused for filtering multiple images of the
+same size and type as `A`.
+
+This is useful when filtering many images with the same kernel, as the FFT plans are
+computed once and reused. The returned algorithm is thread-safe and can be used
+concurrently from multiple threads.
+
+# Arguments
+- `A`: A prototype array (the actual values are not used, only size and element type)
+- `kernel`: The filter kernel (array, tuple of arrays, or `Kernel` type)
+- `border`: Border specification (default: `"replicate"`). Note: `NA()` borders are not supported.
+
+# Returns
+An `Algorithm.FFT` instance with pre-computed plans that can be passed to `imfilter` or `imfilter!`.
+
+# Example
+```julia
+using ImageFiltering, ComputationalResources
+
+# Create a stack of images to filter
+imgstack = [rand(Float64, 512, 512) for _ in 1:100]
+kernel = Kernel.gaussian((3, 3))
+
+# Create the planned FFT algorithm once
+planned_alg = planned_fft(imgstack[1], kernel, "replicate")
+
+# Reuse for all images (can be parallelized with Threads.@threads)
+results = [imfilter(img, kernel, "replicate", planned_alg) for img in imgstack]
+
+# Or use with imfilter! and ComputationalResources
+out = similar(imgstack[1])
+for img in imgstack
+    imfilter!(CPU1(planned_alg), out, img, kernel, "replicate")
+    # process out...
+end
+```
+
+# Notes
+- The plan is specific to the array size, element type, kernel, and border specification.
+  Using it with differently-sized arrays will error.
+- IIR filters (like `KernelFactors.IIRGaussian`) are not supported; use `Algorithm.IIR()` instead.
+- For colorant images (e.g., `RGB`, `Gray`), the plans handle channel separation automatically.
+
+See also: [`imfilter`](@ref), [`imfilter!`](@ref), [`Algorithm.FFT`](@ref)
+"""
 function planned_fft(A::AbstractArray{T,N},
             kernel::ProcessedKernel,
             border::BorderSpecAny=Pad(:replicate)