From 2882e547623e6de66d621ee0f6bae6fea13769fd Mon Sep 17 00:00:00 2001
From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com>
Date: Wed, 14 Jan 2026 23:57:49 +0100
Subject: [PATCH 1/8] Add CUDA neighbor list utilities

---
 README.md                  |  43 ++++++
 src/SPHCUDANeighborList.jl | 287 +++++++++++++++++++++++++++++++++++++
 src/SPHExample.jl          |  11 ++
 3 files changed, 341 insertions(+)
 create mode 100644 src/SPHCUDANeighborList.jl

diff --git a/README.md b/README.md
index c23bc265..c3f854e6 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,7 @@ The project demonstrates how to assemble a small SPH solver with Julia. It focus
 - **Dynamic boundary condition** – inspired by DualSPHysics.
 - **Density diffusion** – based on Fourtakas et al. 2019 to reduce pressure noise.
 - **Wendland quintic kernel** – simple and stable without tensile corrections.
+- **CUDA neighbor lists** – optional GPU neighbor cell list and loop utilities for CUDA arrays.
 
 ## Folder Structure
 
@@ -91,6 +92,48 @@ one plus the particles in its neighbor stencil.
 Neighbor rebuilds use a per-cell ordering buffer without reordering particle
 storage.
 
+### CUDA Neighbor Utilities
+
+This package ships with an optional CUDA-focused neighbor list to offload the
+cell list construction and neighbor iteration. Install CUDA.jl in the project
+environment before using these helpers:
+
+```julia
+using Pkg
+Pkg.add("CUDA")
+```
+
+The CUDA utilities are only loaded when CUDA.jl is available. The workflow is:
+
+1. Build a `CUDACellGrid` describing the simulation bounds and desired cell size.
+2. Allocate a `CUDANeighborList` once for the particle count.
+3. Call `UpdateNeighborsCUDA!` each step after updating positions.
+4. Launch `NeighborLoopCUDA!` with a GPU kernel that computes pairwise
+   interactions.
+
+Here is a minimal sketch:
+
+```julia
+using CUDA
+using SPHExample
+
+grid = CUDACellGrid(SVector(0.0f0, 0.0f0), 0.01f0, SVector(128, 128))
+neighbor_list = AllocateCUDANeighborList(grid, length(position))
+
+UpdateNeighborsCUDA!(neighbor_list, position)
+
+function InteractionKernel(i, j, acc, pos)
+    @inbounds acc[i] += pos[j] - pos[i]
+    return nothing
+end
+
+NeighborLoopCUDA!(InteractionKernel, neighbor_list, acc, position)
+```
+
+These CUDA utilities are intended as building blocks for integrating GPU
+workflows into the existing solver. You can keep the CPU path intact and
+incrementally replace the neighbor loop where it makes sense for your use case.
+
 ## Help
 
 Questions or issues can be posted on the GitHub issue tracker. Response times may vary but all feedback is welcome.
diff --git a/src/SPHCUDANeighborList.jl b/src/SPHCUDANeighborList.jl
new file mode 100644
index 00000000..a684ab55
--- /dev/null
+++ b/src/SPHCUDANeighborList.jl
@@ -0,0 +1,287 @@
+module SPHCUDANeighborList
+
+export CUDACellGrid, CUDANeighborList, AllocateCUDANeighborList,
+       ConstructNeighborOffsets, BuildNeighborCellListsCUDA!,
+       UpdateNeighborsCUDA!, NeighborLoopCUDA!
+
+using CUDA
+using StaticArrays
+
+"""
+    CUDACellGrid(Origin, CellSize, Dims)
+
+Describe a uniform cell grid for CUDA neighbor list construction.
+"""
+struct CUDACellGrid{D, T}
+    Origin::SVector{D, T}
+    InvCellSize::T
+    Dims::SVector{D, Int}
+    Strides::SVector{D, Int}
+    CellCount::Int
+end
+
+"""
+    CUDACellGrid(Origin, CellSize, Dims)
+
+Build a CUDA cell grid from an origin, cell size, and grid dimensions.
+"""
+function CUDACellGrid(Origin::SVector{D, T}, CellSize::T, Dims::SVector{D, Int}) where {D, T}
+    Strides = SVector{D, Int}(ntuple(i -> i == 1 ? 1 : prod(Dims[1:(i - 1)]), D))
+    CellCount = prod(Dims)
+    return CUDACellGrid{D, T}(Origin, inv(CellSize), Dims, Strides, CellCount)
+end
+
+"""
+    CUDANeighborList(Grid, ...)
+
+CUDA storage for per-cell particle ranges and neighbor cell indices.
+"""
+struct CUDANeighborList{D, T}
+    Grid::CUDACellGrid{D, T}
+    CellIds::CuArray{Int}
+    ParticleOrder::CuArray{Int}
+    ParticleRanges::CuArray{Int}
+    CellCounts::CuArray{Int}
+    CellOffsets::CuArray{Int}
+    NeighborCells::CuArray{Int}
+    NeighborCounts::CuArray{Int}
+    NeighborOffsets::CuArray{SVector{D, Int}}
+end
+
+"""
+    ConstructNeighborOffsets(Val(D))
+
+Return the non-zero stencil offsets for a D-dimensional Moore neighborhood.
+"""
+function ConstructNeighborOffsets(::Val{D}) where {D}
+    Offsets = SVector{D, Int}[]
+    for Offset in CartesianIndices(ntuple(_ -> -1:1, D))
+        if !all(iszero, Tuple(Offset))
+            push!(Offsets, SVector{D, Int}(Tuple(Offset)))
+        end
+    end
+    return Offsets
+end
+
+"""
+    AllocateCUDANeighborList(Grid, ParticleCount)
+
+Allocate GPU buffers for neighbor cell lists and particle ordering.
+"""
+function AllocateCUDANeighborList(Grid::CUDACellGrid{D, T}, ParticleCount::Int) where {D, T}
+    CellIds = CUDA.zeros(Int, ParticleCount)
+    ParticleOrder = CUDA.zeros(Int, ParticleCount)
+    ParticleRanges = CUDA.zeros(Int, Grid.CellCount + 1)
+    CellCounts = CUDA.zeros(Int, Grid.CellCount)
+    CellOffsets = CUDA.zeros(Int, Grid.CellCount)
+    NeighborOffsets = CuArray(ConstructNeighborOffsets(Val(D)))
+    NeighborCells = CUDA.zeros(Int, length(NeighborOffsets), Grid.CellCount)
+    NeighborCounts = CUDA.zeros(Int, Grid.CellCount)
+    NeighborList = CUDANeighborList{D, T}(
+        Grid,
+        CellIds,
+        ParticleOrder,
+        ParticleRanges,
+        CellCounts,
+        CellOffsets,
+        NeighborCells,
+        NeighborCounts,
+        NeighborOffsets,
+    )
+    BuildNeighborCellListsCUDA!(NeighborList)
+    return NeighborList
+end
+
+@inline function LinearCellId(CellCoords::SVector{D, Int}, Grid::CUDACellGrid{D, T}) where {D, T}
+    CellId = 1
+    @inbounds for DimIndex in 1:D
+        CellId += CellCoords[DimIndex] * Grid.Strides[DimIndex]
+    end
+    return CellId
+end
+
+@inline function CellCoordsFromId(CellId::Int, Grid::CUDACellGrid{D, T}) where {D, T}
+    Remaining = CellId - 1
+    Coords = ntuple(DimIndex -> begin
+        Dim = Grid.Dims[DimIndex]
+        Coord = Remaining % Dim
+        Remaining ÷= Dim
+        Coord
+    end, D)
+    return SVector{D, Int}(Coords)
+end
+
+function ComputeCellIdsKernel!(CellIds, Position, Grid::CUDACellGrid{D, T}) where {D, T}
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(CellIds)
+        Pos = Position[Index]
+        Coords = ntuple(DimIndex -> begin
+            Raw = (Pos[DimIndex] - Grid.Origin[DimIndex]) * Grid.InvCellSize
+            Cell = floor(Int, Raw)
+            clamp(Cell, 0, Grid.Dims[DimIndex] - 1)
+        end, D)
+        CellIds[Index] = LinearCellId(SVector{D, Int}(Coords), Grid)
+    end
+    return nothing
+end
+
+function ComputeCellCountsKernel!(CellCounts, CellIds)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(CellIds)
+        CellId = CellIds[Index]
+        CUDA.atomic_add!(CellCounts, CellId, 1)
+    end
+    return nothing
+end
+
+function BuildParticleRangesKernel!(ParticleRanges, CellPrefix)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(CellPrefix)
+        if Index == 1
+            ParticleRanges[1] = 1
+        end
+        ParticleRanges[Index + 1] = CellPrefix[Index] + 1
+    end
+    return nothing
+end
+
+function BuildParticleOrderKernel!(ParticleOrder, CellOffsets, CellIds)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(CellIds)
+        CellId = CellIds[Index]
+        TargetIndex = CUDA.atomic_add!(CellOffsets, CellId, 1)
+        ParticleOrder[TargetIndex] = Index
+    end
+    return nothing
+end
+
+function BuildNeighborCellListsKernel!(NeighborCells, NeighborCounts, NeighborOffsets, Grid)
+    CellId = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if CellId <= Grid.CellCount
+        CellCoord = CellCoordsFromId(CellId, Grid)
+        NeighborIndex = 0
+        for OffsetIndex in 1:length(NeighborOffsets)
+            Offset = NeighborOffsets[OffsetIndex]
+            NeighborCoord = CellCoord + Offset
+            IsValid = true
+            @inbounds for DimIndex in 1:length(Grid.Dims)
+                if NeighborCoord[DimIndex] < 0 || NeighborCoord[DimIndex] >= Grid.Dims[DimIndex]
+                    IsValid = false
+                    break
+                end
+            end
+            if IsValid
+                NeighborIndex += 1
+                NeighborCells[NeighborIndex, CellId] = LinearCellId(NeighborCoord, Grid)
+            end
+        end
+        NeighborCounts[CellId] = NeighborIndex
+    end
+    return nothing
+end
+
+"""
+    BuildNeighborCellListsCUDA!(NeighborList)
+
+Precompute the neighbor cell indices for each grid cell on the GPU.
+"""
+function BuildNeighborCellListsCUDA!(NeighborList::CUDANeighborList)
+    Threads = 256
+    Blocks = cld(NeighborList.Grid.CellCount, Threads)
+    CUDA.@cuda threads=Threads blocks=Blocks BuildNeighborCellListsKernel!(
+        NeighborList.NeighborCells,
+        NeighborList.NeighborCounts,
+        NeighborList.NeighborOffsets,
+        NeighborList.Grid,
+    )
+    return nothing
+end
+
+"""
+    UpdateNeighborsCUDA!(NeighborList, Position)
+
+Update cell ids, particle ranges, and particle order on the GPU.
+"""
+function UpdateNeighborsCUDA!(NeighborList::CUDANeighborList, Position)
+    Threads = 256
+    Blocks = cld(length(NeighborList.CellIds), Threads)
+    CUDA.@cuda threads=Threads blocks=Blocks ComputeCellIdsKernel!(
+        NeighborList.CellIds,
+        Position,
+        NeighborList.Grid,
+    )
+    fill!(NeighborList.CellCounts, 0)
+    CUDA.@cuda threads=Threads blocks=Blocks ComputeCellCountsKernel!(
+        NeighborList.CellCounts,
+        NeighborList.CellIds,
+    )
+    CellPrefix = CUDA.cumsum(NeighborList.CellCounts)
+    BlocksCells = cld(length(CellPrefix), Threads)
+    CUDA.@cuda threads=Threads blocks=BlocksCells BuildParticleRangesKernel!(
+        NeighborList.ParticleRanges,
+        CellPrefix,
+    )
+    copyto!(NeighborList.CellOffsets, NeighborList.ParticleRanges[1:(end - 1)])
+    CUDA.@cuda threads=Threads blocks=Blocks BuildParticleOrderKernel!(
+        NeighborList.ParticleOrder,
+        NeighborList.CellOffsets,
+        NeighborList.CellIds,
+    )
+    return nothing
+end
+
+function NeighborLoopKernel!(InteractionKernel,
+                             CellIds,
+                             ParticleOrder,
+                             ParticleRanges,
+                             NeighborCells,
+                             NeighborCounts,
+                             Args...)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(CellIds)
+        CellId = CellIds[Index]
+        StartIndex = ParticleRanges[CellId]
+        EndIndex = ParticleRanges[CellId + 1] - 1
+        for j in StartIndex:EndIndex
+            NeighborIndex = ParticleOrder[j]
+            if NeighborIndex != Index
+                InteractionKernel(Index, NeighborIndex, Args...)
+            end
+        end
+        NeighborCount = NeighborCounts[CellId]
+        for n in 1:NeighborCount
+            NeighborCell = NeighborCells[n, CellId]
+            StartIndex = ParticleRanges[NeighborCell]
+            EndIndex = ParticleRanges[NeighborCell + 1] - 1
+            for j in StartIndex:EndIndex
+                NeighborIndex = ParticleOrder[j]
+                InteractionKernel(Index, NeighborIndex, Args...)
+            end
+        end
+    end
+    return nothing
+end
+
+"""
+    NeighborLoopCUDA!(InteractionKernel, NeighborList, Args...; Threads=256)
+
+Launch a CUDA neighbor loop that calls `InteractionKernel(i, j, Args...)`.
+"""
+function NeighborLoopCUDA!(InteractionKernel,
+                           NeighborList::CUDANeighborList,
+                           Args...;
+                           Threads::Int = 256)
+    Blocks = cld(length(NeighborList.CellIds), Threads)
+    CUDA.@cuda threads=Threads blocks=Blocks NeighborLoopKernel!(
+        InteractionKernel,
+        NeighborList.CellIds,
+        NeighborList.ParticleOrder,
+        NeighborList.ParticleRanges,
+        NeighborList.NeighborCells,
+        NeighborList.NeighborCounts,
+        Args...,
+    )
+    return nothing
+end
+
+end
diff --git a/src/SPHExample.jl b/src/SPHExample.jl
index a56895ea..0aa158a9 100644
--- a/src/SPHExample.jl
+++ b/src/SPHExample.jl
@@ -14,6 +14,10 @@ module SPHExample
     include("OpenExternalPrograms.jl")
     include("SPHDensityDiffusionModels.jl")  
     include("SPHNeighborList.jl")
+    const HasCUDA = Base.find_package("CUDA") !== nothing
+    if HasCUDA
+        include("SPHCUDANeighborList.jl")
+    end
     include("SPHCellList.jl") #Must be last    
 
     # Re-export desired functions from each submodule
@@ -60,6 +64,13 @@ module SPHExample
     export ConstructStencil, ExtractCells!, UpdateNeighbors!,
            BuildNeighborCellLists!, ComputeCellParticleCounts, ComputeCellNeighborCounts
 
+    if HasCUDA
+        using .SPHCUDANeighborList
+        export CUDACellGrid, CUDANeighborList, AllocateCUDANeighborList,
+               ConstructNeighborOffsets, BuildNeighborCellListsCUDA!,
+               UpdateNeighborsCUDA!, NeighborLoopCUDA!
+    end
+
     using .SPHCellList
     export NeighborLoop!, ComputeInteractions!, RunSimulation
 

From a8ba51d481d7dc35d3b13911f76f2bf0f27c5f09 Mon Sep 17 00:00:00 2001
From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com>
Date: Thu, 15 Jan 2026 00:03:24 +0100
Subject: [PATCH 2/8] Add CUDA StillWedge neighbor example

---
 README.md                      |  2 +
 example/StillWedgeMDBC_CUDA.jl | 85 ++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 example/StillWedgeMDBC_CUDA.jl

diff --git a/README.md b/README.md
index c3f854e6..a776b46d 100644
--- a/README.md
+++ b/README.md
@@ -133,6 +133,8 @@ NeighborLoopCUDA!(InteractionKernel, neighbor_list, acc, position)
 These CUDA utilities are intended as building blocks for integrating GPU
 workflows into the existing solver. You can keep the CPU path intact and
 incrementally replace the neighbor loop where it makes sense for your use case.
+The `example/StillWedgeMDBC_CUDA.jl` script shows how to load the StillWedge
+geometry and run a CUDA neighbor-count pass using these utilities.
 
 ## Help
 
diff --git a/example/StillWedgeMDBC_CUDA.jl b/example/StillWedgeMDBC_CUDA.jl
new file mode 100644
index 00000000..1340769b
--- /dev/null
+++ b/example/StillWedgeMDBC_CUDA.jl
@@ -0,0 +1,85 @@
+using SPHExample
+using CUDA
+using StaticArrays
+
+if !SPHExample.HasCUDA
+    error("CUDA.jl is required to run this example.")
+end
+
+@inline function ComputeBounds(Position)
+    MinCorner = reduce((A, B) -> min.(A, B), Position)
+    MaxCorner = reduce((A, B) -> max.(A, B), Position)
+    return MinCorner, MaxCorner
+end
+
+@inline function BuildGrid(MinCorner, MaxCorner, CellSize)
+    Extents = MaxCorner - MinCorner
+    Dims = SVector{length(MinCorner), Int}(ceil.(Int, Extents ./ CellSize) .+ 1)
+    return CUDACellGrid(MinCorner, CellSize, Dims)
+end
+
+@inline function SquaredNorm(Vector)
+    Accumulator = zero(eltype(Vector))
+    @inbounds for Index in eachindex(Vector)
+        Accumulator += Vector[Index] * Vector[Index]
+    end
+    return Accumulator
+end
+
+@inline function NeighborCountInteractionKernel(Index, NeighborIndex, Counts, Position, SupportRadiusSquared)
+    Δx = Position[Index] - Position[NeighborIndex]
+    if SquaredNorm(Δx) <= SupportRadiusSquared
+        CUDA.atomic_add!(Counts, Index, 1)
+    end
+    return nothing
+end
+
+function CountNeighborsCUDA(Position, NeighborList, SupportRadiusSquared)
+    Counts = CUDA.zeros(Int, length(Position))
+    NeighborLoopCUDA!(NeighborCountInteractionKernel, NeighborList, Counts, Position, SupportRadiusSquared)
+    return Counts
+end
+
+let
+    Dimensions = 2
+    FloatType  = Float32
+
+    SimConstantsWedge = SimulationConstants{FloatType}(dx=0.02f0, c₀=42.485762f0, δᵩ=0.1f0, CFL=0.5f0)
+
+    FixedBoundary = Geometry{Dimensions, FloatType}(
+        CSVFile     = "./input/still_wedge/StillWedge_Dp$(SimConstantsWedge.dx)_Bound.csv",
+        GroupMarker = 1,
+        Type        = Fixed,
+        Motion      = nothing,
+    )
+
+    Water = Geometry{Dimensions, FloatType}(
+        CSVFile     = "./input/still_wedge/StillWedge_Dp$(SimConstantsWedge.dx)_Fluid.csv",
+        GroupMarker = 2,
+        Type        = Fluid,
+        Motion      = nothing,
+    )
+
+    SimulationGeometry = [FixedBoundary; Water]
+    SimParticles = AllocateDataStructures(SimulationGeometry)
+
+    SimKernel = SPHKernelInstance{Dimensions, FloatType}(WendlandC2(); dx = SimConstantsWedge.dx)
+
+    PositionGPU = CuArray(SimParticles.Position)
+    MinCorner, MaxCorner = ComputeBounds(SimParticles.Position)
+    Grid = BuildGrid(MinCorner, MaxCorner, SimKernel.H)
+    NeighborList = AllocateCUDANeighborList(Grid, length(PositionGPU))
+
+    UpdateNeighborsCUDA!(NeighborList, PositionGPU)
+    NeighborCounts = CountNeighborsCUDA(PositionGPU, NeighborList, SimKernel.H²)
+
+    NeighborCountsHost = Array(NeighborCounts)
+    MinimumNeighbors = minimum(NeighborCountsHost)
+    MaximumNeighbors = maximum(NeighborCountsHost)
+    AverageNeighbors = sum(NeighborCountsHost) / length(NeighborCountsHost)
+
+    println("StillWedge CUDA neighbor stats (within H):")
+    println("  Min neighbors: ", MinimumNeighbors)
+    println("  Max neighbors: ", MaximumNeighbors)
+    println("  Avg neighbors: ", AverageNeighbors)
+end

From 8e8811b5a1328a63ac7ffcde706c77d673de73bf Mon Sep 17 00:00:00 2001
From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com>
Date: Thu, 15 Jan 2026 00:13:53 +0100
Subject: [PATCH 3/8] Fix CUDA neighbor kernel coords

---
 src/SPHCUDANeighborList.jl | 48 ++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/src/SPHCUDANeighborList.jl b/src/SPHCUDANeighborList.jl
index a684ab55..83dbe6b6 100644
--- a/src/SPHCUDANeighborList.jl
+++ b/src/SPHCUDANeighborList.jl
@@ -100,27 +100,41 @@ end
     return CellId
 end
 
-@inline function CellCoordsFromId(CellId::Int, Grid::CUDACellGrid{D, T}) where {D, T}
-    Remaining = CellId - 1
-    Coords = ntuple(DimIndex -> begin
-        Dim = Grid.Dims[DimIndex]
-        Coord = Remaining % Dim
-        Remaining ÷= Dim
-        Coord
-    end, D)
-    return SVector{D, Int}(Coords)
+@generated function CellCoordsFromId(CellId::Int, Grid::CUDACellGrid{D, T}) where {D, T}
+    coord_symbols = [Symbol(:Coord_, i) for i in 1:D]
+    coord_defs = map(enumerate(coord_symbols)) do (i, sym)
+        quote
+            Dim = Grid.Dims[$i]
+            $sym = Remaining % Dim
+            Remaining = Remaining ÷ Dim
+        end
+    end
+    return quote
+        Remaining = CellId - 1
+        $(coord_defs...)
+        return SVector{$D, Int}($(coord_symbols...))
+    end
+end
+
+@generated function CellCoordsFromPosition(Position, Grid::CUDACellGrid{D, T}) where {D, T}
+    coord_exprs = map(1:D) do i
+        quote
+            Raw = (Position[$i] - Grid.Origin[$i]) * Grid.InvCellSize
+            Cell = floor(Int, Raw)
+            clamp(Cell, 0, Grid.Dims[$i] - 1)
+        end
+    end
+    return quote
+        return SVector{$D, Int}($(coord_exprs...))
+    end
 end
 
 function ComputeCellIdsKernel!(CellIds, Position, Grid::CUDACellGrid{D, T}) where {D, T}
     Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     if Index <= length(CellIds)
         Pos = Position[Index]
-        Coords = ntuple(DimIndex -> begin
-            Raw = (Pos[DimIndex] - Grid.Origin[DimIndex]) * Grid.InvCellSize
-            Cell = floor(Int, Raw)
-            clamp(Cell, 0, Grid.Dims[DimIndex] - 1)
-        end, D)
-        CellIds[Index] = LinearCellId(SVector{D, Int}(Coords), Grid)
+        Coords = CellCoordsFromPosition(Pos, Grid)
+        CellIds[Index] = LinearCellId(Coords, Grid)
     end
     return nothing
 end
@@ -155,7 +169,7 @@ function BuildParticleOrderKernel!(ParticleOrder, CellOffsets, CellIds)
     return nothing
 end
 
-function BuildNeighborCellListsKernel!(NeighborCells, NeighborCounts, NeighborOffsets, Grid)
+function BuildNeighborCellListsKernel!(NeighborCells, NeighborCounts, NeighborOffsets, Grid::CUDACellGrid{D, T}) where {D, T}
     CellId = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     if CellId <= Grid.CellCount
         CellCoord = CellCoordsFromId(CellId, Grid)
@@ -164,7 +178,7 @@ function BuildNeighborCellListsKernel!(NeighborCells, NeighborCounts, NeighborOf
             Offset = NeighborOffsets[OffsetIndex]
             NeighborCoord = CellCoord + Offset
             IsValid = true
-            @inbounds for DimIndex in 1:length(Grid.Dims)
+            @inbounds for DimIndex in 1:D
                 if NeighborCoord[DimIndex] < 0 || NeighborCoord[DimIndex] >= Grid.Dims[DimIndex]
                     IsValid = false
                     break

From b56162c6504ae79d078bc097d29e99a1d31b1810 Mon Sep 17 00:00:00 2001
From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com>
Date: Thu, 15 Jan 2026 00:18:30 +0100
Subject: [PATCH 4/8] Add CUDA per-particle neighbor loop

---
 README.md                  |  2 ++
 src/SPHCUDANeighborList.jl | 70 +++++++++++++++++++++++++++++++++++++-
 src/SPHExample.jl          |  2 +-
 3 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a776b46d..5c1632fe 100644
--- a/README.md
+++ b/README.md
@@ -110,6 +110,8 @@ The CUDA utilities are only loaded when CUDA.jl is available. The workflow is:
 3. Call `UpdateNeighborsCUDA!` each step after updating positions.
 4. Launch `NeighborLoopCUDA!` with a GPU kernel that computes pairwise
    interactions.
+   For per-particle accumulation (mirroring `NeighborLoopPerParticle!`),
+   use `NeighborLoopPerParticleCUDA!` with init/interact/final callbacks.
 
 Here is a minimal sketch:
 
diff --git a/src/SPHCUDANeighborList.jl b/src/SPHCUDANeighborList.jl
index 83dbe6b6..9575ce1d 100644
--- a/src/SPHCUDANeighborList.jl
+++ b/src/SPHCUDANeighborList.jl
@@ -2,7 +2,8 @@ module SPHCUDANeighborList
 
 export CUDACellGrid, CUDANeighborList, AllocateCUDANeighborList,
        ConstructNeighborOffsets, BuildNeighborCellListsCUDA!,
-       UpdateNeighborsCUDA!, NeighborLoopCUDA!
+       UpdateNeighborsCUDA!, NeighborLoopCUDA!,
+       NeighborLoopPerParticleCUDA!
 
 using CUDA
 using StaticArrays
@@ -276,6 +277,42 @@ function NeighborLoopKernel!(InteractionKernel,
     return nothing
 end
 
+function NeighborLoopPerParticleKernel!(InitKernel,
+                                        InteractionKernel,
+                                        FinalKernel,
+                                        CellIds,
+                                        ParticleOrder,
+                                        ParticleRanges,
+                                        NeighborCells,
+                                        NeighborCounts,
+                                        Args...)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(CellIds)
+        CellId = CellIds[Index]
+        Accumulator = InitKernel(Index, Args...)
+        StartIndex = ParticleRanges[CellId]
+        EndIndex = ParticleRanges[CellId + 1] - 1
+        for j in StartIndex:EndIndex
+            NeighborIndex = ParticleOrder[j]
+            if NeighborIndex != Index
+                Accumulator = InteractionKernel(Index, NeighborIndex, Accumulator, Args...)
+            end
+        end
+        NeighborCount = NeighborCounts[CellId]
+        for n in 1:NeighborCount
+            NeighborCell = NeighborCells[n, CellId]
+            StartIndex = ParticleRanges[NeighborCell]
+            EndIndex = ParticleRanges[NeighborCell + 1] - 1
+            for j in StartIndex:EndIndex
+                NeighborIndex = ParticleOrder[j]
+                Accumulator = InteractionKernel(Index, NeighborIndex, Accumulator, Args...)
+            end
+        end
+        FinalKernel(Index, Accumulator, Args...)
+    end
+    return nothing
+end
+
 """
     NeighborLoopCUDA!(InteractionKernel, NeighborList, Args...; Threads=256)
 
@@ -298,4 +335,35 @@ function NeighborLoopCUDA!(InteractionKernel,
     return nothing
 end
 
+"""
+    NeighborLoopPerParticleCUDA!(InitKernel, InteractionKernel, FinalKernel, NeighborList, Args...; Threads=256)
+
+Launch a CUDA neighbor loop that accumulates per-particle state. The kernels
+are called as:
+
+- `InitKernel(i, Args...)` to initialize the per-particle accumulator.
+- `InteractionKernel(i, j, Accumulator, Args...)` for each neighbor, returning an updated accumulator.
+- `FinalKernel(i, Accumulator, Args...)` to store the final per-particle state.
+"""
+function NeighborLoopPerParticleCUDA!(InitKernel,
+                                      InteractionKernel,
+                                      FinalKernel,
+                                      NeighborList::CUDANeighborList,
+                                      Args...;
+                                      Threads::Int = 256)
+    Blocks = cld(length(NeighborList.CellIds), Threads)
+    CUDA.@cuda threads=Threads blocks=Blocks NeighborLoopPerParticleKernel!(
+        InitKernel,
+        InteractionKernel,
+        FinalKernel,
+        NeighborList.CellIds,
+        NeighborList.ParticleOrder,
+        NeighborList.ParticleRanges,
+        NeighborList.NeighborCells,
+        NeighborList.NeighborCounts,
+        Args...,
+    )
+    return nothing
+end
+
 end
diff --git a/src/SPHExample.jl b/src/SPHExample.jl
index 0aa158a9..071b1c35 100644
--- a/src/SPHExample.jl
+++ b/src/SPHExample.jl
@@ -68,7 +68,7 @@ module SPHExample
         using .SPHCUDANeighborList
         export CUDACellGrid, CUDANeighborList, AllocateCUDANeighborList,
                ConstructNeighborOffsets, BuildNeighborCellListsCUDA!,
-               UpdateNeighborsCUDA!, NeighborLoopCUDA!
+               UpdateNeighborsCUDA!, NeighborLoopCUDA!, NeighborLoopPerParticleCUDA!
     end
 
     using .SPHCellList

From 171ca937254da74ecd65402e76aa25bd636e5ffb Mon Sep 17 00:00:00 2001
From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com>
Date: Thu, 15 Jan 2026 00:21:23 +0100
Subject: [PATCH 5/8] Run StillWedge CPU sim after CUDA setup

---
 example/StillWedgeMDBC_CUDA.jl |  31 ++++++++-
 src/SPHCUDANeighborList.jl     | 112 +++++++++++++++++----------------
 2 files changed, 87 insertions(+), 56 deletions(-)

diff --git a/example/StillWedgeMDBC_CUDA.jl b/example/StillWedgeMDBC_CUDA.jl
index 1340769b..8d472a85 100644
--- a/example/StillWedgeMDBC_CUDA.jl
+++ b/example/StillWedgeMDBC_CUDA.jl
@@ -60,10 +60,28 @@ let
         Motion      = nothing,
     )
 
+    SimMetaDataWedge  = SimulationMetaData{Dimensions, FloatType, NoShifting, NoKernelOutput, NoMDBC, StoreLog}(
+        SimulationName="StillWedge",
+        SaveLocation="W:/Simulations/StillWedge2D_MDBC_CUDA",
+        SimulationTime=4.0f0,
+        OutputTimes=0.01f0,
+        VisualizeInParaview=true,
+        ExportSingleVTKHDF=true,
+        ExportGridCells=true,
+        OpenLogFile=true,
+    )
+
+    if !isdir(SimMetaDataWedge.SaveLocation)
+        mkdir(SimMetaDataWedge.SaveLocation)
+    end
+
     SimulationGeometry = [FixedBoundary; Water]
-    SimParticles = AllocateDataStructures(SimulationGeometry)
+    SimParticles = AllocateDataStructures(SimulationGeometry, SimMetaDataWedge)
 
     SimKernel = SPHKernelInstance{Dimensions, FloatType}(WendlandC2(); dx = SimConstantsWedge.dx)
+    SimLogger = SimulationLogger(SimMetaDataWedge.SaveLocation)
+
+    CleanUpSimulationFolder(SimMetaDataWedge.SaveLocation)
 
     PositionGPU = CuArray(SimParticles.Position)
     MinCorner, MaxCorner = ComputeBounds(SimParticles.Position)
@@ -82,4 +100,15 @@ let
     println("  Min neighbors: ", MinimumNeighbors)
     println("  Max neighbors: ", MaximumNeighbors)
     println("  Avg neighbors: ", AverageNeighbors)
+
+    RunSimulation(
+        SimGeometry         = SimulationGeometry,
+        SimMetaData         = SimMetaDataWedge,
+        SimConstants        = SimConstantsWedge,
+        SimKernel           = SimKernel,
+        SimLogger           = SimLogger,
+        SimParticles        = SimParticles,
+        SimViscosity        = ArtificialViscosity(),
+        SimDensityDiffusion = LinearDensityDiffusion(),
+    )
 end
diff --git a/src/SPHCUDANeighborList.jl b/src/SPHCUDANeighborList.jl
index 9575ce1d..4cf1acf7 100644
--- a/src/SPHCUDANeighborList.jl
+++ b/src/SPHCUDANeighborList.jl
@@ -5,6 +5,8 @@ export CUDACellGrid, CUDANeighborList, AllocateCUDANeighborList,
        UpdateNeighborsCUDA!, NeighborLoopCUDA!,
        NeighborLoopPerParticleCUDA!
 
+const CUDAIndex = Int32
+
 using CUDA
 using StaticArrays
 
@@ -39,14 +41,14 @@ CUDA storage for per-cell particle ranges and neighbor cell indices.
 """
 struct CUDANeighborList{D, T}
     Grid::CUDACellGrid{D, T}
-    CellIds::CuArray{Int}
-    ParticleOrder::CuArray{Int}
-    ParticleRanges::CuArray{Int}
-    CellCounts::CuArray{Int}
-    CellOffsets::CuArray{Int}
-    NeighborCells::CuArray{Int}
-    NeighborCounts::CuArray{Int}
-    NeighborOffsets::CuArray{SVector{D, Int}}
+    CellIds::CuArray{CUDAIndex}
+    ParticleOrder::CuArray{CUDAIndex}
+    ParticleRanges::CuArray{CUDAIndex}
+    CellCounts::CuArray{CUDAIndex}
+    CellOffsets::CuArray{CUDAIndex}
+    NeighborCells::CuArray{CUDAIndex}
+    NeighborCounts::CuArray{CUDAIndex}
+    NeighborOffsets::CuArray{SVector{D, CUDAIndex}}
 end
 
 """
@@ -55,10 +57,10 @@ end
 Return the non-zero stencil offsets for a D-dimensional Moore neighborhood.
 """
 function ConstructNeighborOffsets(::Val{D}) where {D}
-    Offsets = SVector{D, Int}[]
+    Offsets = SVector{D, CUDAIndex}[]
     for Offset in CartesianIndices(ntuple(_ -> -1:1, D))
         if !all(iszero, Tuple(Offset))
-            push!(Offsets, SVector{D, Int}(Tuple(Offset)))
+            push!(Offsets, SVector{D, CUDAIndex}(Tuple(Offset)))
         end
     end
     return Offsets
@@ -70,14 +72,14 @@ end
 Allocate GPU buffers for neighbor cell lists and particle ordering.
 """
 function AllocateCUDANeighborList(Grid::CUDACellGrid{D, T}, ParticleCount::Int) where {D, T}
-    CellIds = CUDA.zeros(Int, ParticleCount)
-    ParticleOrder = CUDA.zeros(Int, ParticleCount)
-    ParticleRanges = CUDA.zeros(Int, Grid.CellCount + 1)
-    CellCounts = CUDA.zeros(Int, Grid.CellCount)
-    CellOffsets = CUDA.zeros(Int, Grid.CellCount)
+    CellIds = CUDA.zeros(CUDAIndex, ParticleCount)
+    ParticleOrder = CUDA.zeros(CUDAIndex, ParticleCount)
+    ParticleRanges = CUDA.zeros(CUDAIndex, Grid.CellCount + 1)
+    CellCounts = CUDA.zeros(CUDAIndex, Grid.CellCount)
+    CellOffsets = CUDA.zeros(CUDAIndex, Grid.CellCount)
     NeighborOffsets = CuArray(ConstructNeighborOffsets(Val(D)))
-    NeighborCells = CUDA.zeros(Int, length(NeighborOffsets), Grid.CellCount)
-    NeighborCounts = CUDA.zeros(Int, Grid.CellCount)
+    NeighborCells = CUDA.zeros(CUDAIndex, length(NeighborOffsets), Grid.CellCount)
+    NeighborCounts = CUDA.zeros(CUDAIndex, Grid.CellCount)
     NeighborList = CUDANeighborList{D, T}(
         Grid,
         CellIds,
@@ -93,27 +95,27 @@ function AllocateCUDANeighborList(Grid::CUDACellGrid{D, T}, ParticleCount::Int)
     return NeighborList
 end
 
-@inline function LinearCellId(CellCoords::SVector{D, Int}, Grid::CUDACellGrid{D, T}) where {D, T}
-    CellId = 1
+@inline function LinearCellId(CellCoords::SVector{D, CUDAIndex}, Grid::CUDACellGrid{D, T}) where {D, T}
+    CellId = CUDAIndex(1)
     @inbounds for DimIndex in 1:D
-        CellId += CellCoords[DimIndex] * Grid.Strides[DimIndex]
+        CellId += CellCoords[DimIndex] * CUDAIndex(Grid.Strides[DimIndex])
     end
     return CellId
 end
 
-@generated function CellCoordsFromId(CellId::Int, Grid::CUDACellGrid{D, T}) where {D, T}
+@generated function CellCoordsFromId(CellId::CUDAIndex, Grid::CUDACellGrid{D, T}) where {D, T}
     coord_symbols = [Symbol(:Coord_, i) for i in 1:D]
     coord_defs = map(enumerate(coord_symbols)) do (i, sym)
         quote
-            Dim = Grid.Dims[$i]
+            Dim = CUDAIndex(Grid.Dims[$i])
             $sym = Remaining % Dim
             Remaining = Remaining ÷ Dim
         end
     end
     return quote
-        Remaining = CellId - 1
+        Remaining = CellId - CUDAIndex(1)
         $(coord_defs...)
-        return SVector{$D, Int}($(coord_symbols...))
+        return SVector{$D, CUDAIndex}($(coord_symbols...))
     end
 end
 
@@ -121,12 +123,12 @@ end
     coord_exprs = map(1:D) do i
         quote
             Raw = (Position[$i] - Grid.Origin[$i]) * Grid.InvCellSize
-            Cell = floor(Int, Raw)
-            clamp(Cell, 0, Grid.Dims[$i] - 1)
+            Cell = floor(CUDAIndex, Raw)
+            clamp(Cell, CUDAIndex(0), CUDAIndex(Grid.Dims[$i] - 1))
         end
     end
     return quote
-        return SVector{$D, Int}($(coord_exprs...))
+        return SVector{$D, CUDAIndex}($(coord_exprs...))
     end
 end
 
@@ -144,7 +146,7 @@ function ComputeCellCountsKernel!(CellCounts, CellIds)
     Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     if Index <= length(CellIds)
         CellId = CellIds[Index]
-        CUDA.atomic_add!(CellCounts, CellId, 1)
+        CUDA.atomic_add!(CellCounts, CellId, CUDAIndex(1))
     end
     return nothing
 end
@@ -153,9 +155,9 @@ function BuildParticleRangesKernel!(ParticleRanges, CellPrefix)
     Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     if Index <= length(CellPrefix)
         if Index == 1
-            ParticleRanges[1] = 1
+            ParticleRanges[1] = CUDAIndex(1)
         end
-        ParticleRanges[Index + 1] = CellPrefix[Index] + 1
+        ParticleRanges[Index + 1] = CellPrefix[Index] + CUDAIndex(1)
     end
     return nothing
 end
@@ -164,8 +166,8 @@ function BuildParticleOrderKernel!(ParticleOrder, CellOffsets, CellIds)
     Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     if Index <= length(CellIds)
         CellId = CellIds[Index]
-        TargetIndex = CUDA.atomic_add!(CellOffsets, CellId, 1)
-        ParticleOrder[TargetIndex] = Index
+        TargetIndex = CUDA.atomic_add!(CellOffsets, CellId, CUDAIndex(1))
+        ParticleOrder[Int(TargetIndex)] = CUDAIndex(Index)
     end
     return nothing
 end
@@ -173,21 +175,21 @@ end
 function BuildNeighborCellListsKernel!(NeighborCells, NeighborCounts, NeighborOffsets, Grid::CUDACellGrid{D, T}) where {D, T}
     CellId = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     if CellId <= Grid.CellCount
-        CellCoord = CellCoordsFromId(CellId, Grid)
-        NeighborIndex = 0
+        CellCoord = CellCoordsFromId(CUDAIndex(CellId), Grid)
+        NeighborIndex = CUDAIndex(0)
         for OffsetIndex in 1:length(NeighborOffsets)
             Offset = NeighborOffsets[OffsetIndex]
             NeighborCoord = CellCoord + Offset
             IsValid = true
             @inbounds for DimIndex in 1:D
-                if NeighborCoord[DimIndex] < 0 || NeighborCoord[DimIndex] >= Grid.Dims[DimIndex]
+                if NeighborCoord[DimIndex] < CUDAIndex(0) || NeighborCoord[DimIndex] >= CUDAIndex(Grid.Dims[DimIndex])
                     IsValid = false
                     break
                 end
             end
             if IsValid
-                NeighborIndex += 1
-                NeighborCells[NeighborIndex, CellId] = LinearCellId(NeighborCoord, Grid)
+                NeighborIndex += CUDAIndex(1)
+                NeighborCells[Int(NeighborIndex), CellId] = LinearCellId(NeighborCoord, Grid)
             end
         end
         NeighborCounts[CellId] = NeighborIndex
@@ -254,22 +256,22 @@ function NeighborLoopKernel!(InteractionKernel,
                              Args...)
     Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     if Index <= length(CellIds)
-        CellId = CellIds[Index]
-        StartIndex = ParticleRanges[CellId]
-        EndIndex = ParticleRanges[CellId + 1] - 1
+        CellId = Int(CellIds[Index])
+        StartIndex = Int(ParticleRanges[CellId])
+        EndIndex = Int(ParticleRanges[CellId + 1]) - 1
         for j in StartIndex:EndIndex
-            NeighborIndex = ParticleOrder[j]
+            NeighborIndex = Int(ParticleOrder[j])
             if NeighborIndex != Index
                 InteractionKernel(Index, NeighborIndex, Args...)
             end
         end
-        NeighborCount = NeighborCounts[CellId]
+        NeighborCount = Int(NeighborCounts[CellId])
         for n in 1:NeighborCount
-            NeighborCell = NeighborCells[n, CellId]
-            StartIndex = ParticleRanges[NeighborCell]
-            EndIndex = ParticleRanges[NeighborCell + 1] - 1
+            NeighborCell = Int(NeighborCells[n, CellId])
+            StartIndex = Int(ParticleRanges[NeighborCell])
+            EndIndex = Int(ParticleRanges[NeighborCell + 1]) - 1
             for j in StartIndex:EndIndex
-                NeighborIndex = ParticleOrder[j]
+                NeighborIndex = Int(ParticleOrder[j])
                 InteractionKernel(Index, NeighborIndex, Args...)
             end
         end
@@ -288,23 +290,23 @@ function NeighborLoopPerParticleKernel!(InitKernel,
                                         Args...)
     Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     if Index <= length(CellIds)
-        CellId = CellIds[Index]
+        CellId = Int(CellIds[Index])
         Accumulator = InitKernel(Index, Args...)
-        StartIndex = ParticleRanges[CellId]
-        EndIndex = ParticleRanges[CellId + 1] - 1
+        StartIndex = Int(ParticleRanges[CellId])
+        EndIndex = Int(ParticleRanges[CellId + 1]) - 1
         for j in StartIndex:EndIndex
-            NeighborIndex = ParticleOrder[j]
+            NeighborIndex = Int(ParticleOrder[j])
             if NeighborIndex != Index
                 Accumulator = InteractionKernel(Index, NeighborIndex, Accumulator, Args...)
             end
         end
-        NeighborCount = NeighborCounts[CellId]
+        NeighborCount = Int(NeighborCounts[CellId])
         for n in 1:NeighborCount
-            NeighborCell = NeighborCells[n, CellId]
-            StartIndex = ParticleRanges[NeighborCell]
-            EndIndex = ParticleRanges[NeighborCell + 1] - 1
+            NeighborCell = Int(NeighborCells[n, CellId])
+            StartIndex = Int(ParticleRanges[NeighborCell])
+            EndIndex = Int(ParticleRanges[NeighborCell + 1]) - 1
             for j in StartIndex:EndIndex
-                NeighborIndex = ParticleOrder[j]
+                NeighborIndex = Int(ParticleOrder[j])
                 Accumulator = InteractionKernel(Index, NeighborIndex, Accumulator, Args...)
             end
         end

From ba7d83ccc8c79384e0f414a44ba650d47d895452 Mon Sep 17 00:00:00 2001
From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com>
Date: Thu, 15 Jan 2026 00:33:57 +0100
Subject: [PATCH 6/8] Add CUDA simulation path

---
 README.md                      |   5 +
 example/StillWedgeMDBC_CUDA.jl |   2 +-
 src/SPHCUDASimulation.jl       | 352 +++++++++++++++++++++++++++++++++
 src/SPHExample.jl              |   3 +
 4 files changed, 361 insertions(+), 1 deletion(-)
 create mode 100644 src/SPHCUDASimulation.jl

diff --git a/README.md b/README.md
index 5c1632fe..346097a8 100644
--- a/README.md
+++ b/README.md
@@ -138,6 +138,11 @@ incrementally replace the neighbor loop where it makes sense for your use case.
 The `example/StillWedgeMDBC_CUDA.jl` script shows how to load the StillWedge
 geometry and run a CUDA neighbor-count pass using these utilities.
 
+For an end-to-end GPU path (limited to `NoShifting`, `NoKernelOutput`, `NoMDBC`,
+`ArtificialViscosity`, and `LinearDensityDiffusion`), use `RunSimulationCUDA`
+from the `SPHCUDASimulation` module. This runs the StillWedge-style workflow on
+CUDA arrays and syncs particle data to the CPU only for output/logging.
+
 ## Help
 
 Questions or issues can be posted on the GitHub issue tracker. Response times may vary but all feedback is welcome.
diff --git a/example/StillWedgeMDBC_CUDA.jl b/example/StillWedgeMDBC_CUDA.jl
index 8d472a85..a98ed83a 100644
--- a/example/StillWedgeMDBC_CUDA.jl
+++ b/example/StillWedgeMDBC_CUDA.jl
@@ -101,7 +101,7 @@ let
     println("  Max neighbors: ", MaximumNeighbors)
     println("  Avg neighbors: ", AverageNeighbors)
 
-    RunSimulation(
+    RunSimulationCUDA(
         SimGeometry         = SimulationGeometry,
         SimMetaData         = SimMetaDataWedge,
         SimConstants        = SimConstantsWedge,
diff --git a/src/SPHCUDASimulation.jl b/src/SPHCUDASimulation.jl
new file mode 100644
index 00000000..cb8db28c
--- /dev/null
+++ b/src/SPHCUDASimulation.jl
@@ -0,0 +1,352 @@
+module SPHCUDASimulation
+
+export RunSimulationCUDA, AllocateCUDASimParticles, SyncParticlesFromCUDA!
+
+using CUDA
+using StaticArrays
+using Parameters
+using LinearAlgebra
+
+using ..SimulationConstantsConfiguration
+using ..SimulationGeometry
+using ..SimulationLoggerConfiguration
+using ..SimulationMetaDataConfiguration
+using ..ProduceHDFVTK
+using ..SPHCUDANeighborList
+using ..SPHDensityDiffusionModels
+using ..SPHKernels
+using ..SPHViscosityModels
+using ..TimeStepping: next_output_time
+using ..SimulationEquations: EquationOfStateGamma7
+using ..OpenExternalPrograms
+
+struct CUDASimParticles{D, T}
+    Position::CuArray{SVector{D, T}}
+    Velocity::CuArray{SVector{D, T}}
+    Acceleration::CuArray{SVector{D, T}}
+    Density::CuArray{T}
+    Pressure::CuArray{T}
+    MotionLimiter::CuArray{T}
+    GravityFactor::CuArray{T}
+end
+
+@inline function AllocateCUDASimParticles(SimParticles)
+    return CUDASimParticles(
+        CuArray(SimParticles.Position),
+        CuArray(SimParticles.Velocity),
+        CuArray(SimParticles.Acceleration),
+        CuArray(SimParticles.Density),
+        CuArray(SimParticles.Pressure),
+        CuArray(SimParticles.MotionLimiter),
+        CuArray(SimParticles.GravityFactor),
+    )
+end
+
+@inline function SyncParticlesFromCUDA!(SimParticles, CUDAParticles::CUDASimParticles)
+    copyto!(SimParticles.Position, Array(CUDAParticles.Position))
+    copyto!(SimParticles.Velocity, Array(CUDAParticles.Velocity))
+    copyto!(SimParticles.Acceleration, Array(CUDAParticles.Acceleration))
+    copyto!(SimParticles.Density, Array(CUDAParticles.Density))
+    copyto!(SimParticles.Pressure, Array(CUDAParticles.Pressure))
+    return nothing
+end
+
+@inline function ComputeBounds(Position)
+    MinCorner = reduce((A, B) -> min.(A, B), Position)
+    MaxCorner = reduce((A, B) -> max.(A, B), Position)
+    return MinCorner, MaxCorner
+end
+
+@inline function BuildGrid(MinCorner, MaxCorner, CellSize)
+    Extents = MaxCorner - MinCorner
+    Dims = SVector{length(MinCorner), Int}(ceil.(Int, Extents ./ CellSize) .+ 1)
+    return CUDACellGrid(MinCorner, CellSize, Dims)
+end
+
+@generated function GravityVector(::Val{D}, Value::T) where {D, T}
+    Entries = [:(Index == $D ? Value : zero(T)) for Index in 1:D]
+    return quote
+        return SVector{$D, T}($(Entries...))
+    end
+end
+
+function PressureKernel!(Pressure, Density, SimConstants)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(Pressure)
+        ρ = Density[Index]
+        Pressure[Index] = EquationOfStateGamma7(ρ, SimConstants.c₀, SimConstants.ρ₀)
+    end
+    return nothing
+end
+
+function LimitDensityKernel!(Density, ρ₀, MotionLimiter)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(Density)
+        if Density[Index] < ρ₀ && MotionLimiter[Index] == zero(eltype(MotionLimiter))
+            Density[Index] = ρ₀
+        end
+    end
+    return nothing
+end
+
+function DensityEpsiKernel!(Density, DensityRate, DensityHalf, Δt)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(Density)
+        Epsi = -(DensityRate[Index] / DensityHalf[Index]) * Δt
+        Density[Index] *= (2 - Epsi) / (2 + Epsi)
+    end
+    return nothing
+end
+
+function HalfTimeStepKernel!(Position, Velocity, Acceleration, Density,
+                             GravityFactor, MotionLimiter,
+                             PositionHalf, VelocityHalf, DensityHalf, DensityRate,
+                             DtHalf, GravityValue, Dimensions)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(Position)
+        Acc = Acceleration[Index] + GravityVector(Dimensions, GravityValue * GravityFactor[Index])
+        Acceleration[Index] = Acc
+        Factor = MotionLimiter[Index]
+        PositionHalf[Index] = Position[Index] + Velocity[Index] * DtHalf * Factor
+        VelocityHalf[Index] = Velocity[Index] + Acc * DtHalf * Factor
+        DensityHalf[Index] = Density[Index] + DensityRate[Index] * DtHalf
+    end
+    return nothing
+end
+
+function FullTimeStepKernel!(Position, Velocity, Acceleration, GravityFactor, MotionLimiter, Dt, GravityValue, Dimensions)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(Position)
+        Acc = Acceleration[Index] + GravityVector(Dimensions, GravityValue * GravityFactor[Index])
+        Acceleration[Index] = Acc
+        Factor = MotionLimiter[Index]
+        Velocity[Index] += Acc * Dt * Factor
+        Position[Index] += (((Velocity[Index] + (Velocity[Index] - Acc * Dt * Factor)) / 2) * Dt) * Factor
+    end
+    return nothing
+end
+
+function TimeStepBuffersKernel!(MaxVisc, MinDtForce, Position, Velocity, Acceleration, SimKernel)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(Position)
+        R = Position[Index]
+        V = Velocity[Index]
+        A = Acceleration[Index]
+        RSq = sqrt(dot(R, R))^2
+        CurrVisc = abs(SimKernel.h * dot(V, R) / (RSq + SimKernel.η²))
+        AMag = norm(A)
+        CurrDtForce = AMag > 0 ? sqrt(SimKernel.h / AMag) : typemax(eltype(MinDtForce))
+        MaxVisc[Index] = CurrVisc
+        MinDtForce[Index] = CurrDtForce
+    end
+    return nothing
+end
+
+@inline function InitAccumulatorKernel(Index, Density, Position)
+    return (zero(eltype(Density)), zero(eltype(Position)))
+end
+
+@inline function InteractionKernel(Index, NeighborIndex, Accumulator,
+                                   Position, Density, Pressure, Velocity,
+                                   MotionLimiter, SimKernel, SimConstants)
+    DensityRate, AccelerationValue = Accumulator
+    Δx = Position[Index] - Position[NeighborIndex]
+    Δx² = dot(Δx, Δx)
+    if Δx² <= SimKernel.H²
+        Distance = sqrt(abs(Δx²))
+        Q = clamp(Distance * SimKernel.h⁻¹, zero(Distance), 2)
+        ∇W = ∇Wᵢⱼ(SimKernel, Q, Δx)
+
+        ρᵢ = Density[Index]
+        ρⱼ = Density[NeighborIndex]
+
+        Vᵢ = Velocity[Index]
+        Vⱼ = Velocity[NeighborIndex]
+        Vᵢⱼ = Vᵢ - Vⱼ
+        DensityTerm = dot(-Vᵢⱼ, ∇W)
+        DensityRateContribution = -ρᵢ * (SimConstants.m₀ / ρⱼ) * DensityTerm
+
+        LinearρFactor = inv(SimConstants.Cb * SimConstants.γ) * SimConstants.ρ₀
+        Pᵢⱼᴴ = SimConstants.ρ₀ * (-SimConstants.g) * -Δx[end]
+        ρᵢⱼᴴ = Pᵢⱼᴴ * LinearρFactor
+        InvD = one(eltype(ρᵢ)) / (Δx² + SimKernel.η²)
+        ρⱼᵢ = ρⱼ - ρᵢ
+        ψᵢⱼ = 2 * (ρⱼᵢ - ρᵢⱼᴴ) * (-Δx) * InvD
+        MotionFactor = MotionLimiter[Index] * MotionLimiter[NeighborIndex]
+        DiffusionContribution = SimConstants.δᵩ * SimKernel.h * SimConstants.c₀ * (SimConstants.m₀ / ρⱼ) * dot(ψᵢⱼ, ∇W) * MotionFactor
+
+        DensityRate += DensityRateContribution + DiffusionContribution
+
+        Pᵢ = Pressure[Index]
+        Pⱼ = Pressure[NeighborIndex]
+        Pfac = (Pᵢ + Pⱼ) / (ρᵢ * ρⱼ)
+        Fab = tensile_correction(SimKernel, Pᵢ, ρᵢ, Pⱼ, ρⱼ, Q, SimConstants.dx)
+        AccelerationContribution = -SimConstants.m₀ * (Pfac + Fab) * ∇W
+
+        ViscTerm = zero(AccelerationContribution)
+        VDotX = dot(Vᵢⱼ, Δx)
+        if VDotX < 0
+            ρ̄ = 0.5 * (ρᵢ + ρⱼ)
+            μᵢⱼ = SimKernel.h * VDotX / (Δx² + SimKernel.η²)
+            ViscTerm = -SimConstants.m₀ * (-SimConstants.α * SimConstants.c₀ * μᵢⱼ) / ρ̄ * ∇W
+        end
+
+        AccelerationValue += AccelerationContribution + ViscTerm
+    end
+
+    return DensityRate, AccelerationValue
+end
+
+@inline function FinalKernel(Index, Accumulator, DensityRate, Acceleration)
+    DensityRateValue, AccelerationValue = Accumulator
+    DensityRate[Index] = DensityRateValue
+    Acceleration[Index] = AccelerationValue
+    return nothing
+end
+
+function ComputeNextTimeStep(MaxVisc, MinDtForce, SimConstants, SimKernel)
+    MaxViscHost = maximum(Array(MaxVisc))
+    MinDtHost = minimum(Array(MinDtForce))
+    DtLimit = SimKernel.h / (SimConstants.c₀ + MaxViscHost)
+    return SimConstants.CFL * min(MinDtHost, DtLimit)
+end
+
+function RunSimulationCUDA(;SimGeometry::Vector{Geometry{Dimensions, FloatType}},
+        SimMetaData::SimulationMetaData{Dimensions, FloatType, NoShifting, NoKernelOutput, NoMDBC, LMode},
+        SimConstants::SimulationConstants,
+        SimKernel::SPHKernelInstance,
+        SimLogger::SimulationLogger,
+        SimParticles,
+        SimViscosity::ArtificialViscosity,
+        SimDensityDiffusion::LinearDensityDiffusion,
+        ) where {Dimensions, FloatType, LMode<:LogMode}
+
+    NumberOfPoints = length(SimParticles)
+    CUDAParticles = AllocateCUDASimParticles(SimParticles)
+
+    DensityRate = CUDA.zeros(FloatType, NumberOfPoints)
+    VelocityHalf = CUDA.zeros(SVector{Dimensions, FloatType}, NumberOfPoints)
+    PositionHalf = CUDA.zeros(SVector{Dimensions, FloatType}, NumberOfPoints)
+    DensityHalf = CUDA.zeros(FloatType, NumberOfPoints)
+    MaxVisc = CUDA.zeros(FloatType, NumberOfPoints)
+    MinDtForce = CUDA.zeros(FloatType, NumberOfPoints)
+
+    MinCorner, MaxCorner = ComputeBounds(SimParticles.Position)
+    Grid = BuildGrid(MinCorner, MaxCorner, SimKernel.H)
+    NeighborList = AllocateCUDANeighborList(Grid, NumberOfPoints)
+
+    output = SetupVTKOutput(SimMetaData, SimParticles, SimKernel, Dimensions)
+
+    SimMetaData.OutputIterationCounter = 1
+    output.enqueue_particles(SimMetaData.OutputIterationCounter)
+
+    InitializeLog!(SimMetaData, SimLogger, SimConstants, SimKernel, SimViscosity, SimDensityDiffusion, SimGeometry, SimParticles)
+
+    Threads = 256
+    Blocks = cld(NumberOfPoints, Threads)
+    DimensionsVal = Val(Dimensions)
+
+    CUDA.@cuda threads=Threads blocks=Blocks PressureKernel!(CUDAParticles.Pressure, CUDAParticles.Density, SimConstants)
+    CUDA.@cuda threads=Threads blocks=Blocks TimeStepBuffersKernel!(
+        MaxVisc, MinDtForce, CUDAParticles.Position, CUDAParticles.Velocity, CUDAParticles.Acceleration, SimKernel,
+    )
+    Dt = ComputeNextTimeStep(MaxVisc, MinDtForce, SimConstants, SimKernel)
+
+    while SimMetaData.TotalTime <= next_output_time(SimMetaData)
+        UpdateNeighborsCUDA!(NeighborList, CUDAParticles.Position)
+
+        CUDA.@cuda threads=Threads blocks=Blocks PressureKernel!(CUDAParticles.Pressure, CUDAParticles.Density, SimConstants)
+
+        NeighborLoopPerParticleCUDA!(
+            InitAccumulatorKernel,
+            InteractionKernel,
+            FinalKernel,
+            NeighborList,
+            CUDAParticles.Position,
+            CUDAParticles.Density,
+            CUDAParticles.Pressure,
+            CUDAParticles.Velocity,
+            CUDAParticles.MotionLimiter,
+            SimKernel,
+            SimConstants,
+            DensityRate,
+            CUDAParticles.Acceleration,
+        )
+
+        DtHalf = Dt * 0.5
+        CUDA.@cuda threads=Threads blocks=Blocks HalfTimeStepKernel!(
+            CUDAParticles.Position,
+            CUDAParticles.Velocity,
+            CUDAParticles.Acceleration,
+            CUDAParticles.Density,
+            CUDAParticles.GravityFactor,
+            CUDAParticles.MotionLimiter,
+            PositionHalf,
+            VelocityHalf,
+            DensityHalf,
+            DensityRate,
+            DtHalf,
+            SimConstants.g,
+            DimensionsVal,
+        )
+
+        CUDA.@cuda threads=Threads blocks=Blocks LimitDensityKernel!(DensityHalf, SimConstants.ρ₀, CUDAParticles.MotionLimiter)
+
+        CUDA.@cuda threads=Threads blocks=Blocks PressureKernel!(CUDAParticles.Pressure, DensityHalf, SimConstants)
+
+        NeighborLoopPerParticleCUDA!(
+            InitAccumulatorKernel,
+            InteractionKernel,
+            FinalKernel,
+            NeighborList,
+            PositionHalf,
+            DensityHalf,
+            CUDAParticles.Pressure,
+            VelocityHalf,
+            CUDAParticles.MotionLimiter,
+            SimKernel,
+            SimConstants,
+            DensityRate,
+            CUDAParticles.Acceleration,
+        )
+
+        CUDA.@cuda threads=Threads blocks=Blocks LimitDensityKernel!(CUDAParticles.Density, SimConstants.ρ₀, CUDAParticles.MotionLimiter)
+        CUDA.@cuda threads=Threads blocks=Blocks DensityEpsiKernel!(CUDAParticles.Density, DensityRate, DensityHalf, Dt)
+
+        CUDA.@cuda threads=Threads blocks=Blocks FullTimeStepKernel!(
+            CUDAParticles.Position,
+            CUDAParticles.Velocity,
+            CUDAParticles.Acceleration,
+            CUDAParticles.GravityFactor,
+            CUDAParticles.MotionLimiter,
+            Dt,
+            SimConstants.g,
+            DimensionsVal,
+        )
+
+        CUDA.@cuda threads=Threads blocks=Blocks TimeStepBuffersKernel!(
+            MaxVisc, MinDtForce, CUDAParticles.Position, CUDAParticles.Velocity, CUDAParticles.Acceleration, SimKernel,
+        )
+        Dt = ComputeNextTimeStep(MaxVisc, MinDtForce, SimConstants, SimKernel)
+
+        SimMetaData.Iteration += 1
+        SimMetaData.CurrentTimeStep = Dt
+        SimMetaData.TotalTime += Dt
+
+        SyncParticlesFromCUDA!(SimParticles, CUDAParticles)
+        LogStep!(SimMetaData, SimLogger)
+        SimMetaData.OutputIterationCounter += 1
+        output.enqueue_particles(SimMetaData.OutputIterationCounter)
+
+        if SimMetaData.TotalTime > SimMetaData.SimulationTime
+            output.close_files()
+            FinalizeLog!(SimMetaData, SimLogger)
+            AutoOpenLogFile(SimLogger, SimMetaData)
+            break
+        end
+    end
+
+    return nothing
+end
+
+end
diff --git a/src/SPHExample.jl b/src/SPHExample.jl
index 071b1c35..113aa6b8 100644
--- a/src/SPHExample.jl
+++ b/src/SPHExample.jl
@@ -17,6 +17,7 @@ module SPHExample
     const HasCUDA = Base.find_package("CUDA") !== nothing
     if HasCUDA
         include("SPHCUDANeighborList.jl")
+        include("SPHCUDASimulation.jl")
     end
     include("SPHCellList.jl") #Must be last    
 
@@ -69,6 +70,8 @@ module SPHExample
         export CUDACellGrid, CUDANeighborList, AllocateCUDANeighborList,
                ConstructNeighborOffsets, BuildNeighborCellListsCUDA!,
                UpdateNeighborsCUDA!, NeighborLoopCUDA!, NeighborLoopPerParticleCUDA!
+        using .SPHCUDASimulation
+        export RunSimulationCUDA, AllocateCUDASimParticles, SyncParticlesFromCUDA!
     end
 
     using .SPHCellList

From c8f9ef5c3a96bed9dde4a699c8a33e0b56d5714f Mon Sep 17 00:00:00 2001
From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com>
Date: Thu, 15 Jan 2026 00:39:07 +0100
Subject: [PATCH 7/8] Fix CUDA atomic index types

---
 src/SPHCUDANeighborList.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/SPHCUDANeighborList.jl b/src/SPHCUDANeighborList.jl
index 4cf1acf7..19db7d63 100644
--- a/src/SPHCUDANeighborList.jl
+++ b/src/SPHCUDANeighborList.jl
@@ -146,7 +146,7 @@ function ComputeCellCountsKernel!(CellCounts, CellIds)
     Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     if Index <= length(CellIds)
         CellId = CellIds[Index]
-        CUDA.atomic_add!(CellCounts, CellId, CUDAIndex(1))
+        CUDA.atomic_add!(CellCounts, Int(CellId), CUDAIndex(1))
     end
     return nothing
 end
@@ -166,7 +166,7 @@ function BuildParticleOrderKernel!(ParticleOrder, CellOffsets, CellIds)
     Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
     if Index <= length(CellIds)
         CellId = CellIds[Index]
-        TargetIndex = CUDA.atomic_add!(CellOffsets, CellId, CUDAIndex(1))
+        TargetIndex = CUDA.atomic_add!(CellOffsets, Int(CellId), CUDAIndex(1))
         ParticleOrder[Int(TargetIndex)] = CUDAIndex(Index)
     end
     return nothing

From be8d34dbb7125938c76954d80639daf49561a7dc Mon Sep 17 00:00:00 2001
From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com>
Date: Thu, 15 Jan 2026 00:48:28 +0100
Subject: [PATCH 8/8] Use CPU neighbor list for CUDA loops

---
 README.md                      |   4 +-
 example/StillWedgeMDBC_CUDA.jl |  29 ++++++++--
 src/SPHCUDANeighborList.jl     | 101 ++++++++++++++++++++++++++++++++-
 src/SPHCUDASimulation.jl       |  37 ++++++++++--
 src/SPHExample.jl              |   3 +-
 5 files changed, 160 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 346097a8..886fe15b 100644
--- a/README.md
+++ b/README.md
@@ -141,7 +141,9 @@ geometry and run a CUDA neighbor-count pass using these utilities.
 For an end-to-end GPU path (limited to `NoShifting`, `NoKernelOutput`, `NoMDBC`,
 `ArtificialViscosity`, and `LinearDensityDiffusion`), use `RunSimulationCUDA`
 from the `SPHCUDASimulation` module. This runs the StillWedge-style workflow on
-CUDA arrays and syncs particle data to the CPU only for output/logging.
+CUDA arrays and builds neighbor lists on the CPU, then launches GPU neighbor
+loops (particle data is synced to the CPU only for neighbor-list construction
+and output/logging).
 
 ## Help
 
diff --git a/example/StillWedgeMDBC_CUDA.jl b/example/StillWedgeMDBC_CUDA.jl
index a98ed83a..e7ac0cc6 100644
--- a/example/StillWedgeMDBC_CUDA.jl
+++ b/example/StillWedgeMDBC_CUDA.jl
@@ -84,12 +84,31 @@ let
     CleanUpSimulationFolder(SimMetaDataWedge.SaveLocation)
 
     PositionGPU = CuArray(SimParticles.Position)
-    MinCorner, MaxCorner = ComputeBounds(SimParticles.Position)
-    Grid = BuildGrid(MinCorner, MaxCorner, SimKernel.H)
-    NeighborList = AllocateCUDANeighborList(Grid, length(PositionGPU))
 
-    UpdateNeighborsCUDA!(NeighborList, PositionGPU)
-    NeighborCounts = CountNeighborsCUDA(PositionGPU, NeighborList, SimKernel.H²)
+    ParticleRanges = zeros(Int, length(SimParticles) + 2)
+    UniqueCells = zeros(CartesianIndex{Dimensions}, length(SimParticles))
+    CellDict = Dict{CartesianIndex{Dimensions}, Int}()
+    FullStencil = ConstructStencil(Val(Dimensions))
+    NeighborCellLists = [Int[] for _ in 1:length(UniqueCells)]
+    ParticleOrder = zeros(Int, length(SimParticles))
+    CellOffsets = zeros(Int, length(ParticleRanges))
+    CellIdsHost = zeros(Int, length(SimParticles))
+
+    IndexCounter = UpdateNeighbors!(SimParticles, SimKernel.H⁻¹, ParticleRanges, UniqueCells, CellDict, ParticleOrder, CellOffsets)
+    UniqueCellsView = view(UniqueCells, 1:IndexCounter)
+    BuildNeighborCellLists!(NeighborCellLists, FullStencil, UniqueCellsView, ParticleRanges, CellDict)
+    @inbounds for Index in eachindex(CellIdsHost, SimParticles.Cells)
+        CellIdsHost[Index] = CellDict[SimParticles.Cells[Index]]
+    end
+    NeighborListPacked = UpdateNeighborsCPUToCUDA!(
+        nothing,
+        CellIdsHost,
+        ParticleOrder,
+        view(ParticleRanges, 1:(IndexCounter + 1)),
+        view(NeighborCellLists, 1:IndexCounter),
+    )
+
+    NeighborCounts = CountNeighborsCUDA(PositionGPU, NeighborListPacked, SimKernel.H²)
 
     NeighborCountsHost = Array(NeighborCounts)
     MinimumNeighbors = minimum(NeighborCountsHost)
diff --git a/src/SPHCUDANeighborList.jl b/src/SPHCUDANeighborList.jl
index 19db7d63..4abc0226 100644
--- a/src/SPHCUDANeighborList.jl
+++ b/src/SPHCUDANeighborList.jl
@@ -3,7 +3,8 @@ module SPHCUDANeighborList
 export CUDACellGrid, CUDANeighborList, AllocateCUDANeighborList,
        ConstructNeighborOffsets, BuildNeighborCellListsCUDA!,
        UpdateNeighborsCUDA!, NeighborLoopCUDA!,
-       NeighborLoopPerParticleCUDA!
+       NeighborLoopPerParticleCUDA!, CUDAPackedNeighborList,
+       AllocateCUDAPackedNeighborList, UpdateNeighborsCPUToCUDA!
 
 const CUDAIndex = Int32
 
@@ -51,6 +52,16 @@ struct CUDANeighborList{D, T}
     NeighborOffsets::CuArray{SVector{D, CUDAIndex}}
 end
 
+struct CUDAPackedNeighborList
+    CellIds::CuArray{CUDAIndex}
+    ParticleOrder::CuArray{CUDAIndex}
+    ParticleRanges::CuArray{CUDAIndex}
+    NeighborCells::CuArray{CUDAIndex}
+    NeighborCounts::CuArray{CUDAIndex}
+    CellCount::Int
+    MaxNeighbors::Int
+end
+
 """
     ConstructNeighborOffsets(Val(D))
 
@@ -95,6 +106,56 @@ function AllocateCUDANeighborList(Grid::CUDACellGrid{D, T}, ParticleCount::Int)
     return NeighborList
 end
 
+function AllocateCUDAPackedNeighborList(ParticleCount::Int, CellCount::Int, MaxNeighbors::Int)
+    CellIds = CUDA.zeros(CUDAIndex, ParticleCount)
+    ParticleOrder = CUDA.zeros(CUDAIndex, ParticleCount)
+    ParticleRanges = CUDA.zeros(CUDAIndex, CellCount + 1)
+    NeighborCells = CUDA.zeros(CUDAIndex, MaxNeighbors, CellCount)
+    NeighborCounts = CUDA.zeros(CUDAIndex, CellCount)
+    return CUDAPackedNeighborList(
+        CellIds,
+        ParticleOrder,
+        ParticleRanges,
+        NeighborCells,
+        NeighborCounts,
+        CellCount,
+        MaxNeighbors,
+    )
+end
+
+function UpdateNeighborsCPUToCUDA!(NeighborList::Union{Nothing, CUDAPackedNeighborList},
+                                   CellIdsHost,
+                                   ParticleOrderHost,
+                                   ParticleRangesHost,
+                                   NeighborCellListsHost)
+    CellCount = length(NeighborCellListsHost)
+    MaxNeighbors = maximum(length, NeighborCellListsHost; init=0)
+    if NeighborList === nothing || NeighborList.CellCount != CellCount || NeighborList.MaxNeighbors < MaxNeighbors
+        NeighborList = AllocateCUDAPackedNeighborList(
+            length(CellIdsHost),
+            CellCount,
+            max(MaxNeighbors, 1),
+        )
+    end
+
+    NeighborCountsHost = zeros(CUDAIndex, CellCount)
+    NeighborCellsHost = zeros(CUDAIndex, NeighborList.MaxNeighbors, CellCount)
+    @inbounds for CellIndex in 1:CellCount
+        Neighbors = NeighborCellListsHost[CellIndex]
+        NeighborCountsHost[CellIndex] = CUDAIndex(length(Neighbors))
+        for (NeighborOffset, NeighborIndex) in enumerate(Neighbors)
+            NeighborCellsHost[NeighborOffset, CellIndex] = CUDAIndex(NeighborIndex)
+        end
+    end
+
+    copyto!(NeighborList.CellIds, CUDAIndex.(CellIdsHost))
+    copyto!(NeighborList.ParticleOrder, CUDAIndex.(ParticleOrderHost))
+    copyto!(NeighborList.ParticleRanges, CUDAIndex.(ParticleRangesHost))
+    copyto!(NeighborList.NeighborCells, NeighborCellsHost)
+    copyto!(NeighborList.NeighborCounts, NeighborCountsHost)
+    return NeighborList
+end
+
 @inline function LinearCellId(CellCoords::SVector{D, CUDAIndex}, Grid::CUDACellGrid{D, T}) where {D, T}
     CellId = CUDAIndex(1)
     @inbounds for DimIndex in 1:D
@@ -337,6 +398,23 @@ function NeighborLoopCUDA!(InteractionKernel,
     return nothing
 end
 
+function NeighborLoopCUDA!(InteractionKernel,
+                           NeighborList::CUDAPackedNeighborList,
+                           Args...;
+                           Threads::Int = 256)
+    Blocks = cld(length(NeighborList.CellIds), Threads)
+    CUDA.@cuda threads=Threads blocks=Blocks NeighborLoopKernel!(
+        InteractionKernel,
+        NeighborList.CellIds,
+        NeighborList.ParticleOrder,
+        NeighborList.ParticleRanges,
+        NeighborList.NeighborCells,
+        NeighborList.NeighborCounts,
+        Args...,
+    )
+    return nothing
+end
+
 """
     NeighborLoopPerParticleCUDA!(InitKernel, InteractionKernel, FinalKernel, NeighborList, Args...; Threads=256)
 
@@ -368,4 +446,25 @@ function NeighborLoopPerParticleCUDA!(InitKernel,
     return nothing
 end
 
+function NeighborLoopPerParticleCUDA!(InitKernel,
+                                      InteractionKernel,
+                                      FinalKernel,
+                                      NeighborList::CUDAPackedNeighborList,
+                                      Args...;
+                                      Threads::Int = 256)
+    Blocks = cld(length(NeighborList.CellIds), Threads)
+    CUDA.@cuda threads=Threads blocks=Blocks NeighborLoopPerParticleKernel!(
+        InitKernel,
+        InteractionKernel,
+        FinalKernel,
+        NeighborList.CellIds,
+        NeighborList.ParticleOrder,
+        NeighborList.ParticleRanges,
+        NeighborList.NeighborCells,
+        NeighborList.NeighborCounts,
+        Args...,
+    )
+    return nothing
+end
+
 end
diff --git a/src/SPHCUDASimulation.jl b/src/SPHCUDASimulation.jl
index cb8db28c..1736e373 100644
--- a/src/SPHCUDASimulation.jl
+++ b/src/SPHCUDASimulation.jl
@@ -19,6 +19,7 @@ using ..SPHViscosityModels
 using ..TimeStepping: next_output_time
 using ..SimulationEquations: EquationOfStateGamma7
 using ..OpenExternalPrograms
+using ..SPHNeighborList: BuildNeighborCellLists!, ConstructStencil, UpdateNeighbors!
 
 struct CUDASimParticles{D, T}
     Position::CuArray{SVector{D, T}}
@@ -63,6 +64,13 @@ end
     return CUDACellGrid(MinCorner, CellSize, Dims)
 end
 
+@inline function BuildCellIds!(CellIds, Cells, CellDict)
+    @inbounds for Index in eachindex(CellIds, Cells)
+        CellIds[Index] = CellDict[Cells[Index]]
+    end
+    return nothing
+end
+
 @generated function GravityVector(::Val{D}, Value::T) where {D, T}
     Entries = [:(Index == $D ? Value : zero(T)) for Index in 1:D]
     return quote
@@ -231,9 +239,15 @@ function RunSimulationCUDA(;SimGeometry::Vector{Geometry{Dimensions, FloatType}}
     MaxVisc = CUDA.zeros(FloatType, NumberOfPoints)
     MinDtForce = CUDA.zeros(FloatType, NumberOfPoints)
 
-    MinCorner, MaxCorner = ComputeBounds(SimParticles.Position)
-    Grid = BuildGrid(MinCorner, MaxCorner, SimKernel.H)
-    NeighborList = AllocateCUDANeighborList(Grid, NumberOfPoints)
+    ParticleRanges = zeros(Int, NumberOfPoints + 1 + 1)
+    UniqueCells = zeros(CartesianIndex{Dimensions}, NumberOfPoints)
+    CellDict = Dict{CartesianIndex{Dimensions}, Int}()
+    FullStencil = ConstructStencil(Val(Dimensions))
+    NeighborCellLists = [Int[] for _ in 1:length(UniqueCells)]
+    ParticleOrder = zeros(Int, NumberOfPoints)
+    CellOffsets = zeros(Int, length(ParticleRanges))
+    CellIdsHost = zeros(Int, NumberOfPoints)
+    PackedNeighborList = nothing
 
     output = SetupVTKOutput(SimMetaData, SimParticles, SimKernel, Dimensions)
 
@@ -253,7 +267,18 @@ function RunSimulationCUDA(;SimGeometry::Vector{Geometry{Dimensions, FloatType}}
     Dt = ComputeNextTimeStep(MaxVisc, MinDtForce, SimConstants, SimKernel)
 
     while SimMetaData.TotalTime <= next_output_time(SimMetaData)
-        UpdateNeighborsCUDA!(NeighborList, CUDAParticles.Position)
+        copyto!(SimParticles.Position, Array(CUDAParticles.Position))
+        IndexCounter = UpdateNeighbors!(SimParticles, SimKernel.H⁻¹, ParticleRanges, UniqueCells, CellDict, ParticleOrder, CellOffsets)
+        UniqueCellsView = view(UniqueCells, 1:IndexCounter)
+        BuildNeighborCellLists!(NeighborCellLists, FullStencil, UniqueCellsView, ParticleRanges, CellDict)
+        BuildCellIds!(CellIdsHost, SimParticles.Cells, CellDict)
+        PackedNeighborList = UpdateNeighborsCPUToCUDA!(
+            PackedNeighborList,
+            CellIdsHost,
+            ParticleOrder,
+            view(ParticleRanges, 1:(IndexCounter + 1)),
+            view(NeighborCellLists, 1:IndexCounter),
+        )
 
         CUDA.@cuda threads=Threads blocks=Blocks PressureKernel!(CUDAParticles.Pressure, CUDAParticles.Density, SimConstants)
 
@@ -261,7 +286,7 @@ function RunSimulationCUDA(;SimGeometry::Vector{Geometry{Dimensions, FloatType}}
             InitAccumulatorKernel,
             InteractionKernel,
             FinalKernel,
-            NeighborList,
+            PackedNeighborList,
             CUDAParticles.Position,
             CUDAParticles.Density,
             CUDAParticles.Pressure,
@@ -298,7 +323,7 @@ function RunSimulationCUDA(;SimGeometry::Vector{Geometry{Dimensions, FloatType}}
             InitAccumulatorKernel,
             InteractionKernel,
             FinalKernel,
-            NeighborList,
+            PackedNeighborList,
             PositionHalf,
             DensityHalf,
             CUDAParticles.Pressure,
diff --git a/src/SPHExample.jl b/src/SPHExample.jl
index 113aa6b8..ae313057 100644
--- a/src/SPHExample.jl
+++ b/src/SPHExample.jl
@@ -69,7 +69,8 @@ module SPHExample
         using .SPHCUDANeighborList
         export CUDACellGrid, CUDANeighborList, AllocateCUDANeighborList,
                ConstructNeighborOffsets, BuildNeighborCellListsCUDA!,
-               UpdateNeighborsCUDA!, NeighborLoopCUDA!, NeighborLoopPerParticleCUDA!
+               UpdateNeighborsCUDA!, NeighborLoopCUDA!, NeighborLoopPerParticleCUDA!,
+               CUDAPackedNeighborList, AllocateCUDAPackedNeighborList, UpdateNeighborsCPUToCUDA!
         using .SPHCUDASimulation
         export RunSimulationCUDA, AllocateCUDASimParticles, SyncParticlesFromCUDA!
     end