diff --git a/README.md b/README.md
index c23bc265..886fe15b 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,7 @@ The project demonstrates how to assemble a small SPH solver with Julia. It focus
 - **Dynamic boundary condition** – inspired by DualSPHysics.
 - **Density diffusion** – based on Fourtakas et al. 2019 to reduce pressure noise.
 - **Wendland quintic kernel** – simple and stable without tensile corrections.
+- **CUDA neighbor lists** – optional GPU neighbor cell list and loop utilities for CUDA arrays.
 
 ## Folder Structure
 
@@ -91,6 +92,59 @@ one plus the particles in its neighbor stencil.
 Neighbor rebuilds use a per-cell ordering buffer without reordering particle
 storage.
 
+### CUDA Neighbor Utilities
+
+This package ships with an optional CUDA-focused neighbor list to offload the
+cell list construction and neighbor iteration. Install CUDA.jl in the project
+environment before using these helpers:
+
+```julia
+using Pkg
+Pkg.add("CUDA")
+```
+
+The CUDA utilities are only loaded when CUDA.jl is available. The workflow is:
+
+1. Build a `CUDACellGrid` describing the simulation bounds and desired cell size.
+2. Allocate a `CUDANeighborList` once for the particle count.
+3. Call `UpdateNeighborsCUDA!` each step after updating positions.
+4. Launch `NeighborLoopCUDA!` with a GPU kernel that computes pairwise
+   interactions.
+   For per-particle accumulation (mirroring `NeighborLoopPerParticle!`),
+   use `NeighborLoopPerParticleCUDA!` with init/interact/final callbacks.
+
+Here is a minimal sketch:
+
+```julia
+using CUDA
+using SPHExample
+
+grid = CUDACellGrid(SVector(0.0f0, 0.0f0), 0.01f0, SVector(128, 128))
+neighbor_list = AllocateCUDANeighborList(grid, length(position))
+
+UpdateNeighborsCUDA!(neighbor_list, position)
+
+function InteractionKernel(i, j, acc, pos)
+    @inbounds acc[i] += pos[j] - pos[i]
+    return nothing
+end
+
+NeighborLoopCUDA!(InteractionKernel, neighbor_list, acc, position)
+```
+
+These CUDA utilities are intended as building blocks for integrating GPU
+workflows into the existing solver. You can keep the CPU path intact and
+incrementally replace the neighbor loop where it makes sense for your use case.
+The `example/StillWedgeMDBC_CUDA.jl` script shows how to load the StillWedge
+geometry and run a CUDA neighbor-count pass using these utilities.
+
+For an end-to-end GPU path (limited to `NoShifting`, `NoKernelOutput`, `NoMDBC`,
+`ArtificialViscosity`, and `LinearDensityDiffusion`), use `RunSimulationCUDA`
+from the `SPHCUDASimulation` module. This runs the StillWedge-style workflow on
+CUDA arrays and builds neighbor lists on the CPU, then launches GPU neighbor
+loops (particle data is synced to the CPU only for neighbor-list construction
+and output/logging).
+
 ## Help
 
 Questions or issues can be posted on the GitHub issue tracker. Response times may vary but all feedback is welcome.
diff --git a/example/StillWedgeMDBC_CUDA.jl b/example/StillWedgeMDBC_CUDA.jl
new file mode 100644
index 00000000..e7ac0cc6
--- /dev/null
+++ b/example/StillWedgeMDBC_CUDA.jl
@@ -0,0 +1,133 @@
+using SPHExample
+using CUDA
+using StaticArrays
+
+if !SPHExample.HasCUDA
+    error("CUDA.jl is required to run this example.")
+end
+
+@inline function ComputeBounds(Position)
+    MinCorner = reduce((A, B) -> min.(A, B), Position)
+    MaxCorner = reduce((A, B) -> max.(A, B), Position)
+    return MinCorner, MaxCorner
+end
+
+@inline function BuildGrid(MinCorner, MaxCorner, CellSize)
+    Extents = MaxCorner - MinCorner
+    Dims = SVector{length(MinCorner), Int}(ceil.(Int, Extents ./ CellSize) .+ 1)
+    return CUDACellGrid(MinCorner, CellSize, Dims)
+end
+
+@inline function SquaredNorm(Vector)
+    Accumulator = zero(eltype(Vector))
+    @inbounds for Index in eachindex(Vector)
+        Accumulator += Vector[Index] * Vector[Index]
+    end
+    return Accumulator
+end
+
+@inline function NeighborCountInteractionKernel(Index, NeighborIndex, Counts, Position, SupportRadiusSquared)
+    Δx = Position[Index] - Position[NeighborIndex]
+    if SquaredNorm(Δx) <= SupportRadiusSquared
+        CUDA.atomic_add!(Counts, Index, 1)
+    end
+    return nothing
+end
+
+function CountNeighborsCUDA(Position, NeighborList, SupportRadiusSquared)
+    Counts = CUDA.zeros(Int, length(Position))
+    NeighborLoopCUDA!(NeighborCountInteractionKernel, NeighborList, Counts, Position, SupportRadiusSquared)
+    return Counts
+end
+
+let
+    Dimensions = 2
+    FloatType  = Float32
+
+    SimConstantsWedge = SimulationConstants{FloatType}(dx=0.02f0, c₀=42.485762f0, δᵩ=0.1f0, CFL=0.5f0)
+
+    FixedBoundary = Geometry{Dimensions, FloatType}(
+        CSVFile     = "./input/still_wedge/StillWedge_Dp$(SimConstantsWedge.dx)_Bound.csv",
+        GroupMarker = 1,
+        Type        = Fixed,
+        Motion      = nothing,
+    )
+
+    Water = Geometry{Dimensions, FloatType}(
+        CSVFile     = "./input/still_wedge/StillWedge_Dp$(SimConstantsWedge.dx)_Fluid.csv",
+        GroupMarker = 2,
+        Type        = Fluid,
+        Motion      = nothing,
+    )
+
+    SimMetaDataWedge  = SimulationMetaData{Dimensions, FloatType, NoShifting, NoKernelOutput, NoMDBC, StoreLog}(
+        SimulationName="StillWedge",
+        SaveLocation="W:/Simulations/StillWedge2D_MDBC_CUDA",
+        SimulationTime=4.0f0,
+        OutputTimes=0.01f0,
+        VisualizeInParaview=true,
+        ExportSingleVTKHDF=true,
+        ExportGridCells=true,
+        OpenLogFile=true,
+    )
+
+    if !isdir(SimMetaDataWedge.SaveLocation)
+        mkdir(SimMetaDataWedge.SaveLocation)
+    end
+
+    SimulationGeometry = [FixedBoundary; Water]
+    SimParticles = AllocateDataStructures(SimulationGeometry, SimMetaDataWedge)
+
+    SimKernel = SPHKernelInstance{Dimensions, FloatType}(WendlandC2(); dx = SimConstantsWedge.dx)
+    SimLogger = SimulationLogger(SimMetaDataWedge.SaveLocation)
+
+    CleanUpSimulationFolder(SimMetaDataWedge.SaveLocation)
+
+    PositionGPU = CuArray(SimParticles.Position)
+
+    ParticleRanges = zeros(Int, length(SimParticles) + 2)
+    UniqueCells = zeros(CartesianIndex{Dimensions}, length(SimParticles))
+    CellDict = Dict{CartesianIndex{Dimensions}, Int}()
+    FullStencil = ConstructStencil(Val(Dimensions))
+    NeighborCellLists = [Int[] for _ in 1:length(UniqueCells)]
+    ParticleOrder = zeros(Int, length(SimParticles))
+    CellOffsets = zeros(Int, length(ParticleRanges))
+    CellIdsHost = zeros(Int, length(SimParticles))
+
+    IndexCounter = UpdateNeighbors!(SimParticles, SimKernel.H⁻¹, ParticleRanges, UniqueCells, CellDict, ParticleOrder, CellOffsets)
+    UniqueCellsView = view(UniqueCells, 1:IndexCounter)
+    BuildNeighborCellLists!(NeighborCellLists, FullStencil, UniqueCellsView, ParticleRanges, CellDict)
+    @inbounds for Index in eachindex(CellIdsHost, SimParticles.Cells)
+        CellIdsHost[Index] = CellDict[SimParticles.Cells[Index]]
+    end
+    NeighborListPacked = UpdateNeighborsCPUToCUDA!(
+        nothing,
+        CellIdsHost,
+        ParticleOrder,
+        view(ParticleRanges, 1:(IndexCounter + 1)),
+        view(NeighborCellLists, 1:IndexCounter),
+    )
+
+    NeighborCounts = CountNeighborsCUDA(PositionGPU, NeighborListPacked, SimKernel.H²)
+
+    NeighborCountsHost = Array(NeighborCounts)
+    MinimumNeighbors = minimum(NeighborCountsHost)
+    MaximumNeighbors = maximum(NeighborCountsHost)
+    AverageNeighbors = sum(NeighborCountsHost) / length(NeighborCountsHost)
+
+    println("StillWedge CUDA neighbor stats (within H):")
+    println("  Min neighbors: ", MinimumNeighbors)
+    println("  Max neighbors: ", MaximumNeighbors)
+    println("  Avg neighbors: ", AverageNeighbors)
+
+    RunSimulationCUDA(
+        SimGeometry         = SimulationGeometry,
+        SimMetaData         = SimMetaDataWedge,
+        SimConstants        = SimConstantsWedge,
+        SimKernel           = SimKernel,
+        SimLogger           = SimLogger,
+        SimParticles        = SimParticles,
+        SimViscosity        = ArtificialViscosity(),
+        SimDensityDiffusion = LinearDensityDiffusion(),
+    )
+end
diff --git a/src/SPHCUDANeighborList.jl b/src/SPHCUDANeighborList.jl
new file mode 100644
index 00000000..4abc0226
--- /dev/null
+++ b/src/SPHCUDANeighborList.jl
@@ -0,0 +1,470 @@
+module SPHCUDANeighborList
+
+export CUDACellGrid, CUDANeighborList, AllocateCUDANeighborList,
+       ConstructNeighborOffsets, BuildNeighborCellListsCUDA!,
+       UpdateNeighborsCUDA!, NeighborLoopCUDA!,
+       NeighborLoopPerParticleCUDA!, CUDAPackedNeighborList,
+       AllocateCUDAPackedNeighborList, UpdateNeighborsCPUToCUDA!
+
+const CUDAIndex = Int32
+
+using CUDA
+using StaticArrays
+
+"""
+    CUDACellGrid(Origin, CellSize, Dims)
+
+Describe a uniform cell grid for CUDA neighbor list construction.
+"""
+struct CUDACellGrid{D, T}
+    Origin::SVector{D, T}
+    InvCellSize::T
+    Dims::SVector{D, Int}
+    Strides::SVector{D, Int}
+    CellCount::Int
+end
+
+"""
+    CUDACellGrid(Origin, CellSize, Dims)
+
+Build a CUDA cell grid from an origin, cell size, and grid dimensions.
+"""
+function CUDACellGrid(Origin::SVector{D, T}, CellSize::T, Dims::SVector{D, Int}) where {D, T}
+    Strides = SVector{D, Int}(ntuple(i -> i == 1 ? 1 : prod(Dims[1:(i - 1)]), D))
+    CellCount = prod(Dims)
+    return CUDACellGrid{D, T}(Origin, inv(CellSize), Dims, Strides, CellCount)
+end
+
+"""
+    CUDANeighborList(Grid, ...)
+
+CUDA storage for per-cell particle ranges and neighbor cell indices.
+"""
+struct CUDANeighborList{D, T}
+    Grid::CUDACellGrid{D, T}
+    CellIds::CuArray{CUDAIndex}
+    ParticleOrder::CuArray{CUDAIndex}
+    ParticleRanges::CuArray{CUDAIndex}
+    CellCounts::CuArray{CUDAIndex}
+    CellOffsets::CuArray{CUDAIndex}
+    NeighborCells::CuArray{CUDAIndex}
+    NeighborCounts::CuArray{CUDAIndex}
+    NeighborOffsets::CuArray{SVector{D, CUDAIndex}}
+end
+
+struct CUDAPackedNeighborList
+    CellIds::CuArray{CUDAIndex}
+    ParticleOrder::CuArray{CUDAIndex}
+    ParticleRanges::CuArray{CUDAIndex}
+    NeighborCells::CuArray{CUDAIndex}
+    NeighborCounts::CuArray{CUDAIndex}
+    CellCount::Int
+    MaxNeighbors::Int
+end
+
+"""
+    ConstructNeighborOffsets(Val(D))
+
+Return the non-zero stencil offsets for a D-dimensional Moore neighborhood.
+"""
+function ConstructNeighborOffsets(::Val{D}) where {D}
+    Offsets = SVector{D, CUDAIndex}[]
+    for Offset in CartesianIndices(ntuple(_ -> -1:1, D))
+        if !all(iszero, Tuple(Offset))
+            push!(Offsets, SVector{D, CUDAIndex}(Tuple(Offset)))
+        end
+    end
+    return Offsets
+end
+
+"""
+    AllocateCUDANeighborList(Grid, ParticleCount)
+
+Allocate GPU buffers for neighbor cell lists and particle ordering.
+"""
+function AllocateCUDANeighborList(Grid::CUDACellGrid{D, T}, ParticleCount::Int) where {D, T}
+    CellIds = CUDA.zeros(CUDAIndex, ParticleCount)
+    ParticleOrder = CUDA.zeros(CUDAIndex, ParticleCount)
+    ParticleRanges = CUDA.zeros(CUDAIndex, Grid.CellCount + 1)
+    CellCounts = CUDA.zeros(CUDAIndex, Grid.CellCount)
+    CellOffsets = CUDA.zeros(CUDAIndex, Grid.CellCount)
+    NeighborOffsets = CuArray(ConstructNeighborOffsets(Val(D)))
+    NeighborCells = CUDA.zeros(CUDAIndex, length(NeighborOffsets), Grid.CellCount)
+    NeighborCounts = CUDA.zeros(CUDAIndex, Grid.CellCount)
+    NeighborList = CUDANeighborList{D, T}(
+        Grid,
+        CellIds,
+        ParticleOrder,
+        ParticleRanges,
+        CellCounts,
+        CellOffsets,
+        NeighborCells,
+        NeighborCounts,
+        NeighborOffsets,
+    )
+    BuildNeighborCellListsCUDA!(NeighborList)
+    return NeighborList
+end
+
+function AllocateCUDAPackedNeighborList(ParticleCount::Int, CellCount::Int, MaxNeighbors::Int)
+    CellIds = CUDA.zeros(CUDAIndex, ParticleCount)
+    ParticleOrder = CUDA.zeros(CUDAIndex, ParticleCount)
+    ParticleRanges = CUDA.zeros(CUDAIndex, CellCount + 1)
+    NeighborCells = CUDA.zeros(CUDAIndex, MaxNeighbors, CellCount)
+    NeighborCounts = CUDA.zeros(CUDAIndex, CellCount)
+    return CUDAPackedNeighborList(
+        CellIds,
+        ParticleOrder,
+        ParticleRanges,
+        NeighborCells,
+        NeighborCounts,
+        CellCount,
+        MaxNeighbors,
+    )
+end
+
+function UpdateNeighborsCPUToCUDA!(NeighborList::Union{Nothing, CUDAPackedNeighborList},
+                                   CellIdsHost,
+                                   ParticleOrderHost,
+                                   ParticleRangesHost,
+                                   NeighborCellListsHost)
+    CellCount = length(NeighborCellListsHost)
+    MaxNeighbors = maximum(length, NeighborCellListsHost; init=0)
+    if NeighborList === nothing || NeighborList.CellCount != CellCount || NeighborList.MaxNeighbors < MaxNeighbors
+        NeighborList = AllocateCUDAPackedNeighborList(
+            length(CellIdsHost),
+            CellCount,
+            max(MaxNeighbors, 1),
+        )
+    end
+
+    NeighborCountsHost = zeros(CUDAIndex, CellCount)
+    NeighborCellsHost = zeros(CUDAIndex, NeighborList.MaxNeighbors, CellCount)
+    @inbounds for CellIndex in 1:CellCount
+        Neighbors = NeighborCellListsHost[CellIndex]
+        NeighborCountsHost[CellIndex] = CUDAIndex(length(Neighbors))
+        for (NeighborOffset, NeighborIndex) in enumerate(Neighbors)
+            NeighborCellsHost[NeighborOffset, CellIndex] = CUDAIndex(NeighborIndex)
+        end
+    end
+
+    copyto!(NeighborList.CellIds, CUDAIndex.(CellIdsHost))
+    copyto!(NeighborList.ParticleOrder, CUDAIndex.(ParticleOrderHost))
+    copyto!(NeighborList.ParticleRanges, CUDAIndex.(ParticleRangesHost))
+    copyto!(NeighborList.NeighborCells, NeighborCellsHost)
+    copyto!(NeighborList.NeighborCounts, NeighborCountsHost)
+    return NeighborList
+end
+
+@inline function LinearCellId(CellCoords::SVector{D, CUDAIndex}, Grid::CUDACellGrid{D, T}) where {D, T}
+    CellId = CUDAIndex(1)
+    @inbounds for DimIndex in 1:D
+        CellId += CellCoords[DimIndex] * CUDAIndex(Grid.Strides[DimIndex])
+    end
+    return CellId
+end
+
+@generated function CellCoordsFromId(CellId::CUDAIndex, Grid::CUDACellGrid{D, T}) where {D, T}
+    coord_symbols = [Symbol(:Coord_, i) for i in 1:D]
+    coord_defs = map(enumerate(coord_symbols)) do (i, sym)
+        quote
+            Dim = CUDAIndex(Grid.Dims[$i])
+            $sym = Remaining % Dim
+            Remaining = Remaining ÷ Dim
+        end
+    end
+    return quote
+        Remaining = CellId - CUDAIndex(1)
+        $(coord_defs...)
+        return SVector{$D, CUDAIndex}($(coord_symbols...))
+    end
+end
+
+@generated function CellCoordsFromPosition(Position, Grid::CUDACellGrid{D, T}) where {D, T}
+    coord_exprs = map(1:D) do i
+        quote
+            Raw = (Position[$i] - Grid.Origin[$i]) * Grid.InvCellSize
+            Cell = floor(CUDAIndex, Raw)
+            clamp(Cell, CUDAIndex(0), CUDAIndex(Grid.Dims[$i] - 1))
+        end
+    end
+    return quote
+        return SVector{$D, CUDAIndex}($(coord_exprs...))
+    end
+end
+
+function ComputeCellIdsKernel!(CellIds, Position, Grid::CUDACellGrid{D, T}) where {D, T}
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(CellIds)
+        Pos = Position[Index]
+        Coords = CellCoordsFromPosition(Pos, Grid)
+        CellIds[Index] = LinearCellId(Coords, Grid)
+    end
+    return nothing
+end
+
+function ComputeCellCountsKernel!(CellCounts, CellIds)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(CellIds)
+        CellId = CellIds[Index]
+        CUDA.atomic_add!(CellCounts, Int(CellId), CUDAIndex(1))
+    end
+    return nothing
+end
+
+function BuildParticleRangesKernel!(ParticleRanges, CellPrefix)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(CellPrefix)
+        if Index == 1
+            ParticleRanges[1] = CUDAIndex(1)
+        end
+        ParticleRanges[Index + 1] = CellPrefix[Index] + CUDAIndex(1)
+    end
+    return nothing
+end
+
+function BuildParticleOrderKernel!(ParticleOrder, CellOffsets, CellIds)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(CellIds)
+        CellId = CellIds[Index]
+        TargetIndex = CUDA.atomic_add!(CellOffsets, Int(CellId), CUDAIndex(1))
+        ParticleOrder[Int(TargetIndex)] = CUDAIndex(Index)
+    end
+    return nothing
+end
+
+function BuildNeighborCellListsKernel!(NeighborCells, NeighborCounts, NeighborOffsets, Grid::CUDACellGrid{D, T}) where {D, T}
+    CellId = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if CellId <= Grid.CellCount
+        CellCoord = CellCoordsFromId(CUDAIndex(CellId), Grid)
+        NeighborIndex = CUDAIndex(0)
+        for OffsetIndex in 1:length(NeighborOffsets)
+            Offset = NeighborOffsets[OffsetIndex]
+            NeighborCoord = CellCoord + Offset
+            IsValid = true
+            @inbounds for DimIndex in 1:D
+                if NeighborCoord[DimIndex] < CUDAIndex(0) || NeighborCoord[DimIndex] >= CUDAIndex(Grid.Dims[DimIndex])
+                    IsValid = false
+                    break
+                end
+            end
+            if IsValid
+                NeighborIndex += CUDAIndex(1)
+                NeighborCells[Int(NeighborIndex), CellId] = LinearCellId(NeighborCoord, Grid)
+            end
+        end
+        NeighborCounts[CellId] = NeighborIndex
+    end
+    return nothing
+end
+
+"""
+    BuildNeighborCellListsCUDA!(NeighborList)
+
+Precompute the neighbor cell indices for each grid cell on the GPU.
+"""
+function BuildNeighborCellListsCUDA!(NeighborList::CUDANeighborList)
+    Threads = 256
+    Blocks = cld(NeighborList.Grid.CellCount, Threads)
+    CUDA.@cuda threads=Threads blocks=Blocks BuildNeighborCellListsKernel!(
+        NeighborList.NeighborCells,
+        NeighborList.NeighborCounts,
+        NeighborList.NeighborOffsets,
+        NeighborList.Grid,
+    )
+    return nothing
+end
+
+"""
+    UpdateNeighborsCUDA!(NeighborList, Position)
+
+Update cell ids, particle ranges, and particle order on the GPU.
+"""
+function UpdateNeighborsCUDA!(NeighborList::CUDANeighborList, Position)
+    Threads = 256
+    Blocks = cld(length(NeighborList.CellIds), Threads)
+    CUDA.@cuda threads=Threads blocks=Blocks ComputeCellIdsKernel!(
+        NeighborList.CellIds,
+        Position,
+        NeighborList.Grid,
+    )
+    fill!(NeighborList.CellCounts, 0)
+    CUDA.@cuda threads=Threads blocks=Blocks ComputeCellCountsKernel!(
+        NeighborList.CellCounts,
+        NeighborList.CellIds,
+    )
+    CellPrefix = CUDA.cumsum(NeighborList.CellCounts)
+    BlocksCells = cld(length(CellPrefix), Threads)
+    CUDA.@cuda threads=Threads blocks=BlocksCells BuildParticleRangesKernel!(
+        NeighborList.ParticleRanges,
+        CellPrefix,
+    )
+    copyto!(NeighborList.CellOffsets, NeighborList.ParticleRanges[1:(end - 1)])
+    CUDA.@cuda threads=Threads blocks=Blocks BuildParticleOrderKernel!(
+        NeighborList.ParticleOrder,
+        NeighborList.CellOffsets,
+        NeighborList.CellIds,
+    )
+    return nothing
+end
+
+function NeighborLoopKernel!(InteractionKernel,
+                             CellIds,
+                             ParticleOrder,
+                             ParticleRanges,
+                             NeighborCells,
+                             NeighborCounts,
+                             Args...)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(CellIds)
+        CellId = Int(CellIds[Index])
+        StartIndex = Int(ParticleRanges[CellId])
+        EndIndex = Int(ParticleRanges[CellId + 1]) - 1
+        for j in StartIndex:EndIndex
+            NeighborIndex = Int(ParticleOrder[j])
+            if NeighborIndex != Index
+                InteractionKernel(Index, NeighborIndex, Args...)
+            end
+        end
+        NeighborCount = Int(NeighborCounts[CellId])
+        for n in 1:NeighborCount
+            NeighborCell = Int(NeighborCells[n, CellId])
+            StartIndex = Int(ParticleRanges[NeighborCell])
+            EndIndex = Int(ParticleRanges[NeighborCell + 1]) - 1
+            for j in StartIndex:EndIndex
+                NeighborIndex = Int(ParticleOrder[j])
+                InteractionKernel(Index, NeighborIndex, Args...)
+            end
+        end
+    end
+    return nothing
+end
+
+function NeighborLoopPerParticleKernel!(InitKernel,
+                                        InteractionKernel,
+                                        FinalKernel,
+                                        CellIds,
+                                        ParticleOrder,
+                                        ParticleRanges,
+                                        NeighborCells,
+                                        NeighborCounts,
+                                        Args...)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(CellIds)
+        CellId = Int(CellIds[Index])
+        Accumulator = InitKernel(Index, Args...)
+        StartIndex = Int(ParticleRanges[CellId])
+        EndIndex = Int(ParticleRanges[CellId + 1]) - 1
+        for j in StartIndex:EndIndex
+            NeighborIndex = Int(ParticleOrder[j])
+            if NeighborIndex != Index
+                Accumulator = InteractionKernel(Index, NeighborIndex, Accumulator, Args...)
+            end
+        end
+        NeighborCount = Int(NeighborCounts[CellId])
+        for n in 1:NeighborCount
+            NeighborCell = Int(NeighborCells[n, CellId])
+            StartIndex = Int(ParticleRanges[NeighborCell])
+            EndIndex = Int(ParticleRanges[NeighborCell + 1]) - 1
+            for j in StartIndex:EndIndex
+                NeighborIndex = Int(ParticleOrder[j])
+                Accumulator = InteractionKernel(Index, NeighborIndex, Accumulator, Args...)
+            end
+        end
+        FinalKernel(Index, Accumulator, Args...)
+    end
+    return nothing
+end
+
+"""
+    NeighborLoopCUDA!(InteractionKernel, NeighborList, Args...; Threads=256)
+
+Launch a CUDA neighbor loop that calls `InteractionKernel(i, j, Args...)`.
+"""
+function NeighborLoopCUDA!(InteractionKernel,
+                           NeighborList::CUDANeighborList,
+                           Args...;
+                           Threads::Int = 256)
+    Blocks = cld(length(NeighborList.CellIds), Threads)
+    CUDA.@cuda threads=Threads blocks=Blocks NeighborLoopKernel!(
+        InteractionKernel,
+        NeighborList.CellIds,
+        NeighborList.ParticleOrder,
+        NeighborList.ParticleRanges,
+        NeighborList.NeighborCells,
+        NeighborList.NeighborCounts,
+        Args...,
+    )
+    return nothing
+end
+
+function NeighborLoopCUDA!(InteractionKernel,
+                           NeighborList::CUDAPackedNeighborList,
+                           Args...;
+                           Threads::Int = 256)
+    Blocks = cld(length(NeighborList.CellIds), Threads)
+    CUDA.@cuda threads=Threads blocks=Blocks NeighborLoopKernel!(
+        InteractionKernel,
+        NeighborList.CellIds,
+        NeighborList.ParticleOrder,
+        NeighborList.ParticleRanges,
+        NeighborList.NeighborCells,
+        NeighborList.NeighborCounts,
+        Args...,
+    )
+    return nothing
+end
+
+"""
+    NeighborLoopPerParticleCUDA!(InitKernel, InteractionKernel, FinalKernel, NeighborList, Args...; Threads=256)
+
+Launch a CUDA neighbor loop that accumulates per-particle state. The kernels
+are called as:
+
+- `InitKernel(i, Args...)` to initialize the per-particle accumulator.
+- `InteractionKernel(i, j, Accumulator, Args...)` for each neighbor, returning an updated accumulator.
+- `FinalKernel(i, Accumulator, Args...)` to store the final per-particle state.
+"""
+function NeighborLoopPerParticleCUDA!(InitKernel,
+                                      InteractionKernel,
+                                      FinalKernel,
+                                      NeighborList::CUDANeighborList,
+                                      Args...;
+                                      Threads::Int = 256)
+    Blocks = cld(length(NeighborList.CellIds), Threads)
+    CUDA.@cuda threads=Threads blocks=Blocks NeighborLoopPerParticleKernel!(
+        InitKernel,
+        InteractionKernel,
+        FinalKernel,
+        NeighborList.CellIds,
+        NeighborList.ParticleOrder,
+        NeighborList.ParticleRanges,
+        NeighborList.NeighborCells,
+        NeighborList.NeighborCounts,
+        Args...,
+    )
+    return nothing
+end
+
+function NeighborLoopPerParticleCUDA!(InitKernel,
+                                      InteractionKernel,
+                                      FinalKernel,
+                                      NeighborList::CUDAPackedNeighborList,
+                                      Args...;
+                                      Threads::Int = 256)
+    Blocks = cld(length(NeighborList.CellIds), Threads)
+    CUDA.@cuda threads=Threads blocks=Blocks NeighborLoopPerParticleKernel!(
+        InitKernel,
+        InteractionKernel,
+        FinalKernel,
+        NeighborList.CellIds,
+        NeighborList.ParticleOrder,
+        NeighborList.ParticleRanges,
+        NeighborList.NeighborCells,
+        NeighborList.NeighborCounts,
+        Args...,
+    )
+    return nothing
+end
+
+end
diff --git a/src/SPHCUDASimulation.jl b/src/SPHCUDASimulation.jl
new file mode 100644
index 00000000..1736e373
--- /dev/null
+++ b/src/SPHCUDASimulation.jl
@@ -0,0 +1,377 @@
+module SPHCUDASimulation
+
+export RunSimulationCUDA, AllocateCUDASimParticles, SyncParticlesFromCUDA!
+
+using CUDA
+using StaticArrays
+using Parameters
+using LinearAlgebra
+
+using ..SimulationConstantsConfiguration
+using ..SimulationGeometry
+using ..SimulationLoggerConfiguration
+using ..SimulationMetaDataConfiguration
+using ..ProduceHDFVTK
+using ..SPHCUDANeighborList
+using ..SPHDensityDiffusionModels
+using ..SPHKernels
+using ..SPHViscosityModels
+using ..TimeStepping: next_output_time
+using ..SimulationEquations: EquationOfStateGamma7
+using ..OpenExternalPrograms
+using ..SPHNeighborList: BuildNeighborCellLists!, ConstructStencil, UpdateNeighbors!
+
+struct CUDASimParticles{D, T}
+    Position::CuArray{SVector{D, T}}
+    Velocity::CuArray{SVector{D, T}}
+    Acceleration::CuArray{SVector{D, T}}
+    Density::CuArray{T}
+    Pressure::CuArray{T}
+    MotionLimiter::CuArray{T}
+    GravityFactor::CuArray{T}
+end
+
+@inline function AllocateCUDASimParticles(SimParticles)
+    return CUDASimParticles(
+        CuArray(SimParticles.Position),
+        CuArray(SimParticles.Velocity),
+        CuArray(SimParticles.Acceleration),
+        CuArray(SimParticles.Density),
+        CuArray(SimParticles.Pressure),
+        CuArray(SimParticles.MotionLimiter),
+        CuArray(SimParticles.GravityFactor),
+    )
+end
+
+@inline function SyncParticlesFromCUDA!(SimParticles, CUDAParticles::CUDASimParticles)
+    copyto!(SimParticles.Position, Array(CUDAParticles.Position))
+    copyto!(SimParticles.Velocity, Array(CUDAParticles.Velocity))
+    copyto!(SimParticles.Acceleration, Array(CUDAParticles.Acceleration))
+    copyto!(SimParticles.Density, Array(CUDAParticles.Density))
+    copyto!(SimParticles.Pressure, Array(CUDAParticles.Pressure))
+    return nothing
+end
+
+@inline function ComputeBounds(Position)
+    MinCorner = reduce((A, B) -> min.(A, B), Position)
+    MaxCorner = reduce((A, B) -> max.(A, B), Position)
+    return MinCorner, MaxCorner
+end
+
+@inline function BuildGrid(MinCorner, MaxCorner, CellSize)
+    Extents = MaxCorner - MinCorner
+    Dims = SVector{length(MinCorner), Int}(ceil.(Int, Extents ./ CellSize) .+ 1)
+    return CUDACellGrid(MinCorner, CellSize, Dims)
+end
+
+@inline function BuildCellIds!(CellIds, Cells, CellDict)
+    @inbounds for Index in eachindex(CellIds, Cells)
+        CellIds[Index] = CellDict[Cells[Index]]
+    end
+    return nothing
+end
+
+@generated function GravityVector(::Val{D}, Value::T) where {D, T}
+    Entries = [:(Index == $D ? Value : zero(T)) for Index in 1:D]
+    return quote
+        return SVector{$D, T}($(Entries...))
+    end
+end
+
+function PressureKernel!(Pressure, Density, SimConstants)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(Pressure)
+        ρ = Density[Index]
+        Pressure[Index] = EquationOfStateGamma7(ρ, SimConstants.c₀, SimConstants.ρ₀)
+    end
+    return nothing
+end
+
+function LimitDensityKernel!(Density, ρ₀, MotionLimiter)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(Density)
+        if Density[Index] < ρ₀ && MotionLimiter[Index] == zero(eltype(MotionLimiter))
+            Density[Index] = ρ₀
+        end
+    end
+    return nothing
+end
+
+function DensityEpsiKernel!(Density, DensityRate, DensityHalf, Δt)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(Density)
+        Epsi = -(DensityRate[Index] / DensityHalf[Index]) * Δt
+        Density[Index] *= (2 - Epsi) / (2 + Epsi)
+    end
+    return nothing
+end
+
+function HalfTimeStepKernel!(Position, Velocity, Acceleration, Density,
+                             GravityFactor, MotionLimiter,
+                             PositionHalf, VelocityHalf, DensityHalf, DensityRate,
+                             DtHalf, GravityValue, Dimensions)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(Position)
+        Acc = Acceleration[Index] + GravityVector(Dimensions, GravityValue * GravityFactor[Index])
+        Acceleration[Index] = Acc
+        Factor = MotionLimiter[Index]
+        PositionHalf[Index] = Position[Index] + Velocity[Index] * DtHalf * Factor
+        VelocityHalf[Index] = Velocity[Index] + Acc * DtHalf * Factor
+        DensityHalf[Index] = Density[Index] + DensityRate[Index] * DtHalf
+    end
+    return nothing
+end
+
+function FullTimeStepKernel!(Position, Velocity, Acceleration, GravityFactor, MotionLimiter, Dt, GravityValue, Dimensions)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(Position)
+        Acc = Acceleration[Index] + GravityVector(Dimensions, GravityValue * GravityFactor[Index])
+        Acceleration[Index] = Acc
+        Factor = MotionLimiter[Index]
+        Velocity[Index] += Acc * Dt * Factor
+        Position[Index] += (((Velocity[Index] + (Velocity[Index] - Acc * Dt * Factor)) / 2) * Dt) * Factor
+    end
+    return nothing
+end
+
+function TimeStepBuffersKernel!(MaxVisc, MinDtForce, Position, Velocity, Acceleration, SimKernel)
+    Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if Index <= length(Position)
+        R = Position[Index]
+        V = Velocity[Index]
+        A = Acceleration[Index]
+        RSq = sqrt(dot(R, R))^2
+        CurrVisc = abs(SimKernel.h * dot(V, R) / (RSq + SimKernel.η²))
+        AMag = norm(A)
+        CurrDtForce = AMag > 0 ? sqrt(SimKernel.h / AMag) : typemax(eltype(MinDtForce))
+        MaxVisc[Index] = CurrVisc
+        MinDtForce[Index] = CurrDtForce
+    end
+    return nothing
+end
+
+@inline function InitAccumulatorKernel(Index, Density, Position)
+    return (zero(eltype(Density)), zero(eltype(Position)))
+end
+
+@inline function InteractionKernel(Index, NeighborIndex, Accumulator,
+                                   Position, Density, Pressure, Velocity,
+                                   MotionLimiter, SimKernel, SimConstants)
+    DensityRate, AccelerationValue = Accumulator
+    Δx = Position[Index] - Position[NeighborIndex]
+    Δx² = dot(Δx, Δx)
+    if Δx² <= SimKernel.H²
+        Distance = sqrt(abs(Δx²))
+        Q = clamp(Distance * SimKernel.h⁻¹, zero(Distance), 2)
+        ∇W = ∇Wᵢⱼ(SimKernel, Q, Δx)
+
+        ρᵢ = Density[Index]
+        ρⱼ = Density[NeighborIndex]
+
+        Vᵢ = Velocity[Index]
+        Vⱼ = Velocity[NeighborIndex]
+        Vᵢⱼ = Vᵢ - Vⱼ
+        DensityTerm = dot(-Vᵢⱼ, ∇W)
+        DensityRateContribution = -ρᵢ * (SimConstants.m₀ / ρⱼ) * DensityTerm
+
+        LinearρFactor = inv(SimConstants.Cb * SimConstants.γ) * SimConstants.ρ₀
+        Pᵢⱼᴴ = SimConstants.ρ₀ * (-SimConstants.g) * -Δx[end]
+        ρᵢⱼᴴ = Pᵢⱼᴴ * LinearρFactor
+        InvD = one(eltype(ρᵢ)) / (Δx² + SimKernel.η²)
+        ρⱼᵢ = ρⱼ - ρᵢ
+        ψᵢⱼ = 2 * (ρⱼᵢ - ρᵢⱼᴴ) * (-Δx) * InvD
+        MotionFactor = MotionLimiter[Index] * MotionLimiter[NeighborIndex]
+        DiffusionContribution = SimConstants.δᵩ * SimKernel.h * SimConstants.c₀ * (SimConstants.m₀ / ρⱼ) * dot(ψᵢⱼ, ∇W) * MotionFactor
+
+        DensityRate += DensityRateContribution + DiffusionContribution
+
+        Pᵢ = Pressure[Index]
+        Pⱼ = Pressure[NeighborIndex]
+        Pfac = (Pᵢ + Pⱼ) / (ρᵢ * ρⱼ)
+        Fab = tensile_correction(SimKernel, Pᵢ, ρᵢ, Pⱼ, ρⱼ, Q, SimConstants.dx)
+        AccelerationContribution = -SimConstants.m₀ * (Pfac + Fab) * ∇W
+
+        ViscTerm = zero(AccelerationContribution)
+        VDotX = dot(Vᵢⱼ, Δx)
+        if VDotX < 0
+            ρ̄ = 0.5 * (ρᵢ + ρⱼ)
+            μᵢⱼ = SimKernel.h * VDotX / (Δx² + SimKernel.η²)
+            ViscTerm = -SimConstants.m₀ * (-SimConstants.α * SimConstants.c₀ * μᵢⱼ) / ρ̄ * ∇W
+        end
+
+        AccelerationValue += AccelerationContribution + ViscTerm
+    end
+
+    return DensityRate, AccelerationValue
+end
+
+@inline function FinalKernel(Index, Accumulator, DensityRate, Acceleration)
+    DensityRateValue, AccelerationValue = Accumulator
+    DensityRate[Index] = DensityRateValue
+    Acceleration[Index] = AccelerationValue
+    return nothing
+end
+
+function ComputeNextTimeStep(MaxVisc, MinDtForce, SimConstants, SimKernel)
+    MaxViscHost = maximum(Array(MaxVisc))
+    MinDtHost = minimum(Array(MinDtForce))
+    DtLimit = SimKernel.h / (SimConstants.c₀ + MaxViscHost)
+    return SimConstants.CFL * min(MinDtHost, DtLimit)
+end
+
+function RunSimulationCUDA(;SimGeometry::Vector{Geometry{Dimensions, FloatType}},
+        SimMetaData::SimulationMetaData{Dimensions, FloatType, NoShifting, NoKernelOutput, NoMDBC, LMode},
+        SimConstants::SimulationConstants,
+        SimKernel::SPHKernelInstance,
+        SimLogger::SimulationLogger,
+        SimParticles,
+        SimViscosity::ArtificialViscosity,
+        SimDensityDiffusion::LinearDensityDiffusion,
+        ) where {Dimensions, FloatType, LMode<:LogMode}
+
+    NumberOfPoints = length(SimParticles)
+    CUDAParticles = AllocateCUDASimParticles(SimParticles)
+
+    DensityRate = CUDA.zeros(FloatType, NumberOfPoints)
+    VelocityHalf = CUDA.zeros(SVector{Dimensions, FloatType}, NumberOfPoints)
+    PositionHalf = CUDA.zeros(SVector{Dimensions, FloatType}, NumberOfPoints)
+    DensityHalf = CUDA.zeros(FloatType, NumberOfPoints)
+    MaxVisc = CUDA.zeros(FloatType, NumberOfPoints)
+    MinDtForce = CUDA.zeros(FloatType, NumberOfPoints)
+
+    ParticleRanges = zeros(Int, NumberOfPoints + 1 + 1)
+    UniqueCells = zeros(CartesianIndex{Dimensions}, NumberOfPoints)
+    CellDict = Dict{CartesianIndex{Dimensions}, Int}()
+    FullStencil = ConstructStencil(Val(Dimensions))
+    NeighborCellLists = [Int[] for _ in 1:length(UniqueCells)]
+    ParticleOrder = zeros(Int, NumberOfPoints)
+    CellOffsets = zeros(Int, length(ParticleRanges))
+    CellIdsHost = zeros(Int, NumberOfPoints)
+    PackedNeighborList = nothing
+
+    output = SetupVTKOutput(SimMetaData, SimParticles, SimKernel, Dimensions)
+
+    SimMetaData.OutputIterationCounter = 1
+    output.enqueue_particles(SimMetaData.OutputIterationCounter)
+
+    InitializeLog!(SimMetaData, SimLogger, SimConstants, SimKernel, SimViscosity, SimDensityDiffusion, SimGeometry, SimParticles)
+
+    Threads = 256
+    Blocks = cld(NumberOfPoints, Threads)
+    DimensionsVal = Val(Dimensions)
+
+    CUDA.@cuda threads=Threads blocks=Blocks PressureKernel!(CUDAParticles.Pressure, CUDAParticles.Density, SimConstants)
+    CUDA.@cuda threads=Threads blocks=Blocks TimeStepBuffersKernel!(
+        MaxVisc, MinDtForce, CUDAParticles.Position, CUDAParticles.Velocity, CUDAParticles.Acceleration, SimKernel,
+    )
+    Dt = ComputeNextTimeStep(MaxVisc, MinDtForce, SimConstants, SimKernel)
+
+    while SimMetaData.TotalTime <= next_output_time(SimMetaData)
+        copyto!(SimParticles.Position, Array(CUDAParticles.Position))
+        IndexCounter = UpdateNeighbors!(SimParticles, SimKernel.H⁻¹, ParticleRanges, UniqueCells, CellDict, ParticleOrder, CellOffsets)
+        UniqueCellsView = view(UniqueCells, 1:IndexCounter)
+        BuildNeighborCellLists!(NeighborCellLists, FullStencil, UniqueCellsView, ParticleRanges, CellDict)
+        BuildCellIds!(CellIdsHost, SimParticles.Cells, CellDict)
+        PackedNeighborList = UpdateNeighborsCPUToCUDA!(
+            PackedNeighborList,
+            CellIdsHost,
+            ParticleOrder,
+            view(ParticleRanges, 1:(IndexCounter + 1)),
+            view(NeighborCellLists, 1:IndexCounter),
+        )
+
+        CUDA.@cuda threads=Threads blocks=Blocks PressureKernel!(CUDAParticles.Pressure, CUDAParticles.Density, SimConstants)
+
+        NeighborLoopPerParticleCUDA!(
+            InitAccumulatorKernel,
+            InteractionKernel,
+            FinalKernel,
+            PackedNeighborList,
+            CUDAParticles.Position,
+            CUDAParticles.Density,
+            CUDAParticles.Pressure,
+            CUDAParticles.Velocity,
+            CUDAParticles.MotionLimiter,
+            SimKernel,
+            SimConstants,
+            DensityRate,
+            CUDAParticles.Acceleration,
+        )
+
+        DtHalf = Dt * 0.5
+        CUDA.@cuda threads=Threads blocks=Blocks HalfTimeStepKernel!(
+            CUDAParticles.Position,
+            CUDAParticles.Velocity,
+            CUDAParticles.Acceleration,
+            CUDAParticles.Density,
+            CUDAParticles.GravityFactor,
+            CUDAParticles.MotionLimiter,
+            PositionHalf,
+            VelocityHalf,
+            DensityHalf,
+            DensityRate,
+            DtHalf,
+            SimConstants.g,
+            DimensionsVal,
+        )
+
+        CUDA.@cuda threads=Threads blocks=Blocks LimitDensityKernel!(DensityHalf, SimConstants.ρ₀, CUDAParticles.MotionLimiter)
+
+        CUDA.@cuda threads=Threads blocks=Blocks PressureKernel!(CUDAParticles.Pressure, DensityHalf, SimConstants)
+
+        NeighborLoopPerParticleCUDA!(
+            InitAccumulatorKernel,
+            InteractionKernel,
+            FinalKernel,
+            PackedNeighborList,
+            PositionHalf,
+            DensityHalf,
+            CUDAParticles.Pressure,
+            VelocityHalf,
+            CUDAParticles.MotionLimiter,
+            SimKernel,
+            SimConstants,
+            DensityRate,
+            CUDAParticles.Acceleration,
+        )
+
+        CUDA.@cuda threads=Threads blocks=Blocks LimitDensityKernel!(CUDAParticles.Density, SimConstants.ρ₀, CUDAParticles.MotionLimiter)
+        CUDA.@cuda threads=Threads blocks=Blocks DensityEpsiKernel!(CUDAParticles.Density, DensityRate, DensityHalf, Dt)
+
+        CUDA.@cuda threads=Threads blocks=Blocks FullTimeStepKernel!(
+            CUDAParticles.Position,
+            CUDAParticles.Velocity,
+            CUDAParticles.Acceleration,
+            CUDAParticles.GravityFactor,
+            CUDAParticles.MotionLimiter,
+            Dt,
+            SimConstants.g,
+            DimensionsVal,
+        )
+
+        CUDA.@cuda threads=Threads blocks=Blocks TimeStepBuffersKernel!(
+            MaxVisc, MinDtForce, CUDAParticles.Position, CUDAParticles.Velocity, CUDAParticles.Acceleration, SimKernel,
+        )
+        Dt = ComputeNextTimeStep(MaxVisc, MinDtForce, SimConstants, SimKernel)
+
+        SimMetaData.Iteration += 1
+        SimMetaData.CurrentTimeStep = Dt
+        SimMetaData.TotalTime += Dt
+
+        SyncParticlesFromCUDA!(SimParticles, CUDAParticles)
+        LogStep!(SimMetaData, SimLogger)
+        SimMetaData.OutputIterationCounter += 1
+        output.enqueue_particles(SimMetaData.OutputIterationCounter)
+
+        if SimMetaData.TotalTime > SimMetaData.SimulationTime
+            output.close_files()
+            FinalizeLog!(SimMetaData, SimLogger)
+            AutoOpenLogFile(SimLogger, SimMetaData)
+            break
+        end
+    end
+
+    return nothing
+end
+
+end
diff --git a/src/SPHExample.jl b/src/SPHExample.jl
index a56895ea..ae313057 100644
--- a/src/SPHExample.jl
+++ b/src/SPHExample.jl
@@ -14,6 +14,11 @@ module SPHExample
     include("OpenExternalPrograms.jl")
     include("SPHDensityDiffusionModels.jl")  
     include("SPHNeighborList.jl")
+    const HasCUDA = Base.find_package("CUDA") !== nothing
+    if HasCUDA
+        include("SPHCUDANeighborList.jl")
+        include("SPHCUDASimulation.jl")
+    end
     include("SPHCellList.jl") #Must be last    
 
     # Re-export desired functions from each submodule
@@ -60,6 +65,16 @@ module SPHExample
     export ConstructStencil, ExtractCells!, UpdateNeighbors!,
            BuildNeighborCellLists!, ComputeCellParticleCounts, ComputeCellNeighborCounts
 
+    if HasCUDA
+        using .SPHCUDANeighborList
+        export CUDACellGrid, CUDANeighborList, AllocateCUDANeighborList,
+               ConstructNeighborOffsets, BuildNeighborCellListsCUDA!,
+               UpdateNeighborsCUDA!, NeighborLoopCUDA!, NeighborLoopPerParticleCUDA!,
+               CUDAPackedNeighborList, AllocateCUDAPackedNeighborList, UpdateNeighborsCPUToCUDA!
+        using .SPHCUDASimulation
+        export RunSimulationCUDA, AllocateCUDASimParticles, SyncParticlesFromCUDA!
+    end
+
     using .SPHCellList
     export NeighborLoop!, ComputeInteractions!, RunSimulation