From 2882e547623e6de66d621ee0f6bae6fea13769fd Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Wed, 14 Jan 2026 23:57:49 +0100 Subject: [PATCH 1/8] Add CUDA neighbor list utilities --- README.md | 43 ++++++ src/SPHCUDANeighborList.jl | 287 +++++++++++++++++++++++++++++++++++++ src/SPHExample.jl | 11 ++ 3 files changed, 341 insertions(+) create mode 100644 src/SPHCUDANeighborList.jl diff --git a/README.md b/README.md index c23bc265..c3f854e6 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ The project demonstrates how to assemble a small SPH solver with Julia. It focus - **Dynamic boundary condition** – inspired by DualSPHysics. - **Density diffusion** – based on Fourtakas et al. 2019 to reduce pressure noise. - **Wendland quintic kernel** – simple and stable without tensile corrections. +- **CUDA neighbor lists** – optional GPU neighbor cell list and loop utilities for CUDA arrays. ## Folder Structure @@ -91,6 +92,48 @@ one plus the particles in its neighbor stencil. Neighbor rebuilds use a per-cell ordering buffer without reordering particle storage. +### CUDA Neighbor Utilities + +This package ships with an optional CUDA-focused neighbor list to offload the +cell list construction and neighbor iteration. Install CUDA.jl in the project +environment before using these helpers: + +```julia +using Pkg +Pkg.add("CUDA") +``` + +The CUDA utilities are only loaded when CUDA.jl is available. The workflow is: + +1. Build a `CUDACellGrid` describing the simulation bounds and desired cell size. +2. Allocate a `CUDANeighborList` once for the particle count. +3. Call `UpdateNeighborsCUDA!` each step after updating positions. +4. Launch `NeighborLoopCUDA!` with a GPU kernel that computes pairwise + interactions. + +Here is a minimal sketch: + +```julia +using CUDA +using SPHExample + +grid = CUDACellGrid(SVector(0.0f0, 0.0f0), 0.01f0, SVector(128, 128)) +neighbor_list = AllocateCUDANeighborList(grid, length(position)) + +UpdateNeighborsCUDA!(neighbor_list, position) + +function InteractionKernel(i, j, acc, pos) + @inbounds acc[i] += pos[j] - pos[i] + return nothing +end + +NeighborLoopCUDA!(InteractionKernel, neighbor_list, acc, position) +``` + +These CUDA utilities are intended as building blocks for integrating GPU +workflows into the existing solver. You can keep the CPU path intact and +incrementally replace the neighbor loop where it makes sense for your use case. + ## Help Questions or issues can be posted on the GitHub issue tracker. Response times may vary but all feedback is welcome. diff --git a/src/SPHCUDANeighborList.jl b/src/SPHCUDANeighborList.jl new file mode 100644 index 00000000..a684ab55 --- /dev/null +++ b/src/SPHCUDANeighborList.jl @@ -0,0 +1,287 @@ +module SPHCUDANeighborList + +export CUDACellGrid, CUDANeighborList, AllocateCUDANeighborList, + ConstructNeighborOffsets, BuildNeighborCellListsCUDA!, + UpdateNeighborsCUDA!, NeighborLoopCUDA! + +using CUDA +using StaticArrays + +""" + CUDACellGrid(Origin, CellSize, Dims) + +Describe a uniform cell grid for CUDA neighbor list construction. +""" +struct CUDACellGrid{D, T} + Origin::SVector{D, T} + InvCellSize::T + Dims::SVector{D, Int} + Strides::SVector{D, Int} + CellCount::Int +end + +""" + CUDACellGrid(Origin, CellSize, Dims) + +Build a CUDA cell grid from an origin, cell size, and grid dimensions. +""" +function CUDACellGrid(Origin::SVector{D, T}, CellSize::T, Dims::SVector{D, Int}) where {D, T} + Strides = SVector{D, Int}(ntuple(i -> i == 1 ? 1 : prod(Dims[1:(i - 1)]), D)) + CellCount = prod(Dims) + return CUDACellGrid{D, T}(Origin, inv(CellSize), Dims, Strides, CellCount) +end + +""" + CUDANeighborList(Grid, ...) + +CUDA storage for per-cell particle ranges and neighbor cell indices. +""" +struct CUDANeighborList{D, T} + Grid::CUDACellGrid{D, T} + CellIds::CuArray{Int} + ParticleOrder::CuArray{Int} + ParticleRanges::CuArray{Int} + CellCounts::CuArray{Int} + CellOffsets::CuArray{Int} + NeighborCells::CuArray{Int} + NeighborCounts::CuArray{Int} + NeighborOffsets::CuArray{SVector{D, Int}} +end + +""" + ConstructNeighborOffsets(Val(D)) + +Return the non-zero stencil offsets for a D-dimensional Moore neighborhood. +""" +function ConstructNeighborOffsets(::Val{D}) where {D} + Offsets = SVector{D, Int}[] + for Offset in CartesianIndices(ntuple(_ -> -1:1, D)) + if !all(iszero, Tuple(Offset)) + push!(Offsets, SVector{D, Int}(Tuple(Offset))) + end + end + return Offsets +end + +""" + AllocateCUDANeighborList(Grid, ParticleCount) + +Allocate GPU buffers for neighbor cell lists and particle ordering. +""" +function AllocateCUDANeighborList(Grid::CUDACellGrid{D, T}, ParticleCount::Int) where {D, T} + CellIds = CUDA.zeros(Int, ParticleCount) + ParticleOrder = CUDA.zeros(Int, ParticleCount) + ParticleRanges = CUDA.zeros(Int, Grid.CellCount + 1) + CellCounts = CUDA.zeros(Int, Grid.CellCount) + CellOffsets = CUDA.zeros(Int, Grid.CellCount) + NeighborOffsets = CuArray(ConstructNeighborOffsets(Val(D))) + NeighborCells = CUDA.zeros(Int, length(NeighborOffsets), Grid.CellCount) + NeighborCounts = CUDA.zeros(Int, Grid.CellCount) + NeighborList = CUDANeighborList{D, T}( + Grid, + CellIds, + ParticleOrder, + ParticleRanges, + CellCounts, + CellOffsets, + NeighborCells, + NeighborCounts, + NeighborOffsets, + ) + BuildNeighborCellListsCUDA!(NeighborList) + return NeighborList +end + +@inline function LinearCellId(CellCoords::SVector{D, Int}, Grid::CUDACellGrid{D, T}) where {D, T} + CellId = 1 + @inbounds for DimIndex in 1:D + CellId += CellCoords[DimIndex] * Grid.Strides[DimIndex] + end + return CellId +end + +@inline function CellCoordsFromId(CellId::Int, Grid::CUDACellGrid{D, T}) where {D, T} + Remaining = CellId - 1 + Coords = ntuple(DimIndex -> begin + Dim = Grid.Dims[DimIndex] + Coord = Remaining % Dim + Remaining ÷= Dim + Coord + end, D) + return SVector{D, Int}(Coords) +end + +function ComputeCellIdsKernel!(CellIds, Position, Grid::CUDACellGrid{D, T}) where {D, T} + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(CellIds) + Pos = Position[Index] + Coords = ntuple(DimIndex -> begin + Raw = (Pos[DimIndex] - Grid.Origin[DimIndex]) * Grid.InvCellSize + Cell = floor(Int, Raw) + clamp(Cell, 0, Grid.Dims[DimIndex] - 1) + end, D) + CellIds[Index] = LinearCellId(SVector{D, Int}(Coords), Grid) + end + return nothing +end + +function ComputeCellCountsKernel!(CellCounts, CellIds) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(CellIds) + CellId = CellIds[Index] + CUDA.atomic_add!(CellCounts, CellId, 1) + end + return nothing +end + +function BuildParticleRangesKernel!(ParticleRanges, CellPrefix) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(CellPrefix) + if Index == 1 + ParticleRanges[1] = 1 + end + ParticleRanges[Index + 1] = CellPrefix[Index] + 1 + end + return nothing +end + +function BuildParticleOrderKernel!(ParticleOrder, CellOffsets, CellIds) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(CellIds) + CellId = CellIds[Index] + TargetIndex = CUDA.atomic_add!(CellOffsets, CellId, 1) + ParticleOrder[TargetIndex] = Index + end + return nothing +end + +function BuildNeighborCellListsKernel!(NeighborCells, NeighborCounts, NeighborOffsets, Grid) + CellId = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if CellId <= Grid.CellCount + CellCoord = CellCoordsFromId(CellId, Grid) + NeighborIndex = 0 + for OffsetIndex in 1:length(NeighborOffsets) + Offset = NeighborOffsets[OffsetIndex] + NeighborCoord = CellCoord + Offset + IsValid = true + @inbounds for DimIndex in 1:length(Grid.Dims) + if NeighborCoord[DimIndex] < 0 || NeighborCoord[DimIndex] >= Grid.Dims[DimIndex] + IsValid = false + break + end + end + if IsValid + NeighborIndex += 1 + NeighborCells[NeighborIndex, CellId] = LinearCellId(NeighborCoord, Grid) + end + end + NeighborCounts[CellId] = NeighborIndex + end + return nothing +end + +""" + BuildNeighborCellListsCUDA!(NeighborList) + +Precompute the neighbor cell indices for each grid cell on the GPU. +""" +function BuildNeighborCellListsCUDA!(NeighborList::CUDANeighborList) + Threads = 256 + Blocks = cld(NeighborList.Grid.CellCount, Threads) + CUDA.@cuda threads=Threads blocks=Blocks BuildNeighborCellListsKernel!( + NeighborList.NeighborCells, + NeighborList.NeighborCounts, + NeighborList.NeighborOffsets, + NeighborList.Grid, + ) + return nothing +end + +""" + UpdateNeighborsCUDA!(NeighborList, Position) + +Update cell ids, particle ranges, and particle order on the GPU. +""" +function UpdateNeighborsCUDA!(NeighborList::CUDANeighborList, Position) + Threads = 256 + Blocks = cld(length(NeighborList.CellIds), Threads) + CUDA.@cuda threads=Threads blocks=Blocks ComputeCellIdsKernel!( + NeighborList.CellIds, + Position, + NeighborList.Grid, + ) + fill!(NeighborList.CellCounts, 0) + CUDA.@cuda threads=Threads blocks=Blocks ComputeCellCountsKernel!( + NeighborList.CellCounts, + NeighborList.CellIds, + ) + CellPrefix = CUDA.cumsum(NeighborList.CellCounts) + BlocksCells = cld(length(CellPrefix), Threads) + CUDA.@cuda threads=Threads blocks=BlocksCells BuildParticleRangesKernel!( + NeighborList.ParticleRanges, + CellPrefix, + ) + copyto!(NeighborList.CellOffsets, NeighborList.ParticleRanges[1:(end - 1)]) + CUDA.@cuda threads=Threads blocks=Blocks BuildParticleOrderKernel!( + NeighborList.ParticleOrder, + NeighborList.CellOffsets, + NeighborList.CellIds, + ) + return nothing +end + +function NeighborLoopKernel!(InteractionKernel, + CellIds, + ParticleOrder, + ParticleRanges, + NeighborCells, + NeighborCounts, + Args...) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(CellIds) + CellId = CellIds[Index] + StartIndex = ParticleRanges[CellId] + EndIndex = ParticleRanges[CellId + 1] - 1 + for j in StartIndex:EndIndex + NeighborIndex = ParticleOrder[j] + if NeighborIndex != Index + InteractionKernel(Index, NeighborIndex, Args...) + end + end + NeighborCount = NeighborCounts[CellId] + for n in 1:NeighborCount + NeighborCell = NeighborCells[n, CellId] + StartIndex = ParticleRanges[NeighborCell] + EndIndex = ParticleRanges[NeighborCell + 1] - 1 + for j in StartIndex:EndIndex + NeighborIndex = ParticleOrder[j] + InteractionKernel(Index, NeighborIndex, Args...) + end + end + end + return nothing +end + +""" + NeighborLoopCUDA!(InteractionKernel, NeighborList, Args...; Threads=256) + +Launch a CUDA neighbor loop that calls `InteractionKernel(i, j, Args...)`. +""" +function NeighborLoopCUDA!(InteractionKernel, + NeighborList::CUDANeighborList, + Args...; + Threads::Int = 256) + Blocks = cld(length(NeighborList.CellIds), Threads) + CUDA.@cuda threads=Threads blocks=Blocks NeighborLoopKernel!( + InteractionKernel, + NeighborList.CellIds, + NeighborList.ParticleOrder, + NeighborList.ParticleRanges, + NeighborList.NeighborCells, + NeighborList.NeighborCounts, + Args..., + ) + return nothing +end + +end diff --git a/src/SPHExample.jl b/src/SPHExample.jl index a56895ea..0aa158a9 100644 --- a/src/SPHExample.jl +++ b/src/SPHExample.jl @@ -14,6 +14,10 @@ module SPHExample include("OpenExternalPrograms.jl") include("SPHDensityDiffusionModels.jl") include("SPHNeighborList.jl") + const HasCUDA = Base.find_package("CUDA") !== nothing + if HasCUDA + include("SPHCUDANeighborList.jl") + end include("SPHCellList.jl") #Must be last # Re-export desired functions from each submodule @@ -60,6 +64,13 @@ module SPHExample export ConstructStencil, ExtractCells!, UpdateNeighbors!, BuildNeighborCellLists!, ComputeCellParticleCounts, ComputeCellNeighborCounts + if HasCUDA + using .SPHCUDANeighborList + export CUDACellGrid, CUDANeighborList, AllocateCUDANeighborList, + ConstructNeighborOffsets, BuildNeighborCellListsCUDA!, + UpdateNeighborsCUDA!, NeighborLoopCUDA! + end + using .SPHCellList export NeighborLoop!, ComputeInteractions!, RunSimulation From a8ba51d481d7dc35d3b13911f76f2bf0f27c5f09 Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Thu, 15 Jan 2026 00:03:24 +0100 Subject: [PATCH 2/8] Add CUDA StillWedge neighbor example --- README.md | 2 + example/StillWedgeMDBC_CUDA.jl | 85 ++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 example/StillWedgeMDBC_CUDA.jl diff --git a/README.md b/README.md index c3f854e6..a776b46d 100644 --- a/README.md +++ b/README.md @@ -133,6 +133,8 @@ NeighborLoopCUDA!(InteractionKernel, neighbor_list, acc, position) These CUDA utilities are intended as building blocks for integrating GPU workflows into the existing solver. You can keep the CPU path intact and incrementally replace the neighbor loop where it makes sense for your use case. +The `example/StillWedgeMDBC_CUDA.jl` script shows how to load the StillWedge +geometry and run a CUDA neighbor-count pass using these utilities. ## Help diff --git a/example/StillWedgeMDBC_CUDA.jl b/example/StillWedgeMDBC_CUDA.jl new file mode 100644 index 00000000..1340769b --- /dev/null +++ b/example/StillWedgeMDBC_CUDA.jl @@ -0,0 +1,85 @@ +using SPHExample +using CUDA +using StaticArrays + +if !SPHExample.HasCUDA + error("CUDA.jl is required to run this example.") +end + +@inline function ComputeBounds(Position) + MinCorner = reduce((A, B) -> min.(A, B), Position) + MaxCorner = reduce((A, B) -> max.(A, B), Position) + return MinCorner, MaxCorner +end + +@inline function BuildGrid(MinCorner, MaxCorner, CellSize) + Extents = MaxCorner - MinCorner + Dims = SVector{length(MinCorner), Int}(ceil.(Int, Extents ./ CellSize) .+ 1) + return CUDACellGrid(MinCorner, CellSize, Dims) +end + +@inline function SquaredNorm(Vector) + Accumulator = zero(eltype(Vector)) + @inbounds for Index in eachindex(Vector) + Accumulator += Vector[Index] * Vector[Index] + end + return Accumulator +end + +@inline function NeighborCountInteractionKernel(Index, NeighborIndex, Counts, Position, SupportRadiusSquared) + Δx = Position[Index] - Position[NeighborIndex] + if SquaredNorm(Δx) <= SupportRadiusSquared + CUDA.atomic_add!(Counts, Index, 1) + end + return nothing +end + +function CountNeighborsCUDA(Position, NeighborList, SupportRadiusSquared) + Counts = CUDA.zeros(Int, length(Position)) + NeighborLoopCUDA!(NeighborCountInteractionKernel, NeighborList, Counts, Position, SupportRadiusSquared) + return Counts +end + +let + Dimensions = 2 + FloatType = Float32 + + SimConstantsWedge = SimulationConstants{FloatType}(dx=0.02f0, c₀=42.485762f0, δᵩ=0.1f0, CFL=0.5f0) + + FixedBoundary = Geometry{Dimensions, FloatType}( + CSVFile = "./input/still_wedge/StillWedge_Dp$(SimConstantsWedge.dx)_Bound.csv", + GroupMarker = 1, + Type = Fixed, + Motion = nothing, + ) + + Water = Geometry{Dimensions, FloatType}( + CSVFile = "./input/still_wedge/StillWedge_Dp$(SimConstantsWedge.dx)_Fluid.csv", + GroupMarker = 2, + Type = Fluid, + Motion = nothing, + ) + + SimulationGeometry = [FixedBoundary; Water] + SimParticles = AllocateDataStructures(SimulationGeometry) + + SimKernel = SPHKernelInstance{Dimensions, FloatType}(WendlandC2(); dx = SimConstantsWedge.dx) + + PositionGPU = CuArray(SimParticles.Position) + MinCorner, MaxCorner = ComputeBounds(SimParticles.Position) + Grid = BuildGrid(MinCorner, MaxCorner, SimKernel.H) + NeighborList = AllocateCUDANeighborList(Grid, length(PositionGPU)) + + UpdateNeighborsCUDA!(NeighborList, PositionGPU) + NeighborCounts = CountNeighborsCUDA(PositionGPU, NeighborList, SimKernel.H²) + + NeighborCountsHost = Array(NeighborCounts) + MinimumNeighbors = minimum(NeighborCountsHost) + MaximumNeighbors = maximum(NeighborCountsHost) + AverageNeighbors = sum(NeighborCountsHost) / length(NeighborCountsHost) + + println("StillWedge CUDA neighbor stats (within H):") + println(" Min neighbors: ", MinimumNeighbors) + println(" Max neighbors: ", MaximumNeighbors) + println(" Avg neighbors: ", AverageNeighbors) +end From 8e8811b5a1328a63ac7ffcde706c77d673de73bf Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Thu, 15 Jan 2026 00:13:53 +0100 Subject: [PATCH 3/8] Fix CUDA neighbor kernel coords --- src/SPHCUDANeighborList.jl | 48 ++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/src/SPHCUDANeighborList.jl b/src/SPHCUDANeighborList.jl index a684ab55..83dbe6b6 100644 --- a/src/SPHCUDANeighborList.jl +++ b/src/SPHCUDANeighborList.jl @@ -100,27 +100,41 @@ end return CellId end -@inline function CellCoordsFromId(CellId::Int, Grid::CUDACellGrid{D, T}) where {D, T} - Remaining = CellId - 1 - Coords = ntuple(DimIndex -> begin - Dim = Grid.Dims[DimIndex] - Coord = Remaining % Dim - Remaining ÷= Dim - Coord - end, D) - return SVector{D, Int}(Coords) +@generated function CellCoordsFromId(CellId::Int, Grid::CUDACellGrid{D, T}) where {D, T} + coord_symbols = [Symbol(:Coord_, i) for i in 1:D] + coord_defs = map(enumerate(coord_symbols)) do (i, sym) + quote + Dim = Grid.Dims[$i] + $sym = Remaining % Dim + Remaining = Remaining ÷ Dim + end + end + return quote + Remaining = CellId - 1 + $(coord_defs...) + return SVector{$D, Int}($(coord_symbols...)) + end +end + +@generated function CellCoordsFromPosition(Position, Grid::CUDACellGrid{D, T}) where {D, T} + coord_exprs = map(1:D) do i + quote + Raw = (Position[$i] - Grid.Origin[$i]) * Grid.InvCellSize + Cell = floor(Int, Raw) + clamp(Cell, 0, Grid.Dims[$i] - 1) + end + end + return quote + return SVector{$D, Int}($(coord_exprs...)) + end end function ComputeCellIdsKernel!(CellIds, Position, Grid::CUDACellGrid{D, T}) where {D, T} Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x if Index <= length(CellIds) Pos = Position[Index] - Coords = ntuple(DimIndex -> begin - Raw = (Pos[DimIndex] - Grid.Origin[DimIndex]) * Grid.InvCellSize - Cell = floor(Int, Raw) - clamp(Cell, 0, Grid.Dims[DimIndex] - 1) - end, D) - CellIds[Index] = LinearCellId(SVector{D, Int}(Coords), Grid) + Coords = CellCoordsFromPosition(Pos, Grid) + CellIds[Index] = LinearCellId(Coords, Grid) end return nothing end @@ -155,7 +169,7 @@ function BuildParticleOrderKernel!(ParticleOrder, CellOffsets, CellIds) return nothing end -function BuildNeighborCellListsKernel!(NeighborCells, NeighborCounts, NeighborOffsets, Grid) +function BuildNeighborCellListsKernel!(NeighborCells, NeighborCounts, NeighborOffsets, Grid::CUDACellGrid{D, T}) where {D, T} CellId = (blockIdx().x - 1) * blockDim().x + threadIdx().x if CellId <= Grid.CellCount CellCoord = CellCoordsFromId(CellId, Grid) @@ -164,7 +178,7 @@ function BuildNeighborCellListsKernel!(NeighborCells, NeighborCounts, NeighborOf Offset = NeighborOffsets[OffsetIndex] NeighborCoord = CellCoord + Offset IsValid = true - @inbounds for DimIndex in 1:length(Grid.Dims) + @inbounds for DimIndex in 1:D if NeighborCoord[DimIndex] < 0 || NeighborCoord[DimIndex] >= Grid.Dims[DimIndex] IsValid = false break From b56162c6504ae79d078bc097d29e99a1d31b1810 Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Thu, 15 Jan 2026 00:18:30 +0100 Subject: [PATCH 4/8] Add CUDA per-particle neighbor loop --- README.md | 2 ++ src/SPHCUDANeighborList.jl | 70 +++++++++++++++++++++++++++++++++++++- src/SPHExample.jl | 2 +- 3 files changed, 72 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a776b46d..5c1632fe 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,8 @@ The CUDA utilities are only loaded when CUDA.jl is available. The workflow is: 3. Call `UpdateNeighborsCUDA!` each step after updating positions. 4. Launch `NeighborLoopCUDA!` with a GPU kernel that computes pairwise interactions. + For per-particle accumulation (mirroring `NeighborLoopPerParticle!`), + use `NeighborLoopPerParticleCUDA!` with init/interact/final callbacks. Here is a minimal sketch: diff --git a/src/SPHCUDANeighborList.jl b/src/SPHCUDANeighborList.jl index 83dbe6b6..9575ce1d 100644 --- a/src/SPHCUDANeighborList.jl +++ b/src/SPHCUDANeighborList.jl @@ -2,7 +2,8 @@ module SPHCUDANeighborList export CUDACellGrid, CUDANeighborList, AllocateCUDANeighborList, ConstructNeighborOffsets, BuildNeighborCellListsCUDA!, - UpdateNeighborsCUDA!, NeighborLoopCUDA! + UpdateNeighborsCUDA!, NeighborLoopCUDA!, + NeighborLoopPerParticleCUDA! using CUDA using StaticArrays @@ -276,6 +277,42 @@ function NeighborLoopKernel!(InteractionKernel, return nothing end +function NeighborLoopPerParticleKernel!(InitKernel, + InteractionKernel, + FinalKernel, + CellIds, + ParticleOrder, + ParticleRanges, + NeighborCells, + NeighborCounts, + Args...) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(CellIds) + CellId = CellIds[Index] + Accumulator = InitKernel(Index, Args...) + StartIndex = ParticleRanges[CellId] + EndIndex = ParticleRanges[CellId + 1] - 1 + for j in StartIndex:EndIndex + NeighborIndex = ParticleOrder[j] + if NeighborIndex != Index + Accumulator = InteractionKernel(Index, NeighborIndex, Accumulator, Args...) + end + end + NeighborCount = NeighborCounts[CellId] + for n in 1:NeighborCount + NeighborCell = NeighborCells[n, CellId] + StartIndex = ParticleRanges[NeighborCell] + EndIndex = ParticleRanges[NeighborCell + 1] - 1 + for j in StartIndex:EndIndex + NeighborIndex = ParticleOrder[j] + Accumulator = InteractionKernel(Index, NeighborIndex, Accumulator, Args...) + end + end + FinalKernel(Index, Accumulator, Args...) + end + return nothing +end + """ NeighborLoopCUDA!(InteractionKernel, NeighborList, Args...; Threads=256) @@ -298,4 +335,35 @@ function NeighborLoopCUDA!(InteractionKernel, return nothing end +""" + NeighborLoopPerParticleCUDA!(InitKernel, InteractionKernel, FinalKernel, NeighborList, Args...; Threads=256) + +Launch a CUDA neighbor loop that accumulates per-particle state. The kernels +are called as: + +- `InitKernel(i, Args...)` to initialize the per-particle accumulator. +- `InteractionKernel(i, j, Accumulator, Args...)` for each neighbor, returning an updated accumulator. +- `FinalKernel(i, Accumulator, Args...)` to store the final per-particle state. +""" +function NeighborLoopPerParticleCUDA!(InitKernel, + InteractionKernel, + FinalKernel, + NeighborList::CUDANeighborList, + Args...; + Threads::Int = 256) + Blocks = cld(length(NeighborList.CellIds), Threads) + CUDA.@cuda threads=Threads blocks=Blocks NeighborLoopPerParticleKernel!( + InitKernel, + InteractionKernel, + FinalKernel, + NeighborList.CellIds, + NeighborList.ParticleOrder, + NeighborList.ParticleRanges, + NeighborList.NeighborCells, + NeighborList.NeighborCounts, + Args..., + ) + return nothing +end + end diff --git a/src/SPHExample.jl b/src/SPHExample.jl index 0aa158a9..071b1c35 100644 --- a/src/SPHExample.jl +++ b/src/SPHExample.jl @@ -68,7 +68,7 @@ module SPHExample using .SPHCUDANeighborList export CUDACellGrid, CUDANeighborList, AllocateCUDANeighborList, ConstructNeighborOffsets, BuildNeighborCellListsCUDA!, - UpdateNeighborsCUDA!, NeighborLoopCUDA! + UpdateNeighborsCUDA!, NeighborLoopCUDA!, NeighborLoopPerParticleCUDA! end using .SPHCellList From 171ca937254da74ecd65402e76aa25bd636e5ffb Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Thu, 15 Jan 2026 00:21:23 +0100 Subject: [PATCH 5/8] Run StillWedge CPU sim after CUDA setup --- example/StillWedgeMDBC_CUDA.jl | 31 ++++++++- src/SPHCUDANeighborList.jl | 112 +++++++++++++++++---------------- 2 files changed, 87 insertions(+), 56 deletions(-) diff --git a/example/StillWedgeMDBC_CUDA.jl b/example/StillWedgeMDBC_CUDA.jl index 1340769b..8d472a85 100644 --- a/example/StillWedgeMDBC_CUDA.jl +++ b/example/StillWedgeMDBC_CUDA.jl @@ -60,10 +60,28 @@ let Motion = nothing, ) + SimMetaDataWedge = SimulationMetaData{Dimensions, FloatType, NoShifting, NoKernelOutput, NoMDBC, StoreLog}( + SimulationName="StillWedge", + SaveLocation="W:/Simulations/StillWedge2D_MDBC_CUDA", + SimulationTime=4.0f0, + OutputTimes=0.01f0, + VisualizeInParaview=true, + ExportSingleVTKHDF=true, + ExportGridCells=true, + OpenLogFile=true, + ) + + if !isdir(SimMetaDataWedge.SaveLocation) + mkdir(SimMetaDataWedge.SaveLocation) + end + SimulationGeometry = [FixedBoundary; Water] - SimParticles = AllocateDataStructures(SimulationGeometry) + SimParticles = AllocateDataStructures(SimulationGeometry, SimMetaDataWedge) SimKernel = SPHKernelInstance{Dimensions, FloatType}(WendlandC2(); dx = SimConstantsWedge.dx) + SimLogger = SimulationLogger(SimMetaDataWedge.SaveLocation) + + CleanUpSimulationFolder(SimMetaDataWedge.SaveLocation) PositionGPU = CuArray(SimParticles.Position) MinCorner, MaxCorner = ComputeBounds(SimParticles.Position) @@ -82,4 +100,15 @@ let println(" Min neighbors: ", MinimumNeighbors) println(" Max neighbors: ", MaximumNeighbors) println(" Avg neighbors: ", AverageNeighbors) + + RunSimulation( + SimGeometry = SimulationGeometry, + SimMetaData = SimMetaDataWedge, + SimConstants = SimConstantsWedge, + SimKernel = SimKernel, + SimLogger = SimLogger, + SimParticles = SimParticles, + SimViscosity = ArtificialViscosity(), + SimDensityDiffusion = LinearDensityDiffusion(), + ) end diff --git a/src/SPHCUDANeighborList.jl b/src/SPHCUDANeighborList.jl index 9575ce1d..4cf1acf7 100644 --- a/src/SPHCUDANeighborList.jl +++ b/src/SPHCUDANeighborList.jl @@ -5,6 +5,8 @@ export CUDACellGrid, CUDANeighborList, AllocateCUDANeighborList, UpdateNeighborsCUDA!, NeighborLoopCUDA!, NeighborLoopPerParticleCUDA! +const CUDAIndex = Int32 + using CUDA using StaticArrays @@ -39,14 +41,14 @@ CUDA storage for per-cell particle ranges and neighbor cell indices. """ struct CUDANeighborList{D, T} Grid::CUDACellGrid{D, T} - CellIds::CuArray{Int} - ParticleOrder::CuArray{Int} - ParticleRanges::CuArray{Int} - CellCounts::CuArray{Int} - CellOffsets::CuArray{Int} - NeighborCells::CuArray{Int} - NeighborCounts::CuArray{Int} - NeighborOffsets::CuArray{SVector{D, Int}} + CellIds::CuArray{CUDAIndex} + ParticleOrder::CuArray{CUDAIndex} + ParticleRanges::CuArray{CUDAIndex} + CellCounts::CuArray{CUDAIndex} + CellOffsets::CuArray{CUDAIndex} + NeighborCells::CuArray{CUDAIndex} + NeighborCounts::CuArray{CUDAIndex} + NeighborOffsets::CuArray{SVector{D, CUDAIndex}} end """ @@ -55,10 +57,10 @@ end Return the non-zero stencil offsets for a D-dimensional Moore neighborhood. """ function ConstructNeighborOffsets(::Val{D}) where {D} - Offsets = SVector{D, Int}[] + Offsets = SVector{D, CUDAIndex}[] for Offset in CartesianIndices(ntuple(_ -> -1:1, D)) if !all(iszero, Tuple(Offset)) - push!(Offsets, SVector{D, Int}(Tuple(Offset))) + push!(Offsets, SVector{D, CUDAIndex}(Tuple(Offset))) end end return Offsets @@ -70,14 +72,14 @@ end Allocate GPU buffers for neighbor cell lists and particle ordering. """ function AllocateCUDANeighborList(Grid::CUDACellGrid{D, T}, ParticleCount::Int) where {D, T} - CellIds = CUDA.zeros(Int, ParticleCount) - ParticleOrder = CUDA.zeros(Int, ParticleCount) - ParticleRanges = CUDA.zeros(Int, Grid.CellCount + 1) - CellCounts = CUDA.zeros(Int, Grid.CellCount) - CellOffsets = CUDA.zeros(Int, Grid.CellCount) + CellIds = CUDA.zeros(CUDAIndex, ParticleCount) + ParticleOrder = CUDA.zeros(CUDAIndex, ParticleCount) + ParticleRanges = CUDA.zeros(CUDAIndex, Grid.CellCount + 1) + CellCounts = CUDA.zeros(CUDAIndex, Grid.CellCount) + CellOffsets = CUDA.zeros(CUDAIndex, Grid.CellCount) NeighborOffsets = CuArray(ConstructNeighborOffsets(Val(D))) - NeighborCells = CUDA.zeros(Int, length(NeighborOffsets), Grid.CellCount) - NeighborCounts = CUDA.zeros(Int, Grid.CellCount) + NeighborCells = CUDA.zeros(CUDAIndex, length(NeighborOffsets), Grid.CellCount) + NeighborCounts = CUDA.zeros(CUDAIndex, Grid.CellCount) NeighborList = CUDANeighborList{D, T}( Grid, CellIds, @@ -93,27 +95,27 @@ function AllocateCUDANeighborList(Grid::CUDACellGrid{D, T}, ParticleCount::Int) return NeighborList end -@inline function LinearCellId(CellCoords::SVector{D, Int}, Grid::CUDACellGrid{D, T}) where {D, T} - CellId = 1 +@inline function LinearCellId(CellCoords::SVector{D, CUDAIndex}, Grid::CUDACellGrid{D, T}) where {D, T} + CellId = CUDAIndex(1) @inbounds for DimIndex in 1:D - CellId += CellCoords[DimIndex] * Grid.Strides[DimIndex] + CellId += CellCoords[DimIndex] * CUDAIndex(Grid.Strides[DimIndex]) end return CellId end -@generated function CellCoordsFromId(CellId::Int, Grid::CUDACellGrid{D, T}) where {D, T} +@generated function CellCoordsFromId(CellId::CUDAIndex, Grid::CUDACellGrid{D, T}) where {D, T} coord_symbols = [Symbol(:Coord_, i) for i in 1:D] coord_defs = map(enumerate(coord_symbols)) do (i, sym) quote - Dim = Grid.Dims[$i] + Dim = CUDAIndex(Grid.Dims[$i]) $sym = Remaining % Dim Remaining = Remaining ÷ Dim end end return quote - Remaining = CellId - 1 + Remaining = CellId - CUDAIndex(1) $(coord_defs...) - return SVector{$D, Int}($(coord_symbols...)) + return SVector{$D, CUDAIndex}($(coord_symbols...)) end end @@ -121,12 +123,12 @@ end coord_exprs = map(1:D) do i quote Raw = (Position[$i] - Grid.Origin[$i]) * Grid.InvCellSize - Cell = floor(Int, Raw) - clamp(Cell, 0, Grid.Dims[$i] - 1) + Cell = floor(CUDAIndex, Raw) + clamp(Cell, CUDAIndex(0), CUDAIndex(Grid.Dims[$i] - 1)) end end return quote - return SVector{$D, Int}($(coord_exprs...)) + return SVector{$D, CUDAIndex}($(coord_exprs...)) end end @@ -144,7 +146,7 @@ function ComputeCellCountsKernel!(CellCounts, CellIds) Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x if Index <= length(CellIds) CellId = CellIds[Index] - CUDA.atomic_add!(CellCounts, CellId, 1) + CUDA.atomic_add!(CellCounts, CellId, CUDAIndex(1)) end return nothing end @@ -153,9 +155,9 @@ function BuildParticleRangesKernel!(ParticleRanges, CellPrefix) Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x if Index <= length(CellPrefix) if Index == 1 - ParticleRanges[1] = 1 + ParticleRanges[1] = CUDAIndex(1) end - ParticleRanges[Index + 1] = CellPrefix[Index] + 1 + ParticleRanges[Index + 1] = CellPrefix[Index] + CUDAIndex(1) end return nothing end @@ -164,8 +166,8 @@ function BuildParticleOrderKernel!(ParticleOrder, CellOffsets, CellIds) Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x if Index <= length(CellIds) CellId = CellIds[Index] - TargetIndex = CUDA.atomic_add!(CellOffsets, CellId, 1) - ParticleOrder[TargetIndex] = Index + TargetIndex = CUDA.atomic_add!(CellOffsets, CellId, CUDAIndex(1)) + ParticleOrder[Int(TargetIndex)] = CUDAIndex(Index) end return nothing end @@ -173,21 +175,21 @@ end function BuildNeighborCellListsKernel!(NeighborCells, NeighborCounts, NeighborOffsets, Grid::CUDACellGrid{D, T}) where {D, T} CellId = (blockIdx().x - 1) * blockDim().x + threadIdx().x if CellId <= Grid.CellCount - CellCoord = CellCoordsFromId(CellId, Grid) - NeighborIndex = 0 + CellCoord = CellCoordsFromId(CUDAIndex(CellId), Grid) + NeighborIndex = CUDAIndex(0) for OffsetIndex in 1:length(NeighborOffsets) Offset = NeighborOffsets[OffsetIndex] NeighborCoord = CellCoord + Offset IsValid = true @inbounds for DimIndex in 1:D - if NeighborCoord[DimIndex] < 0 || NeighborCoord[DimIndex] >= Grid.Dims[DimIndex] + if NeighborCoord[DimIndex] < CUDAIndex(0) || NeighborCoord[DimIndex] >= CUDAIndex(Grid.Dims[DimIndex]) IsValid = false break end end if IsValid - NeighborIndex += 1 - NeighborCells[NeighborIndex, CellId] = LinearCellId(NeighborCoord, Grid) + NeighborIndex += CUDAIndex(1) + NeighborCells[Int(NeighborIndex), CellId] = LinearCellId(NeighborCoord, Grid) end end NeighborCounts[CellId] = NeighborIndex @@ -254,22 +256,22 @@ function NeighborLoopKernel!(InteractionKernel, Args...) Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x if Index <= length(CellIds) - CellId = CellIds[Index] - StartIndex = ParticleRanges[CellId] - EndIndex = ParticleRanges[CellId + 1] - 1 + CellId = Int(CellIds[Index]) + StartIndex = Int(ParticleRanges[CellId]) + EndIndex = Int(ParticleRanges[CellId + 1]) - 1 for j in StartIndex:EndIndex - NeighborIndex = ParticleOrder[j] + NeighborIndex = Int(ParticleOrder[j]) if NeighborIndex != Index InteractionKernel(Index, NeighborIndex, Args...) end end - NeighborCount = NeighborCounts[CellId] + NeighborCount = Int(NeighborCounts[CellId]) for n in 1:NeighborCount - NeighborCell = NeighborCells[n, CellId] - StartIndex = ParticleRanges[NeighborCell] - EndIndex = ParticleRanges[NeighborCell + 1] - 1 + NeighborCell = Int(NeighborCells[n, CellId]) + StartIndex = Int(ParticleRanges[NeighborCell]) + EndIndex = Int(ParticleRanges[NeighborCell + 1]) - 1 for j in StartIndex:EndIndex - NeighborIndex = ParticleOrder[j] + NeighborIndex = Int(ParticleOrder[j]) InteractionKernel(Index, NeighborIndex, Args...) end end @@ -288,23 +290,23 @@ function NeighborLoopPerParticleKernel!(InitKernel, Args...) Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x if Index <= length(CellIds) - CellId = CellIds[Index] + CellId = Int(CellIds[Index]) Accumulator = InitKernel(Index, Args...) - StartIndex = ParticleRanges[CellId] - EndIndex = ParticleRanges[CellId + 1] - 1 + StartIndex = Int(ParticleRanges[CellId]) + EndIndex = Int(ParticleRanges[CellId + 1]) - 1 for j in StartIndex:EndIndex - NeighborIndex = ParticleOrder[j] + NeighborIndex = Int(ParticleOrder[j]) if NeighborIndex != Index Accumulator = InteractionKernel(Index, NeighborIndex, Accumulator, Args...) end end - NeighborCount = NeighborCounts[CellId] + NeighborCount = Int(NeighborCounts[CellId]) for n in 1:NeighborCount - NeighborCell = NeighborCells[n, CellId] - StartIndex = ParticleRanges[NeighborCell] - EndIndex = ParticleRanges[NeighborCell + 1] - 1 + NeighborCell = Int(NeighborCells[n, CellId]) + StartIndex = Int(ParticleRanges[NeighborCell]) + EndIndex = Int(ParticleRanges[NeighborCell + 1]) - 1 for j in StartIndex:EndIndex - NeighborIndex = ParticleOrder[j] + NeighborIndex = Int(ParticleOrder[j]) Accumulator = InteractionKernel(Index, NeighborIndex, Accumulator, Args...) end end From ba7d83ccc8c79384e0f414a44ba650d47d895452 Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Thu, 15 Jan 2026 00:33:57 +0100 Subject: [PATCH 6/8] Add CUDA simulation path --- README.md | 5 + example/StillWedgeMDBC_CUDA.jl | 2 +- src/SPHCUDASimulation.jl | 352 +++++++++++++++++++++++++++++++++ src/SPHExample.jl | 3 + 4 files changed, 361 insertions(+), 1 deletion(-) create mode 100644 src/SPHCUDASimulation.jl diff --git a/README.md b/README.md index 5c1632fe..346097a8 100644 --- a/README.md +++ b/README.md @@ -138,6 +138,11 @@ incrementally replace the neighbor loop where it makes sense for your use case. The `example/StillWedgeMDBC_CUDA.jl` script shows how to load the StillWedge geometry and run a CUDA neighbor-count pass using these utilities. +For an end-to-end GPU path (limited to `NoShifting`, `NoKernelOutput`, `NoMDBC`, +`ArtificialViscosity`, and `LinearDensityDiffusion`), use `RunSimulationCUDA` +from the `SPHCUDASimulation` module. This runs the StillWedge-style workflow on +CUDA arrays and syncs particle data to the CPU only for output/logging. + ## Help Questions or issues can be posted on the GitHub issue tracker. Response times may vary but all feedback is welcome. diff --git a/example/StillWedgeMDBC_CUDA.jl b/example/StillWedgeMDBC_CUDA.jl index 8d472a85..a98ed83a 100644 --- a/example/StillWedgeMDBC_CUDA.jl +++ b/example/StillWedgeMDBC_CUDA.jl @@ -101,7 +101,7 @@ let println(" Max neighbors: ", MaximumNeighbors) println(" Avg neighbors: ", AverageNeighbors) - RunSimulation( + RunSimulationCUDA( SimGeometry = SimulationGeometry, SimMetaData = SimMetaDataWedge, SimConstants = SimConstantsWedge, diff --git a/src/SPHCUDASimulation.jl b/src/SPHCUDASimulation.jl new file mode 100644 index 00000000..cb8db28c --- /dev/null +++ b/src/SPHCUDASimulation.jl @@ -0,0 +1,352 @@ +module SPHCUDASimulation + +export RunSimulationCUDA, AllocateCUDASimParticles, SyncParticlesFromCUDA! + +using CUDA +using StaticArrays +using Parameters +using LinearAlgebra + +using ..SimulationConstantsConfiguration +using ..SimulationGeometry +using ..SimulationLoggerConfiguration +using ..SimulationMetaDataConfiguration +using ..ProduceHDFVTK +using ..SPHCUDANeighborList +using ..SPHDensityDiffusionModels +using ..SPHKernels +using ..SPHViscosityModels +using ..TimeStepping: next_output_time +using ..SimulationEquations: EquationOfStateGamma7 +using ..OpenExternalPrograms + +struct CUDASimParticles{D, T} + Position::CuArray{SVector{D, T}} + Velocity::CuArray{SVector{D, T}} + Acceleration::CuArray{SVector{D, T}} + Density::CuArray{T} + Pressure::CuArray{T} + MotionLimiter::CuArray{T} + GravityFactor::CuArray{T} +end + +@inline function AllocateCUDASimParticles(SimParticles) + return CUDASimParticles( + CuArray(SimParticles.Position), + CuArray(SimParticles.Velocity), + CuArray(SimParticles.Acceleration), + CuArray(SimParticles.Density), + CuArray(SimParticles.Pressure), + CuArray(SimParticles.MotionLimiter), + CuArray(SimParticles.GravityFactor), + ) +end + +@inline function SyncParticlesFromCUDA!(SimParticles, CUDAParticles::CUDASimParticles) + copyto!(SimParticles.Position, Array(CUDAParticles.Position)) + copyto!(SimParticles.Velocity, Array(CUDAParticles.Velocity)) + copyto!(SimParticles.Acceleration, Array(CUDAParticles.Acceleration)) + copyto!(SimParticles.Density, Array(CUDAParticles.Density)) + copyto!(SimParticles.Pressure, Array(CUDAParticles.Pressure)) + return nothing +end + +@inline function ComputeBounds(Position) + MinCorner = reduce((A, B) -> min.(A, B), Position) + MaxCorner = reduce((A, B) -> max.(A, B), Position) + return MinCorner, MaxCorner +end + +@inline function BuildGrid(MinCorner, MaxCorner, CellSize) + Extents = MaxCorner - MinCorner + Dims = SVector{length(MinCorner), Int}(ceil.(Int, Extents ./ CellSize) .+ 1) + return CUDACellGrid(MinCorner, CellSize, Dims) +end + +@generated function GravityVector(::Val{D}, Value::T) where {D, T} + Entries = [:(Index == $D ? Value : zero(T)) for Index in 1:D] + return quote + return SVector{$D, T}($(Entries...)) + end +end + +function PressureKernel!(Pressure, Density, SimConstants) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(Pressure) + ρ = Density[Index] + Pressure[Index] = EquationOfStateGamma7(ρ, SimConstants.c₀, SimConstants.ρ₀) + end + return nothing +end + +function LimitDensityKernel!(Density, ρ₀, MotionLimiter) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(Density) + if Density[Index] < ρ₀ && MotionLimiter[Index] == zero(eltype(MotionLimiter)) + Density[Index] = ρ₀ + end + end + return nothing +end + +function DensityEpsiKernel!(Density, DensityRate, DensityHalf, Δt) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(Density) + Epsi = -(DensityRate[Index] / DensityHalf[Index]) * Δt + Density[Index] *= (2 - Epsi) / (2 + Epsi) + end + return nothing +end + +function HalfTimeStepKernel!(Position, Velocity, Acceleration, Density, + GravityFactor, MotionLimiter, + PositionHalf, VelocityHalf, DensityHalf, DensityRate, + DtHalf, GravityValue, Dimensions) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(Position) + Acc = Acceleration[Index] + GravityVector(Dimensions, GravityValue * GravityFactor[Index]) + Acceleration[Index] = Acc + Factor = MotionLimiter[Index] + PositionHalf[Index] = Position[Index] + Velocity[Index] * DtHalf * Factor + VelocityHalf[Index] = Velocity[Index] + Acc * DtHalf * Factor + DensityHalf[Index] = Density[Index] + DensityRate[Index] * DtHalf + end + return nothing +end + +function FullTimeStepKernel!(Position, Velocity, Acceleration, GravityFactor, MotionLimiter, Dt, GravityValue, Dimensions) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(Position) + Acc = Acceleration[Index] + GravityVector(Dimensions, GravityValue * GravityFactor[Index]) + Acceleration[Index] = Acc + Factor = MotionLimiter[Index] + Velocity[Index] += Acc * Dt * Factor + Position[Index] += (((Velocity[Index] + (Velocity[Index] - Acc * Dt * Factor)) / 2) * Dt) * Factor + end + return nothing +end + +function TimeStepBuffersKernel!(MaxVisc, MinDtForce, Position, Velocity, Acceleration, SimKernel) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(Position) + R = Position[Index] + V = Velocity[Index] + A = Acceleration[Index] + RSq = sqrt(dot(R, R))^2 + CurrVisc = abs(SimKernel.h * dot(V, R) / (RSq + SimKernel.η²)) + AMag = norm(A) + CurrDtForce = AMag > 0 ? sqrt(SimKernel.h / AMag) : typemax(eltype(MinDtForce)) + MaxVisc[Index] = CurrVisc + MinDtForce[Index] = CurrDtForce + end + return nothing +end + +@inline function InitAccumulatorKernel(Index, Density, Position) + return (zero(eltype(Density)), zero(eltype(Position))) +end + +@inline function InteractionKernel(Index, NeighborIndex, Accumulator, + Position, Density, Pressure, Velocity, + MotionLimiter, SimKernel, SimConstants) + DensityRate, AccelerationValue = Accumulator + Δx = Position[Index] - Position[NeighborIndex] + Δx² = dot(Δx, Δx) + if Δx² <= SimKernel.H² + Distance = sqrt(abs(Δx²)) + Q = clamp(Distance * SimKernel.h⁻¹, zero(Distance), 2) + ∇W = ∇Wᵢⱼ(SimKernel, Q, Δx) + + ρᵢ = Density[Index] + ρⱼ = Density[NeighborIndex] + + Vᵢ = Velocity[Index] + Vⱼ = Velocity[NeighborIndex] + Vᵢⱼ = Vᵢ - Vⱼ + DensityTerm = dot(-Vᵢⱼ, ∇W) + DensityRateContribution = -ρᵢ * (SimConstants.m₀ / ρⱼ) * DensityTerm + + LinearρFactor = inv(SimConstants.Cb * SimConstants.γ) * SimConstants.ρ₀ + Pᵢⱼᴴ = SimConstants.ρ₀ * (-SimConstants.g) * -Δx[end] + ρᵢⱼᴴ = Pᵢⱼᴴ * LinearρFactor + InvD = one(eltype(ρᵢ)) / (Δx² + SimKernel.η²) + ρⱼᵢ = ρⱼ - ρᵢ + ψᵢⱼ = 2 * (ρⱼᵢ - ρᵢⱼᴴ) * (-Δx) * InvD + MotionFactor = MotionLimiter[Index] * MotionLimiter[NeighborIndex] + DiffusionContribution = SimConstants.δᵩ * SimKernel.h * SimConstants.c₀ * (SimConstants.m₀ / ρⱼ) * dot(ψᵢⱼ, ∇W) * MotionFactor + + DensityRate += DensityRateContribution + DiffusionContribution + + Pᵢ = Pressure[Index] + Pⱼ = Pressure[NeighborIndex] + Pfac = (Pᵢ + Pⱼ) / (ρᵢ * ρⱼ) + Fab = tensile_correction(SimKernel, Pᵢ, ρᵢ, Pⱼ, ρⱼ, Q, SimConstants.dx) + AccelerationContribution = -SimConstants.m₀ * (Pfac + Fab) * ∇W + + ViscTerm = zero(AccelerationContribution) + VDotX = dot(Vᵢⱼ, Δx) + if VDotX < 0 + ρ̄ = 0.5 * (ρᵢ + ρⱼ) + μᵢⱼ = SimKernel.h * VDotX / (Δx² + SimKernel.η²) + ViscTerm = -SimConstants.m₀ * (-SimConstants.α * SimConstants.c₀ * μᵢⱼ) / ρ̄ * ∇W + end + + AccelerationValue += AccelerationContribution + ViscTerm + end + + return DensityRate, AccelerationValue +end + +@inline function FinalKernel(Index, Accumulator, DensityRate, Acceleration) + DensityRateValue, AccelerationValue = Accumulator + DensityRate[Index] = DensityRateValue + Acceleration[Index] = AccelerationValue + return nothing +end + +function ComputeNextTimeStep(MaxVisc, MinDtForce, SimConstants, SimKernel) + MaxViscHost = maximum(Array(MaxVisc)) + MinDtHost = minimum(Array(MinDtForce)) + DtLimit = SimKernel.h / (SimConstants.c₀ + MaxViscHost) + return SimConstants.CFL * min(MinDtHost, DtLimit) +end + +function RunSimulationCUDA(;SimGeometry::Vector{Geometry{Dimensions, FloatType}}, + SimMetaData::SimulationMetaData{Dimensions, FloatType, NoShifting, NoKernelOutput, NoMDBC, LMode}, + SimConstants::SimulationConstants, + SimKernel::SPHKernelInstance, + SimLogger::SimulationLogger, + SimParticles, + SimViscosity::ArtificialViscosity, + SimDensityDiffusion::LinearDensityDiffusion, + ) where {Dimensions, FloatType, LMode<:LogMode} + + NumberOfPoints = length(SimParticles) + CUDAParticles = AllocateCUDASimParticles(SimParticles) + + DensityRate = CUDA.zeros(FloatType, NumberOfPoints) + VelocityHalf = CUDA.zeros(SVector{Dimensions, FloatType}, NumberOfPoints) + PositionHalf = CUDA.zeros(SVector{Dimensions, FloatType}, NumberOfPoints) + DensityHalf = CUDA.zeros(FloatType, NumberOfPoints) + MaxVisc = CUDA.zeros(FloatType, NumberOfPoints) + MinDtForce = CUDA.zeros(FloatType, NumberOfPoints) + + MinCorner, MaxCorner = ComputeBounds(SimParticles.Position) + Grid = BuildGrid(MinCorner, MaxCorner, SimKernel.H) + NeighborList = AllocateCUDANeighborList(Grid, NumberOfPoints) + + output = SetupVTKOutput(SimMetaData, SimParticles, SimKernel, Dimensions) + + SimMetaData.OutputIterationCounter = 1 + output.enqueue_particles(SimMetaData.OutputIterationCounter) + + InitializeLog!(SimMetaData, SimLogger, SimConstants, SimKernel, SimViscosity, SimDensityDiffusion, SimGeometry, SimParticles) + + Threads = 256 + Blocks = cld(NumberOfPoints, Threads) + DimensionsVal = Val(Dimensions) + + CUDA.@cuda threads=Threads blocks=Blocks PressureKernel!(CUDAParticles.Pressure, CUDAParticles.Density, SimConstants) + CUDA.@cuda threads=Threads blocks=Blocks TimeStepBuffersKernel!( + MaxVisc, MinDtForce, CUDAParticles.Position, CUDAParticles.Velocity, CUDAParticles.Acceleration, SimKernel, + ) + Dt = ComputeNextTimeStep(MaxVisc, MinDtForce, SimConstants, SimKernel) + + while SimMetaData.TotalTime <= next_output_time(SimMetaData) + UpdateNeighborsCUDA!(NeighborList, CUDAParticles.Position) + + CUDA.@cuda threads=Threads blocks=Blocks PressureKernel!(CUDAParticles.Pressure, CUDAParticles.Density, SimConstants) + + NeighborLoopPerParticleCUDA!( + InitAccumulatorKernel, + InteractionKernel, + FinalKernel, + NeighborList, + CUDAParticles.Position, + CUDAParticles.Density, + CUDAParticles.Pressure, + CUDAParticles.Velocity, + CUDAParticles.MotionLimiter, + SimKernel, + SimConstants, + DensityRate, + CUDAParticles.Acceleration, + ) + + DtHalf = Dt * 0.5 + CUDA.@cuda threads=Threads blocks=Blocks HalfTimeStepKernel!( + CUDAParticles.Position, + CUDAParticles.Velocity, + CUDAParticles.Acceleration, + CUDAParticles.Density, + CUDAParticles.GravityFactor, + CUDAParticles.MotionLimiter, + PositionHalf, + VelocityHalf, + DensityHalf, + DensityRate, + DtHalf, + SimConstants.g, + DimensionsVal, + ) + + CUDA.@cuda threads=Threads blocks=Blocks LimitDensityKernel!(DensityHalf, SimConstants.ρ₀, CUDAParticles.MotionLimiter) + + CUDA.@cuda threads=Threads blocks=Blocks PressureKernel!(CUDAParticles.Pressure, DensityHalf, SimConstants) + + NeighborLoopPerParticleCUDA!( + InitAccumulatorKernel, + InteractionKernel, + FinalKernel, + NeighborList, + PositionHalf, + DensityHalf, + CUDAParticles.Pressure, + VelocityHalf, + CUDAParticles.MotionLimiter, + SimKernel, + SimConstants, + DensityRate, + CUDAParticles.Acceleration, + ) + + CUDA.@cuda threads=Threads blocks=Blocks LimitDensityKernel!(CUDAParticles.Density, SimConstants.ρ₀, CUDAParticles.MotionLimiter) + CUDA.@cuda threads=Threads blocks=Blocks DensityEpsiKernel!(CUDAParticles.Density, DensityRate, DensityHalf, Dt) + + CUDA.@cuda threads=Threads blocks=Blocks FullTimeStepKernel!( + CUDAParticles.Position, + CUDAParticles.Velocity, + CUDAParticles.Acceleration, + CUDAParticles.GravityFactor, + CUDAParticles.MotionLimiter, + Dt, + SimConstants.g, + DimensionsVal, + ) + + CUDA.@cuda threads=Threads blocks=Blocks TimeStepBuffersKernel!( + MaxVisc, MinDtForce, CUDAParticles.Position, CUDAParticles.Velocity, CUDAParticles.Acceleration, SimKernel, + ) + Dt = ComputeNextTimeStep(MaxVisc, MinDtForce, SimConstants, SimKernel) + + SimMetaData.Iteration += 1 + SimMetaData.CurrentTimeStep = Dt + SimMetaData.TotalTime += Dt + + SyncParticlesFromCUDA!(SimParticles, CUDAParticles) + LogStep!(SimMetaData, SimLogger) + SimMetaData.OutputIterationCounter += 1 + output.enqueue_particles(SimMetaData.OutputIterationCounter) + + if SimMetaData.TotalTime > SimMetaData.SimulationTime + output.close_files() + FinalizeLog!(SimMetaData, SimLogger) + AutoOpenLogFile(SimLogger, SimMetaData) + break + end + end + + return nothing +end + +end diff --git a/src/SPHExample.jl b/src/SPHExample.jl index 071b1c35..113aa6b8 100644 --- a/src/SPHExample.jl +++ b/src/SPHExample.jl @@ -17,6 +17,7 @@ module SPHExample const HasCUDA = Base.find_package("CUDA") !== nothing if HasCUDA include("SPHCUDANeighborList.jl") + include("SPHCUDASimulation.jl") end include("SPHCellList.jl") #Must be last @@ -69,6 +70,8 @@ module SPHExample export CUDACellGrid, CUDANeighborList, AllocateCUDANeighborList, ConstructNeighborOffsets, BuildNeighborCellListsCUDA!, UpdateNeighborsCUDA!, NeighborLoopCUDA!, NeighborLoopPerParticleCUDA! + using .SPHCUDASimulation + export RunSimulationCUDA, AllocateCUDASimParticles, SyncParticlesFromCUDA! end using .SPHCellList From c8f9ef5c3a96bed9dde4a699c8a33e0b56d5714f Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Thu, 15 Jan 2026 00:39:07 +0100 Subject: [PATCH 7/8] Fix CUDA atomic index types --- src/SPHCUDANeighborList.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/SPHCUDANeighborList.jl b/src/SPHCUDANeighborList.jl index 4cf1acf7..19db7d63 100644 --- a/src/SPHCUDANeighborList.jl +++ b/src/SPHCUDANeighborList.jl @@ -146,7 +146,7 @@ function ComputeCellCountsKernel!(CellCounts, CellIds) Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x if Index <= length(CellIds) CellId = CellIds[Index] - CUDA.atomic_add!(CellCounts, CellId, CUDAIndex(1)) + CUDA.atomic_add!(CellCounts, Int(CellId), CUDAIndex(1)) end return nothing end @@ -166,7 +166,7 @@ function BuildParticleOrderKernel!(ParticleOrder, CellOffsets, CellIds) Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x if Index <= length(CellIds) CellId = CellIds[Index] - TargetIndex = CUDA.atomic_add!(CellOffsets, CellId, CUDAIndex(1)) + TargetIndex = CUDA.atomic_add!(CellOffsets, Int(CellId), CUDAIndex(1)) ParticleOrder[Int(TargetIndex)] = CUDAIndex(Index) end return nothing From be8d34dbb7125938c76954d80639daf49561a7dc Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Thu, 15 Jan 2026 00:48:28 +0100 Subject: [PATCH 8/8] Use CPU neighbor list for CUDA loops --- README.md | 4 +- example/StillWedgeMDBC_CUDA.jl | 29 ++++++++-- src/SPHCUDANeighborList.jl | 101 ++++++++++++++++++++++++++++++++- src/SPHCUDASimulation.jl | 37 ++++++++++-- src/SPHExample.jl | 3 +- 5 files changed, 160 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 346097a8..886fe15b 100644 --- a/README.md +++ b/README.md @@ -141,7 +141,9 @@ geometry and run a CUDA neighbor-count pass using these utilities. For an end-to-end GPU path (limited to `NoShifting`, `NoKernelOutput`, `NoMDBC`, `ArtificialViscosity`, and `LinearDensityDiffusion`), use `RunSimulationCUDA` from the `SPHCUDASimulation` module. This runs the StillWedge-style workflow on -CUDA arrays and syncs particle data to the CPU only for output/logging. +CUDA arrays and builds neighbor lists on the CPU, then launches GPU neighbor +loops (particle data is synced to the CPU only for neighbor-list construction +and output/logging). ## Help diff --git a/example/StillWedgeMDBC_CUDA.jl b/example/StillWedgeMDBC_CUDA.jl index a98ed83a..e7ac0cc6 100644 --- a/example/StillWedgeMDBC_CUDA.jl +++ b/example/StillWedgeMDBC_CUDA.jl @@ -84,12 +84,31 @@ let CleanUpSimulationFolder(SimMetaDataWedge.SaveLocation) PositionGPU = CuArray(SimParticles.Position) - MinCorner, MaxCorner = ComputeBounds(SimParticles.Position) - Grid = BuildGrid(MinCorner, MaxCorner, SimKernel.H) - NeighborList = AllocateCUDANeighborList(Grid, length(PositionGPU)) - UpdateNeighborsCUDA!(NeighborList, PositionGPU) - NeighborCounts = CountNeighborsCUDA(PositionGPU, NeighborList, SimKernel.H²) + ParticleRanges = zeros(Int, length(SimParticles) + 2) + UniqueCells = zeros(CartesianIndex{Dimensions}, length(SimParticles)) + CellDict = Dict{CartesianIndex{Dimensions}, Int}() + FullStencil = ConstructStencil(Val(Dimensions)) + NeighborCellLists = [Int[] for _ in 1:length(UniqueCells)] + ParticleOrder = zeros(Int, length(SimParticles)) + CellOffsets = zeros(Int, length(ParticleRanges)) + CellIdsHost = zeros(Int, length(SimParticles)) + + IndexCounter = UpdateNeighbors!(SimParticles, SimKernel.H⁻¹, ParticleRanges, UniqueCells, CellDict, ParticleOrder, CellOffsets) + UniqueCellsView = view(UniqueCells, 1:IndexCounter) + BuildNeighborCellLists!(NeighborCellLists, FullStencil, UniqueCellsView, ParticleRanges, CellDict) + @inbounds for Index in eachindex(CellIdsHost, SimParticles.Cells) + CellIdsHost[Index] = CellDict[SimParticles.Cells[Index]] + end + NeighborListPacked = UpdateNeighborsCPUToCUDA!( + nothing, + CellIdsHost, + ParticleOrder, + view(ParticleRanges, 1:(IndexCounter + 1)), + view(NeighborCellLists, 1:IndexCounter), + ) + + NeighborCounts = CountNeighborsCUDA(PositionGPU, NeighborListPacked, SimKernel.H²) NeighborCountsHost = Array(NeighborCounts) MinimumNeighbors = minimum(NeighborCountsHost) diff --git a/src/SPHCUDANeighborList.jl b/src/SPHCUDANeighborList.jl index 19db7d63..4abc0226 100644 --- a/src/SPHCUDANeighborList.jl +++ b/src/SPHCUDANeighborList.jl @@ -3,7 +3,8 @@ module SPHCUDANeighborList export CUDACellGrid, CUDANeighborList, AllocateCUDANeighborList, ConstructNeighborOffsets, BuildNeighborCellListsCUDA!, UpdateNeighborsCUDA!, NeighborLoopCUDA!, - NeighborLoopPerParticleCUDA! + NeighborLoopPerParticleCUDA!, CUDAPackedNeighborList, + AllocateCUDAPackedNeighborList, UpdateNeighborsCPUToCUDA! const CUDAIndex = Int32 @@ -51,6 +52,16 @@ struct CUDANeighborList{D, T} NeighborOffsets::CuArray{SVector{D, CUDAIndex}} end +struct CUDAPackedNeighborList + CellIds::CuArray{CUDAIndex} + ParticleOrder::CuArray{CUDAIndex} + ParticleRanges::CuArray{CUDAIndex} + NeighborCells::CuArray{CUDAIndex} + NeighborCounts::CuArray{CUDAIndex} + CellCount::Int + MaxNeighbors::Int +end + """ ConstructNeighborOffsets(Val(D)) @@ -95,6 +106,56 @@ function AllocateCUDANeighborList(Grid::CUDACellGrid{D, T}, ParticleCount::Int) return NeighborList end +function AllocateCUDAPackedNeighborList(ParticleCount::Int, CellCount::Int, MaxNeighbors::Int) + CellIds = CUDA.zeros(CUDAIndex, ParticleCount) + ParticleOrder = CUDA.zeros(CUDAIndex, ParticleCount) + ParticleRanges = CUDA.zeros(CUDAIndex, CellCount + 1) + NeighborCells = CUDA.zeros(CUDAIndex, MaxNeighbors, CellCount) + NeighborCounts = CUDA.zeros(CUDAIndex, CellCount) + return CUDAPackedNeighborList( + CellIds, + ParticleOrder, + ParticleRanges, + NeighborCells, + NeighborCounts, + CellCount, + MaxNeighbors, + ) +end + +function UpdateNeighborsCPUToCUDA!(NeighborList::Union{Nothing, CUDAPackedNeighborList}, + CellIdsHost, + ParticleOrderHost, + ParticleRangesHost, + NeighborCellListsHost) + CellCount = length(NeighborCellListsHost) + MaxNeighbors = maximum(length, NeighborCellListsHost; init=0) + if NeighborList === nothing || NeighborList.CellCount != CellCount || NeighborList.MaxNeighbors < MaxNeighbors + NeighborList = AllocateCUDAPackedNeighborList( + length(CellIdsHost), + CellCount, + max(MaxNeighbors, 1), + ) + end + + NeighborCountsHost = zeros(CUDAIndex, CellCount) + NeighborCellsHost = zeros(CUDAIndex, NeighborList.MaxNeighbors, CellCount) + @inbounds for CellIndex in 1:CellCount + Neighbors = NeighborCellListsHost[CellIndex] + NeighborCountsHost[CellIndex] = CUDAIndex(length(Neighbors)) + for (NeighborOffset, NeighborIndex) in enumerate(Neighbors) + NeighborCellsHost[NeighborOffset, CellIndex] = CUDAIndex(NeighborIndex) + end + end + + copyto!(NeighborList.CellIds, CUDAIndex.(CellIdsHost)) + copyto!(NeighborList.ParticleOrder, CUDAIndex.(ParticleOrderHost)) + copyto!(NeighborList.ParticleRanges, CUDAIndex.(ParticleRangesHost)) + copyto!(NeighborList.NeighborCells, NeighborCellsHost) + copyto!(NeighborList.NeighborCounts, NeighborCountsHost) + return NeighborList +end + @inline function LinearCellId(CellCoords::SVector{D, CUDAIndex}, Grid::CUDACellGrid{D, T}) where {D, T} CellId = CUDAIndex(1) @inbounds for DimIndex in 1:D @@ -337,6 +398,23 @@ function NeighborLoopCUDA!(InteractionKernel, return nothing end +function NeighborLoopCUDA!(InteractionKernel, + NeighborList::CUDAPackedNeighborList, + Args...; + Threads::Int = 256) + Blocks = cld(length(NeighborList.CellIds), Threads) + CUDA.@cuda threads=Threads blocks=Blocks NeighborLoopKernel!( + InteractionKernel, + NeighborList.CellIds, + NeighborList.ParticleOrder, + NeighborList.ParticleRanges, + NeighborList.NeighborCells, + NeighborList.NeighborCounts, + Args..., + ) + return nothing +end + """ NeighborLoopPerParticleCUDA!(InitKernel, InteractionKernel, FinalKernel, NeighborList, Args...; Threads=256) @@ -368,4 +446,25 @@ function NeighborLoopPerParticleCUDA!(InitKernel, return nothing end +function NeighborLoopPerParticleCUDA!(InitKernel, + InteractionKernel, + FinalKernel, + NeighborList::CUDAPackedNeighborList, + Args...; + Threads::Int = 256) + Blocks = cld(length(NeighborList.CellIds), Threads) + CUDA.@cuda threads=Threads blocks=Blocks NeighborLoopPerParticleKernel!( + InitKernel, + InteractionKernel, + FinalKernel, + NeighborList.CellIds, + NeighborList.ParticleOrder, + NeighborList.ParticleRanges, + NeighborList.NeighborCells, + NeighborList.NeighborCounts, + Args..., + ) + return nothing +end + end diff --git a/src/SPHCUDASimulation.jl b/src/SPHCUDASimulation.jl index cb8db28c..1736e373 100644 --- a/src/SPHCUDASimulation.jl +++ b/src/SPHCUDASimulation.jl @@ -19,6 +19,7 @@ using ..SPHViscosityModels using ..TimeStepping: next_output_time using ..SimulationEquations: EquationOfStateGamma7 using ..OpenExternalPrograms +using ..SPHNeighborList: BuildNeighborCellLists!, ConstructStencil, UpdateNeighbors! struct CUDASimParticles{D, T} Position::CuArray{SVector{D, T}} @@ -63,6 +64,13 @@ end return CUDACellGrid(MinCorner, CellSize, Dims) end +@inline function BuildCellIds!(CellIds, Cells, CellDict) + @inbounds for Index in eachindex(CellIds, Cells) + CellIds[Index] = CellDict[Cells[Index]] + end + return nothing +end + @generated function GravityVector(::Val{D}, Value::T) where {D, T} Entries = [:(Index == $D ? Value : zero(T)) for Index in 1:D] return quote @@ -231,9 +239,15 @@ function RunSimulationCUDA(;SimGeometry::Vector{Geometry{Dimensions, FloatType}} MaxVisc = CUDA.zeros(FloatType, NumberOfPoints) MinDtForce = CUDA.zeros(FloatType, NumberOfPoints) - MinCorner, MaxCorner = ComputeBounds(SimParticles.Position) - Grid = BuildGrid(MinCorner, MaxCorner, SimKernel.H) - NeighborList = AllocateCUDANeighborList(Grid, NumberOfPoints) + ParticleRanges = zeros(Int, NumberOfPoints + 1 + 1) + UniqueCells = zeros(CartesianIndex{Dimensions}, NumberOfPoints) + CellDict = Dict{CartesianIndex{Dimensions}, Int}() + FullStencil = ConstructStencil(Val(Dimensions)) + NeighborCellLists = [Int[] for _ in 1:length(UniqueCells)] + ParticleOrder = zeros(Int, NumberOfPoints) + CellOffsets = zeros(Int, length(ParticleRanges)) + CellIdsHost = zeros(Int, NumberOfPoints) + PackedNeighborList = nothing output = SetupVTKOutput(SimMetaData, SimParticles, SimKernel, Dimensions) @@ -253,7 +267,18 @@ function RunSimulationCUDA(;SimGeometry::Vector{Geometry{Dimensions, FloatType}} Dt = ComputeNextTimeStep(MaxVisc, MinDtForce, SimConstants, SimKernel) while SimMetaData.TotalTime <= next_output_time(SimMetaData) - UpdateNeighborsCUDA!(NeighborList, CUDAParticles.Position) + copyto!(SimParticles.Position, Array(CUDAParticles.Position)) + IndexCounter = UpdateNeighbors!(SimParticles, SimKernel.H⁻¹, ParticleRanges, UniqueCells, CellDict, ParticleOrder, CellOffsets) + UniqueCellsView = view(UniqueCells, 1:IndexCounter) + BuildNeighborCellLists!(NeighborCellLists, FullStencil, UniqueCellsView, ParticleRanges, CellDict) + BuildCellIds!(CellIdsHost, SimParticles.Cells, CellDict) + PackedNeighborList = UpdateNeighborsCPUToCUDA!( + PackedNeighborList, + CellIdsHost, + ParticleOrder, + view(ParticleRanges, 1:(IndexCounter + 1)), + view(NeighborCellLists, 1:IndexCounter), + ) CUDA.@cuda threads=Threads blocks=Blocks PressureKernel!(CUDAParticles.Pressure, CUDAParticles.Density, SimConstants) @@ -261,7 +286,7 @@ function RunSimulationCUDA(;SimGeometry::Vector{Geometry{Dimensions, FloatType}} InitAccumulatorKernel, InteractionKernel, FinalKernel, - NeighborList, + PackedNeighborList, CUDAParticles.Position, CUDAParticles.Density, CUDAParticles.Pressure, @@ -298,7 +323,7 @@ function RunSimulationCUDA(;SimGeometry::Vector{Geometry{Dimensions, FloatType}} InitAccumulatorKernel, InteractionKernel, FinalKernel, - NeighborList, + PackedNeighborList, PositionHalf, DensityHalf, CUDAParticles.Pressure, diff --git a/src/SPHExample.jl b/src/SPHExample.jl index 113aa6b8..ae313057 100644 --- a/src/SPHExample.jl +++ b/src/SPHExample.jl @@ -69,7 +69,8 @@ module SPHExample using .SPHCUDANeighborList export CUDACellGrid, CUDANeighborList, AllocateCUDANeighborList, ConstructNeighborOffsets, BuildNeighborCellListsCUDA!, - UpdateNeighborsCUDA!, NeighborLoopCUDA!, NeighborLoopPerParticleCUDA! + UpdateNeighborsCUDA!, NeighborLoopCUDA!, NeighborLoopPerParticleCUDA!, + CUDAPackedNeighborList, AllocateCUDAPackedNeighborList, UpdateNeighborsCPUToCUDA! using .SPHCUDASimulation export RunSimulationCUDA, AllocateCUDASimParticles, SyncParticlesFromCUDA! end