diff --git a/README.md b/README.md index c23bc265..886fe15b 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ The project demonstrates how to assemble a small SPH solver with Julia. It focus - **Dynamic boundary condition** – inspired by DualSPHysics. - **Density diffusion** – based on Fourtakas et al. 2019 to reduce pressure noise. - **Wendland quintic kernel** – simple and stable without tensile corrections. +- **CUDA neighbor lists** – optional GPU neighbor cell list and loop utilities for CUDA arrays. ## Folder Structure @@ -91,6 +92,59 @@ one plus the particles in its neighbor stencil. Neighbor rebuilds use a per-cell ordering buffer without reordering particle storage. +### CUDA Neighbor Utilities + +This package ships with an optional CUDA-focused neighbor list to offload the +cell list construction and neighbor iteration. Install CUDA.jl in the project +environment before using these helpers: + +```julia +using Pkg +Pkg.add("CUDA") +``` + +The CUDA utilities are only loaded when CUDA.jl is available. The workflow is: + +1. Build a `CUDACellGrid` describing the simulation bounds and desired cell size. +2. Allocate a `CUDANeighborList` once for the particle count. +3. Call `UpdateNeighborsCUDA!` each step after updating positions. +4. Launch `NeighborLoopCUDA!` with a GPU kernel that computes pairwise + interactions. + For per-particle accumulation (mirroring `NeighborLoopPerParticle!`), + use `NeighborLoopPerParticleCUDA!` with init/interact/final callbacks. + +Here is a minimal sketch: + +```julia +using CUDA +using SPHExample + +grid = CUDACellGrid(SVector(0.0f0, 0.0f0), 0.01f0, SVector(128, 128)) +neighbor_list = AllocateCUDANeighborList(grid, length(position)) + +UpdateNeighborsCUDA!(neighbor_list, position) + +function InteractionKernel(i, j, acc, pos) + @inbounds acc[i] += pos[j] - pos[i] + return nothing +end + +NeighborLoopCUDA!(InteractionKernel, neighbor_list, acc, position) +``` + +These CUDA utilities are intended as building blocks for integrating GPU +workflows into the existing solver. You can keep the CPU path intact and +incrementally replace the neighbor loop where it makes sense for your use case. +The `example/StillWedgeMDBC_CUDA.jl` script shows how to load the StillWedge +geometry and run a CUDA neighbor-count pass using these utilities. + +For an end-to-end GPU path (limited to `NoShifting`, `NoKernelOutput`, `NoMDBC`, +`ArtificialViscosity`, and `LinearDensityDiffusion`), use `RunSimulationCUDA` +from the `SPHCUDASimulation` module. This runs the StillWedge-style workflow on +CUDA arrays and builds neighbor lists on the CPU, then launches GPU neighbor +loops (particle data is synced to the CPU only for neighbor-list construction +and output/logging). + ## Help Questions or issues can be posted on the GitHub issue tracker. Response times may vary but all feedback is welcome. diff --git a/example/StillWedgeMDBC_CUDA.jl b/example/StillWedgeMDBC_CUDA.jl new file mode 100644 index 00000000..e7ac0cc6 --- /dev/null +++ b/example/StillWedgeMDBC_CUDA.jl @@ -0,0 +1,133 @@ +using SPHExample +using CUDA +using StaticArrays + +if !SPHExample.HasCUDA + error("CUDA.jl is required to run this example.") +end + +@inline function ComputeBounds(Position) + MinCorner = reduce((A, B) -> min.(A, B), Position) + MaxCorner = reduce((A, B) -> max.(A, B), Position) + return MinCorner, MaxCorner +end + +@inline function BuildGrid(MinCorner, MaxCorner, CellSize) + Extents = MaxCorner - MinCorner + Dims = SVector{length(MinCorner), Int}(ceil.(Int, Extents ./ CellSize) .+ 1) + return CUDACellGrid(MinCorner, CellSize, Dims) +end + +@inline function SquaredNorm(Vector) + Accumulator = zero(eltype(Vector)) + @inbounds for Index in eachindex(Vector) + Accumulator += Vector[Index] * Vector[Index] + end + return Accumulator +end + +@inline function NeighborCountInteractionKernel(Index, NeighborIndex, Counts, Position, SupportRadiusSquared) + Δx = Position[Index] - Position[NeighborIndex] + if SquaredNorm(Δx) <= SupportRadiusSquared + CUDA.atomic_add!(Counts, Index, 1) + end + return nothing +end + +function CountNeighborsCUDA(Position, NeighborList, SupportRadiusSquared) + Counts = CUDA.zeros(Int, length(Position)) + NeighborLoopCUDA!(NeighborCountInteractionKernel, NeighborList, Counts, Position, SupportRadiusSquared) + return Counts +end + +let + Dimensions = 2 + FloatType = Float32 + + SimConstantsWedge = SimulationConstants{FloatType}(dx=0.02f0, c₀=42.485762f0, δᵩ=0.1f0, CFL=0.5f0) + + FixedBoundary = Geometry{Dimensions, FloatType}( + CSVFile = "./input/still_wedge/StillWedge_Dp$(SimConstantsWedge.dx)_Bound.csv", + GroupMarker = 1, + Type = Fixed, + Motion = nothing, + ) + + Water = Geometry{Dimensions, FloatType}( + CSVFile = "./input/still_wedge/StillWedge_Dp$(SimConstantsWedge.dx)_Fluid.csv", + GroupMarker = 2, + Type = Fluid, + Motion = nothing, + ) + + SimMetaDataWedge = SimulationMetaData{Dimensions, FloatType, NoShifting, NoKernelOutput, NoMDBC, StoreLog}( + SimulationName="StillWedge", + SaveLocation="W:/Simulations/StillWedge2D_MDBC_CUDA", + SimulationTime=4.0f0, + OutputTimes=0.01f0, + VisualizeInParaview=true, + ExportSingleVTKHDF=true, + ExportGridCells=true, + OpenLogFile=true, + ) + + if !isdir(SimMetaDataWedge.SaveLocation) + mkdir(SimMetaDataWedge.SaveLocation) + end + + SimulationGeometry = [FixedBoundary; Water] + SimParticles = AllocateDataStructures(SimulationGeometry, SimMetaDataWedge) + + SimKernel = SPHKernelInstance{Dimensions, FloatType}(WendlandC2(); dx = SimConstantsWedge.dx) + SimLogger = SimulationLogger(SimMetaDataWedge.SaveLocation) + + CleanUpSimulationFolder(SimMetaDataWedge.SaveLocation) + + PositionGPU = CuArray(SimParticles.Position) + + ParticleRanges = zeros(Int, length(SimParticles) + 2) + UniqueCells = zeros(CartesianIndex{Dimensions}, length(SimParticles)) + CellDict = Dict{CartesianIndex{Dimensions}, Int}() + FullStencil = ConstructStencil(Val(Dimensions)) + NeighborCellLists = [Int[] for _ in 1:length(UniqueCells)] + ParticleOrder = zeros(Int, length(SimParticles)) + CellOffsets = zeros(Int, length(ParticleRanges)) + CellIdsHost = zeros(Int, length(SimParticles)) + + IndexCounter = UpdateNeighbors!(SimParticles, SimKernel.H⁻¹, ParticleRanges, UniqueCells, CellDict, ParticleOrder, CellOffsets) + UniqueCellsView = view(UniqueCells, 1:IndexCounter) + BuildNeighborCellLists!(NeighborCellLists, FullStencil, UniqueCellsView, ParticleRanges, CellDict) + @inbounds for Index in eachindex(CellIdsHost, SimParticles.Cells) + CellIdsHost[Index] = CellDict[SimParticles.Cells[Index]] + end + NeighborListPacked = UpdateNeighborsCPUToCUDA!( + nothing, + CellIdsHost, + ParticleOrder, + view(ParticleRanges, 1:(IndexCounter + 1)), + view(NeighborCellLists, 1:IndexCounter), + ) + + NeighborCounts = CountNeighborsCUDA(PositionGPU, NeighborListPacked, SimKernel.H²) + + NeighborCountsHost = Array(NeighborCounts) + MinimumNeighbors = minimum(NeighborCountsHost) + MaximumNeighbors = maximum(NeighborCountsHost) + AverageNeighbors = sum(NeighborCountsHost) / length(NeighborCountsHost) + + println("StillWedge CUDA neighbor stats (within H):") + println(" Min neighbors: ", MinimumNeighbors) + println(" Max neighbors: ", MaximumNeighbors) + println(" Avg neighbors: ", AverageNeighbors) + + RunSimulationCUDA( + SimGeometry = SimulationGeometry, + SimMetaData = SimMetaDataWedge, + SimConstants = SimConstantsWedge, + SimKernel = SimKernel, + SimLogger = SimLogger, + SimParticles = SimParticles, + SimViscosity = ArtificialViscosity(), + SimDensityDiffusion = LinearDensityDiffusion(), + ) +end diff --git a/src/SPHCUDANeighborList.jl b/src/SPHCUDANeighborList.jl new file mode 100644 index 00000000..4abc0226 --- /dev/null +++ b/src/SPHCUDANeighborList.jl @@ -0,0 +1,470 @@ +module SPHCUDANeighborList + +export CUDACellGrid, CUDANeighborList, AllocateCUDANeighborList, + ConstructNeighborOffsets, BuildNeighborCellListsCUDA!, + UpdateNeighborsCUDA!, NeighborLoopCUDA!, + NeighborLoopPerParticleCUDA!, CUDAPackedNeighborList, + AllocateCUDAPackedNeighborList, UpdateNeighborsCPUToCUDA! + +const CUDAIndex = Int32 + +using CUDA +using StaticArrays + +""" + CUDACellGrid(Origin, CellSize, Dims) + +Describe a uniform cell grid for CUDA neighbor list construction. +""" +struct CUDACellGrid{D, T} + Origin::SVector{D, T} + InvCellSize::T + Dims::SVector{D, Int} + Strides::SVector{D, Int} + CellCount::Int +end + +""" + CUDACellGrid(Origin, CellSize, Dims) + +Build a CUDA cell grid from an origin, cell size, and grid dimensions. +""" +function CUDACellGrid(Origin::SVector{D, T}, CellSize::T, Dims::SVector{D, Int}) where {D, T} + Strides = SVector{D, Int}(ntuple(i -> i == 1 ? 1 : prod(Dims[1:(i - 1)]), D)) + CellCount = prod(Dims) + return CUDACellGrid{D, T}(Origin, inv(CellSize), Dims, Strides, CellCount) +end + +""" + CUDANeighborList(Grid, ...) + +CUDA storage for per-cell particle ranges and neighbor cell indices. +""" +struct CUDANeighborList{D, T} + Grid::CUDACellGrid{D, T} + CellIds::CuArray{CUDAIndex} + ParticleOrder::CuArray{CUDAIndex} + ParticleRanges::CuArray{CUDAIndex} + CellCounts::CuArray{CUDAIndex} + CellOffsets::CuArray{CUDAIndex} + NeighborCells::CuArray{CUDAIndex} + NeighborCounts::CuArray{CUDAIndex} + NeighborOffsets::CuArray{SVector{D, CUDAIndex}} +end + +struct CUDAPackedNeighborList + CellIds::CuArray{CUDAIndex} + ParticleOrder::CuArray{CUDAIndex} + ParticleRanges::CuArray{CUDAIndex} + NeighborCells::CuArray{CUDAIndex} + NeighborCounts::CuArray{CUDAIndex} + CellCount::Int + MaxNeighbors::Int +end + +""" + ConstructNeighborOffsets(Val(D)) + +Return the non-zero stencil offsets for a D-dimensional Moore neighborhood. +""" +function ConstructNeighborOffsets(::Val{D}) where {D} + Offsets = SVector{D, CUDAIndex}[] + for Offset in CartesianIndices(ntuple(_ -> -1:1, D)) + if !all(iszero, Tuple(Offset)) + push!(Offsets, SVector{D, CUDAIndex}(Tuple(Offset))) + end + end + return Offsets +end + +""" + AllocateCUDANeighborList(Grid, ParticleCount) + +Allocate GPU buffers for neighbor cell lists and particle ordering. +""" +function AllocateCUDANeighborList(Grid::CUDACellGrid{D, T}, ParticleCount::Int) where {D, T} + CellIds = CUDA.zeros(CUDAIndex, ParticleCount) + ParticleOrder = CUDA.zeros(CUDAIndex, ParticleCount) + ParticleRanges = CUDA.zeros(CUDAIndex, Grid.CellCount + 1) + CellCounts = CUDA.zeros(CUDAIndex, Grid.CellCount) + CellOffsets = CUDA.zeros(CUDAIndex, Grid.CellCount) + NeighborOffsets = CuArray(ConstructNeighborOffsets(Val(D))) + NeighborCells = CUDA.zeros(CUDAIndex, length(NeighborOffsets), Grid.CellCount) + NeighborCounts = CUDA.zeros(CUDAIndex, Grid.CellCount) + NeighborList = CUDANeighborList{D, T}( + Grid, + CellIds, + ParticleOrder, + ParticleRanges, + CellCounts, + CellOffsets, + NeighborCells, + NeighborCounts, + NeighborOffsets, + ) + BuildNeighborCellListsCUDA!(NeighborList) + return NeighborList +end + +function AllocateCUDAPackedNeighborList(ParticleCount::Int, CellCount::Int, MaxNeighbors::Int) + CellIds = CUDA.zeros(CUDAIndex, ParticleCount) + ParticleOrder = CUDA.zeros(CUDAIndex, ParticleCount) + ParticleRanges = CUDA.zeros(CUDAIndex, CellCount + 1) + NeighborCells = CUDA.zeros(CUDAIndex, MaxNeighbors, CellCount) + NeighborCounts = CUDA.zeros(CUDAIndex, CellCount) + return CUDAPackedNeighborList( + CellIds, + ParticleOrder, + ParticleRanges, + NeighborCells, + NeighborCounts, + CellCount, + MaxNeighbors, + ) +end + +function UpdateNeighborsCPUToCUDA!(NeighborList::Union{Nothing, CUDAPackedNeighborList}, + CellIdsHost, + ParticleOrderHost, + ParticleRangesHost, + NeighborCellListsHost) + CellCount = length(NeighborCellListsHost) + MaxNeighbors = maximum(length, NeighborCellListsHost; init=0) + if NeighborList === nothing || NeighborList.CellCount != CellCount || NeighborList.MaxNeighbors < MaxNeighbors + NeighborList = AllocateCUDAPackedNeighborList( + length(CellIdsHost), + CellCount, + max(MaxNeighbors, 1), + ) + end + + NeighborCountsHost = zeros(CUDAIndex, CellCount) + NeighborCellsHost = zeros(CUDAIndex, NeighborList.MaxNeighbors, CellCount) + @inbounds for CellIndex in 1:CellCount + Neighbors = NeighborCellListsHost[CellIndex] + NeighborCountsHost[CellIndex] = CUDAIndex(length(Neighbors)) + for (NeighborOffset, NeighborIndex) in enumerate(Neighbors) + NeighborCellsHost[NeighborOffset, CellIndex] = CUDAIndex(NeighborIndex) + end + end + + copyto!(NeighborList.CellIds, CUDAIndex.(CellIdsHost)) + copyto!(NeighborList.ParticleOrder, CUDAIndex.(ParticleOrderHost)) + copyto!(NeighborList.ParticleRanges, CUDAIndex.(ParticleRangesHost)) + copyto!(NeighborList.NeighborCells, NeighborCellsHost) + copyto!(NeighborList.NeighborCounts, NeighborCountsHost) + return NeighborList +end + +@inline function LinearCellId(CellCoords::SVector{D, CUDAIndex}, Grid::CUDACellGrid{D, T}) where {D, T} + CellId = CUDAIndex(1) + @inbounds for DimIndex in 1:D + CellId += CellCoords[DimIndex] * CUDAIndex(Grid.Strides[DimIndex]) + end + return CellId +end + +@generated function CellCoordsFromId(CellId::CUDAIndex, Grid::CUDACellGrid{D, T}) where {D, T} + coord_symbols = [Symbol(:Coord_, i) for i in 1:D] + coord_defs = map(enumerate(coord_symbols)) do (i, sym) + quote + Dim = CUDAIndex(Grid.Dims[$i]) + $sym = Remaining % Dim + Remaining = Remaining ÷ Dim + end + end + return quote + Remaining = CellId - CUDAIndex(1) + $(coord_defs...) + return SVector{$D, CUDAIndex}($(coord_symbols...)) + end +end + +@generated function CellCoordsFromPosition(Position, Grid::CUDACellGrid{D, T}) where {D, T} + coord_exprs = map(1:D) do i + quote + Raw = (Position[$i] - Grid.Origin[$i]) * Grid.InvCellSize + Cell = floor(CUDAIndex, Raw) + clamp(Cell, CUDAIndex(0), CUDAIndex(Grid.Dims[$i] - 1)) + end + end + return quote + return SVector{$D, CUDAIndex}($(coord_exprs...)) + end +end + +function ComputeCellIdsKernel!(CellIds, Position, Grid::CUDACellGrid{D, T}) where {D, T} + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(CellIds) + Pos = Position[Index] + Coords = CellCoordsFromPosition(Pos, Grid) + CellIds[Index] = LinearCellId(Coords, Grid) + end + return nothing +end + +function ComputeCellCountsKernel!(CellCounts, CellIds) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(CellIds) + CellId = CellIds[Index] + CUDA.atomic_add!(CellCounts, Int(CellId), CUDAIndex(1)) + end + return nothing +end + +function BuildParticleRangesKernel!(ParticleRanges, CellPrefix) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(CellPrefix) + if Index == 1 + ParticleRanges[1] = CUDAIndex(1) + end + ParticleRanges[Index + 1] = CellPrefix[Index] + CUDAIndex(1) + end + return nothing +end + +function BuildParticleOrderKernel!(ParticleOrder, CellOffsets, CellIds) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(CellIds) + CellId = CellIds[Index] + TargetIndex = CUDA.atomic_add!(CellOffsets, Int(CellId), CUDAIndex(1)) + ParticleOrder[Int(TargetIndex)] = CUDAIndex(Index) + end + return nothing +end + +function BuildNeighborCellListsKernel!(NeighborCells, NeighborCounts, NeighborOffsets, Grid::CUDACellGrid{D, T}) where {D, T} + CellId = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if CellId <= Grid.CellCount + CellCoord = CellCoordsFromId(CUDAIndex(CellId), Grid) + NeighborIndex = CUDAIndex(0) + for OffsetIndex in 1:length(NeighborOffsets) + Offset = NeighborOffsets[OffsetIndex] + NeighborCoord = CellCoord + Offset + IsValid = true + @inbounds for DimIndex in 1:D + if NeighborCoord[DimIndex] < CUDAIndex(0) || NeighborCoord[DimIndex] >= CUDAIndex(Grid.Dims[DimIndex]) + IsValid = false + break + end + end + if IsValid + NeighborIndex += CUDAIndex(1) + NeighborCells[Int(NeighborIndex), CellId] = LinearCellId(NeighborCoord, Grid) + end + end + NeighborCounts[CellId] = NeighborIndex + end + return nothing +end + +""" + BuildNeighborCellListsCUDA!(NeighborList) + +Precompute the neighbor cell indices for each grid cell on the GPU. +""" +function BuildNeighborCellListsCUDA!(NeighborList::CUDANeighborList) + Threads = 256 + Blocks = cld(NeighborList.Grid.CellCount, Threads) + CUDA.@cuda threads=Threads blocks=Blocks BuildNeighborCellListsKernel!( + NeighborList.NeighborCells, + NeighborList.NeighborCounts, + NeighborList.NeighborOffsets, + NeighborList.Grid, + ) + return nothing +end + +""" + UpdateNeighborsCUDA!(NeighborList, Position) + +Update cell ids, particle ranges, and particle order on the GPU. +""" +function UpdateNeighborsCUDA!(NeighborList::CUDANeighborList, Position) + Threads = 256 + Blocks = cld(length(NeighborList.CellIds), Threads) + CUDA.@cuda threads=Threads blocks=Blocks ComputeCellIdsKernel!( + NeighborList.CellIds, + Position, + NeighborList.Grid, + ) + fill!(NeighborList.CellCounts, 0) + CUDA.@cuda threads=Threads blocks=Blocks ComputeCellCountsKernel!( + NeighborList.CellCounts, + NeighborList.CellIds, + ) + CellPrefix = CUDA.cumsum(NeighborList.CellCounts) + BlocksCells = cld(length(CellPrefix), Threads) + CUDA.@cuda threads=Threads blocks=BlocksCells BuildParticleRangesKernel!( + NeighborList.ParticleRanges, + CellPrefix, + ) + copyto!(NeighborList.CellOffsets, NeighborList.ParticleRanges[1:(end - 1)]) + CUDA.@cuda threads=Threads blocks=Blocks BuildParticleOrderKernel!( + NeighborList.ParticleOrder, + NeighborList.CellOffsets, + NeighborList.CellIds, + ) + return nothing +end + +function NeighborLoopKernel!(InteractionKernel, + CellIds, + ParticleOrder, + ParticleRanges, + NeighborCells, + NeighborCounts, + Args...) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(CellIds) + CellId = Int(CellIds[Index]) + StartIndex = Int(ParticleRanges[CellId]) + EndIndex = Int(ParticleRanges[CellId + 1]) - 1 + for j in StartIndex:EndIndex + NeighborIndex = Int(ParticleOrder[j]) + if NeighborIndex != Index + InteractionKernel(Index, NeighborIndex, Args...) + end + end + NeighborCount = Int(NeighborCounts[CellId]) + for n in 1:NeighborCount + NeighborCell = Int(NeighborCells[n, CellId]) + StartIndex = Int(ParticleRanges[NeighborCell]) + EndIndex = Int(ParticleRanges[NeighborCell + 1]) - 1 + for j in StartIndex:EndIndex + NeighborIndex = Int(ParticleOrder[j]) + InteractionKernel(Index, NeighborIndex, Args...) + end + end + end + return nothing +end + +function NeighborLoopPerParticleKernel!(InitKernel, + InteractionKernel, + FinalKernel, + CellIds, + ParticleOrder, + ParticleRanges, + NeighborCells, + NeighborCounts, + Args...) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(CellIds) + CellId = Int(CellIds[Index]) + Accumulator = InitKernel(Index, Args...) + StartIndex = Int(ParticleRanges[CellId]) + EndIndex = Int(ParticleRanges[CellId + 1]) - 1 + for j in StartIndex:EndIndex + NeighborIndex = Int(ParticleOrder[j]) + if NeighborIndex != Index + Accumulator = InteractionKernel(Index, NeighborIndex, Accumulator, Args...) + end + end + NeighborCount = Int(NeighborCounts[CellId]) + for n in 1:NeighborCount + NeighborCell = Int(NeighborCells[n, CellId]) + StartIndex = Int(ParticleRanges[NeighborCell]) + EndIndex = Int(ParticleRanges[NeighborCell + 1]) - 1 + for j in StartIndex:EndIndex + NeighborIndex = Int(ParticleOrder[j]) + Accumulator = InteractionKernel(Index, NeighborIndex, Accumulator, Args...) + end + end + FinalKernel(Index, Accumulator, Args...) + end + return nothing +end + +""" + NeighborLoopCUDA!(InteractionKernel, NeighborList, Args...; Threads=256) + +Launch a CUDA neighbor loop that calls `InteractionKernel(i, j, Args...)`. +""" +function NeighborLoopCUDA!(InteractionKernel, + NeighborList::CUDANeighborList, + Args...; + Threads::Int = 256) + Blocks = cld(length(NeighborList.CellIds), Threads) + CUDA.@cuda threads=Threads blocks=Blocks NeighborLoopKernel!( + InteractionKernel, + NeighborList.CellIds, + NeighborList.ParticleOrder, + NeighborList.ParticleRanges, + NeighborList.NeighborCells, + NeighborList.NeighborCounts, + Args..., + ) + return nothing +end + +function NeighborLoopCUDA!(InteractionKernel, + NeighborList::CUDAPackedNeighborList, + Args...; + Threads::Int = 256) + Blocks = cld(length(NeighborList.CellIds), Threads) + CUDA.@cuda threads=Threads blocks=Blocks NeighborLoopKernel!( + InteractionKernel, + NeighborList.CellIds, + NeighborList.ParticleOrder, + NeighborList.ParticleRanges, + NeighborList.NeighborCells, + NeighborList.NeighborCounts, + Args..., + ) + return nothing +end + +""" + NeighborLoopPerParticleCUDA!(InitKernel, InteractionKernel, FinalKernel, NeighborList, Args...; Threads=256) + +Launch a CUDA neighbor loop that accumulates per-particle state. The kernels +are called as: + +- `InitKernel(i, Args...)` to initialize the per-particle accumulator. +- `InteractionKernel(i, j, Accumulator, Args...)` for each neighbor, returning an updated accumulator. +- `FinalKernel(i, Accumulator, Args...)` to store the final per-particle state. +""" +function NeighborLoopPerParticleCUDA!(InitKernel, + InteractionKernel, + FinalKernel, + NeighborList::CUDANeighborList, + Args...; + Threads::Int = 256) + Blocks = cld(length(NeighborList.CellIds), Threads) + CUDA.@cuda threads=Threads blocks=Blocks NeighborLoopPerParticleKernel!( + InitKernel, + InteractionKernel, + FinalKernel, + NeighborList.CellIds, + NeighborList.ParticleOrder, + NeighborList.ParticleRanges, + NeighborList.NeighborCells, + NeighborList.NeighborCounts, + Args..., + ) + return nothing +end + +function NeighborLoopPerParticleCUDA!(InitKernel, + InteractionKernel, + FinalKernel, + NeighborList::CUDAPackedNeighborList, + Args...; + Threads::Int = 256) + Blocks = cld(length(NeighborList.CellIds), Threads) + CUDA.@cuda threads=Threads blocks=Blocks NeighborLoopPerParticleKernel!( + InitKernel, + InteractionKernel, + FinalKernel, + NeighborList.CellIds, + NeighborList.ParticleOrder, + NeighborList.ParticleRanges, + NeighborList.NeighborCells, + NeighborList.NeighborCounts, + Args..., + ) + return nothing +end + +end diff --git a/src/SPHCUDASimulation.jl b/src/SPHCUDASimulation.jl new file mode 100644 index 00000000..1736e373 --- /dev/null +++ b/src/SPHCUDASimulation.jl @@ -0,0 +1,377 @@ +module SPHCUDASimulation + +export RunSimulationCUDA, AllocateCUDASimParticles, SyncParticlesFromCUDA! + +using CUDA +using StaticArrays +using Parameters +using LinearAlgebra + +using ..SimulationConstantsConfiguration +using ..SimulationGeometry +using ..SimulationLoggerConfiguration +using ..SimulationMetaDataConfiguration +using ..ProduceHDFVTK +using ..SPHCUDANeighborList +using ..SPHDensityDiffusionModels +using ..SPHKernels +using ..SPHViscosityModels +using ..TimeStepping: next_output_time +using ..SimulationEquations: EquationOfStateGamma7 +using ..OpenExternalPrograms +using ..SPHNeighborList: BuildNeighborCellLists!, ConstructStencil, UpdateNeighbors! + +struct CUDASimParticles{D, T} + Position::CuArray{SVector{D, T}} + Velocity::CuArray{SVector{D, T}} + Acceleration::CuArray{SVector{D, T}} + Density::CuArray{T} + Pressure::CuArray{T} + MotionLimiter::CuArray{T} + GravityFactor::CuArray{T} +end + +@inline function AllocateCUDASimParticles(SimParticles) + return CUDASimParticles( + CuArray(SimParticles.Position), + CuArray(SimParticles.Velocity), + CuArray(SimParticles.Acceleration), + CuArray(SimParticles.Density), + CuArray(SimParticles.Pressure), + CuArray(SimParticles.MotionLimiter), + CuArray(SimParticles.GravityFactor), + ) +end + +@inline function SyncParticlesFromCUDA!(SimParticles, CUDAParticles::CUDASimParticles) + copyto!(SimParticles.Position, Array(CUDAParticles.Position)) + copyto!(SimParticles.Velocity, Array(CUDAParticles.Velocity)) + copyto!(SimParticles.Acceleration, Array(CUDAParticles.Acceleration)) + copyto!(SimParticles.Density, Array(CUDAParticles.Density)) + copyto!(SimParticles.Pressure, Array(CUDAParticles.Pressure)) + return nothing +end + +@inline function ComputeBounds(Position) + MinCorner = reduce((A, B) -> min.(A, B), Position) + MaxCorner = reduce((A, B) -> max.(A, B), Position) + return MinCorner, MaxCorner +end + +@inline function BuildGrid(MinCorner, MaxCorner, CellSize) + Extents = MaxCorner - MinCorner + Dims = SVector{length(MinCorner), Int}(ceil.(Int, Extents ./ CellSize) .+ 1) + return CUDACellGrid(MinCorner, CellSize, Dims) +end + +@inline function BuildCellIds!(CellIds, Cells, CellDict) + @inbounds for Index in eachindex(CellIds, Cells) + CellIds[Index] = CellDict[Cells[Index]] + end + return nothing +end + +@generated function GravityVector(::Val{D}, Value::T) where {D, T} + Entries = [:(Index == $D ? Value : zero(T)) for Index in 1:D] + return quote + return SVector{$D, T}($(Entries...)) + end +end + +function PressureKernel!(Pressure, Density, SimConstants) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(Pressure) + ρ = Density[Index] + Pressure[Index] = EquationOfStateGamma7(ρ, SimConstants.c₀, SimConstants.ρ₀) + end + return nothing +end + +function LimitDensityKernel!(Density, ρ₀, MotionLimiter) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(Density) + if Density[Index] < ρ₀ && MotionLimiter[Index] == zero(eltype(MotionLimiter)) + Density[Index] = ρ₀ + end + end + return nothing +end + +function DensityEpsiKernel!(Density, DensityRate, DensityHalf, Δt) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(Density) + Epsi = -(DensityRate[Index] / DensityHalf[Index]) * Δt + Density[Index] *= (2 - Epsi) / (2 + Epsi) + end + return nothing +end + +function HalfTimeStepKernel!(Position, Velocity, Acceleration, Density, + GravityFactor, MotionLimiter, + PositionHalf, VelocityHalf, DensityHalf, DensityRate, + DtHalf, GravityValue, Dimensions) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(Position) + Acc = Acceleration[Index] + GravityVector(Dimensions, GravityValue * GravityFactor[Index]) + Acceleration[Index] = Acc + Factor = MotionLimiter[Index] + PositionHalf[Index] = Position[Index] + Velocity[Index] * DtHalf * Factor + VelocityHalf[Index] = Velocity[Index] + Acc * DtHalf * Factor + DensityHalf[Index] = Density[Index] + DensityRate[Index] * DtHalf + end + return nothing +end + +function FullTimeStepKernel!(Position, Velocity, Acceleration, GravityFactor, MotionLimiter, Dt, GravityValue, Dimensions) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(Position) + Acc = Acceleration[Index] + GravityVector(Dimensions, GravityValue * GravityFactor[Index]) + Acceleration[Index] = Acc + Factor = MotionLimiter[Index] + Velocity[Index] += Acc * Dt * Factor + Position[Index] += (((Velocity[Index] + (Velocity[Index] - Acc * Dt * Factor)) / 2) * Dt) * Factor + end + return nothing +end + +function TimeStepBuffersKernel!(MaxVisc, MinDtForce, Position, Velocity, Acceleration, SimKernel) + Index = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if Index <= length(Position) + R = Position[Index] + V = Velocity[Index] + A = Acceleration[Index] + RSq = sqrt(dot(R, R))^2 + CurrVisc = abs(SimKernel.h * dot(V, R) / (RSq + SimKernel.η²)) + AMag = norm(A) + CurrDtForce = AMag > 0 ? sqrt(SimKernel.h / AMag) : typemax(eltype(MinDtForce)) + MaxVisc[Index] = CurrVisc + MinDtForce[Index] = CurrDtForce + end + return nothing +end + +@inline function InitAccumulatorKernel(Index, Density, Position) + return (zero(eltype(Density)), zero(eltype(Position))) +end + +@inline function InteractionKernel(Index, NeighborIndex, Accumulator, + Position, Density, Pressure, Velocity, + MotionLimiter, SimKernel, SimConstants) + DensityRate, AccelerationValue = Accumulator + Δx = Position[Index] - Position[NeighborIndex] + Δx² = dot(Δx, Δx) + if Δx² <= SimKernel.H² + Distance = sqrt(abs(Δx²)) + Q = clamp(Distance * SimKernel.h⁻¹, zero(Distance), 2) + ∇W = ∇Wᵢⱼ(SimKernel, Q, Δx) + + ρᵢ = Density[Index] + ρⱼ = Density[NeighborIndex] + + Vᵢ = Velocity[Index] + Vⱼ = Velocity[NeighborIndex] + Vᵢⱼ = Vᵢ - Vⱼ + DensityTerm = dot(-Vᵢⱼ, ∇W) + DensityRateContribution = -ρᵢ * (SimConstants.m₀ / ρⱼ) * DensityTerm + + LinearρFactor = inv(SimConstants.Cb * SimConstants.γ) * SimConstants.ρ₀ + Pᵢⱼᴴ = SimConstants.ρ₀ * (-SimConstants.g) * -Δx[end] + ρᵢⱼᴴ = Pᵢⱼᴴ * LinearρFactor + InvD = one(eltype(ρᵢ)) / (Δx² + SimKernel.η²) + ρⱼᵢ = ρⱼ - ρᵢ + ψᵢⱼ = 2 * (ρⱼᵢ - ρᵢⱼᴴ) * (-Δx) * InvD + MotionFactor = MotionLimiter[Index] * MotionLimiter[NeighborIndex] + DiffusionContribution = SimConstants.δᵩ * SimKernel.h * SimConstants.c₀ * (SimConstants.m₀ / ρⱼ) * dot(ψᵢⱼ, ∇W) * MotionFactor + + DensityRate += DensityRateContribution + DiffusionContribution + + Pᵢ = Pressure[Index] + Pⱼ = Pressure[NeighborIndex] + Pfac = (Pᵢ + Pⱼ) / (ρᵢ * ρⱼ) + Fab = tensile_correction(SimKernel, Pᵢ, ρᵢ, Pⱼ, ρⱼ, Q, SimConstants.dx) + AccelerationContribution = -SimConstants.m₀ * (Pfac + Fab) * ∇W + + ViscTerm = zero(AccelerationContribution) + VDotX = dot(Vᵢⱼ, Δx) + if VDotX < 0 + ρ̄ = 0.5 * (ρᵢ + ρⱼ) + μᵢⱼ = SimKernel.h * VDotX / (Δx² + SimKernel.η²) + ViscTerm = -SimConstants.m₀ * (-SimConstants.α * SimConstants.c₀ * μᵢⱼ) / ρ̄ * ∇W + end + + AccelerationValue += AccelerationContribution + ViscTerm + end + + return DensityRate, AccelerationValue +end + +@inline function FinalKernel(Index, Accumulator, DensityRate, Acceleration) + DensityRateValue, AccelerationValue = Accumulator + DensityRate[Index] = DensityRateValue + Acceleration[Index] = AccelerationValue + return nothing +end + +function ComputeNextTimeStep(MaxVisc, MinDtForce, SimConstants, SimKernel) + MaxViscHost = maximum(Array(MaxVisc)) + MinDtHost = minimum(Array(MinDtForce)) + DtLimit = SimKernel.h / (SimConstants.c₀ + MaxViscHost) + return SimConstants.CFL * min(MinDtHost, DtLimit) +end + +function RunSimulationCUDA(;SimGeometry::Vector{Geometry{Dimensions, FloatType}}, + SimMetaData::SimulationMetaData{Dimensions, FloatType, NoShifting, NoKernelOutput, NoMDBC, LMode}, + SimConstants::SimulationConstants, + SimKernel::SPHKernelInstance, + SimLogger::SimulationLogger, + SimParticles, + SimViscosity::ArtificialViscosity, + SimDensityDiffusion::LinearDensityDiffusion, + ) where {Dimensions, FloatType, LMode<:LogMode} + + NumberOfPoints = length(SimParticles) + CUDAParticles = AllocateCUDASimParticles(SimParticles) + + DensityRate = CUDA.zeros(FloatType, NumberOfPoints) + VelocityHalf = CUDA.zeros(SVector{Dimensions, FloatType}, NumberOfPoints) + PositionHalf = CUDA.zeros(SVector{Dimensions, FloatType}, NumberOfPoints) + DensityHalf = CUDA.zeros(FloatType, NumberOfPoints) + MaxVisc = CUDA.zeros(FloatType, NumberOfPoints) + MinDtForce = CUDA.zeros(FloatType, NumberOfPoints) + + ParticleRanges = zeros(Int, NumberOfPoints + 1 + 1) + UniqueCells = zeros(CartesianIndex{Dimensions}, NumberOfPoints) + CellDict = Dict{CartesianIndex{Dimensions}, Int}() + FullStencil = ConstructStencil(Val(Dimensions)) + NeighborCellLists = [Int[] for _ in 1:length(UniqueCells)] + ParticleOrder = zeros(Int, NumberOfPoints) + CellOffsets = zeros(Int, length(ParticleRanges)) + CellIdsHost = zeros(Int, NumberOfPoints) + PackedNeighborList = nothing + + output = SetupVTKOutput(SimMetaData, SimParticles, SimKernel, Dimensions) + + SimMetaData.OutputIterationCounter = 1 + output.enqueue_particles(SimMetaData.OutputIterationCounter) + + InitializeLog!(SimMetaData, SimLogger, SimConstants, SimKernel, SimViscosity, SimDensityDiffusion, SimGeometry, SimParticles) + + Threads = 256 + Blocks = cld(NumberOfPoints, Threads) + DimensionsVal = Val(Dimensions) + + CUDA.@cuda threads=Threads blocks=Blocks PressureKernel!(CUDAParticles.Pressure, CUDAParticles.Density, SimConstants) + CUDA.@cuda threads=Threads blocks=Blocks TimeStepBuffersKernel!( + MaxVisc, MinDtForce, CUDAParticles.Position, CUDAParticles.Velocity, CUDAParticles.Acceleration, SimKernel, + ) + Dt = ComputeNextTimeStep(MaxVisc, MinDtForce, SimConstants, SimKernel) + + while SimMetaData.TotalTime <= next_output_time(SimMetaData) + copyto!(SimParticles.Position, Array(CUDAParticles.Position)) + IndexCounter = UpdateNeighbors!(SimParticles, SimKernel.H⁻¹, ParticleRanges, UniqueCells, CellDict, ParticleOrder, CellOffsets) + UniqueCellsView = view(UniqueCells, 1:IndexCounter) + BuildNeighborCellLists!(NeighborCellLists, FullStencil, UniqueCellsView, ParticleRanges, CellDict) + BuildCellIds!(CellIdsHost, SimParticles.Cells, CellDict) + PackedNeighborList = UpdateNeighborsCPUToCUDA!( + PackedNeighborList, + CellIdsHost, + ParticleOrder, + view(ParticleRanges, 1:(IndexCounter + 1)), + view(NeighborCellLists, 1:IndexCounter), + ) + + CUDA.@cuda threads=Threads blocks=Blocks PressureKernel!(CUDAParticles.Pressure, CUDAParticles.Density, SimConstants) + + NeighborLoopPerParticleCUDA!( + InitAccumulatorKernel, + InteractionKernel, + FinalKernel, + PackedNeighborList, + CUDAParticles.Position, + CUDAParticles.Density, + CUDAParticles.Pressure, + CUDAParticles.Velocity, + CUDAParticles.MotionLimiter, + SimKernel, + SimConstants, + DensityRate, + CUDAParticles.Acceleration, + ) + + DtHalf = Dt * 0.5 + CUDA.@cuda threads=Threads blocks=Blocks HalfTimeStepKernel!( + CUDAParticles.Position, + CUDAParticles.Velocity, + CUDAParticles.Acceleration, + CUDAParticles.Density, + CUDAParticles.GravityFactor, + CUDAParticles.MotionLimiter, + PositionHalf, + VelocityHalf, + DensityHalf, + DensityRate, + DtHalf, + SimConstants.g, + DimensionsVal, + ) + + CUDA.@cuda threads=Threads blocks=Blocks LimitDensityKernel!(DensityHalf, SimConstants.ρ₀, CUDAParticles.MotionLimiter) + + CUDA.@cuda threads=Threads blocks=Blocks PressureKernel!(CUDAParticles.Pressure, DensityHalf, SimConstants) + + NeighborLoopPerParticleCUDA!( + InitAccumulatorKernel, + InteractionKernel, + FinalKernel, + PackedNeighborList, + PositionHalf, + DensityHalf, + CUDAParticles.Pressure, + VelocityHalf, + CUDAParticles.MotionLimiter, + SimKernel, + SimConstants, + DensityRate, + CUDAParticles.Acceleration, + ) + + CUDA.@cuda threads=Threads blocks=Blocks LimitDensityKernel!(CUDAParticles.Density, SimConstants.ρ₀, CUDAParticles.MotionLimiter) + CUDA.@cuda threads=Threads blocks=Blocks DensityEpsiKernel!(CUDAParticles.Density, DensityRate, DensityHalf, Dt) + + CUDA.@cuda threads=Threads blocks=Blocks FullTimeStepKernel!( + CUDAParticles.Position, + CUDAParticles.Velocity, + CUDAParticles.Acceleration, + CUDAParticles.GravityFactor, + CUDAParticles.MotionLimiter, + Dt, + SimConstants.g, + DimensionsVal, + ) + + CUDA.@cuda threads=Threads blocks=Blocks TimeStepBuffersKernel!( + MaxVisc, MinDtForce, CUDAParticles.Position, CUDAParticles.Velocity, CUDAParticles.Acceleration, SimKernel, + ) + Dt = ComputeNextTimeStep(MaxVisc, MinDtForce, SimConstants, SimKernel) + + SimMetaData.Iteration += 1 + SimMetaData.CurrentTimeStep = Dt + SimMetaData.TotalTime += Dt + + SyncParticlesFromCUDA!(SimParticles, CUDAParticles) + LogStep!(SimMetaData, SimLogger) + SimMetaData.OutputIterationCounter += 1 + output.enqueue_particles(SimMetaData.OutputIterationCounter) + + if SimMetaData.TotalTime > SimMetaData.SimulationTime + output.close_files() + FinalizeLog!(SimMetaData, SimLogger) + AutoOpenLogFile(SimLogger, SimMetaData) + break + end + end + + return nothing +end + +end diff --git a/src/SPHExample.jl b/src/SPHExample.jl index a56895ea..ae313057 100644 --- a/src/SPHExample.jl +++ b/src/SPHExample.jl @@ -14,6 +14,11 @@ module SPHExample include("OpenExternalPrograms.jl") include("SPHDensityDiffusionModels.jl") include("SPHNeighborList.jl") + const HasCUDA = Base.find_package("CUDA") !== nothing + if HasCUDA + include("SPHCUDANeighborList.jl") + include("SPHCUDASimulation.jl") + end include("SPHCellList.jl") #Must be last # Re-export desired functions from each submodule @@ -60,6 +65,16 @@ module SPHExample export ConstructStencil, ExtractCells!, UpdateNeighbors!, BuildNeighborCellLists!, ComputeCellParticleCounts, ComputeCellNeighborCounts + if HasCUDA + using .SPHCUDANeighborList + export CUDACellGrid, CUDANeighborList, AllocateCUDANeighborList, + ConstructNeighborOffsets, BuildNeighborCellListsCUDA!, + UpdateNeighborsCUDA!, NeighborLoopCUDA!, NeighborLoopPerParticleCUDA!, + CUDAPackedNeighborList, AllocateCUDAPackedNeighborList, UpdateNeighborsCPUToCUDA! + using .SPHCUDASimulation + export RunSimulationCUDA, AllocateCUDASimParticles, SyncParticlesFromCUDA! + end + using .SPHCellList export NeighborLoop!, ComputeInteractions!, RunSimulation