From 31fb92f15e34ab04ffcc1d98da5d33607f18a66e Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Thu, 15 Jan 2026 23:12:12 +0100 Subject: [PATCH 01/15] Compute CUDA timestep at step end --- src/SPHCUDA.jl | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/SPHCUDA.jl b/src/SPHCUDA.jl index 6b9b31bb..0331016b 100644 --- a/src/SPHCUDA.jl +++ b/src/SPHCUDA.jl @@ -296,9 +296,6 @@ end UniqueCellsView = view(UniqueCells, 1:SimMetaData.IndexCounter) dt = Δt(Position, Velocity, Acceleration, SimConstants, SimKernel) - max_visc = Vector{FloatType}(undef, length(Position)) - min_dt_force = Vector{FloatType}(undef, length(Position)) - dt₂ = dt * 0.5 while SimMetaData.TotalTime <= next_output_time(SimMetaData) @@ -346,7 +343,6 @@ end SimDensityDiffusion, SimViscosity, SimKernel, SimMetaData, SimConstants, SimParticles, ParticleRanges, CellLookup, NeighborCellLists, dρdtI, Acceleration, ∇Cᵢ, ∇◌rᵢ, - max_visc, min_dt_force, ParticleOrder = ParticleOrder, Position = Positionₙ⁺, Density = ρₙ⁺, @@ -361,7 +357,7 @@ end @timeit SimMetaData.HourGlass "12 Update MetaData" UpdateMetaData!(SimMetaData, dt) - @timeit SimMetaData.HourGlass "13 Update TimeStep" dt = FinalizeTimeStep(max_visc, min_dt_force, SimConstants, SimKernel) + @timeit SimMetaData.HourGlass "13 Update TimeStep" dt = Δt(Positionₙ⁺, Velocityₙ⁺, Acceleration, SimConstants, SimKernel) end return nothing From d072b39a69771a81ef726aecb195be7df12fd53b Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Thu, 15 Jan 2026 23:15:59 +0100 Subject: [PATCH 02/15] Update CUDA timestep and deps --- Project.toml | 2 ++ src/SPHCUDA.jl | 1 + 2 files changed, 3 insertions(+) diff --git a/Project.toml b/Project.toml index 3eae26b8..fd2b9e39 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "0.7.0" [deps] Bumper = "8ce10254-0962-460f-a3d8-1f77fea1446e" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" FastPow = "c0e83750-1142-43a8-81cf-6c956b72b4d1" Format = "1fa38f19-a742-5d3f-a2b9-30dd87b9d5f8" @@ -24,6 +25,7 @@ UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228" [compat] Bumper = "^0.7.1" CSV = "^0.10.15" +CUDA = "^5.4.0" FastPow = "^0.1.0" Format = "^1.3.7" HDF5 = "^0.17.2" diff --git a/src/SPHCUDA.jl b/src/SPHCUDA.jl index 0331016b..ceb05483 100644 --- a/src/SPHCUDA.jl +++ b/src/SPHCUDA.jl @@ -358,6 +358,7 @@ end @timeit SimMetaData.HourGlass "12 Update MetaData" UpdateMetaData!(SimMetaData, dt) @timeit SimMetaData.HourGlass "13 Update TimeStep" dt = Δt(Positionₙ⁺, Velocityₙ⁺, Acceleration, SimConstants, SimKernel) + dt₂ = dt * 0.5 end return nothing From beb382014bb7b901be2c7c3a52d06cb75ebc3f16 Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Thu, 15 Jan 2026 23:22:12 +0100 Subject: [PATCH 03/15] =?UTF-8?q?Qualify=20Bumper=20macros=20in=20=CE=94t?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/TimeStepping.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/TimeStepping.jl b/src/TimeStepping.jl index 0c795a95..d95d5b47 100644 --- a/src/TimeStepping.jl +++ b/src/TimeStepping.jl @@ -70,9 +70,9 @@ function Δt(Position, Velocity, Acceleration, SimulationConstants, SPHKernel) n_chunks = Threads.nthreads() chunk_size = cld(N, n_chunks) - @no_escape begin - v_buffer = @alloc(Float64, n_chunks) - d_buffer = @alloc(Float64, n_chunks) + Bumper.@no_escape begin + v_buffer = Bumper.@alloc(Float64, n_chunks) + d_buffer = Bumper.@alloc(Float64, n_chunks) @sync for i in 1:n_chunks Threads.@spawn begin From 4e0b1cf538ee9c295bfd56c3a3eeca8acc1cad0c Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Thu, 15 Jan 2026 23:22:18 +0100 Subject: [PATCH 04/15] Remove timestep buffers --- src/SPHCUDA.jl | 10 ++-------- src/SPHCellList.jl | 25 ++++++------------------- src/TimeStepping.jl | 38 +------------------------------------- 3 files changed, 9 insertions(+), 64 deletions(-) diff --git a/src/SPHCUDA.jl b/src/SPHCUDA.jl index ceb05483..2b16b829 100644 --- a/src/SPHCUDA.jl +++ b/src/SPHCUDA.jl @@ -6,6 +6,7 @@ using CUDA using LinearAlgebra using Parameters using StaticArrays +using TimerOutputs using ..SPHKernels using ..SPHViscosityModels @@ -207,8 +208,7 @@ function NeighborLoopPerParticleCUDA!(SimDensityDiffusion::SDD, SimViscosity::SV SimConstants, SimParticles, ParticleRanges, CellLookup, NeighborCellLists, dρdtI, Acceleration, ∇Cᵢ, - ∇◌rᵢ, max_visc = nothing, - min_dt_force = nothing; + ∇◌rᵢ; Position = SimParticles.Position, Density = SimParticles.Density, Pressure = SimParticles.Pressure, @@ -257,12 +257,6 @@ function NeighborLoopPerParticleCUDA!(SimDensityDiffusion::SDD, SimViscosity::SV CUDA.copyto!(dρdtI, dρdtI_gpu) CUDA.copyto!(Acceleration, acceleration_gpu) - if max_visc !== nothing || min_dt_force !== nothing - @inbounds for i in eachindex(Position) - UpdateTimeStepBuffers!(max_visc, min_dt_force, i, Position[i], Velocity[i], Acceleration[i], SimKernel) - end - end - return nothing end diff --git a/src/SPHCellList.jl b/src/SPHCellList.jl index 7d149e2e..d3b721bb 100644 --- a/src/SPHCellList.jl +++ b/src/SPHCellList.jl @@ -39,8 +39,7 @@ using LinearAlgebra SimConstants, SimParticles, ParticleRanges, CellLookup, NeighborCellLists, dρdtI, Acceleration, ∇Cᵢ, - ∇◌rᵢ, max_visc = nothing, - min_dt_force = nothing; + ∇◌rᵢ; Position = SimParticles.Position, Density = SimParticles.Density, Pressure = SimParticles.Pressure, @@ -84,7 +83,6 @@ using LinearAlgebra dρdtI[i] = dρdt_acc Acceleration[i] = acc_acc - UpdateTimeStepBuffers!(max_visc, min_dt_force, i, Position[i], Velocity[i], acc_acc, SimKernel) end return nothing @@ -95,8 +93,7 @@ using LinearAlgebra SimConstants, SimParticles, ParticleRanges, CellLookup, NeighborCellLists, dρdtI, Acceleration, ∇Cᵢ, - ∇◌rᵢ, max_visc = nothing, - min_dt_force = nothing; + ∇◌rᵢ; Position = SimParticles.Position, Density = SimParticles.Density, Pressure = SimParticles.Pressure, @@ -149,7 +146,6 @@ using LinearAlgebra Acceleration[i] = acc_acc Kernel[i] = kernel_acc KernelGradient[i] = kernel_grad_acc - UpdateTimeStepBuffers!(max_visc, min_dt_force, i, Position[i], Velocity[i], acc_acc, SimKernel) end return nothing @@ -160,8 +156,7 @@ using LinearAlgebra SimConstants, SimParticles, ParticleRanges, CellLookup, NeighborCellLists, dρdtI, Acceleration, ∇Cᵢ, - ∇◌rᵢ, max_visc = nothing, - min_dt_force = nothing; + ∇◌rᵢ; Position = SimParticles.Position, Density = SimParticles.Density, Pressure = SimParticles.Pressure, @@ -213,7 +208,6 @@ using LinearAlgebra Acceleration[i] = acc_acc ∇Cᵢ[i] = shift_c_acc ∇◌rᵢ[i] = shift_r_acc - UpdateTimeStepBuffers!(max_visc, min_dt_force, i, Position[i], Velocity[i], acc_acc, SimKernel) end return nothing @@ -224,8 +218,7 @@ using LinearAlgebra SimConstants, SimParticles, ParticleRanges, CellLookup, NeighborCellLists, dρdtI, Acceleration, ∇Cᵢ, - ∇◌rᵢ, max_visc = nothing, - min_dt_force = nothing; + ∇◌rᵢ; Position = SimParticles.Position, Density = SimParticles.Density, Pressure = SimParticles.Pressure, @@ -283,8 +276,6 @@ using LinearAlgebra KernelGradient[i] = kernel_grad_acc ∇Cᵢ[i] = shift_c_acc ∇◌rᵢ[i] = shift_r_acc - UpdateTimeStepBuffers!(max_visc, min_dt_force, i, Position[i], - Velocity[i], acc_acc, SimKernel) end return nothing @@ -729,9 +720,6 @@ using LinearAlgebra dt = Δt(Position, Velocity, Acceleration, SimConstants, SimKernel) @no_escape begin - max_visc = @alloc(FloatType, length(Position)) - min_dt_force = @alloc(FloatType, length(Position)) - dt₂ = dt * 0.5 while SimMetaData.TotalTime <= next_output_time(SimMetaData) @@ -795,7 +783,6 @@ using LinearAlgebra SimDensityDiffusion, SimViscosity, SimKernel, SimMetaData, SimConstants, SimParticles, ParticleRanges, CellLookup, NeighborCellLists, dρdtI, Acceleration, ∇Cᵢ, ∇◌rᵢ, - max_visc, min_dt_force, ParticleOrder = ParticleOrder, Position = Positionₙ⁺, Density = ρₙ⁺, @@ -806,7 +793,6 @@ using LinearAlgebra SimDensityDiffusion, SimViscosity, SimKernel, SimMetaData, SimConstants, SimParticles, ParticleRanges, CellLookup, NeighborCellLists, dρdtI, Acceleration, ∇Cᵢ, ∇◌rᵢ, - max_visc, min_dt_force, ParticleOrder = ParticleOrder, Position = Positionₙ⁺, Density = ρₙ⁺, @@ -822,7 +808,8 @@ using LinearAlgebra @timeit SimMetaData.HourGlass "12 Update MetaData" UpdateMetaData!(SimMetaData, dt) - @timeit SimMetaData.HourGlass "13 Update TimeStep" dt = FinalizeTimeStep(max_visc, min_dt_force, SimConstants, SimKernel) + @timeit SimMetaData.HourGlass "13 Update TimeStep" dt = Δt(Positionₙ⁺, Velocityₙ⁺, Acceleration, SimConstants, SimKernel) + dt₂ = dt * 0.5 end end diff --git a/src/TimeStepping.jl b/src/TimeStepping.jl index d95d5b47..2f525cc0 100644 --- a/src/TimeStepping.jl +++ b/src/TimeStepping.jl @@ -1,6 +1,6 @@ module TimeStepping -export Δt, FinalizeTimeStep, UpdateTimeStepBuffers!, next_output_time, ProgressMotion, HalfTimeStep, FullTimeStep +export Δt, next_output_time, ProgressMotion, HalfTimeStep, FullTimeStep using LinearAlgebra using Parameters @@ -10,42 +10,6 @@ using ..SimulationEquations using ..SimulationGeometry using ..SimulationMetaDataConfiguration -@inline function UpdateTimeStepBuffers!(::Nothing, ::Nothing, index, position, - velocity, acceleration, sim_kernel) - return nothing -end - -@inline function UpdateTimeStepBuffers!(max_visc, min_dt_force, index, position, - velocity, acceleration, sim_kernel) - h = sim_kernel.h - η² = sim_kernel.η² - r_sq = sqrt(dot(position, position))^2 - curr_visc = abs(h * dot(velocity, position) / (r_sq + η²)) - a_mag = norm(acceleration) - curr_dt_force = a_mag > 0 ? sqrt(h / a_mag) : typemax(eltype(min_dt_force)) - @inbounds begin - max_visc[index] = curr_visc - min_dt_force[index] = curr_dt_force - end - return nothing -end - -""" - FinalizeTimeStep(max_visc, min_dt_force, SimulationConstants, SPHKernel) - -Compute the CFL-limited time step from the per-particle buffers. -""" -function FinalizeTimeStep(max_visc, min_dt_force, SimulationConstants, SPHKernel) - @unpack c₀, CFL = SimulationConstants - @unpack h = SPHKernel - - global_visc = maximum(max_visc) - global_dt_force = minimum(min_dt_force) - - dt2 = h / (c₀ + global_visc) - return CFL * min(global_dt_force, dt2) -end - """ Δt(Position, Velocity, Acceleration, SimulationConstants, SPHKernel) From 90b13e022769ab545f35753835a088176193568e Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Thu, 15 Jan 2026 23:26:18 +0100 Subject: [PATCH 05/15] Move CUDA timestep to GPU --- src/SPHCUDA.jl | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/src/SPHCUDA.jl b/src/SPHCUDA.jl index 2b16b829..31630740 100644 --- a/src/SPHCUDA.jl +++ b/src/SPHCUDA.jl @@ -27,6 +27,25 @@ using StructArrays: StructArray CUDAAvailable() = CUDA.functional() +@inline function ΔtCUDA(Position, Velocity, Acceleration, SimulationConstants, SPHKernel) + @unpack c₀, CFL = SimulationConstants + @unpack h, η² = SPHKernel + + map_fn = (r, v, a) -> begin + r_sq = sqrt(dot(r, r))^2 + curr_visc = abs(h * dot(v, r) / (r_sq + η²)) + a_mag = norm(a) + curr_dt_force = a_mag > 0 ? sqrt(h / a_mag) : typemax(eltype(a_mag)) + return (curr_visc, curr_dt_force) + end + + reduce_fn = (left, right) -> (max(left[1], right[1]), min(left[2], right[2])) + global_visc, global_dt_force = CUDA.mapreduce(map_fn, reduce_fn, Position, Velocity, Acceleration) + + dt2 = h / (c₀ + global_visc) + return CFL * min(global_dt_force, dt2) +end + @inline function BuildNeighborCellRanges(NeighborCellLists, cell_count) starts = zeros(Int, cell_count + 1) total = 0 @@ -288,7 +307,12 @@ end GhostNormals = hasproperty(SimParticles, :GhostNormals) ? SimParticles.GhostNormals : nothing UniqueCellsView = view(UniqueCells, 1:SimMetaData.IndexCounter) - dt = Δt(Position, Velocity, Acceleration, SimConstants, SimKernel) + + position_step_gpu = CuArray(Position) + velocity_step_gpu = CuArray(Velocity) + acceleration_step_gpu = CuArray(Acceleration) + + dt = ΔtCUDA(position_step_gpu, velocity_step_gpu, acceleration_step_gpu, SimConstants, SimKernel) dt₂ = dt * 0.5 @@ -351,7 +375,10 @@ end @timeit SimMetaData.HourGlass "12 Update MetaData" UpdateMetaData!(SimMetaData, dt) - @timeit SimMetaData.HourGlass "13 Update TimeStep" dt = Δt(Positionₙ⁺, Velocityₙ⁺, Acceleration, SimConstants, SimKernel) + CUDA.copyto!(position_step_gpu, Positionₙ⁺) + CUDA.copyto!(velocity_step_gpu, Velocityₙ⁺) + CUDA.copyto!(acceleration_step_gpu, Acceleration) + @timeit SimMetaData.HourGlass "13 Update TimeStep" dt = ΔtCUDA(position_step_gpu, velocity_step_gpu, acceleration_step_gpu, SimConstants, SimKernel) dt₂ = dt * 0.5 end From e90db4954bfa264d38a516df539698a745a57a27 Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Thu, 15 Jan 2026 23:32:20 +0100 Subject: [PATCH 06/15] Fix CUDA timestep reduction --- src/SPHCUDA.jl | 31 +++++++++++++++++++++++-------- src/TimeStepping.jl | 6 +++--- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/SPHCUDA.jl b/src/SPHCUDA.jl index 31630740..de824244 100644 --- a/src/SPHCUDA.jl +++ b/src/SPHCUDA.jl @@ -27,21 +27,36 @@ using StructArrays: StructArray CUDAAvailable() = CUDA.functional() -@inline function ΔtCUDA(Position, Velocity, Acceleration, SimulationConstants, SPHKernel) - @unpack c₀, CFL = SimulationConstants - @unpack h, η² = SPHKernel - - map_fn = (r, v, a) -> begin +function ΔtCUDAKernel!(max_visc, min_dt_force, Position, Velocity, Acceleration, h, η²) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if i <= length(Position) + r = Position[i] + v = Velocity[i] + a = Acceleration[i] r_sq = sqrt(dot(r, r))^2 curr_visc = abs(h * dot(v, r) / (r_sq + η²)) a_mag = norm(a) curr_dt_force = a_mag > 0 ? sqrt(h / a_mag) : typemax(eltype(a_mag)) - return (curr_visc, curr_dt_force) + CUDA.atomic_max!(max_visc, 1, curr_visc) + CUDA.atomic_min!(min_dt_force, 1, curr_dt_force) end + return nothing +end - reduce_fn = (left, right) -> (max(left[1], right[1]), min(left[2], right[2])) - global_visc, global_dt_force = CUDA.mapreduce(map_fn, reduce_fn, Position, Velocity, Acceleration) +@inline function ΔtCUDA(Position, Velocity, Acceleration, SimulationConstants, SPHKernel) + @unpack c₀, CFL = SimulationConstants + @unpack h, η² = SPHKernel + + scalar_type = eltype(eltype(Position)) + max_visc = CUDA.zeros(scalar_type, 1) + min_dt_force = CUDA.fill(typemax(scalar_type), 1) + + threads = 256 + blocks = cld(length(Position), threads) + @cuda threads=threads blocks=blocks ΔtCUDAKernel!(max_visc, min_dt_force, Position, Velocity, Acceleration, h, η²) + global_visc = Array(max_visc)[1] + global_dt_force = Array(min_dt_force)[1] dt2 = h / (c₀ + global_visc) return CFL * min(global_dt_force, dt2) end diff --git a/src/TimeStepping.jl b/src/TimeStepping.jl index 2f525cc0..e3b22a48 100644 --- a/src/TimeStepping.jl +++ b/src/TimeStepping.jl @@ -34,9 +34,9 @@ function Δt(Position, Velocity, Acceleration, SimulationConstants, SPHKernel) n_chunks = Threads.nthreads() chunk_size = cld(N, n_chunks) - Bumper.@no_escape begin - v_buffer = Bumper.@alloc(Float64, n_chunks) - d_buffer = Bumper.@alloc(Float64, n_chunks) + @no_escape begin + v_buffer = @alloc(Float64, n_chunks) + d_buffer = @alloc(Float64, n_chunks) @sync for i in 1:n_chunks Threads.@spawn begin From 8e722e479b94a17c934698780ee3a7b9b96443b3 Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Thu, 15 Jan 2026 23:40:11 +0100 Subject: [PATCH 07/15] Fix CUDA timestep kernel --- src/SPHCUDA.jl | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/SPHCUDA.jl b/src/SPHCUDA.jl index de824244..1ed86587 100644 --- a/src/SPHCUDA.jl +++ b/src/SPHCUDA.jl @@ -27,7 +27,7 @@ using StructArrays: StructArray CUDAAvailable() = CUDA.functional() -function ΔtCUDAKernel!(max_visc, min_dt_force, Position, Velocity, Acceleration, h, η²) +function ΔtCUDAKernel!(visc_buffer, dt_buffer, Position, Velocity, Acceleration, h, η²) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x if i <= length(Position) r = Position[i] @@ -36,27 +36,23 @@ function ΔtCUDAKernel!(max_visc, min_dt_force, Position, Velocity, Acceleration r_sq = sqrt(dot(r, r))^2 curr_visc = abs(h * dot(v, r) / (r_sq + η²)) a_mag = norm(a) - curr_dt_force = a_mag > 0 ? sqrt(h / a_mag) : typemax(eltype(a_mag)) - CUDA.atomic_max!(max_visc, 1, curr_visc) - CUDA.atomic_min!(min_dt_force, 1, curr_dt_force) + curr_dt_force = a_mag > 0 ? sqrt(h / a_mag) : oftype(a_mag, Inf) + visc_buffer[i] = curr_visc + dt_buffer[i] = curr_dt_force end return nothing end -@inline function ΔtCUDA(Position, Velocity, Acceleration, SimulationConstants, SPHKernel) +@inline function ΔtCUDA!(visc_buffer, dt_buffer, Position, Velocity, Acceleration, SimulationConstants, SPHKernel) @unpack c₀, CFL = SimulationConstants @unpack h, η² = SPHKernel - scalar_type = eltype(eltype(Position)) - max_visc = CUDA.zeros(scalar_type, 1) - min_dt_force = CUDA.fill(typemax(scalar_type), 1) - threads = 256 blocks = cld(length(Position), threads) - @cuda threads=threads blocks=blocks ΔtCUDAKernel!(max_visc, min_dt_force, Position, Velocity, Acceleration, h, η²) + @cuda threads=threads blocks=blocks ΔtCUDAKernel!(visc_buffer, dt_buffer, Position, Velocity, Acceleration, h, η²) - global_visc = Array(max_visc)[1] - global_dt_force = Array(min_dt_force)[1] + global_visc = CUDA.reduce(max, visc_buffer) + global_dt_force = CUDA.reduce(min, dt_buffer) dt2 = h / (c₀ + global_visc) return CFL * min(global_dt_force, dt2) end @@ -327,7 +323,11 @@ end velocity_step_gpu = CuArray(Velocity) acceleration_step_gpu = CuArray(Acceleration) - dt = ΔtCUDA(position_step_gpu, velocity_step_gpu, acceleration_step_gpu, SimConstants, SimKernel) + scalar_type = eltype(eltype(Position)) + visc_buffer_gpu = CUDA.zeros(scalar_type, length(Position)) + dt_buffer_gpu = CUDA.zeros(scalar_type, length(Position)) + + dt = ΔtCUDA!(visc_buffer_gpu, dt_buffer_gpu, position_step_gpu, velocity_step_gpu, acceleration_step_gpu, SimConstants, SimKernel) dt₂ = dt * 0.5 @@ -393,7 +393,7 @@ end CUDA.copyto!(position_step_gpu, Positionₙ⁺) CUDA.copyto!(velocity_step_gpu, Velocityₙ⁺) CUDA.copyto!(acceleration_step_gpu, Acceleration) - @timeit SimMetaData.HourGlass "13 Update TimeStep" dt = ΔtCUDA(position_step_gpu, velocity_step_gpu, acceleration_step_gpu, SimConstants, SimKernel) + @timeit SimMetaData.HourGlass "13 Update TimeStep" dt = ΔtCUDA!(visc_buffer_gpu, dt_buffer_gpu, position_step_gpu, velocity_step_gpu, acceleration_step_gpu, SimConstants, SimKernel) dt₂ = dt * 0.5 end From 5709f49befd52ce785504ecafbe804a2f2a3feeb Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Thu, 15 Jan 2026 23:48:52 +0100 Subject: [PATCH 08/15] Avoid CUDA copyto for timestep --- src/SPHCUDA.jl | 45 ++------------------------------------------- 1 file changed, 2 insertions(+), 43 deletions(-) diff --git a/src/SPHCUDA.jl b/src/SPHCUDA.jl index 1ed86587..e86002b6 100644 --- a/src/SPHCUDA.jl +++ b/src/SPHCUDA.jl @@ -27,36 +27,6 @@ using StructArrays: StructArray CUDAAvailable() = CUDA.functional() -function ΔtCUDAKernel!(visc_buffer, dt_buffer, Position, Velocity, Acceleration, h, η²) - i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - if i <= length(Position) - r = Position[i] - v = Velocity[i] - a = Acceleration[i] - r_sq = sqrt(dot(r, r))^2 - curr_visc = abs(h * dot(v, r) / (r_sq + η²)) - a_mag = norm(a) - curr_dt_force = a_mag > 0 ? sqrt(h / a_mag) : oftype(a_mag, Inf) - visc_buffer[i] = curr_visc - dt_buffer[i] = curr_dt_force - end - return nothing -end - -@inline function ΔtCUDA!(visc_buffer, dt_buffer, Position, Velocity, Acceleration, SimulationConstants, SPHKernel) - @unpack c₀, CFL = SimulationConstants - @unpack h, η² = SPHKernel - - threads = 256 - blocks = cld(length(Position), threads) - @cuda threads=threads blocks=blocks ΔtCUDAKernel!(visc_buffer, dt_buffer, Position, Velocity, Acceleration, h, η²) - - global_visc = CUDA.reduce(max, visc_buffer) - global_dt_force = CUDA.reduce(min, dt_buffer) - dt2 = h / (c₀ + global_visc) - return CFL * min(global_dt_force, dt2) -end - @inline function BuildNeighborCellRanges(NeighborCellLists, cell_count) starts = zeros(Int, cell_count + 1) total = 0 @@ -319,15 +289,7 @@ end UniqueCellsView = view(UniqueCells, 1:SimMetaData.IndexCounter) - position_step_gpu = CuArray(Position) - velocity_step_gpu = CuArray(Velocity) - acceleration_step_gpu = CuArray(Acceleration) - - scalar_type = eltype(eltype(Position)) - visc_buffer_gpu = CUDA.zeros(scalar_type, length(Position)) - dt_buffer_gpu = CUDA.zeros(scalar_type, length(Position)) - - dt = ΔtCUDA!(visc_buffer_gpu, dt_buffer_gpu, position_step_gpu, velocity_step_gpu, acceleration_step_gpu, SimConstants, SimKernel) + dt = Δt(Position, Velocity, Acceleration, SimConstants, SimKernel) dt₂ = dt * 0.5 @@ -390,10 +352,7 @@ end @timeit SimMetaData.HourGlass "12 Update MetaData" UpdateMetaData!(SimMetaData, dt) - CUDA.copyto!(position_step_gpu, Positionₙ⁺) - CUDA.copyto!(velocity_step_gpu, Velocityₙ⁺) - CUDA.copyto!(acceleration_step_gpu, Acceleration) - @timeit SimMetaData.HourGlass "13 Update TimeStep" dt = ΔtCUDA!(visc_buffer_gpu, dt_buffer_gpu, position_step_gpu, velocity_step_gpu, acceleration_step_gpu, SimConstants, SimKernel) + @timeit SimMetaData.HourGlass "13 Update TimeStep" dt = Δt(Positionₙ⁺, Velocityₙ⁺, Acceleration, SimConstants, SimKernel) dt₂ = dt * 0.5 end From e04a6c2bdbf41a49bf119988ed3d5f596276af01 Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Fri, 16 Jan 2026 00:05:08 +0100 Subject: [PATCH 09/15] Move CUDA loop to GPU kernels --- src/SPHCUDA.jl | 494 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 399 insertions(+), 95 deletions(-) diff --git a/src/SPHCUDA.jl b/src/SPHCUDA.jl index e86002b6..b5d3e104 100644 --- a/src/SPHCUDA.jl +++ b/src/SPHCUDA.jl @@ -27,6 +27,147 @@ using StructArrays: StructArray CUDAAvailable() = CUDA.functional() +struct CUDAParticleBuffers{D, T} + Position::CuArray{SVector{D, T}, 1} + Density::CuArray{T, 1} + Pressure::CuArray{T, 1} + Velocity::CuArray{SVector{D, T}, 1} + Acceleration::CuArray{SVector{D, T}, 1} + MotionLimiter::CuArray{T, 1} + GravityFactor::CuArray{T, 1} +end + +struct CUDASupportBuffers{D, T} + DρdtI::CuArray{T, 1} + Velocityₙ⁺::CuArray{SVector{D, T}, 1} + Positionₙ⁺::CuArray{SVector{D, T}, 1} + ρₙ⁺::CuArray{T, 1} + ∇Cᵢ::CuArray{SVector{D, T}, 1} + ∇◌rᵢ::CuArray{T, 1} + ΔtViscous::CuArray{T, 1} + ΔtForce::CuArray{T, 1} + ΔxScratch::CuArray{T, 1} +end + +mutable struct CUDANeighborBuffers + CellListIndices::CuArray{Int, 1} + ParticleRanges::CuArray{Int, 1} + ParticleOrder::CuArray{Int, 1} + NeighborCellStarts::CuArray{Int, 1} + NeighborCellEntries::CuArray{Int, 1} +end + +struct CUDAMotionBuffers{D, T} + Velocity::CuArray{T, 1} + StartTime::CuArray{T, 1} + Duration::CuArray{T, 1} + Direction::CuArray{SVector{D, T}, 1} + Active::CuArray{Bool, 1} +end + +function BuildCUDAParticleBuffers(SimParticles) + return CUDAParticleBuffers( + CuArray(SimParticles.Position), + CuArray(SimParticles.Density), + CuArray(SimParticles.Pressure), + CuArray(SimParticles.Velocity), + CuArray(SimParticles.Acceleration), + CuArray(SimParticles.MotionLimiter), + CuArray(SimParticles.GravityFactor), + ) +end + +function BuildCUDASupportBuffers(dρdtI, Velocityₙ⁺, Positionₙ⁺, ρₙ⁺, ∇Cᵢ, ∇◌rᵢ) + return CUDASupportBuffers( + CuArray(dρdtI), + CuArray(Velocityₙ⁺), + CuArray(Positionₙ⁺), + CuArray(ρₙ⁺), + CuArray(∇Cᵢ), + CuArray(∇◌rᵢ), + CuArray(similar(dρdtI)), + CuArray(similar(dρdtI)), + CuArray(similar(dρdtI)), + ) +end + +function BuildCUDAMotionBuffers(SimParticles, MotionDefinition, ::Val{Dimensions}, ::Type{FloatType}) where {Dimensions, FloatType} + NumberOfPoints = length(SimParticles.Position) + ZeroVector = zero(eltype(SimParticles.Position)) + MotionVelocity = zeros(FloatType, NumberOfPoints) + MotionStartTime = zeros(FloatType, NumberOfPoints) + MotionDuration = zeros(FloatType, NumberOfPoints) + MotionDirection = Vector{SVector{Dimensions, FloatType}}(undef, NumberOfPoints) + MotionActive = falses(NumberOfPoints) + + if MotionDefinition !== nothing + @inbounds for i in 1:NumberOfPoints + MotionDirection[i] = ZeroVector + if SimParticles.Type[i] == Moving + motion = MotionDefinition[SimParticles.GroupMarker[i]] + if motion !== nothing + MotionVelocity[i] = motion.Velocity + MotionStartTime[i] = motion.StartTime + MotionDuration[i] = motion.Duration + MotionDirection[i] = motion.Direction + MotionActive[i] = true + end + end + end + else + fill!(MotionDirection, ZeroVector) + end + + return CUDAMotionBuffers( + CuArray(MotionVelocity), + CuArray(MotionStartTime), + CuArray(MotionDuration), + CuArray(MotionDirection), + CuArray(MotionActive), + ) +end + +function BuildCUDANeighborBuffers(SimParticles, ParticleRanges, ParticleOrder, NeighborCellLists, CellLookup) + cell_list_indices = BuildCellListIndices(SimParticles.Cells, CellLookup) + cell_count = length(NeighborCellLists) + neighbor_cell_starts, neighbor_cell_entries = BuildNeighborCellRanges(NeighborCellLists, cell_count) + + return CUDANeighborBuffers( + CuArray(cell_list_indices), + CuArray(ParticleRanges), + CuArray(ParticleOrder), + CuArray(neighbor_cell_starts), + CuArray(neighbor_cell_entries), + ) +end + +function RefreshCUDANeighborBuffers!(NeighborBuffers::CUDANeighborBuffers, SimParticles, ParticleRanges, ParticleOrder, NeighborCellLists, CellLookup) + cell_list_indices = BuildCellListIndices(SimParticles.Cells, CellLookup) + cell_count = length(NeighborCellLists) + neighbor_cell_starts, neighbor_cell_entries = BuildNeighborCellRanges(NeighborCellLists, cell_count) + + NeighborBuffers.CellListIndices = CuArray(cell_list_indices) + NeighborBuffers.ParticleRanges = CuArray(ParticleRanges) + NeighborBuffers.ParticleOrder = CuArray(ParticleOrder) + NeighborBuffers.NeighborCellStarts = CuArray(neighbor_cell_starts) + NeighborBuffers.NeighborCellEntries = CuArray(neighbor_cell_entries) + return nothing +end + +function SyncParticlesToHost!(SimParticles, CUDABuffers::CUDAParticleBuffers) + CUDA.copyto!(SimParticles.Position, CUDABuffers.Position) + CUDA.copyto!(SimParticles.Density, CUDABuffers.Density) + CUDA.copyto!(SimParticles.Pressure, CUDABuffers.Pressure) + CUDA.copyto!(SimParticles.Velocity, CUDABuffers.Velocity) + CUDA.copyto!(SimParticles.Acceleration, CUDABuffers.Acceleration) + return nothing +end + +function SyncPositionsToHost!(SimParticles, CUDABuffers::CUDAParticleBuffers) + CUDA.copyto!(SimParticles.Position, CUDABuffers.Position) + return nothing +end + @inline function BuildNeighborCellRanges(NeighborCellLists, cell_count) starts = zeros(Int, cell_count + 1) total = 0 @@ -47,6 +188,209 @@ CUDAAvailable() = CUDA.functional() return starts, entries end +function PressureCUDAKernel!(Press, Density, SimConstants) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if i <= length(Press) + ρ = Density[i] + Press[i] = ((SimConstants.c₀^2 * SimConstants.ρ₀) / 7) * ((ρ / SimConstants.ρ₀)^7 - 1) + end + return nothing +end + +function PressureCUDA!(Press, Density, SimConstants) + threads = 256 + blocks = cld(length(Press), threads) + @cuda threads=threads blocks=blocks PressureCUDAKernel!(Press, Density, SimConstants) + return nothing +end + +function DensityEpsiCUDAKernel!(Density, dρdtIₙ⁺, ρₙ⁺, Δt) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if i <= length(Density) + epsi = -(dρdtIₙ⁺[i] / ρₙ⁺[i]) * Δt + Density[i] = Density[i] * (2 - epsi) / (2 + epsi) + end + return nothing +end + +function DensityEpsiCUDA!(Density, dρdtIₙ⁺, ρₙ⁺, Δt) + threads = 256 + blocks = cld(length(Density), threads) + @cuda threads=threads blocks=blocks DensityEpsiCUDAKernel!(Density, dρdtIₙ⁺, ρₙ⁺, Δt) + return nothing +end + +function LimitDensityAtBoundaryCUDAKernel!(Density, ρ₀, MotionLimiter) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if i <= length(Density) + if (Density[i] < ρ₀) * !Bool(MotionLimiter[i]) + Density[i] = ρ₀ + end + end + return nothing +end + +function LimitDensityAtBoundaryCUDA!(Density, ρ₀, MotionLimiter) + threads = 256 + blocks = cld(length(Density), threads) + @cuda threads=threads blocks=blocks LimitDensityAtBoundaryCUDAKernel!(Density, ρ₀, MotionLimiter) + return nothing +end + +function HalfTimeStepCUDAKernel!(Position, Density, Velocity, Acceleration, GravityFactor, MotionLimiter, + Positionₙ⁺, Velocityₙ⁺, ρₙ⁺, dρdtI, dt₂, SimConstants) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if i <= length(Position) + acc = Acceleration[i] + ConstructGravitySVector(Acceleration[i], SimConstants.g * GravityFactor[i]) + Acceleration[i] = acc + limiter = MotionLimiter[i] + Positionₙ⁺[i] = Position[i] + Velocity[i] * dt₂ * limiter + Velocityₙ⁺[i] = Velocity[i] + acc * dt₂ * limiter + ρₙ⁺[i] = Density[i] + dρdtI[i] * dt₂ + end + return nothing +end + +function HalfTimeStepCUDA!(CUDAParticles::CUDAParticleBuffers, CUDASupport::CUDASupportBuffers, dt₂, SimConstants) + threads = 256 + blocks = cld(length(CUDAParticles.Position), threads) + @cuda threads=threads blocks=blocks HalfTimeStepCUDAKernel!( + CUDAParticles.Position, + CUDAParticles.Density, + CUDAParticles.Velocity, + CUDAParticles.Acceleration, + CUDAParticles.GravityFactor, + CUDAParticles.MotionLimiter, + CUDASupport.Positionₙ⁺, + CUDASupport.Velocityₙ⁺, + CUDASupport.ρₙ⁺, + CUDASupport.DρdtI, + dt₂, + SimConstants, + ) + return nothing +end + +function FullTimeStepCUDAKernel!(Position, Velocity, Acceleration, GravityFactor, MotionLimiter, dt, SimConstants) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if i <= length(Position) + acc = Acceleration[i] + ConstructGravitySVector(Acceleration[i], SimConstants.g * GravityFactor[i]) + Acceleration[i] = acc + limiter = MotionLimiter[i] + Velocity[i] = Velocity[i] + acc * dt * limiter + Position[i] = Position[i] + (((Velocity[i] + (Velocity[i] - acc * dt * limiter)) / 2) * dt) * limiter + end + return nothing +end + +function FullTimeStepCUDA!(CUDAParticles::CUDAParticleBuffers, dt, SimConstants) + threads = 256 + blocks = cld(length(CUDAParticles.Position), threads) + @cuda threads=threads blocks=blocks FullTimeStepCUDAKernel!( + CUDAParticles.Position, + CUDAParticles.Velocity, + CUDAParticles.Acceleration, + CUDAParticles.GravityFactor, + CUDAParticles.MotionLimiter, + dt, + SimConstants, + ) + return nothing +end + +function ProgressMotionCUDAKernel!(Position, Velocity, MotionVelocity, MotionStartTime, MotionDuration, + MotionDirection, MotionActive, TotalTime, dt₂) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if i <= length(Position) + if MotionActive[i] + should_move = (MotionStartTime[i] <= TotalTime) && (TotalTime <= (MotionStartTime[i] + MotionDuration[i])) + if should_move + Velocity[i] = MotionVelocity[i] * MotionDirection[i] + else + Velocity[i] = zero(Position[i]) + end + Position[i] = Position[i] + Velocity[i] * dt₂ + end + end + return nothing +end + +function ProgressMotionCUDA!(CUDAParticles::CUDAParticleBuffers, MotionBuffers::CUDAMotionBuffers, TotalTime, dt₂) + threads = 256 + blocks = cld(length(CUDAParticles.Position), threads) + @cuda threads=threads blocks=blocks ProgressMotionCUDAKernel!( + CUDAParticles.Position, + CUDAParticles.Velocity, + MotionBuffers.Velocity, + MotionBuffers.StartTime, + MotionBuffers.Duration, + MotionBuffers.Direction, + MotionBuffers.Active, + TotalTime, + dt₂, + ) + return nothing +end + +function ΔtCUDAKernel!(ViscousBuffer, ForceBuffer, Position, Velocity, Acceleration, SimConstants, SimKernel) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if i <= length(Position) + r = Position[i] + v = Velocity[i] + a = Acceleration[i] + + r_sq = dot(r, r) + ViscousBuffer[i] = abs(SimKernel.h * dot(v, r) / (r_sq + SimKernel.η²)) + + a_sq = dot(a, a) + if a_sq > 0 + a_mag = sqrt(a_sq) + ForceBuffer[i] = sqrt(SimKernel.h / a_mag) + else + ForceBuffer[i] = Inf + end + end + return nothing +end + +function ΔtCUDA(CUDASupport::CUDASupportBuffers, CUDAParticles::CUDAParticleBuffers, SimConstants, SimKernel) + threads = 256 + blocks = cld(length(CUDAParticles.Position), threads) + @cuda threads=threads blocks=blocks ΔtCUDAKernel!( + CUDASupport.ΔtViscous, + CUDASupport.ΔtForce, + CUDAParticles.Position, + CUDAParticles.Velocity, + CUDAParticles.Acceleration, + SimConstants, + SimKernel, + ) + max_visc = CUDA.reduce(max, CUDASupport.ΔtViscous) + min_force = CUDA.reduce(min, CUDASupport.ΔtForce) + return SimConstants.CFL * min(min_force, SimKernel.h / (SimConstants.c₀ + max_visc)) +end + +function MaxDisplacementCUDAKernel!(Scratch, Positionₙ⁺, Position) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if i <= length(Position) + diff = Positionₙ⁺[i] - Position[i] + Scratch[i] = sqrt(dot(diff, diff)) + end + return nothing +end + +function UpdateΔxCUDA!(Δx, CUDASupport::CUDASupportBuffers, CUDAParticles::CUDAParticleBuffers) + threads = 256 + blocks = cld(length(CUDAParticles.Position), threads) + @cuda threads=threads blocks=blocks MaxDisplacementCUDAKernel!( + CUDASupport.ΔxScratch, + CUDASupport.Positionₙ⁺, + CUDAParticles.Position, + ) + maxd = CUDA.reduce(max, CUDASupport.ΔxScratch) + return Δx + 4 * maxd +end + @inline function BuildCellListIndices(Cells, CellLookup) indices = similar(Cells, Int) @inbounds for i in eachindex(Cells) @@ -204,19 +548,14 @@ function NeighborLoopCUDAKernel!(dρdtI, Acceleration, Position, Density, Pressu end function NeighborLoopPerParticleCUDA!(SimDensityDiffusion::SDD, SimViscosity::SV, SimKernel, - SimMetaData::SimulationMetaData{D,T,NoShifting,NoKernelOutput,B,L}, - SimConstants, SimParticles, ParticleRanges, - CellLookup, NeighborCellLists, dρdtI, - Acceleration, ∇Cᵢ, - ∇◌rᵢ; - Position = SimParticles.Position, - Density = SimParticles.Density, - Pressure = SimParticles.Pressure, - Velocity = SimParticles.Velocity, - ParticleOrder) where {D,T, - B<:MDBCMode,L<:LogMode, - SDD<:SPHDensityDiffusion, - SV<:SPHViscosity} + SimConstants, CUDAParticles::CUDAParticleBuffers, + NeighborBuffers::CUDANeighborBuffers, dρdtI, Acceleration; + Position = CUDAParticles.Position, + Density = CUDAParticles.Density, + Pressure = CUDAParticles.Pressure, + Velocity = CUDAParticles.Velocity) where { + SDD<:SPHDensityDiffusion, + SV<:SPHViscosity} if !CUDAAvailable() error("CUDA is not available; cannot run GPU neighbor loop.") end @@ -228,35 +567,16 @@ function NeighborLoopPerParticleCUDA!(SimDensityDiffusion::SDD, SimViscosity::SV error("CUDA neighbor loop supports ArtificialViscosity or ZeroViscosity.") end - cell_list_indices = BuildCellListIndices(SimParticles.Cells, CellLookup) - cell_count = length(NeighborCellLists) - neighbor_cell_starts, neighbor_cell_entries = BuildNeighborCellRanges(NeighborCellLists, cell_count) - - dρdtI_gpu = CuArray(dρdtI) - acceleration_gpu = CuArray(Acceleration) - position_gpu = CuArray(Position) - density_gpu = CuArray(Density) - pressure_gpu = CuArray(Pressure) - velocity_gpu = CuArray(Velocity) - motion_limiter_gpu = CuArray(SimParticles.MotionLimiter) - cell_list_indices_gpu = CuArray(cell_list_indices) - particle_ranges_gpu = CuArray(ParticleRanges) - particle_order_gpu = CuArray(ParticleOrder) - neighbor_cell_starts_gpu = CuArray(neighbor_cell_starts) - neighbor_cell_entries_gpu = CuArray(neighbor_cell_entries) - threads = 256 blocks = cld(length(Position), threads) @cuda threads=threads blocks=blocks NeighborLoopCUDAKernel!( - dρdtI_gpu, acceleration_gpu, position_gpu, density_gpu, pressure_gpu, - velocity_gpu, motion_limiter_gpu, cell_list_indices_gpu, particle_ranges_gpu, - particle_order_gpu, neighbor_cell_starts_gpu, neighbor_cell_entries_gpu, + dρdtI, Acceleration, Position, Density, Pressure, + Velocity, CUDAParticles.MotionLimiter, NeighborBuffers.CellListIndices, + NeighborBuffers.ParticleRanges, NeighborBuffers.ParticleOrder, + NeighborBuffers.NeighborCellStarts, NeighborBuffers.NeighborCellEntries, SimKernel, SimConstants, SimDensityDiffusion, SimViscosity, ) - CUDA.copyto!(dρdtI, dρdtI_gpu) - CUDA.copyto!(Acceleration, acceleration_gpu) - return nothing end @@ -265,97 +585,77 @@ end SimConstants, SimParticles, FullStencil, ParticleRanges, UniqueCells, CellLookup, ParticleOrder, CellOffsets, - NeighborCellLists, dρdtI, Velocityₙ⁺, - Positionₙ⁺, ρₙ⁺, ∇Cᵢ, ∇◌rᵢ, - MotionDefinition::Union{ - Nothing, - AbstractVector{ - Union{ - Nothing, - MotionDetails{Dimensions, FloatType}, - }, - }, - }) where { - Dimensions, FloatType, SMode, KMode, - BMode, LMode, - SDD<:SPHDensityDiffusion, - SV<:SPHViscosity} - @unpack Position, Density, Pressure, Velocity, Acceleration, MotionLimiter, - GroupMarker = SimParticles - ParticleType = SimParticles.Type - ParticleMarker = GroupMarker - GhostPoints = hasproperty(SimParticles, :GhostPoints) ? SimParticles.GhostPoints : nothing - GhostNormals = hasproperty(SimParticles, :GhostNormals) ? SimParticles.GhostNormals : nothing - + NeighborCellLists, CUDAParticles::CUDAParticleBuffers, + CUDASupport::CUDASupportBuffers, MotionBuffers, + NeighborBuffers::CUDANeighborBuffers) where { + Dimensions, FloatType, SMode, KMode, + BMode, LMode, + SDD<:SPHDensityDiffusion, + SV<:SPHViscosity} UniqueCellsView = view(UniqueCells, 1:SimMetaData.IndexCounter) - dt = Δt(Position, Velocity, Acceleration, SimConstants, SimKernel) - + dt = ΔtCUDA(CUDASupport, CUDAParticles, SimConstants, SimKernel) dt₂ = dt * 0.5 while SimMetaData.TotalTime <= next_output_time(SimMetaData) @timeit SimMetaData.HourGlass "01 Calculate IndexCounter" begin - SimMetaData.Δx = UpdateΔx!(SimMetaData.Δx, Positionₙ⁺, SimParticles.Position) + SimMetaData.Δx = UpdateΔxCUDA!(SimMetaData.Δx, CUDASupport, CUDAParticles) ShouldRebuild = SimMetaData.Δx >= SimKernel.h if ShouldRebuild + SyncPositionsToHost!(SimParticles, CUDAParticles) @timeit SimMetaData.HourGlass "01a Actual Calculate IndexCounter" begin SimMetaData.IndexCounter = UpdateNeighbors!(SimParticles, SimKernel.H⁻¹, ParticleRanges, UniqueCells, CellLookup, ParticleOrder, CellOffsets) end - SimMetaData.Δx = zero(eltype(dρdtI)) + SimMetaData.Δx = zero(eltype(CUDASupport.DρdtI)) UniqueCellsView = view(UniqueCells, 1:SimMetaData.IndexCounter) BuildNeighborCellLists!(NeighborCellLists, FullStencil, UniqueCellsView, ParticleRanges, CellLookup) + RefreshCUDANeighborBuffers!(NeighborBuffers, SimParticles, ParticleRanges, ParticleOrder, NeighborCellLists, CellLookup) end end - @timeit SimMetaData.HourGlass "Motion" ProgressMotion(SimParticles, dt₂, MotionDefinition, SimMetaData) - - @timeit SimMetaData.HourGlass "02 Pressure" Pressure!(SimParticles.Pressure, SimParticles.Density, SimConstants) - if SimMetaData isa SimulationMetaData{Dimensions, FloatType, SMode, KMode, NoMDBC, LMode} where {SMode, KMode, LMode} - @timeit SimMetaData.HourGlass "03 Apply MDBC before Half TimeStep" SPHCellList.ApplyMDBCBeforeHalf!(SimMetaData) - else - @timeit SimMetaData.HourGlass "03 Apply MDBC before Half TimeStep" SPHCellList.ApplyMDBCBeforeHalf!( - SimMetaData, SimKernel, SimConstants, SimParticles, ParticleRanges, CellLookup, Position, - Density, GhostPoints, GhostNormals, ParticleType; ParticleOrder = ParticleOrder, - ) + if MotionBuffers !== nothing + @timeit SimMetaData.HourGlass "Motion" ProgressMotionCUDA!(CUDAParticles, MotionBuffers, SimMetaData.TotalTime, dt₂) end + @timeit SimMetaData.HourGlass "02 Pressure" PressureCUDA!(CUDAParticles.Pressure, CUDAParticles.Density, SimConstants) + @timeit SimMetaData.HourGlass "04 First NeighborLoop" NeighborLoopPerParticleCUDA!( - SimDensityDiffusion, SimViscosity, SimKernel, SimMetaData, - SimConstants, SimParticles, ParticleRanges, CellLookup, - NeighborCellLists, dρdtI, Acceleration, ∇Cᵢ, ∇◌rᵢ, - ParticleOrder = ParticleOrder, + SimDensityDiffusion, SimViscosity, SimKernel, SimConstants, + CUDAParticles, NeighborBuffers, CUDASupport.DρdtI, CUDAParticles.Acceleration, ) - @timeit SimMetaData.HourGlass "05 Update To Half TimeStep" HalfTimeStep(SimMetaData, SimConstants, SimParticles, Positionₙ⁺, Velocityₙ⁺, ρₙ⁺, dρdtI, dt₂) + @timeit SimMetaData.HourGlass "05 Update To Half TimeStep" HalfTimeStepCUDA!(CUDAParticles, CUDASupport, dt₂, SimConstants) - @timeit SimMetaData.HourGlass "06 Half LimitDensityAtBoundary" LimitDensityAtBoundary!(ρₙ⁺, SimConstants.ρ₀, MotionLimiter) + @timeit SimMetaData.HourGlass "06 Half LimitDensityAtBoundary" LimitDensityAtBoundaryCUDA!(CUDASupport.ρₙ⁺, SimConstants.ρ₀, CUDAParticles.MotionLimiter) - @timeit SimMetaData.HourGlass "Motion" ProgressMotion(SimParticles, dt₂, MotionDefinition, SimMetaData) + if MotionBuffers !== nothing + @timeit SimMetaData.HourGlass "Motion" ProgressMotionCUDA!(CUDAParticles, MotionBuffers, SimMetaData.TotalTime, dt₂) + end - @timeit SimMetaData.HourGlass "07 Pressure" Pressure!(SimParticles.Pressure, ρₙ⁺, SimConstants) + @timeit SimMetaData.HourGlass "07 Pressure" PressureCUDA!(CUDAParticles.Pressure, CUDASupport.ρₙ⁺, SimConstants) @timeit SimMetaData.HourGlass "08 Second NeighborLoop" NeighborLoopPerParticleCUDA!( - SimDensityDiffusion, SimViscosity, SimKernel, SimMetaData, - SimConstants, SimParticles, ParticleRanges, CellLookup, - NeighborCellLists, dρdtI, Acceleration, ∇Cᵢ, ∇◌rᵢ, - ParticleOrder = ParticleOrder, - Position = Positionₙ⁺, - Density = ρₙ⁺, - Velocity = Velocityₙ⁺, + SimDensityDiffusion, SimViscosity, SimKernel, SimConstants, + CUDAParticles, NeighborBuffers, CUDASupport.DρdtI, CUDAParticles.Acceleration, + Position = CUDASupport.Positionₙ⁺, + Density = CUDASupport.ρₙ⁺, + Velocity = CUDASupport.Velocityₙ⁺, ) - @timeit SimMetaData.HourGlass "09 Final LimitDensityAtBoundary" LimitDensityAtBoundary!(Density, SimConstants.ρ₀, MotionLimiter) + @timeit SimMetaData.HourGlass "09 Final LimitDensityAtBoundary" LimitDensityAtBoundaryCUDA!(CUDAParticles.Density, SimConstants.ρ₀, CUDAParticles.MotionLimiter) - @timeit SimMetaData.HourGlass "10 Final Density" DensityEpsi!(Density, dρdtI, ρₙ⁺, dt) + @timeit SimMetaData.HourGlass "10 Final Density" DensityEpsiCUDA!(CUDAParticles.Density, CUDASupport.DρdtI, CUDASupport.ρₙ⁺, dt) - @timeit SimMetaData.HourGlass "11 Update To Final TimeStep" FullTimeStep(SimMetaData, SimKernel, SimConstants, SimParticles, ∇Cᵢ, ∇◌rᵢ, dt) + @timeit SimMetaData.HourGlass "11 Update To Final TimeStep" FullTimeStepCUDA!(CUDAParticles, dt, SimConstants) @timeit SimMetaData.HourGlass "12 Update MetaData" UpdateMetaData!(SimMetaData, dt) - @timeit SimMetaData.HourGlass "13 Update TimeStep" dt = Δt(Positionₙ⁺, Velocityₙ⁺, Acceleration, SimConstants, SimKernel) + @timeit SimMetaData.HourGlass "13 Update TimeStep" dt = ΔtCUDA(CUDASupport, CUDAParticles, SimConstants, SimKernel) dt₂ = dt * 0.5 end + SyncParticlesToHost!(SimParticles, CUDAParticles) + return nothing end @@ -372,8 +672,8 @@ function RunSimulationCUDA(;SimGeometry::Vector{Geometry{Dimensions, FloatType}} if !CUDAAvailable() error("CUDA is not available; cannot run RunSimulationCUDA.") end - if !(SimMetaData isa SimulationMetaData{Dimensions, FloatType, NoShifting, NoKernelOutput, BMode, LMode} where {BMode, LMode}) - error("RunSimulationCUDA currently supports NoShifting and NoKernelOutput configurations.") + if !(SimMetaData isa SimulationMetaData{Dimensions, FloatType, NoShifting, NoKernelOutput, NoMDBC, LMode} where {LMode}) + error("RunSimulationCUDA currently supports NoShifting, NoKernelOutput, and NoMDBC configurations.") end NumberOfPoints = length(SimParticles) @@ -415,14 +715,18 @@ function RunSimulationCUDA(;SimGeometry::Vector{Geometry{Dimensions, FloatType}} end MotionDefinition = SPHCellList.GenerateMotionDetails(SimParticles, SimGeometry, Dimensions, FloatType) + CUDAParticles = BuildCUDAParticleBuffers(SimParticles) + CUDASupport = BuildCUDASupportBuffers(dρdtI, Velocityₙ⁺, Positionₙ⁺, ρₙ⁺, ∇Cᵢ, ∇◌rᵢ) + MotionBuffers = MotionDefinition === nothing ? nothing : BuildCUDAMotionBuffers(SimParticles, MotionDefinition, Val(Dimensions), FloatType) + NeighborBuffers = BuildCUDANeighborBuffers(SimParticles, ParticleRanges, ParticleOrder, NeighborCellLists, CellLookup) @inbounds while true @timeit SimMetaData.HourGlass "00 SimulationLoop" SimulationLoopCUDA( SimDensityDiffusion, SimViscosity, SimKernel, SimMetaData, SimConstants, SimParticles, FullStencil, ParticleRanges, UniqueCells, CellLookup, ParticleOrder, CellOffsets, - NeighborCellLists, dρdtI, Velocityₙ⁺, Positionₙ⁺, ρₙ⁺, - ∇Cᵢ, ∇◌rᵢ, MotionDefinition, + NeighborCellLists, CUDAParticles, CUDASupport, + MotionBuffers, NeighborBuffers, ) push!(SimMetaData.TimeSteps, SimMetaData.CurrentTimeStep) From ba8cbb83c472800ce5df35a33241c95e357554aa Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Fri, 16 Jan 2026 00:12:38 +0100 Subject: [PATCH 10/15] Initialize CUDA neighbor buffers safely --- src/SPHCUDA.jl | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/SPHCUDA.jl b/src/SPHCUDA.jl index b5d3e104..99321135 100644 --- a/src/SPHCUDA.jl +++ b/src/SPHCUDA.jl @@ -141,6 +141,17 @@ function BuildCUDANeighborBuffers(SimParticles, ParticleRanges, ParticleOrder, N ) end +function BuildEmptyCUDANeighborBuffers() + empty_ints = Int[] + return CUDANeighborBuffers( + CuArray(empty_ints), + CuArray(empty_ints), + CuArray(empty_ints), + CuArray(empty_ints), + CuArray(empty_ints), + ) +end + function RefreshCUDANeighborBuffers!(NeighborBuffers::CUDANeighborBuffers, SimParticles, ParticleRanges, ParticleOrder, NeighborCellLists, CellLookup) cell_list_indices = BuildCellListIndices(SimParticles.Cells, CellLookup) cell_count = length(NeighborCellLists) @@ -594,6 +605,14 @@ end SV<:SPHViscosity} UniqueCellsView = view(UniqueCells, 1:SimMetaData.IndexCounter) + if isempty(CellLookup.Grid) + SyncPositionsToHost!(SimParticles, CUDAParticles) + SimMetaData.IndexCounter = UpdateNeighbors!(SimParticles, SimKernel.H⁻¹, ParticleRanges, UniqueCells, CellLookup, ParticleOrder, CellOffsets) + UniqueCellsView = view(UniqueCells, 1:SimMetaData.IndexCounter) + BuildNeighborCellLists!(NeighborCellLists, FullStencil, UniqueCellsView, ParticleRanges, CellLookup) + RefreshCUDANeighborBuffers!(NeighborBuffers, SimParticles, ParticleRanges, ParticleOrder, NeighborCellLists, CellLookup) + end + dt = ΔtCUDA(CUDASupport, CUDAParticles, SimConstants, SimKernel) dt₂ = dt * 0.5 @@ -718,7 +737,7 @@ function RunSimulationCUDA(;SimGeometry::Vector{Geometry{Dimensions, FloatType}} CUDAParticles = BuildCUDAParticleBuffers(SimParticles) CUDASupport = BuildCUDASupportBuffers(dρdtI, Velocityₙ⁺, Positionₙ⁺, ρₙ⁺, ∇Cᵢ, ∇◌rᵢ) MotionBuffers = MotionDefinition === nothing ? nothing : BuildCUDAMotionBuffers(SimParticles, MotionDefinition, Val(Dimensions), FloatType) - NeighborBuffers = BuildCUDANeighborBuffers(SimParticles, ParticleRanges, ParticleOrder, NeighborCellLists, CellLookup) + NeighborBuffers = BuildEmptyCUDANeighborBuffers() @inbounds while true @timeit SimMetaData.HourGlass "00 SimulationLoop" SimulationLoopCUDA( From 22ef0980ccde70517973f2d6fb59d95ea45799af Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Fri, 16 Jan 2026 00:34:34 +0100 Subject: [PATCH 11/15] Defer CUDA neighbor rebuilds to host loop --- src/SPHCUDA.jl | 40 +++++++++++++++++----------------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/src/SPHCUDA.jl b/src/SPHCUDA.jl index 99321135..bfbf9b88 100644 --- a/src/SPHCUDA.jl +++ b/src/SPHCUDA.jl @@ -605,33 +605,11 @@ end SV<:SPHViscosity} UniqueCellsView = view(UniqueCells, 1:SimMetaData.IndexCounter) - if isempty(CellLookup.Grid) - SyncPositionsToHost!(SimParticles, CUDAParticles) - SimMetaData.IndexCounter = UpdateNeighbors!(SimParticles, SimKernel.H⁻¹, ParticleRanges, UniqueCells, CellLookup, ParticleOrder, CellOffsets) - UniqueCellsView = view(UniqueCells, 1:SimMetaData.IndexCounter) - BuildNeighborCellLists!(NeighborCellLists, FullStencil, UniqueCellsView, ParticleRanges, CellLookup) - RefreshCUDANeighborBuffers!(NeighborBuffers, SimParticles, ParticleRanges, ParticleOrder, NeighborCellLists, CellLookup) - end - dt = ΔtCUDA(CUDASupport, CUDAParticles, SimConstants, SimKernel) dt₂ = dt * 0.5 while SimMetaData.TotalTime <= next_output_time(SimMetaData) - @timeit SimMetaData.HourGlass "01 Calculate IndexCounter" begin - SimMetaData.Δx = UpdateΔxCUDA!(SimMetaData.Δx, CUDASupport, CUDAParticles) - ShouldRebuild = SimMetaData.Δx >= SimKernel.h - - if ShouldRebuild - SyncPositionsToHost!(SimParticles, CUDAParticles) - @timeit SimMetaData.HourGlass "01a Actual Calculate IndexCounter" begin - SimMetaData.IndexCounter = UpdateNeighbors!(SimParticles, SimKernel.H⁻¹, ParticleRanges, UniqueCells, CellLookup, ParticleOrder, CellOffsets) - end - SimMetaData.Δx = zero(eltype(CUDASupport.DρdtI)) - UniqueCellsView = view(UniqueCells, 1:SimMetaData.IndexCounter) - BuildNeighborCellLists!(NeighborCellLists, FullStencil, UniqueCellsView, ParticleRanges, CellLookup) - RefreshCUDANeighborBuffers!(NeighborBuffers, SimParticles, ParticleRanges, ParticleOrder, NeighborCellLists, CellLookup) - end - end + @timeit SimMetaData.HourGlass "01 Calculate IndexCounter" SimMetaData.Δx = UpdateΔxCUDA!(SimMetaData.Δx, CUDASupport, CUDAParticles) if MotionBuffers !== nothing @timeit SimMetaData.HourGlass "Motion" ProgressMotionCUDA!(CUDAParticles, MotionBuffers, SimMetaData.TotalTime, dt₂) @@ -738,6 +716,12 @@ function RunSimulationCUDA(;SimGeometry::Vector{Geometry{Dimensions, FloatType}} CUDASupport = BuildCUDASupportBuffers(dρdtI, Velocityₙ⁺, Positionₙ⁺, ρₙ⁺, ∇Cᵢ, ∇◌rᵢ) MotionBuffers = MotionDefinition === nothing ? nothing : BuildCUDAMotionBuffers(SimParticles, MotionDefinition, Val(Dimensions), FloatType) NeighborBuffers = BuildEmptyCUDANeighborBuffers() + SimMetaData.IndexCounter = UpdateNeighbors!(SimParticles, SimKernel.H⁻¹, ParticleRanges, UniqueCells, CellLookup, ParticleOrder, CellOffsets) + if SimMetaData.IndexCounter > 0 + unique_cells_view = view(UniqueCells, 1:SimMetaData.IndexCounter) + BuildNeighborCellLists!(NeighborCellLists, FullStencil, unique_cells_view, ParticleRanges, CellLookup) + RefreshCUDANeighborBuffers!(NeighborBuffers, SimParticles, ParticleRanges, ParticleOrder, NeighborCellLists, CellLookup) + end @inbounds while true @timeit SimMetaData.HourGlass "00 SimulationLoop" SimulationLoopCUDA( @@ -749,6 +733,16 @@ function RunSimulationCUDA(;SimGeometry::Vector{Geometry{Dimensions, FloatType}} ) push!(SimMetaData.TimeSteps, SimMetaData.CurrentTimeStep) + if SimMetaData.Δx >= SimKernel.h + SimMetaData.IndexCounter = UpdateNeighbors!(SimParticles, SimKernel.H⁻¹, ParticleRanges, UniqueCells, CellLookup, ParticleOrder, CellOffsets) + SimMetaData.Δx = zero(eltype(dρdtI)) + if SimMetaData.IndexCounter > 0 + unique_cells_view = view(UniqueCells, 1:SimMetaData.IndexCounter) + BuildNeighborCellLists!(NeighborCellLists, FullStencil, unique_cells_view, ParticleRanges, CellLookup) + RefreshCUDANeighborBuffers!(NeighborBuffers, SimParticles, ParticleRanges, ParticleOrder, NeighborCellLists, CellLookup) + end + end + LogStep!(SimMetaData, SimLogger) SimMetaData.OutputIterationCounter += 1 From 5a15fa1480fda7202a6d3fe8c8a8d299701a7563 Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Fri, 16 Jan 2026 00:57:06 +0100 Subject: [PATCH 12/15] Scale CUDA launch size per particle --- src/SPHCUDA.jl | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/src/SPHCUDA.jl b/src/SPHCUDA.jl index bfbf9b88..83826f56 100644 --- a/src/SPHCUDA.jl +++ b/src/SPHCUDA.jl @@ -27,6 +27,14 @@ using StructArrays: StructArray CUDAAvailable() = CUDA.functional() +@inline function KernelLaunchConfig(length) + if length == 0 + return 1, 1 + end + threads = min(256, length) + return threads, cld(length, threads) +end + struct CUDAParticleBuffers{D, T} Position::CuArray{SVector{D, T}, 1} Density::CuArray{T, 1} @@ -209,8 +217,7 @@ function PressureCUDAKernel!(Press, Density, SimConstants) end function PressureCUDA!(Press, Density, SimConstants) - threads = 256 - blocks = cld(length(Press), threads) + threads, blocks = KernelLaunchConfig(length(Press)) @cuda threads=threads blocks=blocks PressureCUDAKernel!(Press, Density, SimConstants) return nothing end @@ -225,8 +232,7 @@ function DensityEpsiCUDAKernel!(Density, dρdtIₙ⁺, ρₙ⁺, Δt) end function DensityEpsiCUDA!(Density, dρdtIₙ⁺, ρₙ⁺, Δt) - threads = 256 - blocks = cld(length(Density), threads) + threads, blocks = KernelLaunchConfig(length(Density)) @cuda threads=threads blocks=blocks DensityEpsiCUDAKernel!(Density, dρdtIₙ⁺, ρₙ⁺, Δt) return nothing end @@ -242,8 +248,7 @@ function LimitDensityAtBoundaryCUDAKernel!(Density, ρ₀, MotionLimiter) end function LimitDensityAtBoundaryCUDA!(Density, ρ₀, MotionLimiter) - threads = 256 - blocks = cld(length(Density), threads) + threads, blocks = KernelLaunchConfig(length(Density)) @cuda threads=threads blocks=blocks LimitDensityAtBoundaryCUDAKernel!(Density, ρ₀, MotionLimiter) return nothing end @@ -263,8 +268,7 @@ function HalfTimeStepCUDAKernel!(Position, Density, Velocity, Acceleration, Grav end function HalfTimeStepCUDA!(CUDAParticles::CUDAParticleBuffers, CUDASupport::CUDASupportBuffers, dt₂, SimConstants) - threads = 256 - blocks = cld(length(CUDAParticles.Position), threads) + threads, blocks = KernelLaunchConfig(length(CUDAParticles.Position)) @cuda threads=threads blocks=blocks HalfTimeStepCUDAKernel!( CUDAParticles.Position, CUDAParticles.Density, @@ -295,8 +299,7 @@ function FullTimeStepCUDAKernel!(Position, Velocity, Acceleration, GravityFactor end function FullTimeStepCUDA!(CUDAParticles::CUDAParticleBuffers, dt, SimConstants) - threads = 256 - blocks = cld(length(CUDAParticles.Position), threads) + threads, blocks = KernelLaunchConfig(length(CUDAParticles.Position)) @cuda threads=threads blocks=blocks FullTimeStepCUDAKernel!( CUDAParticles.Position, CUDAParticles.Velocity, @@ -327,8 +330,7 @@ function ProgressMotionCUDAKernel!(Position, Velocity, MotionVelocity, MotionSta end function ProgressMotionCUDA!(CUDAParticles::CUDAParticleBuffers, MotionBuffers::CUDAMotionBuffers, TotalTime, dt₂) - threads = 256 - blocks = cld(length(CUDAParticles.Position), threads) + threads, blocks = KernelLaunchConfig(length(CUDAParticles.Position)) @cuda threads=threads blocks=blocks ProgressMotionCUDAKernel!( CUDAParticles.Position, CUDAParticles.Velocity, @@ -365,8 +367,7 @@ function ΔtCUDAKernel!(ViscousBuffer, ForceBuffer, Position, Velocity, Accelera end function ΔtCUDA(CUDASupport::CUDASupportBuffers, CUDAParticles::CUDAParticleBuffers, SimConstants, SimKernel) - threads = 256 - blocks = cld(length(CUDAParticles.Position), threads) + threads, blocks = KernelLaunchConfig(length(CUDAParticles.Position)) @cuda threads=threads blocks=blocks ΔtCUDAKernel!( CUDASupport.ΔtViscous, CUDASupport.ΔtForce, @@ -391,8 +392,7 @@ function MaxDisplacementCUDAKernel!(Scratch, Positionₙ⁺, Position) end function UpdateΔxCUDA!(Δx, CUDASupport::CUDASupportBuffers, CUDAParticles::CUDAParticleBuffers) - threads = 256 - blocks = cld(length(CUDAParticles.Position), threads) + threads, blocks = KernelLaunchConfig(length(CUDAParticles.Position)) @cuda threads=threads blocks=blocks MaxDisplacementCUDAKernel!( CUDASupport.ΔxScratch, CUDASupport.Positionₙ⁺, @@ -578,8 +578,7 @@ function NeighborLoopPerParticleCUDA!(SimDensityDiffusion::SDD, SimViscosity::SV error("CUDA neighbor loop supports ArtificialViscosity or ZeroViscosity.") end - threads = 256 - blocks = cld(length(Position), threads) + threads, blocks = KernelLaunchConfig(length(Position)) @cuda threads=threads blocks=blocks NeighborLoopCUDAKernel!( dρdtI, Acceleration, Position, Density, Pressure, Velocity, CUDAParticles.MotionLimiter, NeighborBuffers.CellListIndices, From 9b8eff808de88bce198279640c3e3826d3c57a3d Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Fri, 16 Jan 2026 19:07:50 +0100 Subject: [PATCH 13/15] Cache CUDA launch config per step --- src/SPHCUDA.jl | 83 ++++++++++++++++++++++++-------------------------- 1 file changed, 40 insertions(+), 43 deletions(-) diff --git a/src/SPHCUDA.jl b/src/SPHCUDA.jl index 83826f56..eadd5e79 100644 --- a/src/SPHCUDA.jl +++ b/src/SPHCUDA.jl @@ -27,12 +27,17 @@ using StructArrays: StructArray CUDAAvailable() = CUDA.functional() +struct LaunchConfig + Threads::Int + Blocks::Int +end + @inline function KernelLaunchConfig(length) if length == 0 - return 1, 1 + return LaunchConfig(1, 1) end threads = min(256, length) - return threads, cld(length, threads) + return LaunchConfig(threads, cld(length, threads)) end struct CUDAParticleBuffers{D, T} @@ -216,9 +221,8 @@ function PressureCUDAKernel!(Press, Density, SimConstants) return nothing end -function PressureCUDA!(Press, Density, SimConstants) - threads, blocks = KernelLaunchConfig(length(Press)) - @cuda threads=threads blocks=blocks PressureCUDAKernel!(Press, Density, SimConstants) +function PressureCUDA!(Press, Density, SimConstants, Launch) + @cuda threads=Launch.Threads blocks=Launch.Blocks PressureCUDAKernel!(Press, Density, SimConstants) return nothing end @@ -231,9 +235,8 @@ function DensityEpsiCUDAKernel!(Density, dρdtIₙ⁺, ρₙ⁺, Δt) return nothing end -function DensityEpsiCUDA!(Density, dρdtIₙ⁺, ρₙ⁺, Δt) - threads, blocks = KernelLaunchConfig(length(Density)) - @cuda threads=threads blocks=blocks DensityEpsiCUDAKernel!(Density, dρdtIₙ⁺, ρₙ⁺, Δt) +function DensityEpsiCUDA!(Density, dρdtIₙ⁺, ρₙ⁺, Δt, Launch) + @cuda threads=Launch.Threads blocks=Launch.Blocks DensityEpsiCUDAKernel!(Density, dρdtIₙ⁺, ρₙ⁺, Δt) return nothing end @@ -247,9 +250,8 @@ function LimitDensityAtBoundaryCUDAKernel!(Density, ρ₀, MotionLimiter) return nothing end -function LimitDensityAtBoundaryCUDA!(Density, ρ₀, MotionLimiter) - threads, blocks = KernelLaunchConfig(length(Density)) - @cuda threads=threads blocks=blocks LimitDensityAtBoundaryCUDAKernel!(Density, ρ₀, MotionLimiter) +function LimitDensityAtBoundaryCUDA!(Density, ρ₀, MotionLimiter, Launch) + @cuda threads=Launch.Threads blocks=Launch.Blocks LimitDensityAtBoundaryCUDAKernel!(Density, ρ₀, MotionLimiter) return nothing end @@ -267,9 +269,8 @@ function HalfTimeStepCUDAKernel!(Position, Density, Velocity, Acceleration, Grav return nothing end -function HalfTimeStepCUDA!(CUDAParticles::CUDAParticleBuffers, CUDASupport::CUDASupportBuffers, dt₂, SimConstants) - threads, blocks = KernelLaunchConfig(length(CUDAParticles.Position)) - @cuda threads=threads blocks=blocks HalfTimeStepCUDAKernel!( +function HalfTimeStepCUDA!(CUDAParticles::CUDAParticleBuffers, CUDASupport::CUDASupportBuffers, dt₂, SimConstants, Launch) + @cuda threads=Launch.Threads blocks=Launch.Blocks HalfTimeStepCUDAKernel!( CUDAParticles.Position, CUDAParticles.Density, CUDAParticles.Velocity, @@ -298,9 +299,8 @@ function FullTimeStepCUDAKernel!(Position, Velocity, Acceleration, GravityFactor return nothing end -function FullTimeStepCUDA!(CUDAParticles::CUDAParticleBuffers, dt, SimConstants) - threads, blocks = KernelLaunchConfig(length(CUDAParticles.Position)) - @cuda threads=threads blocks=blocks FullTimeStepCUDAKernel!( +function FullTimeStepCUDA!(CUDAParticles::CUDAParticleBuffers, dt, SimConstants, Launch) + @cuda threads=Launch.Threads blocks=Launch.Blocks FullTimeStepCUDAKernel!( CUDAParticles.Position, CUDAParticles.Velocity, CUDAParticles.Acceleration, @@ -329,9 +329,8 @@ function ProgressMotionCUDAKernel!(Position, Velocity, MotionVelocity, MotionSta return nothing end -function ProgressMotionCUDA!(CUDAParticles::CUDAParticleBuffers, MotionBuffers::CUDAMotionBuffers, TotalTime, dt₂) - threads, blocks = KernelLaunchConfig(length(CUDAParticles.Position)) - @cuda threads=threads blocks=blocks ProgressMotionCUDAKernel!( +function ProgressMotionCUDA!(CUDAParticles::CUDAParticleBuffers, MotionBuffers::CUDAMotionBuffers, TotalTime, dt₂, Launch) + @cuda threads=Launch.Threads blocks=Launch.Blocks ProgressMotionCUDAKernel!( CUDAParticles.Position, CUDAParticles.Velocity, MotionBuffers.Velocity, @@ -366,9 +365,8 @@ function ΔtCUDAKernel!(ViscousBuffer, ForceBuffer, Position, Velocity, Accelera return nothing end -function ΔtCUDA(CUDASupport::CUDASupportBuffers, CUDAParticles::CUDAParticleBuffers, SimConstants, SimKernel) - threads, blocks = KernelLaunchConfig(length(CUDAParticles.Position)) - @cuda threads=threads blocks=blocks ΔtCUDAKernel!( +function ΔtCUDA(CUDASupport::CUDASupportBuffers, CUDAParticles::CUDAParticleBuffers, SimConstants, SimKernel, Launch) + @cuda threads=Launch.Threads blocks=Launch.Blocks ΔtCUDAKernel!( CUDASupport.ΔtViscous, CUDASupport.ΔtForce, CUDAParticles.Position, @@ -391,9 +389,8 @@ function MaxDisplacementCUDAKernel!(Scratch, Positionₙ⁺, Position) return nothing end -function UpdateΔxCUDA!(Δx, CUDASupport::CUDASupportBuffers, CUDAParticles::CUDAParticleBuffers) - threads, blocks = KernelLaunchConfig(length(CUDAParticles.Position)) - @cuda threads=threads blocks=blocks MaxDisplacementCUDAKernel!( +function UpdateΔxCUDA!(Δx, CUDASupport::CUDASupportBuffers, CUDAParticles::CUDAParticleBuffers, Launch) + @cuda threads=Launch.Threads blocks=Launch.Blocks MaxDisplacementCUDAKernel!( CUDASupport.ΔxScratch, CUDASupport.Positionₙ⁺, CUDAParticles.Position, @@ -560,7 +557,7 @@ end function NeighborLoopPerParticleCUDA!(SimDensityDiffusion::SDD, SimViscosity::SV, SimKernel, SimConstants, CUDAParticles::CUDAParticleBuffers, - NeighborBuffers::CUDANeighborBuffers, dρdtI, Acceleration; + NeighborBuffers::CUDANeighborBuffers, dρdtI, Acceleration, Launch; Position = CUDAParticles.Position, Density = CUDAParticles.Density, Pressure = CUDAParticles.Pressure, @@ -578,8 +575,7 @@ function NeighborLoopPerParticleCUDA!(SimDensityDiffusion::SDD, SimViscosity::SV error("CUDA neighbor loop supports ArtificialViscosity or ZeroViscosity.") end - threads, blocks = KernelLaunchConfig(length(Position)) - @cuda threads=threads blocks=blocks NeighborLoopCUDAKernel!( + @cuda threads=Launch.Threads blocks=Launch.Blocks NeighborLoopCUDAKernel!( dρdtI, Acceleration, Position, Density, Pressure, Velocity, CUDAParticles.MotionLimiter, NeighborBuffers.CellListIndices, NeighborBuffers.ParticleRanges, NeighborBuffers.ParticleOrder, @@ -604,49 +600,50 @@ end SV<:SPHViscosity} UniqueCellsView = view(UniqueCells, 1:SimMetaData.IndexCounter) - dt = ΔtCUDA(CUDASupport, CUDAParticles, SimConstants, SimKernel) + Launch = KernelLaunchConfig(length(CUDAParticles.Position)) + dt = ΔtCUDA(CUDASupport, CUDAParticles, SimConstants, SimKernel, Launch) dt₂ = dt * 0.5 while SimMetaData.TotalTime <= next_output_time(SimMetaData) - @timeit SimMetaData.HourGlass "01 Calculate IndexCounter" SimMetaData.Δx = UpdateΔxCUDA!(SimMetaData.Δx, CUDASupport, CUDAParticles) + @timeit SimMetaData.HourGlass "01 Calculate IndexCounter" SimMetaData.Δx = UpdateΔxCUDA!(SimMetaData.Δx, CUDASupport, CUDAParticles, Launch) if MotionBuffers !== nothing - @timeit SimMetaData.HourGlass "Motion" ProgressMotionCUDA!(CUDAParticles, MotionBuffers, SimMetaData.TotalTime, dt₂) + @timeit SimMetaData.HourGlass "Motion" ProgressMotionCUDA!(CUDAParticles, MotionBuffers, SimMetaData.TotalTime, dt₂, Launch) end - @timeit SimMetaData.HourGlass "02 Pressure" PressureCUDA!(CUDAParticles.Pressure, CUDAParticles.Density, SimConstants) + @timeit SimMetaData.HourGlass "02 Pressure" PressureCUDA!(CUDAParticles.Pressure, CUDAParticles.Density, SimConstants, Launch) @timeit SimMetaData.HourGlass "04 First NeighborLoop" NeighborLoopPerParticleCUDA!( SimDensityDiffusion, SimViscosity, SimKernel, SimConstants, - CUDAParticles, NeighborBuffers, CUDASupport.DρdtI, CUDAParticles.Acceleration, + CUDAParticles, NeighborBuffers, CUDASupport.DρdtI, CUDAParticles.Acceleration, Launch, ) - @timeit SimMetaData.HourGlass "05 Update To Half TimeStep" HalfTimeStepCUDA!(CUDAParticles, CUDASupport, dt₂, SimConstants) + @timeit SimMetaData.HourGlass "05 Update To Half TimeStep" HalfTimeStepCUDA!(CUDAParticles, CUDASupport, dt₂, SimConstants, Launch) - @timeit SimMetaData.HourGlass "06 Half LimitDensityAtBoundary" LimitDensityAtBoundaryCUDA!(CUDASupport.ρₙ⁺, SimConstants.ρ₀, CUDAParticles.MotionLimiter) + @timeit SimMetaData.HourGlass "06 Half LimitDensityAtBoundary" LimitDensityAtBoundaryCUDA!(CUDASupport.ρₙ⁺, SimConstants.ρ₀, CUDAParticles.MotionLimiter, Launch) if MotionBuffers !== nothing - @timeit SimMetaData.HourGlass "Motion" ProgressMotionCUDA!(CUDAParticles, MotionBuffers, SimMetaData.TotalTime, dt₂) + @timeit SimMetaData.HourGlass "Motion" ProgressMotionCUDA!(CUDAParticles, MotionBuffers, SimMetaData.TotalTime, dt₂, Launch) end - @timeit SimMetaData.HourGlass "07 Pressure" PressureCUDA!(CUDAParticles.Pressure, CUDASupport.ρₙ⁺, SimConstants) + @timeit SimMetaData.HourGlass "07 Pressure" PressureCUDA!(CUDAParticles.Pressure, CUDASupport.ρₙ⁺, SimConstants, Launch) @timeit SimMetaData.HourGlass "08 Second NeighborLoop" NeighborLoopPerParticleCUDA!( SimDensityDiffusion, SimViscosity, SimKernel, SimConstants, - CUDAParticles, NeighborBuffers, CUDASupport.DρdtI, CUDAParticles.Acceleration, + CUDAParticles, NeighborBuffers, CUDASupport.DρdtI, CUDAParticles.Acceleration, Launch, Position = CUDASupport.Positionₙ⁺, Density = CUDASupport.ρₙ⁺, Velocity = CUDASupport.Velocityₙ⁺, ) - @timeit SimMetaData.HourGlass "09 Final LimitDensityAtBoundary" LimitDensityAtBoundaryCUDA!(CUDAParticles.Density, SimConstants.ρ₀, CUDAParticles.MotionLimiter) + @timeit SimMetaData.HourGlass "09 Final LimitDensityAtBoundary" LimitDensityAtBoundaryCUDA!(CUDAParticles.Density, SimConstants.ρ₀, CUDAParticles.MotionLimiter, Launch) - @timeit SimMetaData.HourGlass "10 Final Density" DensityEpsiCUDA!(CUDAParticles.Density, CUDASupport.DρdtI, CUDASupport.ρₙ⁺, dt) + @timeit SimMetaData.HourGlass "10 Final Density" DensityEpsiCUDA!(CUDAParticles.Density, CUDASupport.DρdtI, CUDASupport.ρₙ⁺, dt, Launch) - @timeit SimMetaData.HourGlass "11 Update To Final TimeStep" FullTimeStepCUDA!(CUDAParticles, dt, SimConstants) + @timeit SimMetaData.HourGlass "11 Update To Final TimeStep" FullTimeStepCUDA!(CUDAParticles, dt, SimConstants, Launch) @timeit SimMetaData.HourGlass "12 Update MetaData" UpdateMetaData!(SimMetaData, dt) - @timeit SimMetaData.HourGlass "13 Update TimeStep" dt = ΔtCUDA(CUDASupport, CUDAParticles, SimConstants, SimKernel) + @timeit SimMetaData.HourGlass "13 Update TimeStep" dt = ΔtCUDA(CUDASupport, CUDAParticles, SimConstants, SimKernel, Launch) dt₂ = dt * 0.5 end From 072db87e2737a1f622c2475b261c76b40c278b0c Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Fri, 16 Jan 2026 19:42:13 +0100 Subject: [PATCH 14/15] Add GPU neighbor grid rebuild --- src/SPHCUDA.jl | 321 ++++++++++++++++++++++++++----------------------- 1 file changed, 169 insertions(+), 152 deletions(-) diff --git a/src/SPHCUDA.jl b/src/SPHCUDA.jl index eadd5e79..de48ae5b 100644 --- a/src/SPHCUDA.jl +++ b/src/SPHCUDA.jl @@ -62,12 +62,15 @@ struct CUDASupportBuffers{D, T} ΔxScratch::CuArray{T, 1} end -mutable struct CUDANeighborBuffers - CellListIndices::CuArray{Int, 1} - ParticleRanges::CuArray{Int, 1} - ParticleOrder::CuArray{Int, 1} - NeighborCellStarts::CuArray{Int, 1} - NeighborCellEntries::CuArray{Int, 1} +mutable struct CUDAGridBuffers{D} + CellCoords::CuArray{SVector{D, Int}, 1} + BucketCounts::CuArray{Int, 1} + BucketEnds::CuArray{Int, 1} + BucketStarts::CuArray{Int, 1} + BucketWrite::CuArray{Int, 1} + BucketParticles::CuArray{Int, 1} + NeighborOffsets::CuArray{SVector{D, Int}, 1} + BucketCount::Int end struct CUDAMotionBuffers{D, T} @@ -140,44 +143,23 @@ function BuildCUDAMotionBuffers(SimParticles, MotionDefinition, ::Val{Dimensions ) end -function BuildCUDANeighborBuffers(SimParticles, ParticleRanges, ParticleOrder, NeighborCellLists, CellLookup) - cell_list_indices = BuildCellListIndices(SimParticles.Cells, CellLookup) - cell_count = length(NeighborCellLists) - neighbor_cell_starts, neighbor_cell_entries = BuildNeighborCellRanges(NeighborCellLists, cell_count) - - return CUDANeighborBuffers( - CuArray(cell_list_indices), - CuArray(ParticleRanges), - CuArray(ParticleOrder), - CuArray(neighbor_cell_starts), - CuArray(neighbor_cell_entries), - ) -end - -function BuildEmptyCUDANeighborBuffers() - empty_ints = Int[] - return CUDANeighborBuffers( - CuArray(empty_ints), - CuArray(empty_ints), - CuArray(empty_ints), - CuArray(empty_ints), - CuArray(empty_ints), +function BuildCUDAGridBuffers(SimParticles, FullStencil) + NumberOfPoints = length(SimParticles.Position) + Dimensions = length(first(SimParticles.Position)) + BucketCount = max(1, 2 * NumberOfPoints) + NeighborOffsets = [SVector{Dimensions, Int}(Tuple(offset)) for offset in FullStencil] + return CUDAGridBuffers( + CuArray(similar(SimParticles.Position, SVector{Dimensions, Int})), + CuArray(zeros(Int, BucketCount)), + CuArray(zeros(Int, BucketCount)), + CuArray(zeros(Int, BucketCount)), + CuArray(zeros(Int, BucketCount)), + CuArray(zeros(Int, NumberOfPoints)), + CuArray(NeighborOffsets), + BucketCount, ) end -function RefreshCUDANeighborBuffers!(NeighborBuffers::CUDANeighborBuffers, SimParticles, ParticleRanges, ParticleOrder, NeighborCellLists, CellLookup) - cell_list_indices = BuildCellListIndices(SimParticles.Cells, CellLookup) - cell_count = length(NeighborCellLists) - neighbor_cell_starts, neighbor_cell_entries = BuildNeighborCellRanges(NeighborCellLists, cell_count) - - NeighborBuffers.CellListIndices = CuArray(cell_list_indices) - NeighborBuffers.ParticleRanges = CuArray(ParticleRanges) - NeighborBuffers.ParticleOrder = CuArray(ParticleOrder) - NeighborBuffers.NeighborCellStarts = CuArray(neighbor_cell_starts) - NeighborBuffers.NeighborCellEntries = CuArray(neighbor_cell_entries) - return nothing -end - function SyncParticlesToHost!(SimParticles, CUDABuffers::CUDAParticleBuffers) CUDA.copyto!(SimParticles.Position, CUDABuffers.Position) CUDA.copyto!(SimParticles.Density, CUDABuffers.Density) @@ -399,12 +381,89 @@ function UpdateΔxCUDA!(Δx, CUDASupport::CUDASupportBuffers, CUDAParticles::CUD return Δx + 4 * maxd end -@inline function BuildCellListIndices(Cells, CellLookup) - indices = similar(Cells, Int) - @inbounds for i in eachindex(Cells) - indices[i] = CellLookupIndex(CellLookup, Cells[i], 1) +@inline function MapFloorGPU(X, InverseCutOff) + return Int(sign(X)) * unsafe_trunc(Int, muladd(abs(X), InverseCutOff, 0.5)) +end + +@inline function HashCell(Cell) + if length(Cell) == 2 + return Cell[1] * 73856093 ⊻ Cell[2] * 19349663 + end + return Cell[1] * 73856093 ⊻ Cell[2] * 19349663 ⊻ Cell[3] * 83492791 +end + +function ResetBucketsKernel!(BucketCounts, BucketEnds, BucketStarts, BucketWrite) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if i <= length(BucketCounts) + BucketCounts[i] = 0 + BucketEnds[i] = 0 + BucketStarts[i] = 0 + BucketWrite[i] = 0 end - return indices + return nothing +end + +function BuildCellCoordsKernel!(CellCoords, BucketCounts, Position, InverseCutOff, BucketCount) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if i <= length(Position) + pos = Position[i] + CellCoords[i] = SVector{length(pos), Int}(ntuple(d -> MapFloorGPU(pos[d], InverseCutOff), length(pos))) + hash_val = HashCell(CellCoords[i]) + bucket = mod(hash_val, BucketCount) + 1 + CUDA.atomic_add!(BucketCounts, bucket, 1) + end + return nothing +end + +function InitializeBucketStartsKernel!(BucketCounts, BucketEnds, BucketStarts, BucketWrite) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if i <= length(BucketCounts) + count = BucketCounts[i] + BucketStarts[i] = BucketEnds[i] - count + 1 + BucketWrite[i] = BucketStarts[i] + end + return nothing +end + +function FillBucketsKernel!(BucketWrite, BucketParticles, CellCoords, BucketCount) + i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if i <= length(CellCoords) + hash_val = HashCell(CellCoords[i]) + bucket = mod(hash_val, BucketCount) + 1 + index = CUDA.atomic_add!(BucketWrite, bucket, 1) + BucketParticles[index] = i + end + return nothing +end + +function RebuildNeighborGridCUDA!(CUDAGrid::CUDAGridBuffers, CUDAParticles::CUDAParticleBuffers, SimKernel, Launch) + @cuda threads=Launch.Threads blocks=Launch.Blocks ResetBucketsKernel!( + CUDAGrid.BucketCounts, + CUDAGrid.BucketEnds, + CUDAGrid.BucketStarts, + CUDAGrid.BucketWrite, + ) + @cuda threads=Launch.Threads blocks=Launch.Blocks BuildCellCoordsKernel!( + CUDAGrid.CellCoords, + CUDAGrid.BucketCounts, + CUDAParticles.Position, + SimKernel.H⁻¹, + CUDAGrid.BucketCount, + ) + CUDAGrid.BucketEnds .= CUDA.cumsum(CUDAGrid.BucketCounts) + @cuda threads=Launch.Threads blocks=Launch.Blocks InitializeBucketStartsKernel!( + CUDAGrid.BucketCounts, + CUDAGrid.BucketEnds, + CUDAGrid.BucketStarts, + CUDAGrid.BucketWrite, + ) + @cuda threads=Launch.Threads blocks=Launch.Blocks FillBucketsKernel!( + CUDAGrid.BucketWrite, + CUDAGrid.BucketParticles, + CUDAGrid.CellCoords, + CUDAGrid.BucketCount, + ) + return nothing end @inline function ComputeDensityDiffusionGPU(::ZeroDensityDiffusion, _SimKernel, _SimConstants, @@ -459,92 +518,56 @@ end end function NeighborLoopCUDAKernel!(dρdtI, Acceleration, Position, Density, Pressure, Velocity, - MotionLimiter, CellListIndices, ParticleRanges, ParticleOrder, - NeighborCellStarts, NeighborCellEntries, SimKernel, SimConstants, + MotionLimiter, CellCoords, BucketStarts, BucketEnds, + BucketParticles, NeighborOffsets, BucketCount, SimKernel, SimConstants, SimDensityDiffusion, SimViscosity) i = (blockIdx().x - 1) * blockDim().x + threadIdx().x if i <= length(Position) dρdt_acc = zero(eltype(dρdtI)) acc_acc = zero(Position[i]) - cell_list_index = CellListIndices[i] - same_cell_start = ParticleRanges[cell_list_index] - same_cell_end = ParticleRanges[cell_list_index + 1] - 1 - - @inbounds for j in same_cell_start:same_cell_end - j_index = ParticleOrder[j] - if j_index != i - xᵢⱼ = Position[i] - Position[j_index] - xᵢⱼ² = dot(xᵢⱼ, xᵢⱼ) - if xᵢⱼ² <= SimKernel.H² - dᵢⱼ = sqrt(abs(xᵢⱼ²)) - q = clamp(dᵢⱼ * SimKernel.h⁻¹, zero(eltype(dᵢⱼ)), one(eltype(dᵢⱼ)) * 2) - ∇ᵢWᵢⱼ = ∇Wᵢⱼ(SimKernel, q, xᵢⱼ) - - ρᵢ = Density[i] - ρⱼ = Density[j_index] - vᵢⱼ = Velocity[i] - Velocity[j_index] - - density_symmetric_term = dot(-vᵢⱼ, ∇ᵢWᵢⱼ) - dρdt⁺ = -ρᵢ * (SimConstants.m₀ / ρⱼ) * density_symmetric_term - - Dᵢ = ComputeDensityDiffusionGPU(SimDensityDiffusion, SimKernel, SimConstants, - Density, MotionLimiter, xᵢⱼ, ∇ᵢWᵢⱼ, - dᵢⱼ^2, i, j_index) - - dρdt_acc += dρdt⁺ + Dᵢ - - Pᵢ = Pressure[i] - Pⱼ = Pressure[j_index] - Pfac = (Pᵢ + Pⱼ) / (ρᵢ * ρⱼ) - f_ab = tensile_correction(SimKernel, Pᵢ, ρᵢ, Pⱼ, ρⱼ, q, SimConstants.dx) - dvdt⁺ = -SimConstants.m₀ * (Pfac + f_ab) * ∇ᵢWᵢⱼ - - visc_term = ComputeViscosityGPU(SimViscosity, SimKernel, SimConstants, - Density, xᵢⱼ, vᵢⱼ, ∇ᵢWᵢⱼ, dᵢⱼ^2, i, j_index) - - acc_acc += dvdt⁺ + visc_term - end - end - end - - neighbor_start = NeighborCellStarts[cell_list_index] - neighbor_end = NeighborCellStarts[cell_list_index + 1] - 1 - @inbounds for neighbor_cursor in neighbor_start:neighbor_end - neighbor_index = NeighborCellEntries[neighbor_cursor] - start_index = ParticleRanges[neighbor_index] - end_index = ParticleRanges[neighbor_index + 1] - 1 - for j in start_index:end_index - j_index = ParticleOrder[j] - xᵢⱼ = Position[i] - Position[j_index] - xᵢⱼ² = dot(xᵢⱼ, xᵢⱼ) - if xᵢⱼ² <= SimKernel.H² - dᵢⱼ = sqrt(abs(xᵢⱼ²)) - q = clamp(dᵢⱼ * SimKernel.h⁻¹, zero(eltype(dᵢⱼ)), one(eltype(dᵢⱼ)) * 2) - ∇ᵢWᵢⱼ = ∇Wᵢⱼ(SimKernel, q, xᵢⱼ) - - ρᵢ = Density[i] - ρⱼ = Density[j_index] - vᵢⱼ = Velocity[i] - Velocity[j_index] - - density_symmetric_term = dot(-vᵢⱼ, ∇ᵢWᵢⱼ) - dρdt⁺ = -ρᵢ * (SimConstants.m₀ / ρⱼ) * density_symmetric_term - - Dᵢ = ComputeDensityDiffusionGPU(SimDensityDiffusion, SimKernel, SimConstants, - Density, MotionLimiter, xᵢⱼ, ∇ᵢWᵢⱼ, - dᵢⱼ^2, i, j_index) - - dρdt_acc += dρdt⁺ + Dᵢ - - Pᵢ = Pressure[i] - Pⱼ = Pressure[j_index] - Pfac = (Pᵢ + Pⱼ) / (ρᵢ * ρⱼ) - f_ab = tensile_correction(SimKernel, Pᵢ, ρᵢ, Pⱼ, ρⱼ, q, SimConstants.dx) - dvdt⁺ = -SimConstants.m₀ * (Pfac + f_ab) * ∇ᵢWᵢⱼ - - visc_term = ComputeViscosityGPU(SimViscosity, SimKernel, SimConstants, - Density, xᵢⱼ, vᵢⱼ, ∇ᵢWᵢⱼ, dᵢⱼ^2, i, j_index) - - acc_acc += dvdt⁺ + visc_term + cell = CellCoords[i] + @inbounds for offset_index in eachindex(NeighborOffsets) + neighbor_cell = cell + NeighborOffsets[offset_index] + hash_val = HashCell(neighbor_cell) + bucket = mod(hash_val, BucketCount) + 1 + start_index = BucketStarts[bucket] + end_index = BucketEnds[bucket] + if start_index <= end_index + @inbounds for j in start_index:end_index + j_index = BucketParticles[j] + if CellCoords[j_index] == neighbor_cell && j_index != i + xᵢⱼ = Position[i] - Position[j_index] + xᵢⱼ² = dot(xᵢⱼ, xᵢⱼ) + if xᵢⱼ² <= SimKernel.H² + dᵢⱼ = sqrt(abs(xᵢⱼ²)) + q = clamp(dᵢⱼ * SimKernel.h⁻¹, zero(eltype(dᵢⱼ)), one(eltype(dᵢⱼ)) * 2) + ∇ᵢWᵢⱼ = ∇Wᵢⱼ(SimKernel, q, xᵢⱼ) + + ρᵢ = Density[i] + ρⱼ = Density[j_index] + vᵢⱼ = Velocity[i] - Velocity[j_index] + + density_symmetric_term = dot(-vᵢⱼ, ∇ᵢWᵢⱼ) + dρdt⁺ = -ρᵢ * (SimConstants.m₀ / ρⱼ) * density_symmetric_term + + Dᵢ = ComputeDensityDiffusionGPU(SimDensityDiffusion, SimKernel, SimConstants, + Density, MotionLimiter, xᵢⱼ, ∇ᵢWᵢⱼ, + dᵢⱼ^2, i, j_index) + + dρdt_acc += dρdt⁺ + Dᵢ + + Pᵢ = Pressure[i] + Pⱼ = Pressure[j_index] + Pfac = (Pᵢ + Pⱼ) / (ρᵢ * ρⱼ) + f_ab = tensile_correction(SimKernel, Pᵢ, ρᵢ, Pⱼ, ρⱼ, q, SimConstants.dx) + dvdt⁺ = -SimConstants.m₀ * (Pfac + f_ab) * ∇ᵢWᵢⱼ + + visc_term = ComputeViscosityGPU(SimViscosity, SimKernel, SimConstants, + Density, xᵢⱼ, vᵢⱼ, ∇ᵢWᵢⱼ, dᵢⱼ^2, i, j_index) + + acc_acc += dvdt⁺ + visc_term + end + end end end end @@ -557,7 +580,7 @@ end function NeighborLoopPerParticleCUDA!(SimDensityDiffusion::SDD, SimViscosity::SV, SimKernel, SimConstants, CUDAParticles::CUDAParticleBuffers, - NeighborBuffers::CUDANeighborBuffers, dρdtI, Acceleration, Launch; + CUDAGrid::CUDAGridBuffers, dρdtI, Acceleration, Launch; Position = CUDAParticles.Position, Density = CUDAParticles.Density, Pressure = CUDAParticles.Pressure, @@ -577,9 +600,9 @@ function NeighborLoopPerParticleCUDA!(SimDensityDiffusion::SDD, SimViscosity::SV @cuda threads=Launch.Threads blocks=Launch.Blocks NeighborLoopCUDAKernel!( dρdtI, Acceleration, Position, Density, Pressure, - Velocity, CUDAParticles.MotionLimiter, NeighborBuffers.CellListIndices, - NeighborBuffers.ParticleRanges, NeighborBuffers.ParticleOrder, - NeighborBuffers.NeighborCellStarts, NeighborBuffers.NeighborCellEntries, + Velocity, CUDAParticles.MotionLimiter, CUDAGrid.CellCoords, + CUDAGrid.BucketStarts, CUDAGrid.BucketEnds, CUDAGrid.BucketParticles, + CUDAGrid.NeighborOffsets, CUDAGrid.BucketCount, SimKernel, SimConstants, SimDensityDiffusion, SimViscosity, ) @@ -593,19 +616,22 @@ end ParticleOrder, CellOffsets, NeighborCellLists, CUDAParticles::CUDAParticleBuffers, CUDASupport::CUDASupportBuffers, MotionBuffers, - NeighborBuffers::CUDANeighborBuffers) where { + CUDAGrid::CUDAGridBuffers) where { Dimensions, FloatType, SMode, KMode, BMode, LMode, SDD<:SPHDensityDiffusion, SV<:SPHViscosity} - UniqueCellsView = view(UniqueCells, 1:SimMetaData.IndexCounter) - Launch = KernelLaunchConfig(length(CUDAParticles.Position)) + RebuildNeighborGridCUDA!(CUDAGrid, CUDAParticles, SimKernel, Launch) dt = ΔtCUDA(CUDASupport, CUDAParticles, SimConstants, SimKernel, Launch) dt₂ = dt * 0.5 while SimMetaData.TotalTime <= next_output_time(SimMetaData) @timeit SimMetaData.HourGlass "01 Calculate IndexCounter" SimMetaData.Δx = UpdateΔxCUDA!(SimMetaData.Δx, CUDASupport, CUDAParticles, Launch) + if SimMetaData.Δx >= SimKernel.h + @timeit SimMetaData.HourGlass "01a Rebuild Neighbor Grid" RebuildNeighborGridCUDA!(CUDAGrid, CUDAParticles, SimKernel, Launch) + SimMetaData.Δx = zero(eltype(CUDASupport.DρdtI)) + end if MotionBuffers !== nothing @timeit SimMetaData.HourGlass "Motion" ProgressMotionCUDA!(CUDAParticles, MotionBuffers, SimMetaData.TotalTime, dt₂, Launch) @@ -615,7 +641,7 @@ end @timeit SimMetaData.HourGlass "04 First NeighborLoop" NeighborLoopPerParticleCUDA!( SimDensityDiffusion, SimViscosity, SimKernel, SimConstants, - CUDAParticles, NeighborBuffers, CUDASupport.DρdtI, CUDAParticles.Acceleration, Launch, + CUDAParticles, CUDAGrid, CUDASupport.DρdtI, CUDAParticles.Acceleration, Launch, ) @timeit SimMetaData.HourGlass "05 Update To Half TimeStep" HalfTimeStepCUDA!(CUDAParticles, CUDASupport, dt₂, SimConstants, Launch) @@ -629,7 +655,7 @@ end @timeit SimMetaData.HourGlass "07 Pressure" PressureCUDA!(CUDAParticles.Pressure, CUDASupport.ρₙ⁺, SimConstants, Launch) @timeit SimMetaData.HourGlass "08 Second NeighborLoop" NeighborLoopPerParticleCUDA!( SimDensityDiffusion, SimViscosity, SimKernel, SimConstants, - CUDAParticles, NeighborBuffers, CUDASupport.DρdtI, CUDAParticles.Acceleration, Launch, + CUDAParticles, CUDAGrid, CUDASupport.DρdtI, CUDAParticles.Acceleration, Launch, Position = CUDASupport.Positionₙ⁺, Density = CUDASupport.ρₙ⁺, Velocity = CUDASupport.Velocityₙ⁺, @@ -711,13 +737,7 @@ function RunSimulationCUDA(;SimGeometry::Vector{Geometry{Dimensions, FloatType}} CUDAParticles = BuildCUDAParticleBuffers(SimParticles) CUDASupport = BuildCUDASupportBuffers(dρdtI, Velocityₙ⁺, Positionₙ⁺, ρₙ⁺, ∇Cᵢ, ∇◌rᵢ) MotionBuffers = MotionDefinition === nothing ? nothing : BuildCUDAMotionBuffers(SimParticles, MotionDefinition, Val(Dimensions), FloatType) - NeighborBuffers = BuildEmptyCUDANeighborBuffers() - SimMetaData.IndexCounter = UpdateNeighbors!(SimParticles, SimKernel.H⁻¹, ParticleRanges, UniqueCells, CellLookup, ParticleOrder, CellOffsets) - if SimMetaData.IndexCounter > 0 - unique_cells_view = view(UniqueCells, 1:SimMetaData.IndexCounter) - BuildNeighborCellLists!(NeighborCellLists, FullStencil, unique_cells_view, ParticleRanges, CellLookup) - RefreshCUDANeighborBuffers!(NeighborBuffers, SimParticles, ParticleRanges, ParticleOrder, NeighborCellLists, CellLookup) - end + CUDAGrid = BuildCUDAGridBuffers(SimParticles, FullStencil) @inbounds while true @timeit SimMetaData.HourGlass "00 SimulationLoop" SimulationLoopCUDA( @@ -725,24 +745,21 @@ function RunSimulationCUDA(;SimGeometry::Vector{Geometry{Dimensions, FloatType}} SimConstants, SimParticles, FullStencil, ParticleRanges, UniqueCells, CellLookup, ParticleOrder, CellOffsets, NeighborCellLists, CUDAParticles, CUDASupport, - MotionBuffers, NeighborBuffers, + MotionBuffers, CUDAGrid, ) push!(SimMetaData.TimeSteps, SimMetaData.CurrentTimeStep) - if SimMetaData.Δx >= SimKernel.h + LogStep!(SimMetaData, SimLogger) + + SimMetaData.OutputIterationCounter += 1 + + if SimMetaData.ExportGridCellParticleCounts SimMetaData.IndexCounter = UpdateNeighbors!(SimParticles, SimKernel.H⁻¹, ParticleRanges, UniqueCells, CellLookup, ParticleOrder, CellOffsets) - SimMetaData.Δx = zero(eltype(dρdtI)) if SimMetaData.IndexCounter > 0 unique_cells_view = view(UniqueCells, 1:SimMetaData.IndexCounter) BuildNeighborCellLists!(NeighborCellLists, FullStencil, unique_cells_view, ParticleRanges, CellLookup) - RefreshCUDANeighborBuffers!(NeighborBuffers, SimParticles, ParticleRanges, ParticleOrder, NeighborCellLists, CellLookup) end end - - LogStep!(SimMetaData, SimLogger) - - SimMetaData.OutputIterationCounter += 1 - UniqueCellsView = view(UniqueCells, 1:SimMetaData.IndexCounter) @timeit SimMetaData.HourGlass "13 Determine Output" SPHCellList.DetermineOutput( From 8e1de54b9577c5ef503c574e80ec6fd4dcfcdb0e Mon Sep 17 00:00:00 2001 From: AhmedSalih3d <36305327+AhmedSalih3d@users.noreply.github.com> Date: Fri, 16 Jan 2026 20:12:27 +0100 Subject: [PATCH 15/15] Fix CUDA grid buffer constructor --- src/SPHCUDA.jl | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/SPHCUDA.jl b/src/SPHCUDA.jl index de48ae5b..9def9b85 100644 --- a/src/SPHCUDA.jl +++ b/src/SPHCUDA.jl @@ -143,13 +143,12 @@ function BuildCUDAMotionBuffers(SimParticles, MotionDefinition, ::Val{Dimensions ) end -function BuildCUDAGridBuffers(SimParticles, FullStencil) +function BuildCUDAGridBuffers(SimParticles, FullStencil, ::Val{D}) where {D} NumberOfPoints = length(SimParticles.Position) - Dimensions = length(first(SimParticles.Position)) BucketCount = max(1, 2 * NumberOfPoints) - NeighborOffsets = [SVector{Dimensions, Int}(Tuple(offset)) for offset in FullStencil] + NeighborOffsets = [SVector{D, Int}(Tuple(offset)) for offset in FullStencil] return CUDAGridBuffers( - CuArray(similar(SimParticles.Position, SVector{Dimensions, Int})), + CuArray(similar(SimParticles.Position, SVector{D, Int})), CuArray(zeros(Int, BucketCount)), CuArray(zeros(Int, BucketCount)), CuArray(zeros(Int, BucketCount)), @@ -737,7 +736,7 @@ function RunSimulationCUDA(;SimGeometry::Vector{Geometry{Dimensions, FloatType}} CUDAParticles = BuildCUDAParticleBuffers(SimParticles) CUDASupport = BuildCUDASupportBuffers(dρdtI, Velocityₙ⁺, Positionₙ⁺, ρₙ⁺, ∇Cᵢ, ∇◌rᵢ) MotionBuffers = MotionDefinition === nothing ? nothing : BuildCUDAMotionBuffers(SimParticles, MotionDefinition, Val(Dimensions), FloatType) - CUDAGrid = BuildCUDAGridBuffers(SimParticles, FullStencil) + CUDAGrid = BuildCUDAGridBuffers(SimParticles, FullStencil, Val(Dimensions)) @inbounds while true @timeit SimMetaData.HourGlass "00 SimulationLoop" SimulationLoopCUDA(