From c2190e6cfbe8ddeda2091c47908415937f774a98 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 3 Mar 2025 16:20:22 -0700 Subject: [PATCH 01/83] Initial check-in of python/miniWeather.py This is NOT COMPLETE YET. It's a port of cpp/miniWeather_serial.cpp. --- python/miniWeather.py | 869 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 869 insertions(+) create mode 100644 python/miniWeather.py diff --git a/python/miniWeather.py b/python/miniWeather.py new file mode 100644 index 0000000..6bb16a3 --- /dev/null +++ b/python/miniWeather.py @@ -0,0 +1,869 @@ + +# ////////////////////////////////////////////////////////////////////////////////////////// +# // miniWeather +# // Author: Matt Norman , Oak Ridge National Laboratory +# // This code simulates dry, stratified, compressible, non-hydrostatic fluid flows +# // For documentation, please see the attached documentation in the "documentation" folder +# // +# ////////////////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include "const.h" +#include "pnetcdf.h" +#include + +# "Halo" size: number of cells beyond the MPI tasks's domain needed for a full "stencil" of information for reconstruction +hs: int = 2 + +# real can be either float or double, depending on cpp/const.h. + +#typedef yakl::Array real1d; +def real1d(name: string, nx: int): + # FIXME (mfh 2025/03/03) This should NOT return the same thing as doub1d; need to change type + return np.zeros(nx) # FIXME (mfh 2025/03/03) + +#typedef yakl::Array real2d; +def real2d(name: string, nx: int, nz: int): + return np.zeros((nz, nx)) # FIXME (mfh 2025/03/03) What element type should this return? + +#typedef yakl::Array real3d; +def real3d(name: string, nx: int, nz: int, nvars: int): + return np.zeros((nvars, nz, nx)) # FIXME (mfh 2025/03/03) What element type should this return? + +#typedef yakl::Array doub2d; +def doub2d(name: string, nx: int, nz: int): + return np.zeros((nz, nx)) # FIXME (mfh 2025/03/03) What element type should this return? + +typedef yakl::Array realConst1d; +typedef yakl::Array realConst2d; +typedef yakl::Array realConst3d; +typedef yakl::Array doubConst1d; +typedef yakl::Array doubConst2d; +typedef yakl::Array doubConst3d; + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are initialized but remain static over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +struct Fixed_data { + int nx, nz; //Number of local grid cells in the x- and z- dimensions for this MPI task + int i_beg, k_beg; //beginning index in the x- and z-directions for this MPI task + int nranks, myrank; //Number of MPI ranks and my rank id + int left_rank, right_rank; //MPI Rank IDs that exist to my left and right in the global domain + int mainproc; //Am I the main process (rank == 0)? + realConst1d hy_dens_cell; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) + realConst1d hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) + realConst1d hy_dens_int; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) + realConst1d hy_dens_theta_int; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) + realConst1d hy_pressure_int; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) +}; + +/////////////////////////////////////////////////////////////////////////////////////// +// THE MAIN PROGRAM STARTS HERE +/////////////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) { + #MPI_Init(&argc,&argv); + #yakl::init(); + { + Fixed_data fixed_data; + real3d state; + real dt; //Model time step (seconds) + + # init allocates state + (fixed_data, state, dt) = init() # init( state , dt , fixed_data ); + + auto &mainproc = fixed_data.mainproc; + + //Initial reductions for mass, kinetic energy, and total energy + double mass0, te0; + reductions(state,mass0,te0,fixed_data); + + int num_out = 0; //The number of outputs performed so far + real output_counter = 0; //Helps determine when it's time to do output + real etime = 0; + + //Output the initial state + if (output_freq >= 0) { + output(state,etime,num_out,fixed_data); + } + + int direction_switch = 1; // Tells dimensionally split which order to take x,z solves + + //////////////////////////////////////////////////// + // MAIN TIME STEP LOOP + //////////////////////////////////////////////////// + auto t1 = std::chrono::steady_clock::now(); + while (etime < sim_time) { + //If the time step leads to exceeding the simulation time, shorten it for the last step + if (etime + dt > sim_time) { dt = sim_time - etime; } + # Perform a single time step + direction_switch = perform_timestep(state, dt, direction_switch, fixed_data) + //Inform the user + #ifndef NO_INFORM + if (mainproc) { printf( "Elapsed Time: %lf / %lf\n", etime , sim_time ); } + #endif + //Update the elapsed time and output counter + etime = etime + dt; + output_counter = output_counter + dt; + //If it's time for output, reset the counter, and do output + if (output_freq >= 0 && output_counter >= output_freq) { + output_counter = output_counter - output_freq; + output(state,etime,num_out,fixed_data); + } + } + auto t2 = std::chrono::steady_clock::now(); + if (mainproc) { + std::cout << "CPU Time: " << std::chrono::duration(t2-t1).count() << " sec\n"; + } + + //Final reductions for mass, kinetic energy, and total energy + double mass, te; + reductions(state,mass,te,fixed_data); + + if (mainproc) { + printf( "d_mass: %le\n" , (mass - mass0)/mass0 ); + printf( "d_te: %le\n" , (te - te0 )/te0 ); + } + + finalize(); + } + yakl::finalize(); + MPI_Finalize(); +} + + +# Performs a single dimensionally split time step using a simple low-storage three-stage Runge-Kutta time integrator +# The dimensional splitting is a second-order-accurate alternating Strang splitting in which the +# order of directions is alternated each time step. +# The Runge-Kutta method used here is defined as follows: +# q* = q_n + dt/3 * rhs(q_n) +# q** = q_n + dt/2 * rhs(q* ) +# q_n+1 = q_n + dt/1 * rhs(q**) +def perform_timestep( + state, # real3d const&, input parameter + dt, # real, must be an input parameter + direction_switch, # int&, in/out parameter, transformed to input and return value + fixed_data # Fixed_data const &, input parameter + ) -> int: # was void; now returns direction_switch + + nx = fixed_data.nx + nz = fixed_data.nz + + state_tmp = real3d("state_tmp", NUM_VARS, nz+2*hs, nx+2*hs); + + if direction_switch != 0: + # x-direction first + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_X , fixed_data ) + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , fixed_data ) + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_X , fixed_data ) + # z-direction second + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_Z , fixed_data ) + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , fixed_data ) + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_Z , fixed_data ) + else: + # z-direction second + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_Z , fixed_data ) + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , fixed_data ) + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_Z , fixed_data ) + # x-direction first + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_X , fixed_data ) + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , fixed_data ) + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_X , fixed_data ) + + if direction_switch: + direction_switch = 0 + else: + direction_switch = 1 + + return direction_switch + + + +//Perform a single semi-discretized step in time with the form: +//state_out = state_init + dt * rhs(state_forcing) +//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out +void semi_discrete_step( realConst3d state_init , real3d const &state_forcing , real3d const &state_out , real dt , int dir , Fixed_data const &fixed_data ) { + auto &nx = fixed_data.nx ; + auto &nz = fixed_data.nz ; + auto &i_beg = fixed_data.i_beg ; + auto &k_beg = fixed_data.k_beg ; + auto &hy_dens_cell = fixed_data.hy_dens_cell ; + + tend = real3d("tend", NUM_VARS, nz, nx); + + if (dir == DIR_X) { + //Set the halo values for this MPI task's fluid state in the x-direction + yakl::timer_start("halo x"); + set_halo_values_x(state_forcing,fixed_data); + yakl::timer_stop("halo x"); + //Compute the time tendencies for the fluid state in the x-direction + yakl::timer_start("tendencies x"); + compute_tendencies_x(state_forcing,tend,dt,fixed_data); + yakl::timer_stop("tendencies x"); + } else if (dir == DIR_Z) { + //Set the halo values for this MPI task's fluid state in the z-direction + yakl::timer_start("halo z"); + set_halo_values_z(state_forcing,fixed_data); + yakl::timer_stop("halo z"); + //Compute the time tendencies for the fluid state in the z-direction + yakl::timer_start("tendencies z"); + compute_tendencies_z(state_forcing,tend,dt,fixed_data); + yakl::timer_stop("tendencies z"); + } + + ///////////////////////////////////////////////// + // TODO: MAKE THESE 3 LOOPS A PARALLEL_FOR + ///////////////////////////////////////////////// + //Apply the tendencies to the fluid state + yakl::timer_start("apply tendencies"); + for (int ll=0; ll stencil; + SArray d3_vals; + SArray vals; + //Use fourth-order interpolation from four cell averages to compute the value at the interface in question + for (int ll=0; ll stencil; + SArray d3_vals; + SArray vals; + //Use fourth-order interpolation from four cell averages to compute the value at the interface in question + for (int ll=0; ll qpoints; + # SArray qweights; + + qpoints = np.array([0.112701665379258311482073460022, 0.500000000000000000000000000000, 0.887298334620741688517926539980], dtype=real) + qweights = np.array([0.277777777777777777777777777779, 0.444444444444444444444444444444, 0.277777777777777777777777777779], dtype=real) + + # ////////////////////////////////////////////////////////////////////////// + # Initialize the cell-averaged fluid state via Gauss-Legendre quadrature + #////////////////////////////////////////////////////////////////////////// + #/////////////////////////////////////////////// + # TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR + #/////////////////////////////////////////////// + for k in range(nz+2*hs): + for i in range(nx+2*hs): + # Initialize the state to zero + for ll in range(NUM_VARS): + state[ll,k,i] = 0.0 + + # Use Gauss-Legendre quadrature to initialize a hydrostatic balance + temperature perturbation + for kk in range(nqpoints): + for ii in range(nqpoints): + # Compute the x,z location within the global domain based on cell and quadrature index + x: real = (i_beg + i-hs+0.5)*dx + (qpoints[ii]-0.5)*dx + z: real = (k_beg + k-hs+0.5)*dz + (qpoints[kk]-0.5)*dz + # real r, u, w, t, hr, ht; + + # The above real variables are probably output parameters + # of collision, thermal, gravity_waves, density_current, and injection. + # x and z are probably input parameters of these functions. + + # Set the fluid state based on the user's specification + if data_spec_int == DATA_SPEC_COLLISION: + (r,u,w,t,hr,ht) = collision(x,z) + if data_spec_int == DATA_SPEC_THERMAL: + (r,u,w,t,hr,ht) = thermal(x,z) + if data_spec_int == DATA_SPEC_GRAVITY_WAVES: + (r,u,w,t,hr,ht) = gravity_waves(x,z) + if data_spec_int == DATA_SPEC_DENSITY_CURRENT: + (r,u,w,t,hr,ht) = density_current(x,z) + if data_spec_int == DATA_SPEC_INJECTION: + (r,u,w,t,hr,ht) = injection(x,z) + + # Store into the fluid state array + state[ID_DENS,k,i] += r * qweights[ii]*qweights[kk]; + state[ID_UMOM,k,i] += (r+hr)*u * qweights[ii]*qweights[kk]; + state[ID_WMOM,k,i] += (r+hr)*w * qweights[ii]*qweights[kk]; + state[ID_RHOT,k,i] += ( (r+hr)*(t+ht) - hr*ht ) * qweights[ii]*qweights[kk]; + + hy_dens_cell = real1d("hy_dens_cell ", nz+2*hs); + hy_dens_theta_cell = real1d("hy_dens_theta_cell", nz+2*hs); + hy_dens_int = real1d("hy_dens_int ", nz+1); + hy_dens_theta_int = real1d("hy_dens_theta_int ", nz+1); + hy_pressure_int = real1d("hy_pressure_int ", nz+1); + + //Compute the hydrostatic background state over vertical cell averages + ///////////////////////////////////////////////// + // TODO: MAKE THIS LOOP A PARALLEL_FOR + ///////////////////////////////////////////////// + for (int k=0; k Date: Mon, 3 Mar 2025 17:09:42 -0700 Subject: [PATCH 02/83] Progress on Python port; not done yet --- python/miniWeather.py | 481 ++++++++++++++++++++++-------------------- 1 file changed, 252 insertions(+), 229 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index 6bb16a3..ab8e3ba 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -67,9 +67,9 @@ def doub2d(name: string, nx: int, nz: int): #MPI_Init(&argc,&argv); #yakl::init(); { - Fixed_data fixed_data; - real3d state; - real dt; //Model time step (seconds) + #Fixed_data fixed_data; + #real3d state; + #real dt; //Model time step (seconds) # init allocates state (fixed_data, state, dt) = init() # init( state , dt , fixed_data ); @@ -122,15 +122,15 @@ def doub2d(name: string, nx: int, nz: int): double mass, te; reductions(state,mass,te,fixed_data); - if (mainproc) { - printf( "d_mass: %le\n" , (mass - mass0)/mass0 ); - printf( "d_te: %le\n" , (te - te0 )/te0 ); - } + if mainproc: + print( "d_mass: %le\n" % ((mass - mass0)/mass0) ) + print( "d_te: %le\n" % ((te - te0 )/te0 ) ) - finalize(); + finalize() } - yakl::finalize(); - MPI_Finalize(); + + # yakl::finalize(); + # MPI_Finalize(); } @@ -213,26 +213,25 @@ def perform_timestep( yakl::timer_stop("tendencies z"); } - ///////////////////////////////////////////////// - // TODO: MAKE THESE 3 LOOPS A PARALLEL_FOR - ///////////////////////////////////////////////// - //Apply the tendencies to the fluid state - yakl::timer_start("apply tendencies"); - for (int ll=0; ll tuple[real3d, real, Fixed_data]: #real3d state #Fixed_data fixed_data #real dt @@ -514,8 +506,16 @@ def init(): # tuple[real3d, real, Fixed_data]: # SArray qpoints; # SArray qweights; - qpoints = np.array([0.112701665379258311482073460022, 0.500000000000000000000000000000, 0.887298334620741688517926539980], dtype=real) - qweights = np.array([0.277777777777777777777777777779, 0.444444444444444444444444444444, 0.277777777777777777777777777779], dtype=real) + qpoints = np.array([ + 0.112701665379258311482073460022, + 0.500000000000000000000000000000, + 0.887298334620741688517926539980 + ], dtype=real) + qweights = np.array([ + 0.277777777777777777777777777779, + 0.444444444444444444444444444444, + 0.277777777777777777777777777779 + ], dtype=real) # ////////////////////////////////////////////////////////////////////////// # Initialize the cell-averaged fluid state via Gauss-Legendre quadrature @@ -565,103 +565,120 @@ def init(): # tuple[real3d, real, Fixed_data]: hy_dens_theta_int = real1d("hy_dens_theta_int ", nz+1); hy_pressure_int = real1d("hy_pressure_int ", nz+1); - //Compute the hydrostatic background state over vertical cell averages - ///////////////////////////////////////////////// - // TODO: MAKE THIS LOOP A PARALLEL_FOR - ///////////////////////////////////////////////// - for (int k=0; k None: + pass + + +# Compute reduced quantities for error checking without resorting to the "ncdiff" tool +#void reductions( realConst3d state , double &mass , double &te , Fixed_data const &fixed_data ) { +def reductions( + state # realConst3d, an input parameter + fixed_data # Fixed_data const&, an input parameter + ) -> tuple[double, double]: # mass, te + + nx = fixed_data.nx + nz = fixed_data.nz + hy_dens_cell = fixed_data.hy_dens_cell + hy_dens_theta_cell = fixed_data.hy_dens_theta_cell + + mass = 0 + te = 0 + for k in range(nz): + for i in range(nx): + r = state[ID_DENS,hs+k,hs+i] + hy_dens_cell[hs+k] # Density + u = state[ID_UMOM,hs+k,hs+i] / r # U-wind + w = state[ID_WMOM,hs+k,hs+i] / r # W-wind + th = ( state[ID_RHOT,hs+k,hs+i] + hy_dens_theta_cell[hs+k] ) / r # Potential Temperature (theta) + p = C0*pow(r*th,gamm) # Pressure + t = th / pow(p0/p,rd/cp) # Temperature + ke = r*(u*u+w*w) # Kinetic Energy + ie = r*cv*t # Internal Energy + mass += r *dx*dz # Accumulate domain mass + te += (ke + ie)*dx*dz # Accumulate domain total energy + + #double glob[2], loc[2]; + #loc[0] = mass; + #loc[1] = te; + #int ierr = MPI_Allreduce(loc,glob,2,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); + #mass = glob[0]; + #te = glob[1]; + + return (mass, te) From 609c239aad0245752b26185616390d8f740bbde1 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 3 Mar 2025 22:46:58 -0700 Subject: [PATCH 03/83] Progress on porting to Python --- python/miniWeather.py | 702 ++++++++++++++++++++++-------------------- 1 file changed, 368 insertions(+), 334 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index ab8e3ba..d601022 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -60,78 +60,73 @@ def doub2d(name: string, nx: int, nz: int): realConst1d hy_pressure_int; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) }; -/////////////////////////////////////////////////////////////////////////////////////// -// THE MAIN PROGRAM STARTS HERE -/////////////////////////////////////////////////////////////////////////////////////// -int main(int argc, char **argv) { +# /////////////////////////////////////////////////////////////////////////////////////// +# // THE MAIN PROGRAM STARTS HERE +# /////////////////////////////////////////////////////////////////////////////////////// +def main() -> None: #MPI_Init(&argc,&argv); #yakl::init(); - { - #Fixed_data fixed_data; - #real3d state; - #real dt; //Model time step (seconds) - # init allocates state - (fixed_data, state, dt) = init() # init( state , dt , fixed_data ); + #Fixed_data fixed_data; + #real3d state; + #real dt; //Model time step (seconds) - auto &mainproc = fixed_data.mainproc; + # init allocates state + (fixed_data, state, dt) = init() # init( state , dt , fixed_data ); - //Initial reductions for mass, kinetic energy, and total energy - double mass0, te0; - reductions(state,mass0,te0,fixed_data); + mainproc = fixed_data.mainproc; - int num_out = 0; //The number of outputs performed so far - real output_counter = 0; //Helps determine when it's time to do output - real etime = 0; + # Initial reductions for mass, kinetic energy, and total energy + (mass0, te0) = reductions(state, fixed_data) - //Output the initial state - if (output_freq >= 0) { - output(state,etime,num_out,fixed_data); - } + num_out: int = 0 # The number of outputs performed so far + output_counter: real = 0 # Helps determine when it's time to do output + etime: real = 0 - int direction_switch = 1; // Tells dimensionally split which order to take x,z solves - - //////////////////////////////////////////////////// - // MAIN TIME STEP LOOP - //////////////////////////////////////////////////// - auto t1 = std::chrono::steady_clock::now(); - while (etime < sim_time) { - //If the time step leads to exceeding the simulation time, shorten it for the last step - if (etime + dt > sim_time) { dt = sim_time - etime; } - # Perform a single time step - direction_switch = perform_timestep(state, dt, direction_switch, fixed_data) - //Inform the user - #ifndef NO_INFORM - if (mainproc) { printf( "Elapsed Time: %lf / %lf\n", etime , sim_time ); } - #endif - //Update the elapsed time and output counter - etime = etime + dt; - output_counter = output_counter + dt; - //If it's time for output, reset the counter, and do output - if (output_freq >= 0 && output_counter >= output_freq) { - output_counter = output_counter - output_freq; - output(state,etime,num_out,fixed_data); - } - } - auto t2 = std::chrono::steady_clock::now(); - if (mainproc) { - std::cout << "CPU Time: " << std::chrono::duration(t2-t1).count() << " sec\n"; - } + # Output the initial state + if output_freq >= 0: + num_out = output(state, etime, num_out, fixed_data) - //Final reductions for mass, kinetic energy, and total energy - double mass, te; - reductions(state,mass,te,fixed_data); + direction_switch: int = 1 # Tells dimensionally split which order to take x,z solves + # //////////////////////////////////////////////////// + # MAIN TIME STEP LOOP + # //////////////////////////////////////////////////// + t1 = std::chrono::steady_clock::now() + + while etime < sim_time: + # If the time step leads to exceeding the simulation time, shorten it for the last step + if etime + dt > sim_time: + dt = sim_time - etime + + # Perform a single time step + direction_switch = perform_timestep(state, dt, direction_switch, fixed_data) + # Inform the user if mainproc: - print( "d_mass: %le\n" % ((mass - mass0)/mass0) ) - print( "d_te: %le\n" % ((te - te0 )/te0 ) ) + print( "Elapsed Time: %lf / %lf\n", etime , sim_time ) + # Update the elapsed time and output counter + etime = etime + dt + output_counter = output_counter + dt + # If it's time for output, reset the counter, and do output + if output_freq >= 0 and output_counter >= output_freq: + output_counter = output_counter - output_freq + num_out = output(state, etime, num_out, fixed_data) + + auto t2 = std::chrono::steady_clock::now(); + if mainproc: + print( "CPU Time: " << std::chrono::duration(t2-t1).count() << " sec\n" ) - finalize() - } + # Final reductions for mass, kinetic energy, and total energy + (mass, te) = reductions(state, fixed_data) + + if mainproc: + print( "d_mass: %le\n" % ((mass - mass0)/mass0) ) + print( "d_te: %le\n" % ((te - te0 )/te0 ) ) + + finalize() # yakl::finalize(); # MPI_Finalize(); -} # Performs a single dimensionally split time step using a simple low-storage three-stage Runge-Kutta time integrator @@ -151,26 +146,26 @@ def perform_timestep( nx = fixed_data.nx nz = fixed_data.nz - state_tmp = real3d("state_tmp", NUM_VARS, nz+2*hs, nx+2*hs); + state_tmp = real3d("state_tmp", NUM_VARS, nz+2*hs, nx+2*hs) if direction_switch != 0: # x-direction first - semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_X , fixed_data ) - semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , fixed_data ) - semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_X , fixed_data ) + semi_discrete_step(state, state , state_tmp, dt / 3, DIR_X, fixed_data) + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, fixed_data) + semi_discrete_step(state, state_tmp, state , dt / 1, DIR_X, fixed_data) # z-direction second - semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_Z , fixed_data ) - semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , fixed_data ) - semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_Z , fixed_data ) + semi_discrete_step(state, state , state_tmp, dt / 3, DIR_Z, fixed_data) + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, fixed_data) + semi_discrete_step(state, state_tmp, state , dt / 1, DIR_Z, fixed_data) else: # z-direction second - semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_Z , fixed_data ) - semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , fixed_data ) - semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_Z , fixed_data ) + semi_discrete_step(state, state , state_tmp, dt / 3, DIR_Z, fixed_data) + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, fixed_data) + semi_discrete_step(state, state_tmp, state , dt / 1, DIR_Z, fixed_data) # x-direction first - semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_X , fixed_data ) - semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , fixed_data ) - semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_X , fixed_data ) + semi_discrete_step(state, state , state_tmp, dt / 3, DIR_X, fixed_data) + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, fixed_data) + semi_discrete_step(state, state_tmp, state , dt / 1, DIR_X, fixed_data) if direction_switch: direction_switch = 0 @@ -181,37 +176,44 @@ def perform_timestep( -//Perform a single semi-discretized step in time with the form: -//state_out = state_init + dt * rhs(state_forcing) -//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out -void semi_discrete_step( realConst3d state_init , real3d const &state_forcing , real3d const &state_out , real dt , int dir , Fixed_data const &fixed_data ) { - auto &nx = fixed_data.nx ; - auto &nz = fixed_data.nz ; - auto &i_beg = fixed_data.i_beg ; - auto &k_beg = fixed_data.k_beg ; - auto &hy_dens_cell = fixed_data.hy_dens_cell ; - - tend = real3d("tend", NUM_VARS, nz, nx); - - if (dir == DIR_X) { - //Set the halo values for this MPI task's fluid state in the x-direction - yakl::timer_start("halo x"); - set_halo_values_x(state_forcing,fixed_data); - yakl::timer_stop("halo x"); - //Compute the time tendencies for the fluid state in the x-direction - yakl::timer_start("tendencies x"); - compute_tendencies_x(state_forcing,tend,dt,fixed_data); - yakl::timer_stop("tendencies x"); - } else if (dir == DIR_Z) { - //Set the halo values for this MPI task's fluid state in the z-direction - yakl::timer_start("halo z"); - set_halo_values_z(state_forcing,fixed_data); - yakl::timer_stop("halo z"); - //Compute the time tendencies for the fluid state in the z-direction - yakl::timer_start("tendencies z"); - compute_tendencies_z(state_forcing,tend,dt,fixed_data); - yakl::timer_stop("tendencies z"); - } +# Perform a single semi-discretized step in time with the form: +# state_out = state_init + dt * rhs(state_forcing) +# Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out +def semi_discrete_step( + state_init, # realConst3d + state_forcing, # real3d const& + state_out, # real3d const&, + dt: real, + dir: int, + fixed_data # Fixed_data const& + ) -> None: + + nx = fixed_data.nx + nz = fixed_data.nz + i_beg = fixed_data.i_beg + k_beg = fixed_data.k_beg + hy_dens_cell = fixed_data.hy_dens_cell + + tend = real3d("tend", NUM_VARS, nz, nx) + + if dir == DIR_X: + # Set the halo values for this MPI task's fluid state in the x-direction + #yakl::timer_start("halo x"); + set_halo_values_x(state_forcing, fixed_data) + #yakl::timer_stop("halo x"); + # Compute the time tendencies for the fluid state in the x-direction + #yakl::timer_start("tendencies x"); + compute_tendencies_x(state_forcing, tend, dt, fixed_data) + #yakl::timer_stop("tendencies x"); + elif dir == DIR_Z: + # Set the halo values for this MPI task's fluid state in the z-direction + #yakl::timer_start("halo z"); + set_halo_values_z(state_forcing, fixed_data) + #yakl::timer_stop("halo z"); + # Compute the time tendencies for the fluid state in the z-direction + #yakl::timer_start("tendencies z"); + compute_tendencies_z(state_forcing, tend, dt, fixed_data) + #yakl::timer_stop("tendencies z"); # ///////////////////////////////////////////////// # // TODO: MAKE THESE 3 LOOPS A PARALLEL_FOR @@ -231,154 +233,174 @@ def perform_timestep( # yakl::timer_stop("apply tendencies"); - # FIXME (mfh 2025/03/03) This should return something; not sure what yet + # NOTE It's OK for this not to return anything, + # as long as we can treat state_out as an output parameter. -//Compute the time tendencies of the fluid state using forcing in the x-direction -//Since the halos are set in a separate routine, this will not require MPI -//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) -//Then, compute the tendencies using those fluxes -void compute_tendencies_x( realConst3d state , real3d const &tend , real dt , Fixed_data const &fixed_data ) { - auto &nx = fixed_data.nx ; - auto &nz = fixed_data.nz ; - auto &hy_dens_cell = fixed_data.hy_dens_cell ; - auto &hy_dens_theta_cell = fixed_data.hy_dens_theta_cell; - - flux = real3d("flux", NUM_VARS, nz, nx+1); - - //Compute the hyperviscosity coefficient - real hv_coef = -hv_beta * dx / (16*dt); - ///////////////////////////////////////////////// - // TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR - ///////////////////////////////////////////////// - //Compute fluxes in the x-direction for each cell - for (int k=0; k stencil; - SArray d3_vals; - SArray vals; - //Use fourth-order interpolation from four cell averages to compute the value at the interface in question - for (int ll=0; ll None: - ///////////////////////////////////////////////// - // TODO: MAKE THESE 3 LOOPS A PARALLEL_FOR - ///////////////////////////////////////////////// - //Use the fluxes to compute tendencies for each cell - for (int ll=0; ll stencil; - SArray d3_vals; - SArray vals; - //Use fourth-order interpolation from four cell averages to compute the value at the interface in question - for (int ll=0; ll stencil; + #SArray d3_vals; + #SArray vals; + + stencil = np.zeros((1, 4), dtype=real) + d3_vals = np.zeros((1, NUM_VARS), dtype=real) + vals = np.zeros((1, NUM_VARS), dtype=real) + + # Use fourth-order interpolation from four cell averages to compute the value at the interface in question + for ll in range(NUM_VARS): + for s in range(sten_size): + stencil[s] = state[ll,hs+k,i+s] + + # Fourth-order-accurate interpolation of the state + vals[ll] = -stencil[0]/12 + 7*stencil[1]/12 + 7*stencil[2]/12 - stencil[3]/12 + # First-order-accurate interpolation of the third spatial derivative of the state (for artificial viscosity) + d3_vals[ll] = -stencil[0] + 3*stencil[1] - 3*stencil[2] + stencil[3] + + # Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively) + r = vals[ID_DENS] + hy_dens_cell[hs+k] + u = vals[ID_UMOM] / r + w = vals[ID_WMOM] / r + t = (vals[ID_RHOT] + hy_dens_theta_cell[hs+k]) / r + p = C0*pow((r*t), gamm) + + # Compute the flux vector + flux[ID_DENS,k,i] = r*u - hv_coef*d3_vals[ID_DENS] + flux[ID_UMOM,k,i] = r*u*u+p - hv_coef*d3_vals[ID_UMOM] + flux[ID_WMOM,k,i] = r*u*w - hv_coef*d3_vals[ID_WMOM] + flux[ID_RHOT,k,i] = r*u*t - hv_coef*d3_vals[ID_RHOT] + + # ///////////////////////////////////////////////// + # // TODO: MAKE THESE 3 LOOPS A PARALLEL_FOR + # ///////////////////////////////////////////////// + # Use the fluxes to compute tendencies for each cell + for ll in range(NUM_VARS): + for k in range(nz): + for i in range(nx): + tend[ll,k,i] = -( flux[ll,k,i+1] - flux[ll,k,i] ) / dx + + # NOTE It's OK for this not to return anything, + # as long as we can treat tend as an output parameter. + + +# Compute the time tendencies of the fluid state using forcing in the z-direction +# Since the halos are set in a separate routine, this will not require MPI +# First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity) +# Then, compute the tendencies using those fluxes +def compute_tendencies_z( + state, # realConst3d + tend, # real3d const& + dt, # real + fixed_data # Fixed_data const& + ) -> None: + + nx = fixed_data.nx + nz = fixed_data.nz + hy_dens_int = fixed_data.hy_dens_int + hy_dens_theta_int = fixed_data.hy_dens_theta_int + hy_pressure_int = fixed_data.hy_pressure_int + + flux = real3d("flux", NUM_VARS, nz+1, nx) + + # Compute the hyperviscosity coefficient + hv_coef: real = -hv_beta * dz / (16*dt); + # ///////////////////////////////////////////////// + # TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR + # ///////////////////////////////////////////////// + # Compute fluxes in the x-direction for each cell + for k in range(nz+1): + for i in range(nx): + # "Stack Array" -- local multidimensional array type with compile-time extents + #SArray stencil; + #SArray d3_vals; + #SArray vals; + + stencil = np.zeros((1, 4), dtype=real) + d3_vals = np.zeros((1, NUM_VARS), dtype=real) + vals = np.zeros((1, NUM_VARS), dtype=real) + + # Use fourth-order interpolation from four cell averages to compute the value at the interface in question + for ll in range(NUM_VARS): + for s in range(sten_size): + stencil[s] = state[ll,k+s,hs+i]; + + # Fourth-order-accurate interpolation of the state + vals[ll] = -stencil[0]/12 + 7*stencil[1]/12 + 7*stencil[2]/12 - stencil[3]/12 + # First-order-accurate interpolation of the third spatial derivative of the state + d3_vals[ll] = -stencil[0] + 3*stencil[1] - 3*stencil[2] + stencil[3] + + # Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively) + r: real = vals[ID_DENS] + hy_dens_int[k]; + u: real = vals[ID_UMOM] / r; + w: real = vals[ID_WMOM] / r; + t: real = ( vals[ID_RHOT] + hy_dens_theta_int[k] ) / r; + p: real = C0*pow((r*t),gamm) - hy_pressure_int[k]; + if k == 0 or k == nz: w = 0; - d3_vals(ID_DENS) = 0; - } - - //Compute the flux vector with hyperviscosity - flux(ID_DENS,k,i) = r*w - hv_coef*d3_vals(ID_DENS); - flux(ID_UMOM,k,i) = r*w*u - hv_coef*d3_vals(ID_UMOM); - flux(ID_WMOM,k,i) = r*w*w+p - hv_coef*d3_vals(ID_WMOM); - flux(ID_RHOT,k,i) = r*w*t - hv_coef*d3_vals(ID_RHOT); - } - } + d3_vals[ID_DENS] = 0; - //Use the fluxes to compute tendencies for each cell - ///////////////////////////////////////////////// - // TODO: MAKE THESE 3 LOOPS A PARALLEL_FOR - ///////////////////////////////////////////////// - for (int ll=0; ll None: + + nx = fixed_data.nx + nz = fixed_data.nz + k_beg = fixed_data.k_beg + left_rank = fixed_data.left_rank + right_rank = fixed_data.right_rank + myrank = fixed_data.myrank + hy_dens_cell = fixed_data.hy_dens_cell + hy_dens_theta_cell = fixed_data.hy_dens_theta_cell # ////////////////////////////////////////////////////////////////////// # TODO: EXCHANGE HALO VALUES WITH NEIGHBORING MPI TASKS @@ -410,17 +432,20 @@ def perform_timestep( state[ID_UMOM,hs+k,i] = (state[ID_DENS,hs+k,i] + hy_dens_cell[hs+k]) * 50.0 state[ID_RHOT,hs+k,i] = (state[ID_DENS,hs+k,i] + hy_dens_cell[hs+k]) * 298.0 - hy_dens_theta_cell[hs+k] - # TODO (mfh 2025/03/03) Return something, not sure what yet + # NOTE (mfh 2025/03/03) Don't need to return anything, as long as state can be an output parameter +# Set this MPI task's halo values in the z-direction. This does not require MPI because there is no MPI +# decomposition in the vertical direction +def set_halo_values_z( + state, # real3d const& + fixed_data # Fixed_data const& + ) -> None: -//Set this MPI task's halo values in the z-direction. This does not require MPI because there is no MPI -//decomposition in the vertical direction -void set_halo_values_z( real3d const &state , Fixed_data const &fixed_data ) { - auto &nx = fixed_data.nx ; - auto &nz = fixed_data.nz ; - auto &hy_dens_cell = fixed_data.hy_dens_cell ; - + nx = fixed_data.nx + nz = fixed_data.nz + hy_dens_cell = fixed_data.hy_dens_cell + # ///////////////////////////////////////////////// # // TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR # ///////////////////////////////////////////////// @@ -442,7 +467,7 @@ def perform_timestep( state[ll,nz+hs ,i] = state[ll,nz+hs-1,i] state[ll,nz+hs+1,i] = state[ll,nz+hs-1,i] - # TODO (mfh 2025/03/03) Return something, not sure what yet + # NOTE (mfh 2025/03/03) Don't need to return anything, as long as state can be an output parameter # state, dt, and fixed_data used to be output parameters. @@ -732,89 +757,97 @@ def hydro_const_bvfreq(z: real, bv_freq0: real): # returns (r, t) return (r, t) -//Sample from an ellipse of a specified center, radius, and amplitude at a specified location -//x and z are input coordinates -//amp,x0,z0,xrad,zrad are input amplitude, center, and radius of the ellipse -real sample_ellipse_cosine( real x , real z , real amp , real x0 , real z0 , real xrad , real zrad ) { - //Compute distance from bubble center - real dist = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.; - //If the distance from bubble center is less than the radius, create a cos**2 profile - if (dist <= pi / 2.) { - return amp * pow(cos(dist),2.); - } else { - return 0.; - } -} - - -//Output the fluid state (state) to a NetCDF file at a given elapsed model time (etime) -//The file I/O uses parallel-netcdf, the only external library required for this mini-app. -//If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics -void output( realConst3d state , real etime , int &num_out , Fixed_data const &fixed_data ) { - auto &nx = fixed_data.nx ; - auto &nz = fixed_data.nz ; - auto &i_beg = fixed_data.i_beg ; - auto &k_beg = fixed_data.k_beg ; - auto &mainproc = fixed_data.mainproc ; - auto &hy_dens_cell = fixed_data.hy_dens_cell ; - auto &hy_dens_theta_cell = fixed_data.hy_dens_theta_cell; +# Sample from an ellipse of a specified center, radius, and amplitude at a specified location +# x and z are input coordinates +# amp,x0,z0,xrad,zrad are input amplitude, center, and radius of the ellipse +def sample_ellipse_cosine(x: real, z: real, amp: real, x0: real, z0: real, xrad: real, zrad: real) -> real: + # Compute distance from bubble center + dist: real = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.0 + # If the distance from bubble center is less than the radius, create a cos**2 profile + if dist <= pi / 2.0: + return amp * pow(cos(dist),2.) + else: + return 0.0 + + +# Output the fluid state (state) to a NetCDF file at a given elapsed model time (etime) +# The file I/O uses parallel-netcdf, the only external library required for this mini-app. +# If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics +def output( + state, # realConst3d + etime, # real + num_out, # int + fixed_data # Fixed_data const& + ) -> int: # num_out (updated) + + nx = fixed_data.nx + nz = fixed_data.nz + i_beg = fixed_data.i_beg + k_beg = fixed_data.k_beg + mainproc = fixed_data.mainproc + hy_dens_cell = fixed_data.hy_dens_cell + hy_dens_theta_cell = fixed_data.hy_dens_theta_cell int ncid, t_dimid, x_dimid, z_dimid, dens_varid, uwnd_varid, wwnd_varid, theta_varid, t_varid, dimids[3]; MPI_Offset st1[1], ct1[1], st3[3], ct3[3]; - //Temporary arrays to hold density, u-wind, w-wind, and potential temperature (theta) - //Inform the user - if (mainproc) { printf("*** OUTPUT ***\n"); } + # Temporary arrays to hold density, u-wind, w-wind, and potential temperature (theta) + # Inform the user + if mainproc: + print("*** OUTPUT ***\n") # Allocate some (big) temp arrays - dens = doub2d("dens", nz, nx); - uwnd = doub2d("uwnd", nz, nx); - wwnd = doub2d("wwnd", nz, nx); - theta = doub2d("theta", nz, nx); - - //If the elapsed time is zero, create the file. Otherwise, open the file - if (etime == 0) { - //Create the file + dens = doub2d("dens", nz, nx) + uwnd = doub2d("uwnd", nz, nx) + wwnd = doub2d("wwnd", nz, nx) + theta = doub2d("theta", nz, nx) + + # If the elapsed time is zero, create the file. Otherwise, open the file + if etime == 0: + # Create the file ncwrap( ncmpi_create( MPI_COMM_WORLD , "output.nc" , NC_CLOBBER , MPI_INFO_NULL , &ncid ) , __LINE__ ); - //Create the dimensions - ncwrap( ncmpi_def_dim( ncid , "t" , (MPI_Offset) NC_UNLIMITED , &t_dimid ) , __LINE__ ); - ncwrap( ncmpi_def_dim( ncid , "x" , (MPI_Offset) nx_glob , &x_dimid ) , __LINE__ ); - ncwrap( ncmpi_def_dim( ncid , "z" , (MPI_Offset) nz_glob , &z_dimid ) , __LINE__ ); - //Create the variables - dimids[0] = t_dimid; - ncwrap( ncmpi_def_var( ncid , "t" , NC_DOUBLE , 1 , dimids , &t_varid ) , __LINE__ ); - dimids[0] = t_dimid; dimids[1] = z_dimid; dimids[2] = x_dimid; - ncwrap( ncmpi_def_var( ncid , "dens" , NC_DOUBLE , 3 , dimids , &dens_varid ) , __LINE__ ); - ncwrap( ncmpi_def_var( ncid , "uwnd" , NC_DOUBLE , 3 , dimids , &uwnd_varid ) , __LINE__ ); - ncwrap( ncmpi_def_var( ncid , "wwnd" , NC_DOUBLE , 3 , dimids , &wwnd_varid ) , __LINE__ ); - ncwrap( ncmpi_def_var( ncid , "theta" , NC_DOUBLE , 3 , dimids , &theta_varid ) , __LINE__ ); - //End "define" mode + # Create the dimensions + ncwrap( ncmpi_def_dim( ncid , "t" , (MPI_Offset) NC_UNLIMITED , &t_dimid ) , __LINE__ ) + ncwrap( ncmpi_def_dim( ncid , "x" , (MPI_Offset) nx_glob , &x_dimid ) , __LINE__ ) + ncwrap( ncmpi_def_dim( ncid , "z" , (MPI_Offset) nz_glob , &z_dimid ) , __LINE__ ) + # Create the variables + dimids[0] = t_dimid + ncwrap( ncmpi_def_var( ncid , "t" , NC_DOUBLE , 1 , dimids , &t_varid ) , __LINE__ ) + dimids[0] = t_dimid + dimids[1] = z_dimid + dimids[2] = x_dimid + ncwrap( ncmpi_def_var( ncid , "dens" , NC_DOUBLE , 3 , dimids , &dens_varid ) , __LINE__ ) + ncwrap( ncmpi_def_var( ncid , "uwnd" , NC_DOUBLE , 3 , dimids , &uwnd_varid ) , __LINE__ ) + ncwrap( ncmpi_def_var( ncid , "wwnd" , NC_DOUBLE , 3 , dimids , &wwnd_varid ) , __LINE__ ) + ncwrap( ncmpi_def_var( ncid , "theta" , NC_DOUBLE , 3 , dimids , &theta_varid ) , __LINE__ ) + # End "define" mode ncwrap( ncmpi_enddef( ncid ) , __LINE__ ); - } else { - //Open the file - ncwrap( ncmpi_open( MPI_COMM_WORLD , "output.nc" , NC_WRITE , MPI_INFO_NULL , &ncid ) , __LINE__ ); - //Get the variable IDs - ncwrap( ncmpi_inq_varid( ncid , "dens" , &dens_varid ) , __LINE__ ); - ncwrap( ncmpi_inq_varid( ncid , "uwnd" , &uwnd_varid ) , __LINE__ ); - ncwrap( ncmpi_inq_varid( ncid , "wwnd" , &wwnd_varid ) , __LINE__ ); - ncwrap( ncmpi_inq_varid( ncid , "theta" , &theta_varid ) , __LINE__ ); - ncwrap( ncmpi_inq_varid( ncid , "t" , &t_varid ) , __LINE__ ); - } - - //Store perturbed values in the temp arrays for output - ///////////////////////////////////////////////// - // TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR - ///////////////////////////////////////////////// - for (int k=0; k Date: Tue, 4 Mar 2025 11:18:44 -0700 Subject: [PATCH 04/83] Progress on porting to Python --- python/miniWeather.py | 72 ++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 39 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index d601022..fcbf75f 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -7,15 +7,12 @@ # // # ////////////////////////////////////////////////////////////////////////////////////////// -#include -#include -#include -#include +import timeit #include "const.h" #include "pnetcdf.h" -#include -# "Halo" size: number of cells beyond the MPI tasks's domain needed for a full "stencil" of information for reconstruction +# "Halo" size: number of cells beyond the MPI tasks's domain +# needed for a full "stencil" of information for reconstruction hs: int = 2 # real can be either float or double, depending on cpp/const.h. @@ -67,61 +64,58 @@ def main() -> None: #MPI_Init(&argc,&argv); #yakl::init(); - #Fixed_data fixed_data; - #real3d state; - #real dt; //Model time step (seconds) - - # init allocates state + # fixed_data: Fixed_data + # state: real3d + # dt: real: Model time step (seconds) (fixed_data, state, dt) = init() # init( state , dt , fixed_data ); - mainproc = fixed_data.mainproc; + mainproc = fixed_data.mainproc # Initial reductions for mass, kinetic energy, and total energy (mass0, te0) = reductions(state, fixed_data) - num_out: int = 0 # The number of outputs performed so far - output_counter: real = 0 # Helps determine when it's time to do output - etime: real = 0 + num_out: int = 0 # The number of outputs performed so far + etime: real = 0.0 # Elapsed time # Output the initial state if output_freq >= 0: num_out = output(state, etime, num_out, fixed_data) - direction_switch: int = 1 # Tells dimensionally split which order to take x,z solves - # //////////////////////////////////////////////////// # MAIN TIME STEP LOOP # //////////////////////////////////////////////////// - t1 = std::chrono::steady_clock::now() - - while etime < sim_time: - # If the time step leads to exceeding the simulation time, shorten it for the last step - if etime + dt > sim_time: - dt = sim_time - etime - - # Perform a single time step - direction_switch = perform_timestep(state, dt, direction_switch, fixed_data) - # Inform the user - if mainproc: - print( "Elapsed Time: %lf / %lf\n", etime , sim_time ) - # Update the elapsed time and output counter - etime = etime + dt - output_counter = output_counter + dt - # If it's time for output, reset the counter, and do output - if output_freq >= 0 and output_counter >= output_freq: - output_counter = output_counter - output_freq + def run_simulation(): + direction_switch: int = 1 # Order in which dimensional splitting takes x,z solves + output_counter: real = 0.0 # Helps determine when it's time to do output + + while etime < sim_time: + # If the time step leads to exceeding the simulation time, shorten it for the last step + if etime + dt > sim_time: + dt = sim_time - etime + + # Perform a single time step + direction_switch = perform_timestep(state, dt, direction_switch, fixed_data) + # Inform the user + if mainproc: + print(f"Elapsed Time: {etime}, Simulation Time: {sim_time}\n") + # Update the elapsed time and output counter + etime = etime + dt + output_counter = output_counter + dt + # If it's time for output, reset the counter, and do output + if output_freq >= 0 and output_counter >= output_freq: + output_counter = output_counter - output_freq num_out = output(state, etime, num_out, fixed_data) - auto t2 = std::chrono::steady_clock::now(); + time_in_s = timeit.timeit(run_simulation(), number=1) if mainproc: - print( "CPU Time: " << std::chrono::duration(t2-t1).count() << " sec\n" ) + print(f"CPU Time: {time_in_s} s\n") # Final reductions for mass, kinetic energy, and total energy (mass, te) = reductions(state, fixed_data) if mainproc: - print( "d_mass: %le\n" % ((mass - mass0)/mass0) ) - print( "d_te: %le\n" % ((te - te0 )/te0 ) ) + print( f"d_mass: {((mass - mass0)/mass0)}" ) + print( f"d_te: {((te - te0 )/te0 )}" ) finalize() From d5634d23143e0b143212206efb13c623437d455e Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 4 Mar 2025 11:24:35 -0700 Subject: [PATCH 05/83] Progress on porting to Python (print statements) --- python/miniWeather.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index fcbf75f..ba3cc30 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -7,6 +7,7 @@ # // # ////////////////////////////////////////////////////////////////////////////////////////// +import sys import timeit #include "const.h" #include "pnetcdf.h" @@ -114,8 +115,8 @@ def run_simulation(): (mass, te) = reductions(state, fixed_data) if mainproc: - print( f"d_mass: {((mass - mass0)/mass0)}" ) - print( f"d_te: {((te - te0 )/te0 )}" ) + print(f"d_mass: {((mass - mass0)/mass0)}") + print(f"d_te: {((te - te0 )/te0 )}") finalize() @@ -513,9 +514,9 @@ def init(): # -> tuple[real3d, real, Fixed_data]: # If I'm the main process in MPI, display some grid information if mainproc: - print( "nx_glob, nz_glob: %d %d\n", nx_glob, nz_glob) - print( "dx,dz: %lf %lf\n",dx,dz) - print( "dt: %lf\n",dt) + print(f"nx_glob, nz_glob: {nx_glob} {nz_glob}\n") + print(f"dx,dz: {dx} {dz}\n") + print(f"dt: {dt}\n") # Want to make sure this info is displayed before further output # ierr = MPI_Barrier(MPI_COMM_WORLD); @@ -872,9 +873,9 @@ def output( //Error reporting routine for the PNetCDF I/O void ncwrap( int ierr , int line ) { if (ierr != NC_NOERR) { - printf("NetCDF Error at line: %d\n", line); - printf("%s\n",ncmpi_strerror(ierr)); - exit(-1); + print(f"NetCDF Error at line: {line}\n") + print(f"{ncmpi_strerror(ierr)}\n") + sys.exit(-1) } } From 3fe55840f3f7f4b40a56c27320d9acba744ea0cc Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 4 Mar 2025 14:51:55 -0700 Subject: [PATCH 06/83] All C++ code is now Python code --- python/miniWeather.py | 297 +++++++++++++++++++----------------------- 1 file changed, 132 insertions(+), 165 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index ba3cc30..21ee2c3 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -9,54 +9,129 @@ import sys import timeit -#include "const.h" +import numpy as np #include "pnetcdf.h" +# "real" in the original C++ code could be either float or double. +real = np.float64 # or np.float32 + +# +# Constants (ported from cpp/const.h) +# + +hs: int = 2 + +# We don't like code that redefines pi, +# but we keep it just for now, to test that +# the Python gives the same results as the C++. +pi: real = 3.14159265358979323846264338327 +grav: real = 9.8 # Gravitational acceleration (m / s^2) +cp: real = 1004.0 # Specific heat of dry air at constant pressure +cv: real = 717.0 # Specific heat of dry air at constant volume +rd: real = 287.0 # Dry air constant for equation of state (P=rho*rd*T) +p0: real = 1.e5 # Standard pressure at the surface in Pascals +C0: real = 27.5629410929725921310572974482 # Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma) +# gamma=cp/Rd, have to call this gamm because "gamma" is taken (I hate C so much) +gamm: real = 1.40027894002789400278940027894 + +# +# Domain and stability-related constants +# + +xlen: real = 2.e4 # Length of the domain in the x-direction (meters) +zlen: real = 1.e4 # Length of the domain in the z-direction (meters) +hv_beta: real = 0.05 # How strong to diffuse the solution: hv_beta \in [0:1] +cfl: real = 1.50 # "Courant, Friedrichs, Lewy" number (for numerical stability) +max_speed: real = 450 # Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec) # "Halo" size: number of cells beyond the MPI tasks's domain # needed for a full "stencil" of information for reconstruction hs: int = 2 +# Size of the stencil used for interpolation +sten_size: int = 4 -# real can be either float or double, depending on cpp/const.h. +# /////////////////////////////////////////////////////////////////////////////////////// +# // BEGIN USER-CONFIGURABLE PARAMETERS +# /////////////////////////////////////////////////////////////////////////////////////// +# The x-direction length is twice as long as the z-direction length +# So, you'll want to have nx_glob be twice as large as nz_glob +nx_glob: int = _NX # Number of total cells in the x-direction +nz_glob: int = _NZ # Number of total cells in the z-direction +sim_time: real = _SIM_TIME # How many seconds to run the simulation +output_freq: real = _OUT_FREQ # How frequently to output data to file (in seconds) +data_spec_int: int = _DATA_SPEC # How to initialize the data +# /////////////////////////////////////////////////////////////////////////////////////// +# // END USER-CONFIGURABLE PARAMETERS +# /////////////////////////////////////////////////////////////////////////////////////// +dx: real = xlen / nx_glob +dz: real = zlen / nz_glob + + +# +# Parameters for indexing and flags +# + +NUM_VARS: int = 4 # Number of fluid state variables +ID_DENS: int = 0 #index for density ("rho") +ID_UMOM: int = 1 #index for momentum in the x-direction ("rho * u") +ID_WMOM: int = 2 #index for momentum in the z-direction ("rho * w") +ID_RHOT: int = 3 #index for density * potential temperature ("rho * theta") +DIR_X: int = 1 #Integer constant to express that this operation is in the x-direction +DIR_Z: int = 2 #Integer constant to express that this operation is in the z-direction +DATA_SPEC_COLLISION: int = 1 +DATA_SPEC_THERMAL: int = 2 +DATA_SPEC_GRAVITY_WAVES: int = 3 +DATA_SPEC_DENSITY_CURRENT: int = 5 +DATA_SPEC_INJECTION: int = 6 + +# +# These functions aid in porting from the original C++. +# #typedef yakl::Array real1d; def real1d(name: string, nx: int): - # FIXME (mfh 2025/03/03) This should NOT return the same thing as doub1d; need to change type - return np.zeros(nx) # FIXME (mfh 2025/03/03) + return np.zeros(nx, dtype=real) #typedef yakl::Array real2d; def real2d(name: string, nx: int, nz: int): - return np.zeros((nz, nx)) # FIXME (mfh 2025/03/03) What element type should this return? + return np.zeros((nz, nx), dtype=real) #typedef yakl::Array real3d; def real3d(name: string, nx: int, nz: int, nvars: int): - return np.zeros((nvars, nz, nx)) # FIXME (mfh 2025/03/03) What element type should this return? + return np.zeros((nvars, nz, nx), dtype=real) #typedef yakl::Array doub2d; def doub2d(name: string, nx: int, nz: int): - return np.zeros((nz, nx)) # FIXME (mfh 2025/03/03) What element type should this return? - -typedef yakl::Array realConst1d; -typedef yakl::Array realConst2d; -typedef yakl::Array realConst3d; -typedef yakl::Array doubConst1d; -typedef yakl::Array doubConst2d; -typedef yakl::Array doubConst3d; - -/////////////////////////////////////////////////////////////////////////////////////// -// Variables that are initialized but remain static over the course of the simulation -/////////////////////////////////////////////////////////////////////////////////////// -struct Fixed_data { - int nx, nz; //Number of local grid cells in the x- and z- dimensions for this MPI task - int i_beg, k_beg; //beginning index in the x- and z-directions for this MPI task - int nranks, myrank; //Number of MPI ranks and my rank id - int left_rank, right_rank; //MPI Rank IDs that exist to my left and right in the global domain - int mainproc; //Am I the main process (rank == 0)? - realConst1d hy_dens_cell; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) - realConst1d hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) - realConst1d hy_dens_int; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) - realConst1d hy_dens_theta_int; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) - realConst1d hy_pressure_int; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) -}; + return np.zeros((nz, nx), dtype=real) + +#typedef yakl::Array realConst1d; +def realConst1d(name: string, nx: int): + return np.zeros(nx, dtype=real) + +#typedef yakl::Array realConst2d; +def realConst2d(name: string, nx: int, nz: int): + return np.zeros((nz, nx), dtype=real) + +# /////////////////////////////////////////////////////////////////////////////////////// +# // Variables that are initialized but remain static over the course of the simulation +# /////////////////////////////////////////////////////////////////////////////////////// + +class Fixed_data: + def __init__(self): + self.nx = 0 # Number of local grid cells in the x dimension for this MPI task + self.nz = 0 # Number of local grid cells in the z dimension for this MPI task + self.i_beg = 0 # beginning index in the x direction for this MPI task + self.k_beg = 0 # beginning index in the z direction for this MPI task + self.nranks = 0 # Number of MPI ranks + self.myrank = 0 # My rank id + self.left_rank = 0 # MPI Rank ID that exists to my left in the global domain + self.right_rank = 0 # MPI Rank ID that exists to my right in the global domain + self.mainproc = True # Am I the main process (rank == 0)? + self.hy_dens_cell = None # realConst1d: hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) + self.hy_dens_theta_cell = None # realConst1d: hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) + self.hy_dens_int = None # realConst1d: hydrostatic density (vert cell interf). Dimensions: (1:nz+1) + self.hy_dens_theta_int = None # realConst1d: hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) + self.hy_pressure_int = None # realConst1d: hydrostatic press (vert cell interf). Dimensions: (1:nz+1) + # /////////////////////////////////////////////////////////////////////////////////////// # // THE MAIN PROGRAM STARTS HERE @@ -68,7 +143,7 @@ def main() -> None: # fixed_data: Fixed_data # state: real3d # dt: real: Model time step (seconds) - (fixed_data, state, dt) = init() # init( state , dt , fixed_data ); + (fixed_data, state, dt) = init() mainproc = fixed_data.mainproc @@ -468,19 +543,7 @@ def set_halo_values_z( # state, dt, and fixed_data used to be output parameters. # It would be more Pythonic to return them as a tuple. def init(): # -> tuple[real3d, real, Fixed_data]: - #real3d state - #Fixed_data fixed_data - #real dt - - auto &nx = fixed_data.nx ; - auto &nz = fixed_data.nz ; - auto &i_beg = fixed_data.i_beg ; - auto &k_beg = fixed_data.k_beg ; - auto &left_rank = fixed_data.left_rank ; - auto &right_rank = fixed_data.right_rank ; - auto &nranks = fixed_data.nranks ; - auto &myrank = fixed_data.myrank ; - auto &mainproc = fixed_data.mainproc ; + ierr: int = 0 # ///////////////////////////////////////////////////////////// @@ -501,7 +564,8 @@ def init(): # -> tuple[real3d, real, Fixed_data]: # END MPI DUMMY SECTION # ////////////////////////////////////////////// - # Vertical direction isn't MPI-ized, so the rank's local values = the global values + # Vertical direction isn't MPI-ized, + # so the rank's local values = the global values k_beg = 0 nz = nz_glob mainproc = (myrank == 0) @@ -522,10 +586,7 @@ def init(): # -> tuple[real3d, real, Fixed_data]: # ierr = MPI_Barrier(MPI_COMM_WORLD); # Define quadrature weights and points - nqpoints: int = 3 - # SArray qpoints; - # SArray qweights; - + nqpoints: int = 3 qpoints = np.array([ 0.112701665379258311482073460022, 0.500000000000000000000000000000, @@ -555,11 +616,6 @@ def init(): # -> tuple[real3d, real, Fixed_data]: # Compute the x,z location within the global domain based on cell and quadrature index x: real = (i_beg + i-hs+0.5)*dx + (qpoints[ii]-0.5)*dx z: real = (k_beg + k-hs+0.5)*dz + (qpoints[kk]-0.5)*dz - # real r, u, w, t, hr, ht; - - # The above real variables are probably output parameters - # of collision, thermal, gravity_waves, density_current, and injection. - # x and z are probably input parameters of these functions. # Set the fluid state based on the user's specification if data_spec_int == DATA_SPEC_COLLISION: @@ -574,16 +630,16 @@ def init(): # -> tuple[real3d, real, Fixed_data]: (r,u,w,t,hr,ht) = injection(x,z) # Store into the fluid state array - state[ID_DENS,k,i] += r * qweights[ii]*qweights[kk]; - state[ID_UMOM,k,i] += (r+hr)*u * qweights[ii]*qweights[kk]; - state[ID_WMOM,k,i] += (r+hr)*w * qweights[ii]*qweights[kk]; - state[ID_RHOT,k,i] += ( (r+hr)*(t+ht) - hr*ht ) * qweights[ii]*qweights[kk]; + state[ID_DENS,k,i] += r * qweights[ii]*qweights[kk] + state[ID_UMOM,k,i] += (r+hr)*u * qweights[ii]*qweights[kk] + state[ID_WMOM,k,i] += (r+hr)*w * qweights[ii]*qweights[kk] + state[ID_RHOT,k,i] += ( (r+hr)*(t+ht) - hr*ht ) * qweights[ii]*qweights[kk] - hy_dens_cell = real1d("hy_dens_cell ", nz+2*hs); - hy_dens_theta_cell = real1d("hy_dens_theta_cell", nz+2*hs); - hy_dens_int = real1d("hy_dens_int ", nz+1); - hy_dens_theta_int = real1d("hy_dens_theta_int ", nz+1); - hy_pressure_int = real1d("hy_pressure_int ", nz+1); + hy_dens_cell = real1d("hy_dens_cell ", nz+2*hs) + hy_dens_theta_cell = real1d("hy_dens_theta_cell", nz+2*hs) + hy_dens_int = real1d("hy_dens_int ", nz+1) + hy_dens_theta_int = real1d("hy_dens_theta_int ", nz+1) + hy_pressure_int = real1d("hy_pressure_int ", nz+1) # Compute the hydrostatic background state over vertical cell averages # ///////////////////////////////////////////////// @@ -630,15 +686,20 @@ def init(): # -> tuple[real3d, real, Fixed_data]: hy_dens_int[k] = hr hy_dens_theta_int[k] = hr*ht - hy_pressure_int[k] = C0*pow((hr*ht),gamm) + hy_pressure_int[k] = C0*pow((hr*ht), gamm) + + hy_dens_cell = realConst1d(hy_dens_cell ) + hy_dens_theta_cell = realConst1d(hy_dens_theta_cell) + hy_dens_int = realConst1d(hy_dens_int ) + hy_dens_theta_int = realConst1d(hy_dens_theta_int ) + hy_pressure_int = realConst1d(hy_pressure_int ) - fixed_data.hy_dens_cell = realConst1d(hy_dens_cell ) - fixed_data.hy_dens_theta_cell = realConst1d(hy_dens_theta_cell) - fixed_data.hy_dens_int = realConst1d(hy_dens_int ) - fixed_data.hy_dens_theta_int = realConst1d(hy_dens_theta_int ) - fixed_data.hy_pressure_int = realConst1d(hy_pressure_int ) + fixed_data = Fixed_data(nx, nz, i_beg, k_beg, + nranks, myrank, left_rank, right_rank, mainproc, + hy_dens_cell, hy_dens_theta_cell, hy_dens_int, + hy_dens_theta_int, hy_pressure_int) - # FIXME (mfh 2025/03/03) This should return something; not sure what yet + return (state, fixed_data, dt) # This test case is initially balanced but injects fast, cold air from the left boundary near the model top # x and z are input coordinates at which to sample @@ -775,94 +836,10 @@ def output( fixed_data # Fixed_data const& ) -> int: # num_out (updated) - nx = fixed_data.nx - nz = fixed_data.nz - i_beg = fixed_data.i_beg - k_beg = fixed_data.k_beg - mainproc = fixed_data.mainproc - hy_dens_cell = fixed_data.hy_dens_cell - hy_dens_theta_cell = fixed_data.hy_dens_theta_cell - - int ncid, t_dimid, x_dimid, z_dimid, dens_varid, uwnd_varid, wwnd_varid, theta_varid, t_varid, dimids[3]; - MPI_Offset st1[1], ct1[1], st3[3], ct3[3]; - # Temporary arrays to hold density, u-wind, w-wind, and potential temperature (theta) - # Inform the user if mainproc: print("*** OUTPUT ***\n") - # Allocate some (big) temp arrays - dens = doub2d("dens", nz, nx) - uwnd = doub2d("uwnd", nz, nx) - wwnd = doub2d("wwnd", nz, nx) - theta = doub2d("theta", nz, nx) - - # If the elapsed time is zero, create the file. Otherwise, open the file - if etime == 0: - # Create the file - ncwrap( ncmpi_create( MPI_COMM_WORLD , "output.nc" , NC_CLOBBER , MPI_INFO_NULL , &ncid ) , __LINE__ ); - # Create the dimensions - ncwrap( ncmpi_def_dim( ncid , "t" , (MPI_Offset) NC_UNLIMITED , &t_dimid ) , __LINE__ ) - ncwrap( ncmpi_def_dim( ncid , "x" , (MPI_Offset) nx_glob , &x_dimid ) , __LINE__ ) - ncwrap( ncmpi_def_dim( ncid , "z" , (MPI_Offset) nz_glob , &z_dimid ) , __LINE__ ) - # Create the variables - dimids[0] = t_dimid - ncwrap( ncmpi_def_var( ncid , "t" , NC_DOUBLE , 1 , dimids , &t_varid ) , __LINE__ ) - dimids[0] = t_dimid - dimids[1] = z_dimid - dimids[2] = x_dimid - ncwrap( ncmpi_def_var( ncid , "dens" , NC_DOUBLE , 3 , dimids , &dens_varid ) , __LINE__ ) - ncwrap( ncmpi_def_var( ncid , "uwnd" , NC_DOUBLE , 3 , dimids , &uwnd_varid ) , __LINE__ ) - ncwrap( ncmpi_def_var( ncid , "wwnd" , NC_DOUBLE , 3 , dimids , &wwnd_varid ) , __LINE__ ) - ncwrap( ncmpi_def_var( ncid , "theta" , NC_DOUBLE , 3 , dimids , &theta_varid ) , __LINE__ ) - # End "define" mode - ncwrap( ncmpi_enddef( ncid ) , __LINE__ ); - else: - # Open the file - ncwrap( ncmpi_open( MPI_COMM_WORLD , "output.nc" , NC_WRITE , MPI_INFO_NULL , &ncid ) , __LINE__ ) - # Get the variable IDs - ncwrap( ncmpi_inq_varid( ncid , "dens" , &dens_varid ) , __LINE__ ) - ncwrap( ncmpi_inq_varid( ncid , "uwnd" , &uwnd_varid ) , __LINE__ ) - ncwrap( ncmpi_inq_varid( ncid , "wwnd" , &wwnd_varid ) , __LINE__ ) - ncwrap( ncmpi_inq_varid( ncid , "theta" , &theta_varid ) , __LINE__ ) - ncwrap( ncmpi_inq_varid( ncid , "t" , &t_varid ) , __LINE__ ) - - # Store perturbed values in the temp arrays for output - # ///////////////////////////////////////////////// - # TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR - # ///////////////////////////////////////////////// - for k in range(nz): - for i in range(nx): - dens[k,i] = state[ID_DENS,hs+k,hs+i] - uwnd[k,i] = state[ID_UMOM,hs+k,hs+i] / ( hy_dens_cell[hs+k] + state[ID_DENS,hs+k,hs+i] ) - wwnd[k,i] = state[ID_WMOM,hs+k,hs+i] / ( hy_dens_cell[hs+k] + state[ID_DENS,hs+k,hs+i] ) - theta[k,i] = ( state[ID_RHOT,hs+k,hs+i] + hy_dens_theta_cell[hs+k] ) / ( hy_dens_cell[hs+k] + state[ID_DENS,hs+k,hs+i] ) - hy_dens_theta_cell[hs+k] / hy_dens_cell[hs+k] - - # Write the grid data to file with all the processes writing collectively - st3[0] = num_out - st3[1] = k_beg - st3[2] = i_beg - ct3[0] = 1 - ct3[1] = nz - ct3[2] = nx - ncwrap( ncmpi_put_vara_double_all( ncid , dens_varid , st3 , ct3 , dens.data() ) , __LINE__ ); - ncwrap( ncmpi_put_vara_double_all( ncid , uwnd_varid , st3 , ct3 , uwnd.data() ) , __LINE__ ); - ncwrap( ncmpi_put_vara_double_all( ncid , wwnd_varid , st3 , ct3 , wwnd.data() ) , __LINE__ ); - ncwrap( ncmpi_put_vara_double_all( ncid , theta_varid , st3 , ct3 , theta.data() ) , __LINE__ ); - - //Only the main process needs to write the elapsed time - //Begin "independent" write mode - ncwrap( ncmpi_begin_indep_data(ncid) , __LINE__ ); - # write elapsed time to file - if mainproc: - st1[0] = num_out - ct1[0] = 1 - double etimearr[1]; - etimearr[0] = etime; ncwrap( ncmpi_put_vara_double( ncid , t_varid , st1 , ct1 , etimearr ) , __LINE__ ); - - //End "independent" write mode - ncwrap( ncmpi_end_indep_data(ncid) , __LINE__ ); - //Close the file - ncwrap( ncmpi_close(ncid) , __LINE__ ); + # TODO (mfh 2025/03/04) Actually write to the output file. # Increment the number of outputs num_out = num_out + 1; @@ -870,16 +847,6 @@ def output( return num_out -//Error reporting routine for the PNetCDF I/O -void ncwrap( int ierr , int line ) { - if (ierr != NC_NOERR) { - print(f"NetCDF Error at line: {line}\n") - print(f"{ncmpi_strerror(ierr)}\n") - sys.exit(-1) - } -} - - def finalize() -> None: pass From 2967153482b9210df835e1ddeb3870b1c9886ef7 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 4 Mar 2025 16:22:26 -0700 Subject: [PATCH 07/83] Run-time debugging of port --- python/miniWeather.py | 131 +++++++++++++++++++++--------------------- 1 file changed, 66 insertions(+), 65 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index 21ee2c3..14710c9 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -7,23 +7,41 @@ # // # ////////////////////////////////////////////////////////////////////////////////////////// +import math +import numpy as np import sys import timeit -import numpy as np #include "pnetcdf.h" # "real" in the original C++ code could be either float or double. real = np.float64 # or np.float32 +double = np.float64 # -# Constants (ported from cpp/const.h) +# Parameters for indexing and flags +# (effectively enums, but we leave them as constants +# so it's easier to see the relationship to the original C++) # -hs: int = 2 +NUM_VARS: int = 4 # Number of fluid state variables +ID_DENS: int = 0 #index for density ("rho") +ID_UMOM: int = 1 #index for momentum in the x-direction ("rho * u") +ID_WMOM: int = 2 #index for momentum in the z-direction ("rho * w") +ID_RHOT: int = 3 #index for density * potential temperature ("rho * theta") +DIR_X: int = 1 #Integer constant to express that this operation is in the x-direction +DIR_Z: int = 2 #Integer constant to express that this operation is in the z-direction +DATA_SPEC_COLLISION: int = 1 +DATA_SPEC_THERMAL: int = 2 +DATA_SPEC_GRAVITY_WAVES: int = 3 +DATA_SPEC_DENSITY_CURRENT: int = 5 +DATA_SPEC_INJECTION: int = 6 + +# +# Constants (ported from cpp/const.h) +# -# We don't like code that redefines pi, -# but we keep it just for now, to test that -# the Python gives the same results as the C++. +# We don't like code that redefines pi, but we keep it for now, +# to test that the Python gives the same results as the C++. pi: real = 3.14159265358979323846264338327 grav: real = 9.8 # Gravitational acceleration (m / s^2) cp: real = 1004.0 # Specific heat of dry air at constant pressure @@ -54,61 +72,41 @@ # /////////////////////////////////////////////////////////////////////////////////////// # The x-direction length is twice as long as the z-direction length # So, you'll want to have nx_glob be twice as large as nz_glob -nx_glob: int = _NX # Number of total cells in the x-direction -nz_glob: int = _NZ # Number of total cells in the z-direction -sim_time: real = _SIM_TIME # How many seconds to run the simulation -output_freq: real = _OUT_FREQ # How frequently to output data to file (in seconds) -data_spec_int: int = _DATA_SPEC # How to initialize the data +nz_glob: int = 50 # Number of total cells in the z-direction +nx_glob: int = 2 * nz_glob # Number of total cells in the x-direction +sim_time: real = 1000.0 # How many seconds to run the simulation +output_freq: real = 10.0 # How frequently to output data to file (in seconds) +data_spec_int: int = DATA_SPEC_THERMAL # How to initialize the data # /////////////////////////////////////////////////////////////////////////////////////// # // END USER-CONFIGURABLE PARAMETERS # /////////////////////////////////////////////////////////////////////////////////////// dx: real = xlen / nx_glob dz: real = zlen / nz_glob - -# -# Parameters for indexing and flags -# - -NUM_VARS: int = 4 # Number of fluid state variables -ID_DENS: int = 0 #index for density ("rho") -ID_UMOM: int = 1 #index for momentum in the x-direction ("rho * u") -ID_WMOM: int = 2 #index for momentum in the z-direction ("rho * w") -ID_RHOT: int = 3 #index for density * potential temperature ("rho * theta") -DIR_X: int = 1 #Integer constant to express that this operation is in the x-direction -DIR_Z: int = 2 #Integer constant to express that this operation is in the z-direction -DATA_SPEC_COLLISION: int = 1 -DATA_SPEC_THERMAL: int = 2 -DATA_SPEC_GRAVITY_WAVES: int = 3 -DATA_SPEC_DENSITY_CURRENT: int = 5 -DATA_SPEC_INJECTION: int = 6 - # # These functions aid in porting from the original C++. +# Not sure why the degrees of freedom are in reverse order; +# perhaps the original author wanted to preserve Fortran order. # #typedef yakl::Array real1d; -def real1d(name: string, nx: int): +def real1d(name: str, nx: int): return np.zeros(nx, dtype=real) -#typedef yakl::Array real2d; -def real2d(name: string, nx: int, nz: int): - return np.zeros((nz, nx), dtype=real) - #typedef yakl::Array real3d; -def real3d(name: string, nx: int, nz: int, nvars: int): +def real3d(name: str, nx: int, nz: int, nvars: int): return np.zeros((nvars, nz, nx), dtype=real) #typedef yakl::Array doub2d; -def doub2d(name: string, nx: int, nz: int): +def doub2d(name: str, nx: int, nz: int): return np.zeros((nz, nx), dtype=real) #typedef yakl::Array realConst1d; -def realConst1d(name: string, nx: int): +def realConst1d(name: str, nx: int): return np.zeros(nx, dtype=real) #typedef yakl::Array realConst2d; -def realConst2d(name: string, nx: int, nz: int): +def realConst2d(name: str, nx: int, nz: int): return np.zeros((nz, nx), dtype=real) # /////////////////////////////////////////////////////////////////////////////////////// @@ -216,7 +214,7 @@ def perform_timestep( nx = fixed_data.nx nz = fixed_data.nz - state_tmp = real3d("state_tmp", NUM_VARS, nz+2*hs, nx+2*hs) + state_tmp = real3d("state_tmp", nx=nx+2*hs, nz=nz+2*hs, nvars=NUM_VARS) if direction_switch != 0: # x-direction first @@ -264,7 +262,7 @@ def semi_discrete_step( k_beg = fixed_data.k_beg hy_dens_cell = fixed_data.hy_dens_cell - tend = real3d("tend", NUM_VARS, nz, nx) + tend = real3d("tend", nx=nx, nz=nz, nvars=NUM_VARS) if dir == DIR_X: # Set the halo values for this MPI task's fluid state in the x-direction @@ -299,7 +297,7 @@ def semi_discrete_step( wpert: real = sample_ellipse_cosine(x, z, 0.01, xlen/8, 1000.0, 500.0, 500.0) tend[ID_WMOM,k,i] += wpert*hy_dens_cell[hs+k] - state_out(ll,hs+k,hs+i) = state_init(ll,hs+k,hs+i) + dt * tend(ll,k,i); + state_out[ll,hs+k,hs+i] = state_init[ll,hs+k,hs+i] + dt * tend[ll,k,i] # yakl::timer_stop("apply tendencies"); @@ -323,7 +321,7 @@ def compute_tendencies_x( hy_dens_cell = fixed_data.hy_dens_cell hy_dens_theta_cell = fixed_data.hy_dens_theta_cell - flux = real3d("flux", NUM_VARS, nz, nx+1) + flux = real3d("flux", nx=nx+1, nz=nz, nvars=NUM_VARS) # Compute the hyperviscosity coefficient hv_coef: real = -hv_beta * dx / (16*dt) @@ -396,7 +394,7 @@ def compute_tendencies_z( hy_dens_theta_int = fixed_data.hy_dens_theta_int hy_pressure_int = fixed_data.hy_pressure_int - flux = real3d("flux", NUM_VARS, nz+1, nx) + flux = real3d("flux", nx=nx, nz=nz+1, nvars=NUM_VARS) # Compute the hyperviscosity coefficient hv_coef: real = -hv_beta * dz / (16*dt); @@ -411,9 +409,9 @@ def compute_tendencies_z( #SArray d3_vals; #SArray vals; - stencil = np.zeros((1, 4), dtype=real) - d3_vals = np.zeros((1, NUM_VARS), dtype=real) - vals = np.zeros((1, NUM_VARS), dtype=real) + stencil = np.zeros(4, dtype=real) + d3_vals = np.zeros(NUM_VARS, dtype=real) + vals = np.zeros(NUM_VARS, dtype=real) # Use fourth-order interpolation from four cell averages to compute the value at the interface in question for ll in range(NUM_VARS): @@ -440,8 +438,6 @@ def compute_tendencies_z( flux[ID_UMOM,k,i] = r*w*u - hv_coef*d3_vals[ID_UMOM]; flux[ID_WMOM,k,i] = r*w*w+p - hv_coef*d3_vals[ID_WMOM]; flux[ID_RHOT,k,i] = r*w*t - hv_coef*d3_vals[ID_RHOT]; - } - } # Use the fluxes to compute tendencies for each cell #///////////////////////////////////////////////// @@ -571,7 +567,9 @@ def init(): # -> tuple[real3d, real, Fixed_data]: mainproc = (myrank == 0) # Allocate the model data - state = real3d("state", NUM_VARS, nz+2*hs, nx+2*hs) + state = real3d("state", nx=nx+2*hs, nz=nz+2*hs, nvars=NUM_VARS) + if mainproc: + print(f"Allocate state: NUM_VARS={NUM_VARS}, nx+2*hs={nx+2*hs}, nz+2*hs={nz+2*hs}") # Define the maximum stable time step based on an assumed maximum wind speed dt: real = min(dx,dz) / max_speed * cfl; @@ -608,6 +606,8 @@ def init(): # -> tuple[real3d, real, Fixed_data]: for i in range(nx+2*hs): # Initialize the state to zero for ll in range(NUM_VARS): + if mainproc: + print(f"ll={ll}, k={k}, i={i}") state[ll,k,i] = 0.0 # Use Gauss-Legendre quadrature to initialize a hydrostatic balance + temperature perturbation @@ -688,11 +688,11 @@ def init(): # -> tuple[real3d, real, Fixed_data]: hy_dens_theta_int[k] = hr*ht hy_pressure_int[k] = C0*pow((hr*ht), gamm) - hy_dens_cell = realConst1d(hy_dens_cell ) - hy_dens_theta_cell = realConst1d(hy_dens_theta_cell) - hy_dens_int = realConst1d(hy_dens_int ) - hy_dens_theta_int = realConst1d(hy_dens_theta_int ) - hy_pressure_int = realConst1d(hy_pressure_int ) + hy_dens_cell = realConst1d("hy_dens_cell ", nz+2*hs) + hy_dens_theta_cell = realConst1d("hy_dens_theta_cell", nz+2*hs) + hy_dens_int = realConst1d("hy_dens_int ", nz+1) + hy_dens_theta_int = realConst1d("hy_dens_theta_int ", nz+1) + hy_pressure_int = realConst1d("hy_pressure_int ", nz+1) fixed_data = Fixed_data(nx, nz, i_beg, k_beg, nranks, myrank, left_rank, right_rank, mainproc, @@ -794,7 +794,7 @@ def hydro_const_theta(z: real): # returns (r, t) r = rt / t # Density at z return (r, t) -} + # Establish hydrostatic balance using constant Brunt-Vaisala frequency @@ -818,10 +818,10 @@ def hydro_const_bvfreq(z: real, bv_freq0: real): # returns (r, t) # amp,x0,z0,xrad,zrad are input amplitude, center, and radius of the ellipse def sample_ellipse_cosine(x: real, z: real, amp: real, x0: real, z0: real, xrad: real, zrad: real) -> real: # Compute distance from bubble center - dist: real = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.0 + dist: real = math.sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.0 # If the distance from bubble center is less than the radius, create a cos**2 profile if dist <= pi / 2.0: - return amp * pow(cos(dist),2.) + return amp * math.pow(math.cos(dist), 2.0) else: return 0.0 @@ -831,9 +831,9 @@ def sample_ellipse_cosine(x: real, z: real, amp: real, x0: real, z0: real, xrad: # If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics def output( state, # realConst3d - etime, # real - num_out, # int - fixed_data # Fixed_data const& + etime: real, + num_out: int, + fixed_data: Fixed_data ) -> int: # num_out (updated) if mainproc: @@ -841,10 +841,7 @@ def output( # TODO (mfh 2025/03/04) Actually write to the output file. - # Increment the number of outputs - num_out = num_out + 1; - - return num_out + return num_out + 1 def finalize() -> None: @@ -854,7 +851,7 @@ def finalize() -> None: # Compute reduced quantities for error checking without resorting to the "ncdiff" tool #void reductions( realConst3d state , double &mass , double &te , Fixed_data const &fixed_data ) { def reductions( - state # realConst3d, an input parameter + state, # realConst3d, an input parameter fixed_data # Fixed_data const&, an input parameter ) -> tuple[double, double]: # mass, te @@ -886,3 +883,7 @@ def reductions( #te = glob[1]; return (mass, te) + + +if __name__ == "__main__": + main() From 3d9794e3c6b0bed627e68f17e0b00c692cfb55e6 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 4 Mar 2025 16:47:36 -0700 Subject: [PATCH 08/83] More run-time debugging --- python/miniWeather.py | 68 +++++++++++++++++++++++++++---------------- 1 file changed, 43 insertions(+), 25 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index 14710c9..8c1ebde 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -76,7 +76,7 @@ nx_glob: int = 2 * nz_glob # Number of total cells in the x-direction sim_time: real = 1000.0 # How many seconds to run the simulation output_freq: real = 10.0 # How frequently to output data to file (in seconds) -data_spec_int: int = DATA_SPEC_THERMAL # How to initialize the data +data_spec_int: int = DATA_SPEC_INJECTION # How to initialize the data # /////////////////////////////////////////////////////////////////////////////////////// # // END USER-CONFIGURABLE PARAMETERS # /////////////////////////////////////////////////////////////////////////////////////// @@ -114,21 +114,37 @@ def realConst2d(name: str, nx: int, nz: int): # /////////////////////////////////////////////////////////////////////////////////////// class Fixed_data: - def __init__(self): - self.nx = 0 # Number of local grid cells in the x dimension for this MPI task - self.nz = 0 # Number of local grid cells in the z dimension for this MPI task - self.i_beg = 0 # beginning index in the x direction for this MPI task - self.k_beg = 0 # beginning index in the z direction for this MPI task - self.nranks = 0 # Number of MPI ranks - self.myrank = 0 # My rank id - self.left_rank = 0 # MPI Rank ID that exists to my left in the global domain - self.right_rank = 0 # MPI Rank ID that exists to my right in the global domain - self.mainproc = True # Am I the main process (rank == 0)? - self.hy_dens_cell = None # realConst1d: hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) - self.hy_dens_theta_cell = None # realConst1d: hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) - self.hy_dens_int = None # realConst1d: hydrostatic density (vert cell interf). Dimensions: (1:nz+1) - self.hy_dens_theta_int = None # realConst1d: hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) - self.hy_pressure_int = None # realConst1d: hydrostatic press (vert cell interf). Dimensions: (1:nz+1) + def __init__(self, + nx: int = 0, # Number of local grid cells in the x dimension for this MPI task + nz: int = 0, # Number of local grid cells in the z dimension for this MPI task + i_beg: int = 0, # beginning index in the x direction for this MPI task + k_beg: int = 0, # beginning index in the z direction for this MPI task + nranks: int = 0, # Number of MPI ranks + myrank: int = 0, # My rank id + left_rank: int = 0, # MPI Rank ID that exists to my left in the global domain + right_rank: int = 0, # MPI Rank ID that exists to my right in the global domain + mainproc: bool = True, # Am I the main process (rank == 0)? + hy_dens_cell = None, # realConst1d: hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) + hy_dens_theta_cell = None, # realConst1d: hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) + hy_dens_int = None, # realConst1d: hydrostatic density (vert cell interf). Dimensions: (1:nz+1) + hy_dens_theta_int = None, # realConst1d: hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) + hy_pressure_int = None, # realConst1d:hydrostatic press (vert cell interf). Dimensions: (1:nz+1) + ): + + self.nx = nx + self.nz = nz + self.i_beg = i_beg + self.k_beg = k_beg + self.nranks = nranks + self.myrank = myrank + self.left_rank = left_rank + self.right_rank = right_rank + self.mainproc = mainproc + self.hy_dens_cell = hy_dens_cell + self.hy_dens_theta_cell = hy_dens_theta_cell + self.hy_dens_int = hy_dens_int + self.hy_dens_theta_int = hy_dens_theta_int + self.hy_pressure_int = hy_pressure_int # /////////////////////////////////////////////////////////////////////////////////////// @@ -141,7 +157,7 @@ def main() -> None: # fixed_data: Fixed_data # state: real3d # dt: real: Model time step (seconds) - (fixed_data, state, dt) = init() + (state, fixed_data, dt) = init() mainproc = fixed_data.mainproc @@ -158,7 +174,7 @@ def main() -> None: # //////////////////////////////////////////////////// # MAIN TIME STEP LOOP # //////////////////////////////////////////////////// - def run_simulation(): + def main_time_step_loop(dt: real, etime: real, num_out: int) -> None: direction_switch: int = 1 # Order in which dimensional splitting takes x,z solves output_counter: real = 0.0 # Helps determine when it's time to do output @@ -179,8 +195,10 @@ def run_simulation(): if output_freq >= 0 and output_counter >= output_freq: output_counter = output_counter - output_freq num_out = output(state, etime, num_out, fixed_data) + # NOTE (mfh 2025/03/04) etime and num_out will be discarded. + # Figure out how to return them from within a timeit expression. - time_in_s = timeit.timeit(run_simulation(), number=1) + time_in_s = timeit.timeit(main_time_step_loop(dt, etime, num_out), number=1) if mainproc: print(f"CPU Time: {time_in_s} s\n") @@ -337,9 +355,9 @@ def compute_tendencies_x( #SArray d3_vals; #SArray vals; - stencil = np.zeros((1, 4), dtype=real) - d3_vals = np.zeros((1, NUM_VARS), dtype=real) - vals = np.zeros((1, NUM_VARS), dtype=real) + stencil = np.zeros(4, dtype=real) + d3_vals = np.zeros(NUM_VARS, dtype=real) + vals = np.zeros(NUM_VARS, dtype=real) # Use fourth-order interpolation from four cell averages to compute the value at the interface in question for ll in range(NUM_VARS): @@ -606,8 +624,8 @@ def init(): # -> tuple[real3d, real, Fixed_data]: for i in range(nx+2*hs): # Initialize the state to zero for ll in range(NUM_VARS): - if mainproc: - print(f"ll={ll}, k={k}, i={i}") + #if mainproc: + # print(f"ll={ll}, k={k}, i={i}") state[ll,k,i] = 0.0 # Use Gauss-Legendre quadrature to initialize a hydrostatic balance + temperature perturbation @@ -836,7 +854,7 @@ def output( fixed_data: Fixed_data ) -> int: # num_out (updated) - if mainproc: + if fixed_data.mainproc: print("*** OUTPUT ***\n") # TODO (mfh 2025/03/04) Actually write to the output file. From 6762e76a1f4ed36354993c5e3725666fde314ea9 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 4 Mar 2025 16:50:30 -0700 Subject: [PATCH 09/83] More run-time debugging --- python/miniWeather.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index 8c1ebde..fe3d203 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -706,12 +706,6 @@ def init(): # -> tuple[real3d, real, Fixed_data]: hy_dens_theta_int[k] = hr*ht hy_pressure_int[k] = C0*pow((hr*ht), gamm) - hy_dens_cell = realConst1d("hy_dens_cell ", nz+2*hs) - hy_dens_theta_cell = realConst1d("hy_dens_theta_cell", nz+2*hs) - hy_dens_int = realConst1d("hy_dens_int ", nz+1) - hy_dens_theta_int = realConst1d("hy_dens_theta_int ", nz+1) - hy_pressure_int = realConst1d("hy_pressure_int ", nz+1) - fixed_data = Fixed_data(nx, nz, i_beg, k_beg, nranks, myrank, left_rank, right_rank, mainproc, hy_dens_cell, hy_dens_theta_cell, hy_dens_int, From 880ee6adc5f8f3e19e3bc8d1964caf91caf14a3c Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 4 Mar 2025 17:07:53 -0700 Subject: [PATCH 10/83] It runs to completion without errors --- python/miniWeather.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index fe3d203..b4f3222 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -74,7 +74,7 @@ # So, you'll want to have nx_glob be twice as large as nz_glob nz_glob: int = 50 # Number of total cells in the z-direction nx_glob: int = 2 * nz_glob # Number of total cells in the x-direction -sim_time: real = 1000.0 # How many seconds to run the simulation +sim_time: real = 20.0 #1000.0 # How many seconds to run the simulation output_freq: real = 10.0 # How frequently to output data to file (in seconds) data_spec_int: int = DATA_SPEC_INJECTION # How to initialize the data # /////////////////////////////////////////////////////////////////////////////////////// @@ -174,6 +174,7 @@ def main() -> None: # //////////////////////////////////////////////////// # MAIN TIME STEP LOOP # //////////////////////////////////////////////////// + def main_time_step_loop(dt: real, etime: real, num_out: int) -> None: direction_switch: int = 1 # Order in which dimensional splitting takes x,z solves output_counter: real = 0.0 # Helps determine when it's time to do output @@ -195,10 +196,14 @@ def main_time_step_loop(dt: real, etime: real, num_out: int) -> None: if output_freq >= 0 and output_counter >= output_freq: output_counter = output_counter - output_freq num_out = output(state, etime, num_out, fixed_data) - # NOTE (mfh 2025/03/04) etime and num_out will be discarded. - # Figure out how to return them from within a timeit expression. - time_in_s = timeit.timeit(main_time_step_loop(dt, etime, num_out), number=1) + return (etime, num_out) + + start_time = timeit.default_timer() + (etime, num_out) = main_time_step_loop(dt, etime, num_out) + end_time = timeit.default_timer() + + time_in_s = end_time - start_time if mainproc: print(f"CPU Time: {time_in_s} s\n") From 01e45bcba091148e1c4d1d47c012b616f96472ba Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Fri, 7 Mar 2025 00:36:37 +0200 Subject: [PATCH 11/83] Debugging Python (C is OK) --- c/miniWeather_serial.cpp | 16 +++++++++++++++- cpp/CMakeLists.txt | 2 +- cpp/YAKL | 2 +- python/miniWeather.py | 17 +++++++++++------ 4 files changed, 28 insertions(+), 9 deletions(-) mode change 160000 => 120000 cpp/YAKL diff --git a/c/miniWeather_serial.cpp b/c/miniWeather_serial.cpp index 0d04d44..12f14ce 100644 --- a/c/miniWeather_serial.cpp +++ b/c/miniWeather_serial.cpp @@ -132,6 +132,10 @@ int main(int argc, char **argv) { //Initial reductions for mass, kinetic energy, and total energy reductions(mass0,te0); + { + printf( "mass0: %le\n" , mass0 ); + printf( "te0: %le\n" , te0 ); + } //Output the initial state output(state,etime); @@ -157,6 +161,13 @@ int main(int argc, char **argv) { output_counter = output_counter - output_freq; output(state,etime); } + { + double mass = 0.0; + double te = 0.0; + reductions(mass, te); + printf( "mass: %le\n" , mass ); + printf( "te: %le\n" , te ); + } } auto t2 = std::chrono::steady_clock::now(); if (mainproc) { @@ -722,6 +733,7 @@ double sample_ellipse_cosine( double x , double z , double amp , double x0 , dou //The file I/O uses parallel-netcdf, the only external library required for this mini-app. //If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics void output( double *state , double etime ) { +#if 0 int ncid, t_dimid, x_dimid, z_dimid, dens_varid, uwnd_varid, wwnd_varid, theta_varid, t_varid, dimids[3]; int i, k, ind_r, ind_u, ind_w, ind_t; MPI_Offset st1[1], ct1[1], st3[3], ct3[3]; @@ -802,16 +814,18 @@ void output( double *state , double etime ) { //Close the file ncwrap( ncmpi_close(ncid) , __LINE__ ); - +#endif // 0 //Increment the number of outputs num_out = num_out + 1; +#if 0 //Deallocate the temp arrays free( dens ); free( uwnd ); free( wwnd ); free( theta ); free( etimearr ); +#endif // 0 } diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9dde59f..8b9ea27 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -116,7 +116,7 @@ target_link_libraries(parallelfor_simd_x_test "${LDFLAGS}") add_test(NAME YAKL_SIMD_X_TEST COMMAND ./check_output.sh ./parallelfor_simd_x_test 1e-9 4.5e-5 ) -include(YAKL/yakl_utils.cmake) +include(YAKL/deprecated/yakl_utils.cmake) yakl_process_target(serial) yakl_process_target(serial_test) yakl_process_target(mpi) diff --git a/cpp/YAKL b/cpp/YAKL deleted file mode 160000 index 71a059c..0000000 --- a/cpp/YAKL +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 71a059c4701d22f3d60157b01a922776261993c0 diff --git a/cpp/YAKL b/cpp/YAKL new file mode 120000 index 0000000..faed9af --- /dev/null +++ b/cpp/YAKL @@ -0,0 +1 @@ +../../YAKL \ No newline at end of file diff --git a/python/miniWeather.py b/python/miniWeather.py index b4f3222..bd0f438 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -163,6 +163,9 @@ def main() -> None: # Initial reductions for mass, kinetic energy, and total energy (mass0, te0) = reductions(state, fixed_data) + if mainproc: + print(f"mass0: {mass0:.6e}") + print(f"te0: {te0:.6e}") num_out: int = 0 # The number of outputs performed so far etime: real = 0.0 # Elapsed time @@ -188,7 +191,7 @@ def main_time_step_loop(dt: real, etime: real, num_out: int) -> None: direction_switch = perform_timestep(state, dt, direction_switch, fixed_data) # Inform the user if mainproc: - print(f"Elapsed Time: {etime}, Simulation Time: {sim_time}\n") + print(f"Elapsed Time: {etime}, Simulation Time: {sim_time}") # Update the elapsed time and output counter etime = etime + dt output_counter = output_counter + dt @@ -197,6 +200,11 @@ def main_time_step_loop(dt: real, etime: real, num_out: int) -> None: output_counter = output_counter - output_freq num_out = output(state, etime, num_out, fixed_data) + (mass, te) = reductions(state, fixed_data) + if mainproc: + print(f"mass: {mass:.6e}") + print(f"te: {te:.6e}") + return (etime, num_out) start_time = timeit.default_timer() @@ -211,8 +219,8 @@ def main_time_step_loop(dt: real, etime: real, num_out: int) -> None: (mass, te) = reductions(state, fixed_data) if mainproc: - print(f"d_mass: {((mass - mass0)/mass0)}") - print(f"d_te: {((te - te0 )/te0 )}") + print(f"d_mass: {((mass - mass0)/mass0):.6e}") + print(f"d_te: {((te - te0 )/te0 ):.6e}") finalize() @@ -853,9 +861,6 @@ def output( fixed_data: Fixed_data ) -> int: # num_out (updated) - if fixed_data.mainproc: - print("*** OUTPUT ***\n") - # TODO (mfh 2025/03/04) Actually write to the output file. return num_out + 1 From 2d4f752e0d4e0975a04624efb980459ee8c0958d Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Fri, 7 Mar 2025 00:43:09 +0200 Subject: [PATCH 12/83] Remove some Python / C diffs Remove some differences (state_tmp allocation and initialization) between Python and C. It didn't change the Python output at all. --- python/miniWeather.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index bd0f438..e01a2c7 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -156,8 +156,9 @@ def main() -> None: # fixed_data: Fixed_data # state: real3d + # state_tmp: real3d # dt: real: Model time step (seconds) - (state, fixed_data, dt) = init() + (state, state_tmp, fixed_data, dt) = init() mainproc = fixed_data.mainproc @@ -188,7 +189,7 @@ def main_time_step_loop(dt: real, etime: real, num_out: int) -> None: dt = sim_time - etime # Perform a single time step - direction_switch = perform_timestep(state, dt, direction_switch, fixed_data) + direction_switch = perform_timestep(state, state_tmp, dt, direction_switch, fixed_data) # Inform the user if mainproc: print(f"Elapsed Time: {etime}, Simulation Time: {sim_time}") @@ -237,6 +238,7 @@ def main_time_step_loop(dt: real, etime: real, num_out: int) -> None: # q_n+1 = q_n + dt/1 * rhs(q**) def perform_timestep( state, # real3d const&, input parameter + state_tmp, dt, # real, must be an input parameter direction_switch, # int&, in/out parameter, transformed to input and return value fixed_data # Fixed_data const &, input parameter @@ -245,8 +247,6 @@ def perform_timestep( nx = fixed_data.nx nz = fixed_data.nz - state_tmp = real3d("state_tmp", nx=nx+2*hs, nz=nz+2*hs, nvars=NUM_VARS) - if direction_switch != 0: # x-direction first semi_discrete_step(state, state , state_tmp, dt / 3, DIR_X, fixed_data) @@ -569,7 +569,7 @@ def set_halo_values_z( # state, dt, and fixed_data used to be output parameters. # It would be more Pythonic to return them as a tuple. -def init(): # -> tuple[real3d, real, Fixed_data]: +def init(): # -> (state: real3d, state_tmp: real3d, dt: real, fixed_data: Fixed_data) ierr: int = 0 @@ -601,6 +601,7 @@ def init(): # -> tuple[real3d, real, Fixed_data]: state = real3d("state", nx=nx+2*hs, nz=nz+2*hs, nvars=NUM_VARS) if mainproc: print(f"Allocate state: NUM_VARS={NUM_VARS}, nx+2*hs={nx+2*hs}, nz+2*hs={nz+2*hs}") + state_tmp = real3d("state_tmp", nx=nx+2*hs, nz=nz+2*hs, nvars=NUM_VARS) # Define the maximum stable time step based on an assumed maximum wind speed dt: real = min(dx,dz) / max_speed * cfl; @@ -666,6 +667,9 @@ def init(): # -> tuple[real3d, real, Fixed_data]: state[ID_WMOM,k,i] += (r+hr)*w * qweights[ii]*qweights[kk] state[ID_RHOT,k,i] += ( (r+hr)*(t+ht) - hr*ht ) * qweights[ii]*qweights[kk] + for ll in range(NUM_VARS): + state_tmp[ll,k,i] = state[ll,k,i] + hy_dens_cell = real1d("hy_dens_cell ", nz+2*hs) hy_dens_theta_cell = real1d("hy_dens_theta_cell", nz+2*hs) hy_dens_int = real1d("hy_dens_int ", nz+1) @@ -724,7 +728,7 @@ def init(): # -> tuple[real3d, real, Fixed_data]: hy_dens_cell, hy_dens_theta_cell, hy_dens_int, hy_dens_theta_int, hy_pressure_int) - return (state, fixed_data, dt) + return (state, state_tmp, fixed_data, dt) # This test case is initially balanced but injects fast, cold air from the left boundary near the model top # x and z are input coordinates at which to sample From f9b94f6f22c24c4b5c2a597a8508138aaf251d04 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Fri, 7 Mar 2025 01:01:47 +0200 Subject: [PATCH 13/83] Make Python flux like C flux Extents of flux in Python differed from those in C. I changed the Python flux allocation to work like C, but that didn't help the mass balance. --- python/miniWeather.py | 55 +++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index e01a2c7..ef046d4 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -155,10 +155,11 @@ def main() -> None: #yakl::init(); # fixed_data: Fixed_data - # state: real3d - # state_tmp: real3d + # state: real3d, NUM_VARS x (nz+2*hs) x (nz+2*hs) + # state_tmp: real3d, ditto + # flux: real3d, NUM_VARS x (nz+1) x (nx+1) # dt: real: Model time step (seconds) - (state, state_tmp, fixed_data, dt) = init() + (state, state_tmp, flux, fixed_data, dt) = init() mainproc = fixed_data.mainproc @@ -189,7 +190,7 @@ def main_time_step_loop(dt: real, etime: real, num_out: int) -> None: dt = sim_time - etime # Perform a single time step - direction_switch = perform_timestep(state, state_tmp, dt, direction_switch, fixed_data) + direction_switch = perform_timestep(state, state_tmp, flux, dt, direction_switch, fixed_data) # Inform the user if mainproc: print(f"Elapsed Time: {etime}, Simulation Time: {sim_time}") @@ -238,7 +239,8 @@ def main_time_step_loop(dt: real, etime: real, num_out: int) -> None: # q_n+1 = q_n + dt/1 * rhs(q**) def perform_timestep( state, # real3d const&, input parameter - state_tmp, + state_tmp, # real3d + flux, # real3d dt, # real, must be an input parameter direction_switch, # int&, in/out parameter, transformed to input and return value fixed_data # Fixed_data const &, input parameter @@ -249,22 +251,22 @@ def perform_timestep( if direction_switch != 0: # x-direction first - semi_discrete_step(state, state , state_tmp, dt / 3, DIR_X, fixed_data) - semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, fixed_data) - semi_discrete_step(state, state_tmp, state , dt / 1, DIR_X, fixed_data) + semi_discrete_step(state, state , state_tmp, dt / 3, DIR_X, flux, fixed_data) + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, fixed_data) + semi_discrete_step(state, state_tmp, state , dt / 1, DIR_X, flux, fixed_data) # z-direction second - semi_discrete_step(state, state , state_tmp, dt / 3, DIR_Z, fixed_data) - semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, fixed_data) - semi_discrete_step(state, state_tmp, state , dt / 1, DIR_Z, fixed_data) + semi_discrete_step(state, state , state_tmp, dt / 3, DIR_Z, flux, fixed_data) + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, fixed_data) + semi_discrete_step(state, state_tmp, state , dt / 1, DIR_Z, flux, fixed_data) else: # z-direction second - semi_discrete_step(state, state , state_tmp, dt / 3, DIR_Z, fixed_data) - semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, fixed_data) - semi_discrete_step(state, state_tmp, state , dt / 1, DIR_Z, fixed_data) + semi_discrete_step(state, state , state_tmp, dt / 3, DIR_Z, flux, fixed_data) + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, fixed_data) + semi_discrete_step(state, state_tmp, state , dt / 1, DIR_Z, flux, fixed_data) # x-direction first - semi_discrete_step(state, state , state_tmp, dt / 3, DIR_X, fixed_data) - semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, fixed_data) - semi_discrete_step(state, state_tmp, state , dt / 1, DIR_X, fixed_data) + semi_discrete_step(state, state , state_tmp, dt / 3, DIR_X, flux, fixed_data) + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, fixed_data) + semi_discrete_step(state, state_tmp, state , dt / 1, DIR_X, flux, fixed_data) if direction_switch: direction_switch = 0 @@ -284,6 +286,7 @@ def semi_discrete_step( state_out, # real3d const&, dt: real, dir: int, + flux, # real3d fixed_data # Fixed_data const& ) -> None: @@ -302,7 +305,7 @@ def semi_discrete_step( #yakl::timer_stop("halo x"); # Compute the time tendencies for the fluid state in the x-direction #yakl::timer_start("tendencies x"); - compute_tendencies_x(state_forcing, tend, dt, fixed_data) + compute_tendencies_x(state_forcing, flux, tend, dt, fixed_data) #yakl::timer_stop("tendencies x"); elif dir == DIR_Z: # Set the halo values for this MPI task's fluid state in the z-direction @@ -311,7 +314,7 @@ def semi_discrete_step( #yakl::timer_stop("halo z"); # Compute the time tendencies for the fluid state in the z-direction #yakl::timer_start("tendencies z"); - compute_tendencies_z(state_forcing, tend, dt, fixed_data) + compute_tendencies_z(state_forcing, flux, tend, dt, fixed_data) #yakl::timer_stop("tendencies z"); # ///////////////////////////////////////////////// @@ -323,6 +326,9 @@ def semi_discrete_step( for k in range(nz): for i in range(nx): if data_spec_int == DATA_SPEC_GRAVITY_WAVES: + print("*** NOT IMPLEMENTED ***"); + sys.exit(-1); + x: real = (i_beg + i+0.5)*dx; z: real = (k_beg + k+0.5)*dz; wpert: real = sample_ellipse_cosine(x, z, 0.01, xlen/8, 1000.0, 500.0, 500.0) @@ -342,6 +348,7 @@ def semi_discrete_step( # Then, compute the tendencies using those fluxes def compute_tendencies_x( state, # realConst3d + flux, tend, # real3d const& dt, # real fixed_data # Fixed_data const& @@ -352,8 +359,6 @@ def compute_tendencies_x( hy_dens_cell = fixed_data.hy_dens_cell hy_dens_theta_cell = fixed_data.hy_dens_theta_cell - flux = real3d("flux", nx=nx+1, nz=nz, nvars=NUM_VARS) - # Compute the hyperviscosity coefficient hv_coef: real = -hv_beta * dx / (16*dt) # ///////////////////////////////////////////////// @@ -414,6 +419,7 @@ def compute_tendencies_x( # Then, compute the tendencies using those fluxes def compute_tendencies_z( state, # realConst3d + flux, tend, # real3d const& dt, # real fixed_data # Fixed_data const& @@ -425,8 +431,6 @@ def compute_tendencies_z( hy_dens_theta_int = fixed_data.hy_dens_theta_int hy_pressure_int = fixed_data.hy_pressure_int - flux = real3d("flux", nx=nx, nz=nz+1, nvars=NUM_VARS) - # Compute the hyperviscosity coefficient hv_coef: real = -hv_beta * dz / (16*dt); # ///////////////////////////////////////////////// @@ -569,7 +573,7 @@ def set_halo_values_z( # state, dt, and fixed_data used to be output parameters. # It would be more Pythonic to return them as a tuple. -def init(): # -> (state: real3d, state_tmp: real3d, dt: real, fixed_data: Fixed_data) +def init(): # -> (state: real3d, state_tmp: real3d, flux: real3d, dt: real, fixed_data: Fixed_data) ierr: int = 0 @@ -602,6 +606,7 @@ def init(): # -> (state: real3d, state_tmp: real3d, dt: real, fixed_data: Fixed_ if mainproc: print(f"Allocate state: NUM_VARS={NUM_VARS}, nx+2*hs={nx+2*hs}, nz+2*hs={nz+2*hs}") state_tmp = real3d("state_tmp", nx=nx+2*hs, nz=nz+2*hs, nvars=NUM_VARS) + flux = real3d("flux", nx=nx+1, nz=nz+1, nvars=NUM_VARS) # Define the maximum stable time step based on an assumed maximum wind speed dt: real = min(dx,dz) / max_speed * cfl; @@ -728,7 +733,7 @@ def init(): # -> (state: real3d, state_tmp: real3d, dt: real, fixed_data: Fixed_ hy_dens_cell, hy_dens_theta_cell, hy_dens_int, hy_dens_theta_int, hy_pressure_int) - return (state, state_tmp, fixed_data, dt) + return (state, state_tmp, flux, fixed_data, dt) # This test case is initially balanced but injects fast, cold air from the left boundary near the model top # x and z are input coordinates at which to sample From 52e035b5f0cf078be9a9d97814c502219dd5d580 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Fri, 7 Mar 2025 01:09:00 +0200 Subject: [PATCH 14/83] Make direction_switch the same; verify --- c/miniWeather_serial.cpp | 1 + python/miniWeather.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/c/miniWeather_serial.cpp b/c/miniWeather_serial.cpp index 12f14ce..512bf55 100644 --- a/c/miniWeather_serial.cpp +++ b/c/miniWeather_serial.cpp @@ -194,6 +194,7 @@ int main(int argc, char **argv) { // q** = q[n] + dt/2 * rhs(q* ) // q[n+1] = q[n] + dt/1 * rhs(q** ) void perform_timestep( double *state , double *state_tmp , double *flux , double *tend , double dt ) { + printf("direction_switch: %d\n", direction_switch); if (direction_switch) { //x-direction first semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_X , flux , tend ); diff --git a/python/miniWeather.py b/python/miniWeather.py index ef046d4..a1013f2 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -249,7 +249,8 @@ def perform_timestep( nx = fixed_data.nx nz = fixed_data.nz - if direction_switch != 0: + print(f'direction_switch: {direction_switch}') + if direction_switch: # x-direction first semi_discrete_step(state, state , state_tmp, dt / 3, DIR_X, flux, fixed_data) semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, fixed_data) From 650634205c359459f7e5d851cc9cd00e56ef37a0 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Fri, 7 Mar 2025 01:16:04 +0200 Subject: [PATCH 15/83] Reconcile more, but no change in results --- python/miniWeather.py | 55 ++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index a1013f2..81a947f 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -159,7 +159,7 @@ def main() -> None: # state_tmp: real3d, ditto # flux: real3d, NUM_VARS x (nz+1) x (nx+1) # dt: real: Model time step (seconds) - (state, state_tmp, flux, fixed_data, dt) = init() + (state, state_tmp, flux, tend, fixed_data, dt) = init() mainproc = fixed_data.mainproc @@ -190,7 +190,7 @@ def main_time_step_loop(dt: real, etime: real, num_out: int) -> None: dt = sim_time - etime # Perform a single time step - direction_switch = perform_timestep(state, state_tmp, flux, dt, direction_switch, fixed_data) + direction_switch = perform_timestep(state, state_tmp, flux, tend, dt, direction_switch, fixed_data) # Inform the user if mainproc: print(f"Elapsed Time: {etime}, Simulation Time: {sim_time}") @@ -241,6 +241,7 @@ def perform_timestep( state, # real3d const&, input parameter state_tmp, # real3d flux, # real3d + tend, # real3d dt, # real, must be an input parameter direction_switch, # int&, in/out parameter, transformed to input and return value fixed_data # Fixed_data const &, input parameter @@ -252,22 +253,22 @@ def perform_timestep( print(f'direction_switch: {direction_switch}') if direction_switch: # x-direction first - semi_discrete_step(state, state , state_tmp, dt / 3, DIR_X, flux, fixed_data) - semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, fixed_data) - semi_discrete_step(state, state_tmp, state , dt / 1, DIR_X, flux, fixed_data) + semi_discrete_step(state, state , state_tmp, dt / 3, DIR_X, flux, tend, fixed_data) + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend, fixed_data) + semi_discrete_step(state, state_tmp, state , dt / 1, DIR_X, flux, tend, fixed_data) # z-direction second - semi_discrete_step(state, state , state_tmp, dt / 3, DIR_Z, flux, fixed_data) - semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, fixed_data) - semi_discrete_step(state, state_tmp, state , dt / 1, DIR_Z, flux, fixed_data) + semi_discrete_step(state, state , state_tmp, dt / 3, DIR_Z, flux, tend, fixed_data) + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend, fixed_data) + semi_discrete_step(state, state_tmp, state , dt / 1, DIR_Z, flux, tend, fixed_data) else: # z-direction second - semi_discrete_step(state, state , state_tmp, dt / 3, DIR_Z, flux, fixed_data) - semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, fixed_data) - semi_discrete_step(state, state_tmp, state , dt / 1, DIR_Z, flux, fixed_data) + semi_discrete_step(state, state , state_tmp, dt / 3, DIR_Z, flux, tend, fixed_data) + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend, fixed_data) + semi_discrete_step(state, state_tmp, state , dt / 1, DIR_Z, flux, tend, fixed_data) # x-direction first - semi_discrete_step(state, state , state_tmp, dt / 3, DIR_X, flux, fixed_data) - semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, fixed_data) - semi_discrete_step(state, state_tmp, state , dt / 1, DIR_X, flux, fixed_data) + semi_discrete_step(state, state , state_tmp, dt / 3, DIR_X, flux, tend, fixed_data) + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend, fixed_data) + semi_discrete_step(state, state_tmp, state , dt / 1, DIR_X, flux, tend, fixed_data) if direction_switch: direction_switch = 0 @@ -288,6 +289,7 @@ def semi_discrete_step( dt: real, dir: int, flux, # real3d + tend, # real3d fixed_data # Fixed_data const& ) -> None: @@ -297,8 +299,6 @@ def semi_discrete_step( k_beg = fixed_data.k_beg hy_dens_cell = fixed_data.hy_dens_cell - tend = real3d("tend", nx=nx, nz=nz, nvars=NUM_VARS) - if dir == DIR_X: # Set the halo values for this MPI task's fluid state in the x-direction #yakl::timer_start("halo x"); @@ -393,7 +393,7 @@ def compute_tendencies_x( u = vals[ID_UMOM] / r w = vals[ID_WMOM] / r t = (vals[ID_RHOT] + hy_dens_theta_cell[hs+k]) / r - p = C0*pow((r*t), gamm) + p = C0*math.pow((r*t), gamm) # Compute the flux vector flux[ID_DENS,k,i] = r*u - hv_coef*d3_vals[ID_DENS] @@ -464,7 +464,7 @@ def compute_tendencies_z( u: real = vals[ID_UMOM] / r; w: real = vals[ID_WMOM] / r; t: real = ( vals[ID_RHOT] + hy_dens_theta_int[k] ) / r; - p: real = C0*pow((r*t),gamm) - hy_pressure_int[k]; + p: real = C0*math.pow((r*t),gamm) - hy_pressure_int[k]; if k == 0 or k == nz: w = 0; d3_vals[ID_DENS] = 0; @@ -574,7 +574,7 @@ def set_halo_values_z( # state, dt, and fixed_data used to be output parameters. # It would be more Pythonic to return them as a tuple. -def init(): # -> (state: real3d, state_tmp: real3d, flux: real3d, dt: real, fixed_data: Fixed_data) +def init(): # -> (state: real3d, state_tmp: real3d, flux: real3d, tend: real3d, dt: real, fixed_data: Fixed_data) ierr: int = 0 @@ -608,6 +608,7 @@ def init(): # -> (state: real3d, state_tmp: real3d, flux: real3d, dt: real, fixe print(f"Allocate state: NUM_VARS={NUM_VARS}, nx+2*hs={nx+2*hs}, nz+2*hs={nz+2*hs}") state_tmp = real3d("state_tmp", nx=nx+2*hs, nz=nz+2*hs, nvars=NUM_VARS) flux = real3d("flux", nx=nx+1, nz=nz+1, nvars=NUM_VARS) + tend = real3d("tend", nx=nx, nz=nz, nvars=NUM_VARS) # Define the maximum stable time step based on an assumed maximum wind speed dt: real = min(dx,dz) / max_speed * cfl; @@ -727,14 +728,14 @@ def init(): # -> (state: real3d, state_tmp: real3d, flux: real3d, dt: real, fixe hy_dens_int[k] = hr hy_dens_theta_int[k] = hr*ht - hy_pressure_int[k] = C0*pow((hr*ht), gamm) + hy_pressure_int[k] = C0*math.pow((hr*ht), gamm) fixed_data = Fixed_data(nx, nz, i_beg, k_beg, nranks, myrank, left_rank, right_rank, mainproc, hy_dens_cell, hy_dens_theta_cell, hy_dens_int, hy_dens_theta_int, hy_pressure_int) - return (state, state_tmp, flux, fixed_data, dt) + return (state, state_tmp, flux, tend, fixed_data, dt) # This test case is initially balanced but injects fast, cold air from the left boundary near the model top # x and z are input coordinates at which to sample @@ -824,8 +825,8 @@ def hydro_const_theta(z: real): # returns (r, t) # Establish hydrostatic balance first using Exner pressure t = theta0 # Potential Temperature at z exner = exner0 - grav * z / (cp * theta0) # Exner pressure at z - p = p0 * pow(exner, (cp/rd)) # Pressure at z - rt = pow((p / C0), (1. / gamm)) # rho*theta at z + p = p0 * math.pow(exner, (cp/rd)) # Pressure at z + rt = math.pow((p / C0), (1. / gamm)) # rho*theta at z r = rt / t # Density at z return (r, t) @@ -841,8 +842,8 @@ def hydro_const_bvfreq(z: real, bv_freq0: real): # returns (r, t) exner0: real = 1.0 # Surface-level Exner pressure t = theta0 * exp( bv_freq0*bv_freq0 / grav * z ) # Pot temp at z exner = exner0 - grav*grav / (cp * bv_freq0*bv_freq0) * (t - theta0) / (t * theta0) # Exner pressure at z - p = p0 * pow(exner,(cp/rd)) # Pressure at z - rt = pow((p / C0),(1. / gamm)) # rho*theta at z + p = p0 * math.pow(exner,(cp/rd)) # Pressure at z + rt = math.pow((p / C0),(1. / gamm)) # rho*theta at z r = rt / t # Density at z return (r, t) @@ -900,8 +901,8 @@ def reductions( u = state[ID_UMOM,hs+k,hs+i] / r # U-wind w = state[ID_WMOM,hs+k,hs+i] / r # W-wind th = ( state[ID_RHOT,hs+k,hs+i] + hy_dens_theta_cell[hs+k] ) / r # Potential Temperature (theta) - p = C0*pow(r*th,gamm) # Pressure - t = th / pow(p0/p,rd/cp) # Temperature + p = C0*math.pow(r*th,gamm) # Pressure + t = th / math.pow(p0/p,rd/cp) # Temperature ke = r*(u*u+w*w) # Kinetic Energy ie = r*cv*t # Internal Energy mass += r *dx*dz # Accumulate domain mass From 53dbd0eaefab1b28d66c15e089433437c97ee9ea Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Fri, 7 Mar 2025 01:18:52 +0200 Subject: [PATCH 16/83] Reconcile more, but no change in results --- python/miniWeather.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index 81a947f..98ed4bb 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -840,7 +840,7 @@ def hydro_const_theta(z: real): # returns (r, t) def hydro_const_bvfreq(z: real, bv_freq0: real): # returns (r, t) theta0: real = 300.0 # Background potential temperature exner0: real = 1.0 # Surface-level Exner pressure - t = theta0 * exp( bv_freq0*bv_freq0 / grav * z ) # Pot temp at z + t = theta0 * math.exp( bv_freq0*bv_freq0 / grav * z ) # Pot temp at z exner = exner0 - grav*grav / (cp * bv_freq0*bv_freq0) * (t - theta0) / (t * theta0) # Exner pressure at z p = p0 * math.pow(exner,(cp/rd)) # Pressure at z rt = math.pow((p / C0),(1. / gamm)) # rho*theta at z From 4c6fd2056ca1fd3c26fa6f578967d43d4ae175af Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Fri, 7 Mar 2025 01:24:19 +0200 Subject: [PATCH 17/83] Reconcile collision; didn't help --- python/miniWeather.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index 98ed4bb..f3210ad 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -808,10 +808,8 @@ def collision(x: real, z: real): # returns (r, u, w, t, hr, ht) t = 0.0 u = 0.0 w = 0.0 - # FIXME (mfh 2025/03/03) This was originally t = t + ... so perhaps t should also be an input parameter. - # On the other hand, t was uninitialized before (which means this was probably incorrect in the original C++). - t = sample_ellipse_cosine(x, z, 20.0, xlen/2, 2000.0, 2000.0, 2000.0) + \ - sample_ellipse_cosine(x, z, -20.0, xlen/2, 8000.0, 2000.0, 2000.0) + t = t + sample_ellipse_cosine(x, z, 20.0, xlen/2, 2000.0, 2000.0, 2000.0) + t = t + sample_ellipse_cosine(x, z, -20.0, xlen/2, 8000.0, 2000.0, 2000.0) return (r, u, w, t, hr, ht) From 8dd07fb77235ce349807ceae81b4f6f58d30a86b Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Fri, 7 Mar 2025 01:29:26 +0200 Subject: [PATCH 18/83] More reconciling: abs -> math.fabs in set_halo_values_x --- python/miniWeather.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index f3210ad..e901365 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -530,7 +530,7 @@ def set_halo_values_x( for k in range(nz): for i in range(hs): z: real = (k_beg + k+0.5)*dz - if abs(z-3*zlen/4) <= zlen/16: + if math.fabs(z-3*zlen/4) <= zlen/16: state[ID_UMOM,hs+k,i] = (state[ID_DENS,hs+k,i] + hy_dens_cell[hs+k]) * 50.0 state[ID_RHOT,hs+k,i] = (state[ID_DENS,hs+k,i] + hy_dens_cell[hs+k]) * 298.0 - hy_dens_theta_cell[hs+k] @@ -761,9 +761,7 @@ def density_current(x: real, z: real): # returns (r, u, w, t, hr, ht) t = 0.0 u = 0.0 w = 0.0 - # FIXME (mfh 2025/03/03) This was originally t = t + ... so perhaps t should also be an input parameter. - # On the other hand, t was uninitialized before (which means this was probably incorrect in the original C++). - t = sample_ellipse_cosine(x, z, -20.0, xlen/2, 5000.0, 4000.0, 2000.0) + t = t + sample_ellipse_cosine(x, z, -20.0, xlen/2, 5000.0, 4000.0, 2000.0) return (r, u, w, t, hr, ht) @@ -791,9 +789,7 @@ def thermal(x: real, z: real): # returns (r, u, w, t, hr, ht) t = 0.0 u = 0.0 w = 0.0 - # FIXME (mfh 2025/03/03) This was originally t = t + ... so perhaps t should also be an input parameter. - # On the other hand, t was uninitialized before (which means this was probably incorrect in the original C++). - t = sample_ellipse_cosine(x, z, 3.0, xlen/2, 2000.0, 2000.0, 2000.0) + t = t + sample_ellipse_cosine(x, z, 3.0, xlen/2, 2000.0, 2000.0, 2000.0) return (r, u, w, t, hr, ht) From 5ae15b13f7b9077229c882bbec46da6ce0f83f05 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Fri, 7 Mar 2025 01:47:11 +0200 Subject: [PATCH 19/83] Remove superfluous comments --- python/miniWeather.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index e901365..f3760af 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -301,33 +301,24 @@ def semi_discrete_step( if dir == DIR_X: # Set the halo values for this MPI task's fluid state in the x-direction - #yakl::timer_start("halo x"); set_halo_values_x(state_forcing, fixed_data) - #yakl::timer_stop("halo x"); # Compute the time tendencies for the fluid state in the x-direction - #yakl::timer_start("tendencies x"); compute_tendencies_x(state_forcing, flux, tend, dt, fixed_data) - #yakl::timer_stop("tendencies x"); elif dir == DIR_Z: # Set the halo values for this MPI task's fluid state in the z-direction - #yakl::timer_start("halo z"); set_halo_values_z(state_forcing, fixed_data) - #yakl::timer_stop("halo z"); # Compute the time tendencies for the fluid state in the z-direction - #yakl::timer_start("tendencies z"); compute_tendencies_z(state_forcing, flux, tend, dt, fixed_data) - #yakl::timer_stop("tendencies z"); # ///////////////////////////////////////////////// # // TODO: MAKE THESE 3 LOOPS A PARALLEL_FOR # ///////////////////////////////////////////////// # Apply the tendencies to the fluid state - # yakl::timer_start("apply tendencies"); for ll in range(NUM_VARS): for k in range(nz): for i in range(nx): if data_spec_int == DATA_SPEC_GRAVITY_WAVES: - print("*** NOT IMPLEMENTED ***"); + print("*** TEMPORARILY DISABLED ***"); sys.exit(-1); x: real = (i_beg + i+0.5)*dx; @@ -337,8 +328,6 @@ def semi_discrete_step( state_out[ll,hs+k,hs+i] = state_init[ll,hs+k,hs+i] + dt * tend[ll,k,i] - # yakl::timer_stop("apply tendencies"); - # NOTE It's OK for this not to return anything, # as long as we can treat state_out as an output parameter. From bc5c387c53e76981d3c140b5d2bfe8187652a888 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Fri, 7 Mar 2025 19:09:46 +0200 Subject: [PATCH 20/83] Change Python to run THERMAL not INJECTION --- python/miniWeather.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index f3760af..61048ca 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -76,7 +76,8 @@ nx_glob: int = 2 * nz_glob # Number of total cells in the x-direction sim_time: real = 20.0 #1000.0 # How many seconds to run the simulation output_freq: real = 10.0 # How frequently to output data to file (in seconds) -data_spec_int: int = DATA_SPEC_INJECTION # How to initialize the data +#data_spec_int: int = DATA_SPEC_INJECTION # How to initialize the data +data_spec_int: int = DATA_SPEC_THERMAL # How to initialize the data # /////////////////////////////////////////////////////////////////////////////////////// # // END USER-CONFIGURABLE PARAMETERS # /////////////////////////////////////////////////////////////////////////////////////// From 81ae53d9b814c758a8717fa5645e6c8f35c15a7a Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 10 Mar 2025 19:12:25 +0200 Subject: [PATCH 21/83] Fix MPI_Info_dup error in C output PNetCDF wants an MPI_Info that is not MPI_INFO_NULL. There are other issues with C output. --- c/miniWeather_serial.cpp | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/c/miniWeather_serial.cpp b/c/miniWeather_serial.cpp index 512bf55..f2638c9 100644 --- a/c/miniWeather_serial.cpp +++ b/c/miniWeather_serial.cpp @@ -734,7 +734,7 @@ double sample_ellipse_cosine( double x , double z , double amp , double x0 , dou //The file I/O uses parallel-netcdf, the only external library required for this mini-app. //If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics void output( double *state , double etime ) { -#if 0 +#if 1 int ncid, t_dimid, x_dimid, z_dimid, dens_varid, uwnd_varid, wwnd_varid, theta_varid, t_varid, dimids[3]; int i, k, ind_r, ind_u, ind_w, ind_t; MPI_Offset st1[1], ct1[1], st3[3], ct3[3]; @@ -750,10 +750,19 @@ void output( double *state , double etime ) { theta = (double *) malloc(nx*nz*sizeof(double)); etimearr = (double *) malloc(1 *sizeof(double)); + // PNetCDF needs an MPI_Info object that is not MPI_INFO_NULL. + // It's possible that earlier PNetCDF versions tolerated MPI_INFO_NULL. + MPI_Info mpi_info; + auto info_err = MPI_Info_create(&mpi_info); + if (info_err != MPI_SUCCESS) { + printf("Error creating MPI Info object\n"); + MPI_Abort(MPI_COMM_WORLD, -1); + } + //If the elapsed time is zero, create the file. Otherwise, open the file if (etime == 0) { //Create the file - ncwrap( ncmpi_create( MPI_COMM_WORLD , "output.nc" , NC_CLOBBER , MPI_INFO_NULL , &ncid ) , __LINE__ ); + ncwrap( ncmpi_create( MPI_COMM_WORLD , "output.nc" , NC_CLOBBER , mpi_info , &ncid ) , __LINE__ ); //Create the dimensions ncwrap( ncmpi_def_dim( ncid , "t" , (MPI_Offset) NC_UNLIMITED , &t_dimid ) , __LINE__ ); ncwrap( ncmpi_def_dim( ncid , "x" , (MPI_Offset) nx_glob , &x_dimid ) , __LINE__ ); @@ -770,7 +779,7 @@ void output( double *state , double etime ) { ncwrap( ncmpi_enddef( ncid ) , __LINE__ ); } else { //Open the file - ncwrap( ncmpi_open( MPI_COMM_WORLD , "output.nc" , NC_WRITE , MPI_INFO_NULL , &ncid ) , __LINE__ ); + ncwrap( ncmpi_open( MPI_COMM_WORLD , "output.nc" , NC_WRITE , mpi_info , &ncid ) , __LINE__ ); //Get the variable IDs ncwrap( ncmpi_inq_varid( ncid , "dens" , &dens_varid ) , __LINE__ ); ncwrap( ncmpi_inq_varid( ncid , "uwnd" , &uwnd_varid ) , __LINE__ ); @@ -819,7 +828,9 @@ void output( double *state , double etime ) { //Increment the number of outputs num_out = num_out + 1; -#if 0 +#if 1 + MPI_Info_free(&mpi_info); + //Deallocate the temp arrays free( dens ); free( uwnd ); From da28c5581229d949e38bdfb2f7011cc6510f7c2c Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 10 Mar 2025 20:52:54 +0200 Subject: [PATCH 22/83] C: output updates --- c/miniWeather_serial.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/c/miniWeather_serial.cpp b/c/miniWeather_serial.cpp index f2638c9..173554b 100644 --- a/c/miniWeather_serial.cpp +++ b/c/miniWeather_serial.cpp @@ -742,7 +742,7 @@ void output( double *state , double etime ) { double *dens, *uwnd, *wwnd, *theta; double *etimearr; //Inform the user - if (mainproc) { printf("*** OUTPUT ***\n"); } + if (mainproc) { fprintf(stderr, "*** OUTPUT ***\n"); } //Allocate some (big) temp arrays dens = (double *) malloc(nx*nz*sizeof(double)); uwnd = (double *) malloc(nx*nz*sizeof(double)); @@ -755,7 +755,7 @@ void output( double *state , double etime ) { MPI_Info mpi_info; auto info_err = MPI_Info_create(&mpi_info); if (info_err != MPI_SUCCESS) { - printf("Error creating MPI Info object\n"); + fprintf(stderr, "Error creating MPI Info object\n"); MPI_Abort(MPI_COMM_WORLD, -1); } @@ -769,7 +769,7 @@ void output( double *state , double etime ) { ncwrap( ncmpi_def_dim( ncid , "z" , (MPI_Offset) nz_glob , &z_dimid ) , __LINE__ ); //Create the variables dimids[0] = t_dimid; - ncwrap( ncmpi_def_var( ncid , "t" , NC_DOUBLE , 1 , dimids , &t_varid ) , __LINE__ ); + ncwrap( ncmpi_def_var( ncid , "t_var" , NC_DOUBLE , 1 , dimids , &t_varid ) , __LINE__ ); dimids[0] = t_dimid; dimids[1] = z_dimid; dimids[2] = x_dimid; ncwrap( ncmpi_def_var( ncid , "dens" , NC_DOUBLE , 3 , dimids , &dens_varid ) , __LINE__ ); ncwrap( ncmpi_def_var( ncid , "uwnd" , NC_DOUBLE , 3 , dimids , &uwnd_varid ) , __LINE__ ); @@ -785,7 +785,7 @@ void output( double *state , double etime ) { ncwrap( ncmpi_inq_varid( ncid , "uwnd" , &uwnd_varid ) , __LINE__ ); ncwrap( ncmpi_inq_varid( ncid , "wwnd" , &wwnd_varid ) , __LINE__ ); ncwrap( ncmpi_inq_varid( ncid , "theta" , &theta_varid ) , __LINE__ ); - ncwrap( ncmpi_inq_varid( ncid , "t" , &t_varid ) , __LINE__ ); + ncwrap( ncmpi_inq_varid( ncid , "t_var" , &t_varid ) , __LINE__ ); } //Store perturbed values in the temp arrays for output @@ -817,7 +817,8 @@ void output( double *state , double etime ) { if (mainproc) { st1[0] = num_out; ct1[0] = 1; - etimearr[0] = etime; ncwrap( ncmpi_put_vara_double( ncid , t_varid , st1 , ct1 , etimearr ) , __LINE__ ); + etimearr[0] = etime; + ncwrap( ncmpi_put_vara_double( ncid , t_varid , st1 , ct1 , etimearr ) , __LINE__ ); } //End "independent" write mode ncwrap( ncmpi_end_indep_data(ncid) , __LINE__ ); @@ -844,9 +845,9 @@ void output( double *state , double etime ) { //Error reporting routine for the PNetCDF I/O void ncwrap( int ierr , int line ) { if (ierr != NC_NOERR) { - printf("NetCDF Error at line: %d\n", line); - printf("%s\n",ncmpi_strerror(ierr)); - exit(-1); + fprintf(stderr, "NetCDF Error at line: %d\n", line); + fprintf(stderr, "%s\n", ncmpi_strerror(ierr)); + MPI_Abort(MPI_COMM_WORLD, -1); } } From a3cdf1f2acf6607139527b32feedff9e58ff282b Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 10 Mar 2025 22:40:48 +0200 Subject: [PATCH 23/83] C: debug output to stderr not stdout --- c/miniWeather_serial.cpp | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/c/miniWeather_serial.cpp b/c/miniWeather_serial.cpp index 173554b..a9ddf9c 100644 --- a/c/miniWeather_serial.cpp +++ b/c/miniWeather_serial.cpp @@ -133,8 +133,8 @@ int main(int argc, char **argv) { //Initial reductions for mass, kinetic energy, and total energy reductions(mass0,te0); { - printf( "mass0: %le\n" , mass0 ); - printf( "te0: %le\n" , te0 ); + fprintf(stderr, "mass0: %le\n" , mass0); + fprintf(stderr, "te0: %le\n" , te0 ); } //Output the initial state @@ -151,7 +151,7 @@ int main(int argc, char **argv) { perform_timestep(state,state_tmp,flux,tend,dt); //Inform the user #ifndef NO_INFORM - if (mainproc) { printf( "Elapsed Time: %lf / %lf\n", etime , sim_time ); } + if (mainproc) { fprintf(stderr, "Elapsed Time: %lf / %lf\n", etime , sim_time ); } #endif //Update the elapsed time and output counter etime = etime + dt; @@ -161,25 +161,27 @@ int main(int argc, char **argv) { output_counter = output_counter - output_freq; output(state,etime); } +#if 0 { double mass = 0.0; double te = 0.0; reductions(mass, te); - printf( "mass: %le\n" , mass ); - printf( "te: %le\n" , te ); + fprintf(stderr, "mass: %le\n" , mass ); + fprintf(stderr, "te: %le\n" , te ); } +#endif // 0 } auto t2 = std::chrono::steady_clock::now(); if (mainproc) { - std::cout << "CPU Time: " << std::chrono::duration(t2-t1).count() << " sec\n"; + std::cerr << "CPU Time: " << std::chrono::duration(t2-t1).count() << " sec\n"; } //Final reductions for mass, kinetic energy, and total energy reductions(mass,te); if (mainproc) { - printf( "d_mass: %le\n" , (mass - mass0)/mass0 ); - printf( "d_te: %le\n" , (te - te0 )/te0 ); + fprintf(stderr, "d_mass: %le\n" , (mass - mass0)/mass0 ); + fprintf(stderr, "d_te: %le\n" , (te - te0 )/te0 ); } finalize(); @@ -194,7 +196,7 @@ int main(int argc, char **argv) { // q** = q[n] + dt/2 * rhs(q* ) // q[n+1] = q[n] + dt/1 * rhs(q** ) void perform_timestep( double *state , double *state_tmp , double *flux , double *tend , double dt ) { - printf("direction_switch: %d\n", direction_switch); + //fprintf(stderr, "direction_switch: %d\n", direction_switch); if (direction_switch) { //x-direction first semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_X , flux , tend ); @@ -535,9 +537,9 @@ void init( int *argc , char ***argv ) { //If I'm the main process in MPI, display some grid information if (mainproc) { - printf( "nx_glob, nz_glob: %d %d\n", nx_glob, nz_glob); - printf( "dx,dz: %lf %lf\n",dx,dz); - printf( "dt: %lf\n",dt); + fprintf(stderr, "nx_glob, nz_glob: %d %d\n", nx_glob, nz_glob); + fprintf(stderr, "dx,dz: %lf %lf\n",dx,dz); + fprintf(stderr, "dt: %lf\n",dt); } //Want to make sure this info is displayed before further output ierr = MPI_Barrier(MPI_COMM_WORLD); From 79212d0d2b4e5e212e214c7548493b16f961c587 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 10 Mar 2025 22:41:02 +0200 Subject: [PATCH 24/83] C: Add build script that works for me --- c/build/cmake_kermit.sh | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 c/build/cmake_kermit.sh diff --git a/c/build/cmake_kermit.sh b/c/build/cmake_kermit.sh new file mode 100644 index 0000000..5901101 --- /dev/null +++ b/c/build/cmake_kermit.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +PNETCDF_ROOT=/raid/mhoemmen/pkg/pnetcdf-1.14.0 +SRC_ROOT=/raid/mhoemmen/src/miniWeather/c +OPT_FLAGS="-g -O2" + +cmake \ + -DCMAKE_CXX_COMPILER=mpic++ \ + -DCMAKE_C_COMPILER=mpicc \ + -DCMAKE_Fortran_COMPILER=mpif90 \ + -DCXXFLAGS="${OPT_FLAGS} -I${PNETCDF_ROOT}/include" \ + -DLDFLAGS="-L${PNETCDF_ROOT}/lib -lpnetcdf" \ + -DNX=100 \ + -DNZ=50 \ + -DSIM_TIME=20 \ + -DOUT_FREQ=10 \ + ${SRC_ROOT} From 953601e2144614d9fea84ed928b1c9eef3eb777d Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 10 Mar 2025 15:35:33 -0600 Subject: [PATCH 25/83] Python: Starting on netcdf output --- python/miniWeather.py | 62 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index 61048ca..2b986f1 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -856,7 +856,67 @@ def output( fixed_data: Fixed_data ) -> int: # num_out (updated) - # TODO (mfh 2025/03/04) Actually write to the output file. + # Get dimensions from fixed_data + nx = fixed_data.nx + nz = fixed_data.nz + i_beg = fixed_data.i_beg + k_beg = fixed_data.k_beg + mainproc = fixed_data.mainproc + hy_dens_cell = fixed_data.hy_dens_cell + hy_dens_theta_cell = fixed_data.hy_dens_theta_cell + + # Inform the user + if mainproc: + print("*** OUTPUT ***") + + # Allocate temp arrays + # + # TODO (mfh 2025/03/10) Check output order + dens = np.zeros((nz,nx), dtype=real) + uwnd = np.zeros((nz,nx), dtype=real) + wwnd = np.zeros((nz,nx), dtype=real) + theta = np.zeros((nz,nx), dtype=real) + etimearr = np.zeros((1), dtype=real) + + # Store perturbed values in temp arrays + for k in range(nz): + for i in range(nx): + ind_r = ID_DENS*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs + ind_u = ID_UMOM*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs + ind_w = ID_WMOM*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs + ind_t = ID_RHOT*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs + dens[k,i] = state[ind_r] + uwnd[k,i] = state[ind_u] / (hy_dens_cell[k+hs] + state[ind_r]) + wwnd[k,i] = state[ind_w] / (hy_dens_cell[k+hs] + state[ind_r]) + theta[k,i] = (state[ind_t] + hy_dens_theta_cell[k+hs]) / (hy_dens_cell[k+hs] + state[ind_r]) - hy_dens_theta_cell[k+hs] / hy_dens_cell[k+hs] + + with (Dataset("output.nc", "w") if etime == 0 else Dataset("output.nc", "a")) as nc: + # Write output using netCDF4 + if etime == 0: + # Create dimensions + nc.createDimension("t", None) # unlimited + nc.createDimension("x", nx_glob) + nc.createDimension("z", nz_glob) + # Create variables + t_var = nc.createVariable("t_var", real, ("t",)) + dens_var = nc.createVariable("dens", real, ("t","z","x")) + uwnd_var = nc.createVariable("uwnd", real, ("t","z","x")) + wwnd_var = nc.createVariable("wwnd", real, ("t","z","x")) + theta_var = nc.createVariable("theta", real, ("t","z","x")) + else: + t_var = nc.variables["t_var"] + dens_var = nc.variables["dens"] + uwnd_var = nc.variables["uwnd"] + wwnd_var = nc.variables["wwnd"] + theta_var = nc.variables["theta"] + + # Write data + if mainproc: + t_var[num_out] = etime + dens_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = dens + uwnd_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = uwnd + wwnd_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = wwnd + theta_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = theta return num_out + 1 From 185fcafd975613e3cd087750fe9d5927a54fe7b3 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 10 Mar 2025 15:40:07 -0600 Subject: [PATCH 26/83] Python: netcdf output 2 --- python/miniWeather.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index 2b986f1..a193ac4 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -881,14 +881,10 @@ def output( # Store perturbed values in temp arrays for k in range(nz): for i in range(nx): - ind_r = ID_DENS*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs - ind_u = ID_UMOM*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs - ind_w = ID_WMOM*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs - ind_t = ID_RHOT*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs - dens[k,i] = state[ind_r] - uwnd[k,i] = state[ind_u] / (hy_dens_cell[k+hs] + state[ind_r]) - wwnd[k,i] = state[ind_w] / (hy_dens_cell[k+hs] + state[ind_r]) - theta[k,i] = (state[ind_t] + hy_dens_theta_cell[k+hs]) / (hy_dens_cell[k+hs] + state[ind_r]) - hy_dens_theta_cell[k+hs] / hy_dens_cell[k+hs] + dens[k,i] = state[ID_DENS, k+hs, i+hs] + uwnd[k,i] = state[ID_UMOM, k+hs, i+hs] / (hy_dens_cell[k+hs] + state[ID_DENS, k+hs, i+hs]) + wwnd[k,i] = state[ID_WMOM, k+hs, i+hs] / (hy_dens_cell[k+hs] + state[ID_DENS, k+hs, i+hs]) + theta[k,i] = (state[ID_RHOT, k+hs, i+hs] + hy_dens_theta_cell[k+hs]) / (hy_dens_cell[k+hs] + state[ID_DENS, k+hs, i+hs]) - hy_dens_theta_cell[k+hs] / hy_dens_cell[k+hs] with (Dataset("output.nc", "w") if etime == 0 else Dataset("output.nc", "a")) as nc: # Write output using netCDF4 From 5c2206adddbee88d467bcd67763aef33d29c015d Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 10 Mar 2025 15:42:37 -0600 Subject: [PATCH 27/83] Python: netcdf output 3 (may have fixed it) --- python/miniWeather.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index a193ac4..95b731a 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -8,10 +8,10 @@ # ////////////////////////////////////////////////////////////////////////////////////////// import math -import numpy as np import sys import timeit -#include "pnetcdf.h" +import numpy as np +from netCDF4 import Dataset # "real" in the original C++ code could be either float or double. real = np.float64 # or np.float32 From a1d2fc3877913a80bfaa7241d6a6ae2107700d65 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 10 Mar 2025 16:34:09 -0600 Subject: [PATCH 28/83] Fix Python output --- python/miniWeather.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index 95b731a..6aae6cd 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -74,10 +74,10 @@ # So, you'll want to have nx_glob be twice as large as nz_glob nz_glob: int = 50 # Number of total cells in the z-direction nx_glob: int = 2 * nz_glob # Number of total cells in the x-direction -sim_time: real = 20.0 #1000.0 # How many seconds to run the simulation +sim_time: real = 700.0 # How many seconds to run the simulation output_freq: real = 10.0 # How frequently to output data to file (in seconds) #data_spec_int: int = DATA_SPEC_INJECTION # How to initialize the data -data_spec_int: int = DATA_SPEC_THERMAL # How to initialize the data +data_spec_int: int = DATA_SPEC_COLLISION #DATA_SPEC_THERMAL # How to initialize the data # /////////////////////////////////////////////////////////////////////////////////////// # // END USER-CONFIGURABLE PARAMETERS # /////////////////////////////////////////////////////////////////////////////////////// @@ -201,7 +201,7 @@ def main_time_step_loop(dt: real, etime: real, num_out: int) -> None: # If it's time for output, reset the counter, and do output if output_freq >= 0 and output_counter >= output_freq: output_counter = output_counter - output_freq - num_out = output(state, etime, num_out, fixed_data) + num_out = output(state, etime, num_out, fixed_data) (mass, te) = reductions(state, fixed_data) if mainproc: From 07e0ccca14ff3cfae563f1a45246062b4b187fe5 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 10 Mar 2025 16:57:24 -0600 Subject: [PATCH 29/83] Update ncview home page --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4326e14..53dd642 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,7 @@ Once the time tendency is computed, the fluid PDEs are essentially now cast as a * Parallel-netcdf: https://github.com/Parallel-NetCDF/PnetCDF * This is a dependency for two reasons: (1) NetCDF files are easy to visualize and convenient to work with; (2) The users of this code shouldn't have to write their own parallel I/O. -* Ncview: http://meteora.ucsd.edu/~pierce/ncview_home_page.html +* Ncview: https://cirrus.ucsd.edu/ncview/ * This is the easiest way to visualize NetCDF files. * MPI * For OpenACC: An OpenACC-capable compiler (PGI / Nvidia, Cray, GNU) From 6cb325ef1f8170afd09de7d7e04b6c1f5077abde Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 12 Mar 2025 17:22:01 +0200 Subject: [PATCH 30/83] C: temporarily fix sim params --- c/miniWeather_serial.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/c/miniWeather_serial.cpp b/c/miniWeather_serial.cpp index a9ddf9c..2a484df 100644 --- a/c/miniWeather_serial.cpp +++ b/c/miniWeather_serial.cpp @@ -56,11 +56,13 @@ constexpr double qweights[] = { 0.277777777777777777777777777779E0 , 0.444444444 /////////////////////////////////////////////////////////////////////////////////////// //The x-direction length is twice as long as the z-direction length //So, you'll want to have nx_glob be twice as large as nz_glob -int constexpr nx_glob = _NX; //Number of total cells in the x-direction -int constexpr nz_glob = _NZ; //Number of total cells in the z-direction -double constexpr sim_time = _SIM_TIME; //How many seconds to run the simulation -double constexpr output_freq = _OUT_FREQ; //How frequently to output data to file (in seconds) -int constexpr data_spec_int = _DATA_SPEC; //How to initialize the data + +int constexpr nz_glob = 1000; //Number of total cells in the z-direction +int constexpr nx_glob = 2 * nz_glob; //Number of total cells in the x-direction +double constexpr sim_time = 700.0; //How many seconds to run the simulation +double constexpr output_freq = 10.0; //How frequently to output data to file (in seconds) +//int constexpr data_spec_int = _DATA_SPEC; //How to initialize the data +int constexpr data_spec_int = DATA_SPEC_COLLISION; //How to initialize the data double constexpr dx = xlen / nx_glob; // grid spacing in the x-direction double constexpr dz = zlen / nz_glob; // grid spacing in the x-direction /////////////////////////////////////////////////////////////////////////////////////// From fc81e1e19a3834aa9dac79fcf6d4329239957357 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 12 Mar 2025 18:27:53 +0200 Subject: [PATCH 31/83] C: output control (still need to test) --- c/miniWeather_serial.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/c/miniWeather_serial.cpp b/c/miniWeather_serial.cpp index 2a484df..251b3c1 100644 --- a/c/miniWeather_serial.cpp +++ b/c/miniWeather_serial.cpp @@ -16,6 +16,8 @@ #include "pnetcdf.h" #include +#define MINIWEATHER_ONLY_OUTPUT_THETA 1 + constexpr double pi = 3.14159265358979323846264338327; //Pi constexpr double grav = 9.8; //Gravitational acceleration (m / s^2) constexpr double cp = 1004.; //Specific heat of dry air at constant pressure @@ -748,9 +750,11 @@ void output( double *state , double etime ) { //Inform the user if (mainproc) { fprintf(stderr, "*** OUTPUT ***\n"); } //Allocate some (big) temp arrays +#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) dens = (double *) malloc(nx*nz*sizeof(double)); uwnd = (double *) malloc(nx*nz*sizeof(double)); wwnd = (double *) malloc(nx*nz*sizeof(double)); +#endif theta = (double *) malloc(nx*nz*sizeof(double)); etimearr = (double *) malloc(1 *sizeof(double)); @@ -775,9 +779,11 @@ void output( double *state , double etime ) { dimids[0] = t_dimid; ncwrap( ncmpi_def_var( ncid , "t_var" , NC_DOUBLE , 1 , dimids , &t_varid ) , __LINE__ ); dimids[0] = t_dimid; dimids[1] = z_dimid; dimids[2] = x_dimid; +#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) ncwrap( ncmpi_def_var( ncid , "dens" , NC_DOUBLE , 3 , dimids , &dens_varid ) , __LINE__ ); ncwrap( ncmpi_def_var( ncid , "uwnd" , NC_DOUBLE , 3 , dimids , &uwnd_varid ) , __LINE__ ); ncwrap( ncmpi_def_var( ncid , "wwnd" , NC_DOUBLE , 3 , dimids , &wwnd_varid ) , __LINE__ ); +#endif ncwrap( ncmpi_def_var( ncid , "theta" , NC_DOUBLE , 3 , dimids , &theta_varid ) , __LINE__ ); //End "define" mode ncwrap( ncmpi_enddef( ncid ) , __LINE__ ); @@ -785,9 +791,11 @@ void output( double *state , double etime ) { //Open the file ncwrap( ncmpi_open( MPI_COMM_WORLD , "output.nc" , NC_WRITE , mpi_info , &ncid ) , __LINE__ ); //Get the variable IDs +#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) ncwrap( ncmpi_inq_varid( ncid , "dens" , &dens_varid ) , __LINE__ ); ncwrap( ncmpi_inq_varid( ncid , "uwnd" , &uwnd_varid ) , __LINE__ ); ncwrap( ncmpi_inq_varid( ncid , "wwnd" , &wwnd_varid ) , __LINE__ ); +#endif ncwrap( ncmpi_inq_varid( ncid , "theta" , &theta_varid ) , __LINE__ ); ncwrap( ncmpi_inq_varid( ncid , "t_var" , &t_varid ) , __LINE__ ); } @@ -796,12 +804,16 @@ void output( double *state , double etime ) { for (k=0; k Date: Wed, 12 Mar 2025 18:38:22 +0200 Subject: [PATCH 32/83] C: Remove hard-coding of parameters --- c/miniWeather_serial.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/c/miniWeather_serial.cpp b/c/miniWeather_serial.cpp index 251b3c1..710cb4f 100644 --- a/c/miniWeather_serial.cpp +++ b/c/miniWeather_serial.cpp @@ -59,12 +59,11 @@ constexpr double qweights[] = { 0.277777777777777777777777777779E0 , 0.444444444 //The x-direction length is twice as long as the z-direction length //So, you'll want to have nx_glob be twice as large as nz_glob -int constexpr nz_glob = 1000; //Number of total cells in the z-direction -int constexpr nx_glob = 2 * nz_glob; //Number of total cells in the x-direction -double constexpr sim_time = 700.0; //How many seconds to run the simulation -double constexpr output_freq = 10.0; //How frequently to output data to file (in seconds) -//int constexpr data_spec_int = _DATA_SPEC; //How to initialize the data -int constexpr data_spec_int = DATA_SPEC_COLLISION; //How to initialize the data +int constexpr nz_glob = _NZ; //Number of total cells in the z-direction +int constexpr nx_glob = 2 * nz_glob; //Number of total cells in the x-direction +double constexpr sim_time = _SIM_TIME; //How many seconds to run the simulation +double constexpr output_freq = _OUT_FREQ; //How frequently to output data to file (in seconds) +int constexpr data_spec_int = _DATA_SPEC; //How to initialize the data double constexpr dx = xlen / nx_glob; // grid spacing in the x-direction double constexpr dz = zlen / nz_glob; // grid spacing in the x-direction /////////////////////////////////////////////////////////////////////////////////////// From 4303ec062a72f6de416bb15aa1f61316124f9c6f Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 12 Mar 2025 13:12:46 -0600 Subject: [PATCH 33/83] Python: x boundary injection hack For injection only, change the right boundary to be Dirichlet. Add a C(++) build script for Linux + gcc. --- c/build/cmake_linux_gnu.sh | 25 +++++++++++++++++++++++++ python/miniWeather.py | 22 +++++++++++++++------- 2 files changed, 40 insertions(+), 7 deletions(-) create mode 100644 c/build/cmake_linux_gnu.sh diff --git a/c/build/cmake_linux_gnu.sh b/c/build/cmake_linux_gnu.sh new file mode 100644 index 0000000..719420d --- /dev/null +++ b/c/build/cmake_linux_gnu.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +SRC_ROOT=../../../src/miniWeather/c +OPT_FLAGS="-g -O2" + +#PNETCDF_LIB=/usr/lib/x86_64-linux-gnu +#PNETCDF_LDFLAGS="-L${PNETCDF_LIB} -lpnetcdf" +PNETCDF_LDFLAGS="-lpnetcdf" +#PNETCDF_CXXFLAGS="-I$/usr/include" +PNETCDF_CXXFLAGS="" + +DATA_SPEC="DATA_SPEC_INJECTION" + +cmake \ + -DCMAKE_CXX_COMPILER=mpic++ \ + -DCMAKE_C_COMPILER=mpicc \ + -DCMAKE_Fortran_COMPILER=mpif90 \ + -DCXXFLAGS="${OPT_FLAGS} ${PNETCDF_CXXFLAGS}" \ + -DLDFLAGS="${PNETCDF_LDFLAGS}" \ + -DNX=200 \ + -DNZ=100 \ + -DSIM_TIME=1200 \ + -DOUT_FREQ=10 \ + -DDATA_SPEC="${DATA_SPEC}" \ + ${SRC_ROOT} diff --git a/python/miniWeather.py b/python/miniWeather.py index 6aae6cd..a59d0da 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -74,10 +74,10 @@ # So, you'll want to have nx_glob be twice as large as nz_glob nz_glob: int = 50 # Number of total cells in the z-direction nx_glob: int = 2 * nz_glob # Number of total cells in the x-direction -sim_time: real = 700.0 # How many seconds to run the simulation +sim_time: real = 1000.0 # How many seconds to run the simulation output_freq: real = 10.0 # How frequently to output data to file (in seconds) -#data_spec_int: int = DATA_SPEC_INJECTION # How to initialize the data -data_spec_int: int = DATA_SPEC_COLLISION #DATA_SPEC_THERMAL # How to initialize the data +data_spec_int: int = DATA_SPEC_INJECTION # How to initialize the data +#data_spec_int: int = DATA_SPEC_COLLISION #DATA_SPEC_THERMAL # How to initialize the data # /////////////////////////////////////////////////////////////////////////////////////// # // END USER-CONFIGURABLE PARAMETERS # /////////////////////////////////////////////////////////////////////////////////////// @@ -507,10 +507,18 @@ def set_halo_values_x( # ////////////////////////////////////////////////////// for ll in range(NUM_VARS): for k in range(nz): - state[ll,hs+k,0 ] = state[ll,hs+k,nx+hs-2]; - state[ll,hs+k,1 ] = state[ll,hs+k,nx+hs-1]; - state[ll,hs+k,nx+hs ] = state[ll,hs+k,hs ]; - state[ll,hs+k,nx+hs+1] = state[ll,hs+k,hs+1 ]; + if data_spec_int == DATA_SPEC_INJECTION: + # Dirichlet boundary on the right ONLY avoids + # spurious reflections on the right and four corners. + # This should not be considered physical and does not + # conserve energy (but injection doesn't anyway). + state[ll,hs+k,nx+hs ] = 0.0 + state[ll,hs+k,nx+hs+1] = 0.0 + else: + state[ll,hs+k,0 ] = state[ll,hs+k,nx+hs-2] + state[ll,hs+k,1 ] = state[ll,hs+k,nx+hs-1] + state[ll,hs+k,nx+hs ] = state[ll,hs+k,hs ] + state[ll,hs+k,nx+hs+1] = state[ll,hs+k,hs+1 ] if data_spec_int == DATA_SPEC_INJECTION: if myrank == 0: From 8c41a31de0bcb78bd494908f588dc66c69e6cbf6 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 17 Mar 2025 11:30:02 -0600 Subject: [PATCH 34/83] Build script: Change default model --- c/build/cmake_linux_gnu.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c/build/cmake_linux_gnu.sh b/c/build/cmake_linux_gnu.sh index 719420d..861e651 100644 --- a/c/build/cmake_linux_gnu.sh +++ b/c/build/cmake_linux_gnu.sh @@ -9,7 +9,7 @@ PNETCDF_LDFLAGS="-lpnetcdf" #PNETCDF_CXXFLAGS="-I$/usr/include" PNETCDF_CXXFLAGS="" -DATA_SPEC="DATA_SPEC_INJECTION" +DATA_SPEC="DATA_SPEC_COLLISION" cmake \ -DCMAKE_CXX_COMPILER=mpic++ \ @@ -19,7 +19,7 @@ cmake \ -DLDFLAGS="${PNETCDF_LDFLAGS}" \ -DNX=200 \ -DNZ=100 \ - -DSIM_TIME=1200 \ + -DSIM_TIME=1000 \ -DOUT_FREQ=10 \ -DDATA_SPEC="${DATA_SPEC}" \ ${SRC_ROOT} From 74a519ea08d2e17fc0a9898d1577dd1ab02a8b07 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 17 Mar 2025 11:30:29 -0600 Subject: [PATCH 35/83] Python: Add option only to output theta (temp density) --- python/miniWeather.py | 48 ++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index a59d0da..ac39683 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -17,6 +17,9 @@ real = np.float64 # or np.float32 double = np.float64 +# Only output the temperature density ("theta") +miniweather_output_only_theta: bool = True + # # Parameters for indexing and flags # (effectively enums, but we leave them as constants @@ -72,7 +75,7 @@ # /////////////////////////////////////////////////////////////////////////////////////// # The x-direction length is twice as long as the z-direction length # So, you'll want to have nx_glob be twice as large as nz_glob -nz_glob: int = 50 # Number of total cells in the z-direction +nz_glob: int = 100 # Number of total cells in the z-direction nx_glob: int = 2 * nz_glob # Number of total cells in the x-direction sim_time: real = 1000.0 # How many seconds to run the simulation output_freq: real = 10.0 # How frequently to output data to file (in seconds) @@ -512,8 +515,15 @@ def set_halo_values_x( # spurious reflections on the right and four corners. # This should not be considered physical and does not # conserve energy (but injection doesn't anyway). + + state[ll,hs+k,0 ] = state[ll,hs+k,nx+hs-2] + state[ll,hs+k,1 ] = state[ll,hs+k,nx+hs-1] state[ll,hs+k,nx+hs ] = 0.0 state[ll,hs+k,nx+hs+1] = 0.0 + + #state[ll,hs+k,nx+hs ] = state[ll,hs+k,hs ] + #state[ll,hs+k,nx+hs+1] = state[ll,hs+k,hs+1 ] + else: state[ll,hs+k,0 ] = state[ll,hs+k,nx+hs-2] state[ll,hs+k,1 ] = state[ll,hs+k,nx+hs-1] @@ -880,18 +890,20 @@ def output( # Allocate temp arrays # # TODO (mfh 2025/03/10) Check output order - dens = np.zeros((nz,nx), dtype=real) - uwnd = np.zeros((nz,nx), dtype=real) - wwnd = np.zeros((nz,nx), dtype=real) + if not miniweather_output_only_theta: + dens = np.zeros((nz,nx), dtype=real) + uwnd = np.zeros((nz,nx), dtype=real) + wwnd = np.zeros((nz,nx), dtype=real) theta = np.zeros((nz,nx), dtype=real) etimearr = np.zeros((1), dtype=real) # Store perturbed values in temp arrays for k in range(nz): for i in range(nx): - dens[k,i] = state[ID_DENS, k+hs, i+hs] - uwnd[k,i] = state[ID_UMOM, k+hs, i+hs] / (hy_dens_cell[k+hs] + state[ID_DENS, k+hs, i+hs]) - wwnd[k,i] = state[ID_WMOM, k+hs, i+hs] / (hy_dens_cell[k+hs] + state[ID_DENS, k+hs, i+hs]) + if not miniweather_output_only_theta: + dens[k,i] = state[ID_DENS, k+hs, i+hs] + uwnd[k,i] = state[ID_UMOM, k+hs, i+hs] / (hy_dens_cell[k+hs] + state[ID_DENS, k+hs, i+hs]) + wwnd[k,i] = state[ID_WMOM, k+hs, i+hs] / (hy_dens_cell[k+hs] + state[ID_DENS, k+hs, i+hs]) theta[k,i] = (state[ID_RHOT, k+hs, i+hs] + hy_dens_theta_cell[k+hs]) / (hy_dens_cell[k+hs] + state[ID_DENS, k+hs, i+hs]) - hy_dens_theta_cell[k+hs] / hy_dens_cell[k+hs] with (Dataset("output.nc", "w") if etime == 0 else Dataset("output.nc", "a")) as nc: @@ -903,23 +915,27 @@ def output( nc.createDimension("z", nz_glob) # Create variables t_var = nc.createVariable("t_var", real, ("t",)) - dens_var = nc.createVariable("dens", real, ("t","z","x")) - uwnd_var = nc.createVariable("uwnd", real, ("t","z","x")) - wwnd_var = nc.createVariable("wwnd", real, ("t","z","x")) + if not miniweather_output_only_theta: + dens_var = nc.createVariable("dens", real, ("t","z","x")) + uwnd_var = nc.createVariable("uwnd", real, ("t","z","x")) + wwnd_var = nc.createVariable("wwnd", real, ("t","z","x")) theta_var = nc.createVariable("theta", real, ("t","z","x")) else: t_var = nc.variables["t_var"] - dens_var = nc.variables["dens"] - uwnd_var = nc.variables["uwnd"] - wwnd_var = nc.variables["wwnd"] + if not miniweather_output_only_theta: + dens_var = nc.variables["dens"] + uwnd_var = nc.variables["uwnd"] + wwnd_var = nc.variables["wwnd"] theta_var = nc.variables["theta"] # Write data if mainproc: t_var[num_out] = etime - dens_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = dens - uwnd_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = uwnd - wwnd_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = wwnd + + if not miniweather_output_only_theta: + dens_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = dens + uwnd_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = uwnd + wwnd_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = wwnd theta_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = theta return num_out + 1 From 94839255a173915aa3821cad5873694966bbc1fc Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 17 Mar 2025 11:40:38 -0600 Subject: [PATCH 36/83] Python: Remove injection boundary special case --- python/miniWeather.py | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index ac39683..003ab73 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -510,25 +510,10 @@ def set_halo_values_x( # ////////////////////////////////////////////////////// for ll in range(NUM_VARS): for k in range(nz): - if data_spec_int == DATA_SPEC_INJECTION: - # Dirichlet boundary on the right ONLY avoids - # spurious reflections on the right and four corners. - # This should not be considered physical and does not - # conserve energy (but injection doesn't anyway). - - state[ll,hs+k,0 ] = state[ll,hs+k,nx+hs-2] - state[ll,hs+k,1 ] = state[ll,hs+k,nx+hs-1] - state[ll,hs+k,nx+hs ] = 0.0 - state[ll,hs+k,nx+hs+1] = 0.0 - - #state[ll,hs+k,nx+hs ] = state[ll,hs+k,hs ] - #state[ll,hs+k,nx+hs+1] = state[ll,hs+k,hs+1 ] - - else: - state[ll,hs+k,0 ] = state[ll,hs+k,nx+hs-2] - state[ll,hs+k,1 ] = state[ll,hs+k,nx+hs-1] - state[ll,hs+k,nx+hs ] = state[ll,hs+k,hs ] - state[ll,hs+k,nx+hs+1] = state[ll,hs+k,hs+1 ] + state[ll,hs+k,0 ] = state[ll,hs+k,nx+hs-2] + state[ll,hs+k,1 ] = state[ll,hs+k,nx+hs-1] + state[ll,hs+k,nx+hs ] = state[ll,hs+k,hs ] + state[ll,hs+k,nx+hs+1] = state[ll,hs+k,hs+1 ] if data_spec_int == DATA_SPEC_INJECTION: if myrank == 0: From a01bc207dd40728e05b95feca863182c1b091907 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 17 Mar 2025 14:25:12 -0600 Subject: [PATCH 37/83] Python: Restore default model --- python/miniWeather.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index 003ab73..bcb2e13 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -79,8 +79,7 @@ nx_glob: int = 2 * nz_glob # Number of total cells in the x-direction sim_time: real = 1000.0 # How many seconds to run the simulation output_freq: real = 10.0 # How frequently to output data to file (in seconds) -data_spec_int: int = DATA_SPEC_INJECTION # How to initialize the data -#data_spec_int: int = DATA_SPEC_COLLISION #DATA_SPEC_THERMAL # How to initialize the data +data_spec_int: int = DATA_SPEC_COLLISION #DATA_SPEC_THERMAL # How to initialize the data # /////////////////////////////////////////////////////////////////////////////////////// # // END USER-CONFIGURABLE PARAMETERS # /////////////////////////////////////////////////////////////////////////////////////// From 275067fac70d8771fb182eb0b51b40c142f22846 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Thu, 20 Mar 2025 23:59:17 +0200 Subject: [PATCH 38/83] Add cpp-mdspan directory Build script is tested; build works. miniWeather_mdspan.cpp initially is just a copy of ../c/miniWeather_serial.cpp. --- cpp-mdspan/CMakeLists.txt | 80 +++ cpp-mdspan/build/cmake-kermit.sh | 10 + cpp-mdspan/miniWeather_mdspan.cpp | 919 ++++++++++++++++++++++++++++++ 3 files changed, 1009 insertions(+) create mode 100644 cpp-mdspan/CMakeLists.txt create mode 100755 cpp-mdspan/build/cmake-kermit.sh create mode 100644 cpp-mdspan/miniWeather_mdspan.cpp diff --git a/cpp-mdspan/CMakeLists.txt b/cpp-mdspan/CMakeLists.txt new file mode 100644 index 0000000..2a52ee6 --- /dev/null +++ b/cpp-mdspan/CMakeLists.txt @@ -0,0 +1,80 @@ +cmake_minimum_required(VERSION 3.12) +project(miniWeather-mdspan + VERSION 0.0.1 + LANGUAGES CXX +) + +# Option to override which C++ standard to use +set(MINIWEATHER_CXX_STANDARD DETECT CACHE STRING "Override the default CXX_STANDARD to compile with.") + +# Decide on the standard to use +if(MINIWEATHER_CXX_STANDARD STREQUAL "17") + if("cxx_std_17" IN_LIST CMAKE_CXX_COMPILE_FEATURES) + message(STATUS "Using C++17 standard") + set(CMAKE_CXX_STANDARD 17) + else() + message(FATAL_ERROR "Requested MINIWEATHER_CXX_STANDARD \"17\" not supported by provided C++ compiler") + endif() +elseif(MINIWEATHER_CXX_STANDARD STREQUAL "20") + if("cxx_std_20" IN_LIST CMAKE_CXX_COMPILE_FEATURES) + message(STATUS "Using C++20 standard") + set(CMAKE_CXX_STANDARD 20) + else() + message(FATAL_ERROR "Requested MINIWEATHER_CXX_STANDARD \"20\" not supported by provided C++ compiler") + endif() +elseif(MINIWEATHER_CXX_STANDARD STREQUAL "23") + if("cxx_std_23" IN_LIST CMAKE_CXX_COMPILE_FEATURES) + message(STATUS "Using C++23 standard") + set(CMAKE_CXX_STANDARD 23) + else() + message(FATAL_ERROR "Requested MINIWEATHER_CXX_STANDARD \"23\" not supported by provided C++ compiler") + endif() +else() + if("cxx_std_23" IN_LIST CMAKE_CXX_COMPILE_FEATURES) + set(CMAKE_CXX_STANDARD 23) + message(STATUS "Detected support for C++23 standard") + elseif("cxx_std_20" IN_LIST CMAKE_CXX_COMPILE_FEATURES) + set(CMAKE_CXX_STANDARD 20) + message(STATUS "Detected support for C++20 standard") + elseif("cxx_std_17" IN_LIST CMAKE_CXX_COMPILE_FEATURES) + set(CMAKE_CXX_STANDARD 17) + message(STATUS "Detected support for C++17 standard") + else() + message(FATAL_ERROR "Cannot detect CXX_STANDARD of C++17 or newer.") + endif() +endif() + +find_package(mdspan QUIET) +if (NOT mdspan_FOUND) + message(STATUS "No installed mdspan found, fetching from Github") + include(FetchContent) + FetchContent_Declare( + mdspan + GIT_REPOSITORY https://github.com/kokkos/mdspan.git + GIT_TAG stable + ) + FetchContent_GetProperties(mdspan) + if(NOT mdspan_POPULATED) + FetchContent_Populate(mdspan) + add_subdirectory(${mdspan_SOURCE_DIR} ${mdspan_BINARY_DIR} EXCLUDE_FROM_ALL) + endif() +endif() + +############################################################ +## Compile the serial version +############################################################ +add_executable(serial miniWeather_mdspan.cpp) +target_link_libraries(serial INTERFACE std::mdspan) +target_include_directories(serial INTERFACE +# $ + $ +) +target_compile_options(serial PRIVATE + $<$,$,$>: + -Wall> + $<$: + /W4> +) + + + diff --git a/cpp-mdspan/build/cmake-kermit.sh b/cpp-mdspan/build/cmake-kermit.sh new file mode 100755 index 0000000..88b532c --- /dev/null +++ b/cpp-mdspan/build/cmake-kermit.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +PNETCDF_ROOT=/raid/mhoemmen/pkg/pnetcdf-1.14.0 +SRC_ROOT=/raid/mhoemmen/src/miniWeather/cpp-mdspan + +LDFLAGS="-L${PNETCDF_ROOT}/lib -lpnetcdf" CXXFLAGS="-I${PNETCDF_ROOT}/include" cmake \ + -DCMAKE_CXX_COMPILER=mpic++ \ + -DCMAKE_C_COMPILER=mpicc \ + -DCMAKE_Fortran_COMPILER=mpif90 \ + ${SRC_ROOT} diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp new file mode 100644 index 0000000..b25cb8a --- /dev/null +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -0,0 +1,919 @@ + +////////////////////////////////////////////////////////////////////////////////////////// +// miniWeather +// Author: Matt Norman , Oak Ridge National Laboratory +// This code simulates dry, stratified, compressible, non-hydrostatic fluid flows +// For documentation, please see the attached documentation in the "documentation" folder +// +////////////////////////////////////////////////////////////////////////////////////////// + +#include +#include +#include +#include +#include +#include +#include "pnetcdf.h" +#include + +#define MINIWEATHER_ONLY_OUTPUT_THETA 1 + +constexpr double pi = 3.14159265358979323846264338327; //Pi +constexpr double grav = 9.8; //Gravitational acceleration (m / s^2) +constexpr double cp = 1004.; //Specific heat of dry air at constant pressure +constexpr double cv = 717.; //Specific heat of dry air at constant volume +constexpr double rd = 287.; //Dry air constant for equation of state (P=rho*rd*T) +constexpr double p0 = 1.e5; //Standard pressure at the surface in Pascals +constexpr double C0 = 27.5629410929725921310572974482; //Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma) +constexpr double gamm = 1.40027894002789400278940027894; //gamma=cp/Rd , have to call this gamm because "gamma" is taken (I hate C so much) +//Define domain and stability-related constants +constexpr double xlen = 2.e4; //Length of the domain in the x-direction (meters) +constexpr double zlen = 1.e4; //Length of the domain in the z-direction (meters) +constexpr double hv_beta = 0.05; //How strong to diffuse the solution: hv_beta \in [0:1] +constexpr double cfl = 1.50; //"Courant, Friedrichs, Lewy" number (for numerical stability) +constexpr double max_speed = 450; //Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec) +constexpr int hs = 2; //"Halo" size: number of cells beyond the MPI tasks's domain needed for a full "stencil" of information for reconstruction +constexpr int sten_size = 4; //Size of the stencil used for interpolation + +//Parameters for indexing and flags +constexpr int NUM_VARS = 4; //Number of fluid state variables +constexpr int ID_DENS = 0; //index for density ("rho") +constexpr int ID_UMOM = 1; //index for momentum in the x-direction ("rho * u") +constexpr int ID_WMOM = 2; //index for momentum in the z-direction ("rho * w") +constexpr int ID_RHOT = 3; //index for density * potential temperature ("rho * theta") +constexpr int DIR_X = 1; //Integer constant to express that this operation is in the x-direction +constexpr int DIR_Z = 2; //Integer constant to express that this operation is in the z-direction +constexpr int DATA_SPEC_COLLISION = 1; +constexpr int DATA_SPEC_THERMAL = 2; +constexpr int DATA_SPEC_GRAVITY_WAVES = 3; +constexpr int DATA_SPEC_DENSITY_CURRENT = 5; +constexpr int DATA_SPEC_INJECTION = 6; + +constexpr int nqpoints = 3; +constexpr double qpoints [] = { 0.112701665379258311482073460022E0 , 0.500000000000000000000000000000E0 , 0.887298334620741688517926539980E0 }; +constexpr double qweights[] = { 0.277777777777777777777777777779E0 , 0.444444444444444444444444444444E0 , 0.277777777777777777777777777779E0 }; + +/////////////////////////////////////////////////////////////////////////////////////// +// BEGIN USER-CONFIGURABLE PARAMETERS +/////////////////////////////////////////////////////////////////////////////////////// +//The x-direction length is twice as long as the z-direction length +//So, you'll want to have nx_glob be twice as large as nz_glob + +int constexpr nz_glob = 50; //Number of total cells in the z-direction +int constexpr nx_glob = 2 * nz_glob; //Number of total cells in the x-direction +double constexpr sim_time = 1000.0; //How many seconds to run the simulation +double constexpr output_freq = 10.0; //How frequently to output data to file (in seconds) +int constexpr data_spec_int = DATA_SPEC_THERMAL; //How to initialize the data +double constexpr dx = xlen / nx_glob; // grid spacing in the x-direction +double constexpr dz = zlen / nz_glob; // grid spacing in the x-direction +/////////////////////////////////////////////////////////////////////////////////////// +// END USER-CONFIGURABLE PARAMETERS +/////////////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are initialized but remain static over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +double dt; //Model time step (seconds) +int nx, nz; //Number of local grid cells in the x- and z- dimensions for this MPI task +int i_beg, k_beg; //beginning index in the x- and z-directions for this MPI task +int nranks, myrank; //Number of MPI ranks and my rank id +int left_rank, right_rank; //MPI Rank IDs that exist to my left and right in the global domain +int mainproc; //Am I the main process (rank == 0)? +double *hy_dens_cell; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) +double *hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) +double *hy_dens_int; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) +double *hy_dens_theta_int; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) +double *hy_pressure_int; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) + +/////////////////////////////////////////////////////////////////////////////////////// +// Variables that are dynamics over the course of the simulation +/////////////////////////////////////////////////////////////////////////////////////// +double etime; //Elapsed model time +double output_counter; //Helps determine when it's time to do output +//Runtime variable arrays +double *state; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +double *state_tmp; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +double *flux; //Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) +double *tend; //Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) +int num_out = 0; //The number of outputs performed so far +int direction_switch = 1; +double mass0, te0; //Initial domain totals for mass and total energy +double mass , te ; //Domain totals for mass and total energy + +//How is this not in the standard?! +double dmin( double a , double b ) { if (a sim_time) { dt = sim_time - etime; } + //Perform a single time step + perform_timestep(state,state_tmp,flux,tend,dt); + //Inform the user +#ifndef NO_INFORM + if (mainproc) { fprintf(stderr, "Elapsed Time: %lf / %lf\n", etime , sim_time ); } +#endif + //Update the elapsed time and output counter + etime = etime + dt; + output_counter = output_counter + dt; + //If it's time for output, reset the counter, and do output + if (output_counter >= output_freq) { + output_counter = output_counter - output_freq; + output(state,etime); + } +#if 0 + { + double mass = 0.0; + double te = 0.0; + reductions(mass, te); + fprintf(stderr, "mass: %le\n" , mass ); + fprintf(stderr, "te: %le\n" , te ); + } +#endif // 0 + } + auto t2 = std::chrono::steady_clock::now(); + if (mainproc) { + std::cerr << "CPU Time: " << std::chrono::duration(t2-t1).count() << " sec\n"; + } + + //Final reductions for mass, kinetic energy, and total energy + reductions(mass,te); + + if (mainproc) { + fprintf(stderr, "d_mass: %le\n" , (mass - mass0)/mass0 ); + fprintf(stderr, "d_te: %le\n" , (te - te0 )/te0 ); + } + + finalize(); +} + + +//Performs a single dimensionally split time step using a simple low-storage three-stage Runge-Kutta time integrator +//The dimensional splitting is a second-order-accurate alternating Strang splitting in which the +//order of directions is alternated each time step. +//The Runge-Kutta method used here is defined as follows: +// q* = q[n] + dt/3 * rhs(q[n]) +// q** = q[n] + dt/2 * rhs(q* ) +// q[n+1] = q[n] + dt/1 * rhs(q** ) +void perform_timestep( double *state , double *state_tmp , double *flux , double *tend , double dt ) { + //fprintf(stderr, "direction_switch: %d\n", direction_switch); + if (direction_switch) { + //x-direction first + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_X , flux , tend ); + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , flux , tend ); + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_X , flux , tend ); + //z-direction second + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_Z , flux , tend ); + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , flux , tend ); + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_Z , flux , tend ); + } else { + //z-direction second + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_Z , flux , tend ); + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , flux , tend ); + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_Z , flux , tend ); + //x-direction first + semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_X , flux , tend ); + semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , flux , tend ); + semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_X , flux , tend ); + } + if (direction_switch) { direction_switch = 0; } else { direction_switch = 1; } +} + + +//Perform a single semi-discretized step in time with the form: +//state_out = state_init + dt * rhs(state_forcing) +//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out +void semi_discrete_step( double *state_init , double *state_forcing , double *state_out , double dt , int dir , double *flux , double *tend ) { + int i, k, ll, inds, indt, indw; + double x, z, wpert, dist, x0, z0, xrad, zrad, amp; + if (dir == DIR_X) { + //Set the halo values for this MPI task's fluid state in the x-direction + set_halo_values_x(state_forcing); + //Compute the time tendencies for the fluid state in the x-direction + compute_tendencies_x(state_forcing,flux,tend,dt); + } else if (dir == DIR_Z) { + //Set the halo values for this MPI task's fluid state in the z-direction + set_halo_values_z(state_forcing); + //Compute the time tendencies for the fluid state in the z-direction + compute_tendencies_z(state_forcing,flux,tend,dt); + } + + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Apply the tendencies to the fluid state + for (ll=0; ll Date: Fri, 21 Mar 2025 00:59:20 +0200 Subject: [PATCH 39/83] Fix mdspan include path --- cpp-mdspan/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp-mdspan/CMakeLists.txt b/cpp-mdspan/CMakeLists.txt index 2a52ee6..2e3642c 100644 --- a/cpp-mdspan/CMakeLists.txt +++ b/cpp-mdspan/CMakeLists.txt @@ -66,7 +66,7 @@ endif() add_executable(serial miniWeather_mdspan.cpp) target_link_libraries(serial INTERFACE std::mdspan) target_include_directories(serial INTERFACE -# $ + $ $ ) target_compile_options(serial PRIVATE From 0c62b3f09a1363e93ec59efdf10999f4a1e1f8b7 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Fri, 21 Mar 2025 21:36:34 +0200 Subject: [PATCH 40/83] Fix mdspan build cpp-mdspan automatically downloads mdspan and fixes the include path to point to it. --- cpp-mdspan/CMakeLists.txt | 5 +---- cpp-mdspan/build/cmake-kermit.sh | 1 + cpp-mdspan/miniWeather_mdspan.cpp | 2 ++ 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp-mdspan/CMakeLists.txt b/cpp-mdspan/CMakeLists.txt index 2e3642c..0b36569 100644 --- a/cpp-mdspan/CMakeLists.txt +++ b/cpp-mdspan/CMakeLists.txt @@ -64,11 +64,8 @@ endif() ## Compile the serial version ############################################################ add_executable(serial miniWeather_mdspan.cpp) +include_directories(${mdspan_SOURCE_DIR}/include) target_link_libraries(serial INTERFACE std::mdspan) -target_include_directories(serial INTERFACE - $ - $ -) target_compile_options(serial PRIVATE $<$,$,$>: -Wall> diff --git a/cpp-mdspan/build/cmake-kermit.sh b/cpp-mdspan/build/cmake-kermit.sh index 88b532c..c30d9eb 100755 --- a/cpp-mdspan/build/cmake-kermit.sh +++ b/cpp-mdspan/build/cmake-kermit.sh @@ -7,4 +7,5 @@ LDFLAGS="-L${PNETCDF_ROOT}/lib -lpnetcdf" CXXFLAGS="-I${PNETCDF_ROOT}/include" c -DCMAKE_CXX_COMPILER=mpic++ \ -DCMAKE_C_COMPILER=mpicc \ -DCMAKE_Fortran_COMPILER=mpif90 \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ ${SRC_ROOT} diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index b25cb8a..29d2f88 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -16,6 +16,8 @@ #include "pnetcdf.h" #include +#include "mdspan/mdspan.hpp" + #define MINIWEATHER_ONLY_OUTPUT_THETA 1 constexpr double pi = 3.14159265358979323846264338327; //Pi From 50f71d0445b6cff2d3c233d1b53989673d537b63 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Fri, 21 Mar 2025 22:14:01 +0200 Subject: [PATCH 41/83] Add unique_mdarray implementation unique_mdarray lets code allocate a multidimensional array without needing to worry about how to copy multidimensional data efficiently. --- cpp-mdspan/CMakeLists.txt | 21 +- cpp-mdspan/miniWeather_mdspan.cpp | 1 + cpp-mdspan/test_unique_mdarray.cpp | 258 ++++++++++++++ cpp-mdspan/unique_mdarray.hpp | 541 +++++++++++++++++++++++++++++ 4 files changed, 814 insertions(+), 7 deletions(-) create mode 100644 cpp-mdspan/test_unique_mdarray.cpp create mode 100644 cpp-mdspan/unique_mdarray.hpp diff --git a/cpp-mdspan/CMakeLists.txt b/cpp-mdspan/CMakeLists.txt index 0b36569..b4698cd 100644 --- a/cpp-mdspan/CMakeLists.txt +++ b/cpp-mdspan/CMakeLists.txt @@ -4,10 +4,10 @@ project(miniWeather-mdspan LANGUAGES CXX ) -# Option to override which C++ standard to use -set(MINIWEATHER_CXX_STANDARD DETECT CACHE STRING "Override the default CXX_STANDARD to compile with.") +# Option to override which version of the C++ Standard to use +set(MINIWEATHER_CXX_STANDARD DETECT CACHE STRING "Override the default CXX_STANDARD") -# Decide on the standard to use +# Decide which version of the C++ Standard version to use if(MINIWEATHER_CXX_STANDARD STREQUAL "17") if("cxx_std_17" IN_LIST CMAKE_CXX_COMPILE_FEATURES) message(STATUS "Using C++17 standard") @@ -60,11 +60,10 @@ if (NOT mdspan_FOUND) endif() endif() -############################################################ -## Compile the serial version -############################################################ +# Add mdspan include directory +include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include" "${mdspan_SOURCE_DIR}/include") + add_executable(serial miniWeather_mdspan.cpp) -include_directories(${mdspan_SOURCE_DIR}/include) target_link_libraries(serial INTERFACE std::mdspan) target_compile_options(serial PRIVATE $<$,$,$>: @@ -73,5 +72,13 @@ target_compile_options(serial PRIVATE /W4> ) +add_executable(test_unique_mdarray test_unique_mdarray.cpp) +target_link_libraries(test_unique_mdarray INTERFACE std::mdspan) +target_compile_options(test_unique_mdarray PRIVATE + $<$,$,$>: + -Wall> + $<$: + /W4> +) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index 29d2f88..ebea9c1 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -17,6 +17,7 @@ #include #include "mdspan/mdspan.hpp" +#include "unique_mdarray.hpp" #define MINIWEATHER_ONLY_OUTPUT_THETA 1 diff --git a/cpp-mdspan/test_unique_mdarray.cpp b/cpp-mdspan/test_unique_mdarray.cpp new file mode 100644 index 0000000..0acefd7 --- /dev/null +++ b/cpp-mdspan/test_unique_mdarray.cpp @@ -0,0 +1,258 @@ +#include "unique_mdarray.hpp" +#include + +namespace test { + +using namespace md; + +template +std::remove_const_t get_value_at( + const unique_mdarray& x, + Indices... indices) +{ +#if defined(MDSPAN_USE_BRACKET_OPERATOR) && (MDSPAN_USE_BRACKET_OPERATOR != 0) + return x[indices...]; +#else + return x(indices...); +#endif +} + +template +void set_value_at( + const unique_mdarray& x, + std::remove_const_t value, + Indices... indices) +{ +#if defined(MDSPAN_USE_BRACKET_OPERATOR) && (MDSPAN_USE_BRACKET_OPERATOR != 0) + x[indices...] = value; +#else + x(indices...) = value; +#endif +} + +template +std::remove_const_t get_value_at_with_array( + const unique_mdarray& x, + Indices... indices) +{ + using index_type = typename Extents::index_type; + std::array inds{indices...}; + return x[inds]; +} + +template +void set_value_at_with_array( + const unique_mdarray& x, + std::remove_const_t value, + Indices... indices) +{ + using index_type = typename Extents::index_type; + std::array inds{indices...}; + x[inds] = value; +} + +template +std::remove_const_t get_value_at_with_span( + const unique_mdarray& x, + Indices... indices) +{ + using index_type = typename Extents::index_type; + std::array inds{ + static_cast(indices)... + }; + std::span sp{inds.data(), inds.size()}; + return x[sp]; +} + +template +void set_value_at_with_span( + const unique_mdarray& x, + std::remove_const_t value, + Indices... indices) +{ + using index_type = typename Extents::index_type; + std::array inds{ + static_cast(indices)... + }; + std::span sp{inds.data(), inds.size()}; + x[sp] = value; +} + +template +void test_implicit_conversion( + mdspan>) +{} + +template +void test_float_construction(const Mapping& m, const Deleter& d, std::integral_constant = {}) { + std::unique_ptr ptr = std::make_unique(m.required_span_size()); + using extents_type = typename Mapping::extents_type; + using layout_type = typename Mapping::layout_type; + using index_type = typename Mapping::index_type; + + float* raw_ptr_x = ptr.get(); + std::span sp_x{ptr.release(), static_cast(m.required_span_size())}; + unique_mdarray x{sp_x, m, d}; + + static_assert(x.rank() == extents_type::rank()); + static_assert(x.rank_dynamic() == extents_type::rank_dynamic()); + assert(x.get() == raw_ptr_x); + assert(x.extents() == m.extents()); + assert(x.mapping() == m); + + set_value_at(x, 42.0f, 1, 2); + assert(get_value_at(x, 1, 2) == 42.0f); + + set_value_at_with_array(x, 43.0f, 1, 2); + assert(get_value_at_with_array(x, 1, 2) == 43.0f); + + set_value_at_with_span(x, 44.0f, 1, 2); + assert(get_value_at_with_span(x, 1, 2) == 44.0f); + + set_value_at(x, 45.0f, 1, 2); + assert(get_value_at(x, 1, 2) == 45.0f); + + if constexpr (extents_type::rank_dynamic() != 0) { + try { + float* raw_ptr = x.release(); + delete [] raw_ptr; + } + catch (...) { + std::cerr << "x.release() threw an exception\n"; + assert(false); + } + assert(x.get() == nullptr); + + // This only works if Deleter uses the same allocation strategy. + auto ptr_x2 = std::make_unique(m.required_span_size()); + std::span sp_x2{ + ptr_x2.release(), + static_cast(m.required_span_size()) + }; + x = unique_mdarray{sp_x2, m, d}; + } + + for (index_type r = 0; r < x.extent(0); ++r) { + for (index_type c = 0; c < x.extent(1); ++c) { + set_value_at(x, 1.0f + static_cast(r + c * x.extent(1)), r, c); + } + } + + mdspan> x_view = x; + for (index_type r = 0; r < x.extent(0); ++r) { + for (index_type c = 0; c < x.extent(1); ++c) { +#if defined(MDSPAN_USE_BRACKET_OPERATOR) && (MDSPAN_USE_BRACKET_OPERATOR != 0) + const float x_rc = x_view[r, c]; +#else + const float x_rc = x_view(r, c); +#endif + const float val = 1.0f + static_cast(r + c * x.extent(1)); + assert(x_rc == val); + } + } + + if constexpr (std::is_swappable_v) { + // This only works if Deleter uses the same allocation strategy. + std::unique_ptr ptr_y = std::make_unique(m.required_span_size()); + std::span sp_y{ptr_y.release(), static_cast(m.required_span_size())}; + unique_mdarray y{sp_y, m, d}; + + for (index_type r = 0; r < y.extent(0); ++r) { + for (index_type c = 0; c < y.extent(1); ++c) { + set_value_at(y, -(1.0f + static_cast(r + c * x.extent(1))), r, c); + } + } + + using std::swap; + swap(x, y); + for (index_type r = 0; r < x.extent(0); ++r) { + for (index_type c = 0; c < x.extent(1); ++c) { + const float x_rc = get_value_at(x, r, c); + const float y_rc = get_value_at(y, r, c); + const float val = 1.0f + static_cast(r + c * x.extent(1)); + assert(x_rc == -val); + assert(y_rc == val); + } + } + } +} + +template +struct my_array_deleter { + void operator() (T* ptr) const { + delete [] ptr; + } + + // Make it not swappable, just to test constraints on swap. + friend constexpr void + swap(my_array_deleter&, my_array_deleter&) = delete; +}; + +template +void construction(const Deleter& d) { + using layout_type = layout_right; + + { + using extents_type = extents; + extents_type e{2, 3}; + using mapping_type = layout_type::mapping; + test_float_construction(mapping_type{e}, d); + test_float_construction(mapping_type{e}, d, std::integral_constant{}); + } + + { + using extents_type = extents; + extents_type e{2, 3}; + using mapping_type = layout_type::mapping; + test_float_construction(mapping_type{e}, d); + test_float_construction(mapping_type{e}, d, std::integral_constant{}); + } + + { + using extents_type = extents; + extents_type e{2, 3}; + using mapping_type = layout_type::mapping; + test_float_construction(mapping_type{e}, d); + test_float_construction(mapping_type{e}, d, std::integral_constant{}); + } + + { + using extents_type = extents; + extents_type e{2, 3}; + using mapping_type = layout_type::mapping; + test_float_construction(mapping_type{e}, d); + test_float_construction(mapping_type{e}, d, std::integral_constant{}); + } +} + +} // namespace test + +int main() { + test::construction(std::default_delete{}); + test::construction(test::my_array_deleter{}); + return 0; +} diff --git a/cpp-mdspan/unique_mdarray.hpp b/cpp-mdspan/unique_mdarray.hpp new file mode 100644 index 0000000..d53cc19 --- /dev/null +++ b/cpp-mdspan/unique_mdarray.hpp @@ -0,0 +1,541 @@ +#pragma once + +#include "mdspan/mdspan.hpp" +#include +#include +#include +#include + +// operator[] would need C++23 for multiple parameters. +// The reference mdspan has its own macros (please see above). +#if defined(MDSPAN_USE_BRACKET_OPERATOR) && (MDSPAN_USE_BRACKET_OPERATOR != 0) +# define MDSPAN_ARRAY_ACCESS_OPERATOR operator[] +#else +# define MDSPAN_ARRAY_ACCESS_OPERATOR operator() +#endif + +namespace md { + +namespace stdex = MDSPAN_IMPL_STANDARD_NAMESPACE :: MDSPAN_IMPL_PROPOSED_NAMESPACE ; + +using std::dynamic_extent; +using std::size_t; +using MDSPAN_IMPL_STANDARD_NAMESPACE :: extents; +using MDSPAN_IMPL_STANDARD_NAMESPACE :: layout_right; +using MDSPAN_IMPL_STANDARD_NAMESPACE :: mdspan; +using MDSPAN_IMPL_STANDARD_NAMESPACE :: default_accessor; + +namespace impl { + +template +constexpr size_t product_of_static_extents(extents e) { + return ((e.static_extent(Extents) == dynamic_extent ? size_t(0) : e.static_extent(Extents)) * ... * size_t(1)); +} + +template +constexpr typename extents::size_type + forward_product_of_extents(extents) +{ + return 0; +} + +template +constexpr typename extents::size_type + forward_product_of_extents(extents e) +{ + return (e.extent(Exts) * ... * size_t(1)); +} + +template +constexpr bool empty(extents e) +{ + return ((e.extent(Exts) == 0) || ... || true); +} + +} // namespace impl + +// Differences from P1684: +// +// 1. Separate dynamic allocation case from static allocation case. +// +// This sidesteps all the issues with mdarray having +// moved-from behavior that depends on the container type. +// +// 2. It's not a container adapter. +// +// Users don't really want to think about container adapter behavior. +// std::vector has to store capacity(), which mdarray doesn't use. +// +// 3. It's not even a container. +// +// Containers are "synchronous by nature." +// Their constructors have a postconditition +// that element access is valid, and thus, +// that the fill or copy that created them is done. +// +// Containers need to know how to fill and copy elements. +// That means needing to know about CUDA streams and synchronization. +// Users already need to handle that in their parallel algorithms. +// Some container constructors (e.g., mdarray construction from mdspan) +// have no convenient way to convey a CUDA stream. +// +// Containers need allocators for two reasons: +// because their constructors need to allocate, +// and because they permit resizing. +// We don't want to permit resizing +// (as extents can be any combination of static or dynamic), +// and it's a lot easier to let users allocate +// than to make the constructor do it. + +// Key features: +// +// 1. Behaves like a pointer (unique_ptr, specifically), +// not like a container. +// It doesn't need an allocator or a CUDA stream. +// +// 2. Callers are responsible for element fills and (deep) copies. +// Callers don't have to pay for initialization of elements +// (if value_type is implicit-lifetime). +// +// 3. Default-constructing it or moving from it results +// in a valid (empty) multidimensional array. +// You can't do either if the extents are all static. +// +// 4. You can reuse its storage safely by calling release(). +// This has the same constraints as default construction +// or the move constructor, and results in a valid (empty) +// multidimensional array. +// +// Execution policies should be separate from allocations, +// just like parallel algorithms are separate from allocations. +// If the deleter does cudaFreeAsync, it should store its own stream. +// Otherwise, users would manage the stream as part of +// their asynchronous call graph, however they choose to do that. +// For example, if using std::execution, +// a sender adapter for asynchronous allocations would return +// a sender of unique_mdarray +// with the CUDA stream stored in the scheduler. + +// Design questions: +// +// 1. Always act as-if default_accessor, +// or permit custom accessors (e.g., aligned_accessor)? +// +// 2. Permit reset(p) (with nonnull pointer p)? +// +// 3. Take span instead of ElementType* ? +// +// Regarding (1), the simpler design would make the viewing mdspan +// always have default_accessor. +// This won't give host vs. device access protection. It would also +// lose information that the creator has, like overalignment. +// A more general design would let the user provide an accessor. +// This means that data_handle_type might not be ElementType*, etc., +// so that we couldn't just use unique_ptr to implement it. +// +// Regarding (2), reset(p) implies that +// [p, p + mapping_.required_span_size()) is a valid range. +// While users could make a mistake with p, +// they might have made the same mistake with the constructor. +// +// Regarding (3), using span makes constructor preconditions +// more explicit and checkable. It does mean that a common pattern +// for allocating memory and transferring it to the unique_mdarray +// becomes less safe, because there are more steps between releasing +// the unique_ptr's control of the allocation (see code below) +// and creating the unique_mdarray. +// +// auto ptr = std::make_unique_ptr(m.required_span_size()); +// std::span sp{ptr.release(), m.required_span_size()}; +// unique_mdarray md{sp, m}; +// +// Contrast that with using a raw pointer. +// +// auto ptr = std::make_unique_ptr(m.required_span_size()); +// unique_mdarray md{ptr.release(), m}; +// +// Note that unique_mdarray's constructor could throw if, for example, +// the layout mapping's copy constructor throws. +// +// =================== +// Template parameters +// =================== +// +// ElementType: The (possibly const-qualified) type of each element. +// Const qualification is permitted, just as with unique_ptr. +// Extents: Specialization of std::extents. +// Layout: As in mdspan. +// Deleter: As in unique_ptr; must be an array deleter. +template> +class unique_mdarray { +public: + using extents_type = Extents; + using layout_type = Layout; + using mapping_type = typename layout_type::template mapping; + using element_type = ElementType; + using value_type = std::remove_const_t; + using index_type = typename extents_type::index_type; + using size_type = typename extents_type::size_type; + using rank_type = typename extents_type::rank_type; + + // It's not a container; all operator[] access is const, + // even if access to the element is not. + // + //using const_reference = std::add_const_t&; + using reference = ElementType&; + + // Like unique_ptr. + // + // We define element_type above. + // For pointer, see [unique.ptr.runtime.general] and [unique.ptr.single.general] 3. + using pointer = typename std::unique_ptr::pointer; + using deleter_type = Deleter; + + // Should this class have data_handle_type too? + // It only really makes sense if we allow "pointer" + // to be something other than ElementType*. + // + //using data_handle_type = ElementType*; + + ////////////////////////////////////////////////// + // Constructors and other special member functions + ////////////////////////////////////////////////// + + // Default construction is only permitted + // if a default-constructed unique_mdarray's mdspan would be valid: + // that is, if the type is rank zero, has no static extents, + // or all the static extents are zero. + // This ensures that the resulting object is valid, + // specifically that it does not "lie" + // about the size of its multidimensional index space. + unique_mdarray() requires( + extents_type::rank() == 0 || + extents_type::rank_dynamic() != 0 || + product_of_static_extents(extents_type{}) == 0 + ) = default; + + unique_mdarray(const unique_mdarray&) = delete; + unique_mdarray& operator=(const unique_mdarray&) = delete; + ~unique_mdarray() = default; + + // Move construction or move assignment is only permitted + // in the cases where default construction would be valid. + // This ensures that the moved-from object is valid, + // specifically that it does not "lie" + // about the size of its multidimensional index space. + unique_mdarray(unique_mdarray&&) = delete; + + unique_mdarray(unique_mdarray&& moved_from) requires( + ( + extents_type::rank() == 0 || + extents_type::rank_dynamic() != 0 || + product_of_static_extents(extents_type{}) == 0 + ) && + std::is_constructible_v + ) + : ptr_(std::move(moved_from.ptr_)) + , mapping_(std::move(moved_from.mapping_)) + { + moved_from.mapping_ = mapping_type{extents_type{}}; + } + + unique_mdarray& operator=(unique_mdarray&&) = delete; + + unique_mdarray& operator=(unique_mdarray&& moved_from) requires( + ( + extents_type::rank() == 0 || + extents_type::rank_dynamic() != 0 || + product_of_static_extents(extents_type{}) == 0 + ) && + std::is_constructible_v + ) + { + if (&moved_from != this) { + ptr_ = std::move(moved_from.ptr_); + mapping_ = std::move(moved_from.mapping_); + moved_from.mapping_ = mapping_type{extents_type{}}; + } + return *this; + } + + // unique_ptr constructors don't permit implicit conversions to pointer. + + // Just for now, I'll leave out all the cases of Deleter + // being something funny, and the weird constraints on + // unique_ptr(type_identity_t, d) constructors. + // I'll just say "Deleter d" for now. + + // The "parent constructor" to which most of the other constructors defer. + // + // Specific mappings have required_span_size that is a constant expression + // if all the extents are static. In that case, if InputExtent is not dynamic_extent, + // then we could turn the precondition into a static_assert. + // + // If we don't want to use span for the input range, + // then we would use std::type_identity_t data, + // just as unique_ptr does. + template + unique_mdarray(std::span sp, const mapping_type& m, Deleter d) + : ptr_(sp.data(), d), mapping_(m) + { + assert(static_cast(m.required_span_size()) <= sp.size()); + } + + template + unique_mdarray(std::span sp, const extents_type& e, Deleter d) + requires( + std::is_constructible_v + ) + : unique_mdarray(sp.data(), mapping_type{e}, d) + { + assert(sp.size() >= static_cast(mapping_.required_span_size())); + } + + template + unique_mdarray(std::span sp, const mapping_type& m) + requires( + std::is_nothrow_default_constructible_v + ) + : unique_mdarray(sp, m, Deleter{}) + {} + + template + unique_mdarray(std::span sp, const extents_type& e) + requires( + std::is_constructible_v && + std::is_nothrow_default_constructible_v + ) + : unique_mdarray(sp, e, Deleter{}) + {} + + // This is the analog of the mdspan constructor that takes a list of extents + // (as things convertible to index_type, generally integers). + // That constructor doesn't accept an accessor. + // Analogously, this constructor doesn't accept a deleter. + template + requires( + std::is_nothrow_default_constructible_v && + (std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && ...) && + ( + sizeof...(OtherIndexTypes) == extents_type::rank() || + sizeof...(OtherIndexTypes) == extents_type::rank_dynamic() + ) && + std::is_constructible_v + ) + unique_mdarray(std::span sp, OtherIndexTypes... exts) + : unique_mdarray(sp, mapping_type{extents_type{std::move(exts)...}}, Deleter{}) + {} + + // unique_ptr(span) is explicit. + // + // Construction without extents_type or mapping_type implies + // construction from extents_type{}. + template + explicit unique_mdarray(std::span sp) + requires ( + std::is_nothrow_default_constructible_v && + std::is_constructible_v + ) + : unique_mdarray(sp, mapping_type{extents_type{}}, Deleter{}) + {} + + // unique_ptr(nullptr_t) is NOT explicit. + // The constraints make the implicit conversion harmless, + // as the resulting array will have size zero + // (and therefore won't be accessible anyway). + unique_mdarray(std::nullptr_t) + requires ( + std::is_nothrow_default_constructible_v && + std::is_constructible_v, + std::nullptr_t> && + std::is_constructible_v && + ( + extents_type::rank_dynamic() != 0 || + empty(extents_type{}) + ) + ) + : ptr_(nullptr), mapping_{extents_type{}} + {} + + ////////////////////////////////////////////////// + // Functions adopted directly from unique_ptr + ////////////////////////////////////////////////// + + constexpr deleter_type& get_deleter() { return ptr_.get_deleter(); } + constexpr const deleter_type& get_deleter() const { return ptr_.get_deleter(); } + constexpr pointer get() const { return ptr_.get(); } + constexpr explicit operator bool() const noexcept { + return bool(ptr_); + } + + template + friend constexpr void swap( + unique_mdarray& x, + unique_mdarray& y, + std::enable_if_t, void>* = nullptr) noexcept + { + using std::swap; + swap(x.ptr_, y.ptr_); + swap(x.mapping_, y.mapping_); + } + + ////////////////////////////////////////////////////////// + // Get a nonowning mdspan that views the elements + ////////////////////////////////////////////////////////// + constexpr operator + mdspan>() const { + return {ptr_.get(), mapping_}; + } + + ////////////////////////////////////////////////////////// + // release and reset (both constrained, unlike unique_ptr) + ////////////////////////////////////////////////////////// + + // Only permit calling release() if we can ensure + // the postcondition that extents() has size zero. + template requires( + std::is_same_v && + ( + extents_type::rank_dynamic() != 0 || + impl::empty(extents_type{}) + ) && + std::is_constructible_v + ) + constexpr pointer release() noexcept { + auto p = ptr_.release(); + mapping_ = mapping_type{extents_type{}}; + return p; + } + + // Only permit calling reset() if we can ensure + // the postcondition that extents() has size zero. + template requires( + std::is_same_v && + ( + extents_type::rank_dynamic() != 0 || + impl::empty(extents_type{}) + ) && + std::is_constructible_v + ) + constexpr void reset() noexcept { + (void) this->release(); + } + + // reset with a single pointer argument implies + // that the extents haven't changed. + // Thus, we don't have to recreate the mapping. + constexpr void reset(std::type_identity_t p) noexcept { + ptr_.reset(p); + } + + ////////////////////////////////////////////////////////// + // mdspan-like interface + ////////////////////////////////////////////////////////// + + static constexpr rank_type rank() noexcept { + return extents_type::rank(); + } + static constexpr rank_type rank_dynamic() noexcept { + return extents_type::rank_dynamic(); + } + static constexpr size_t static_extent(rank_type r) noexcept + { + return extents_type::static_extent(r); + } + constexpr index_type extent(rank_type r) const noexcept { + return extents().extent(r); + } + + // It's not a container; it works like unique_ptr. + // Thus, operator[] always returns reference. + // If element_type is const-qualified, then so its the reference. + // There's no non-const overload, as a container would have. + + template + requires( + (std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && ...) && + sizeof...(OtherIndexTypes) == extents_type::rank() + ) + reference MDSPAN_ARRAY_ACCESS_OPERATOR + (OtherIndexTypes... indices) const { + return ptr_[mapping_(static_cast(std::move(indices))...)]; + } + + template + constexpr reference + operator[](std::span indices) const + { + return [this, indices] (std::index_sequence) -> reference { + return ptr_[mapping_(static_cast(indices[Which])...)]; + } (std::make_index_sequence()); + } + + template + constexpr reference + operator[](const std::array& indices) const + { + return [this, &indices] (std::index_sequence) -> reference { + return ptr_[mapping_(static_cast(indices[Which])...)]; + } (std::make_index_sequence()); + } + + constexpr size_type size() const noexcept { + return forward_product_of_extents(extents()); + } + constexpr bool empty() const noexcept { + return empty(extents()); + } + + constexpr const extents_type& extents() const noexcept { + return mapping_.extents(); + } + // We don't include data_handle(), because the name suggests + // that it could be something other than ElementType*. + // + //constexpr const data_handle_type& data_handle() const noexcept { + // return ptr_; + //} + constexpr const mapping_type& mapping() const noexcept { + return mapping_; + } + + static constexpr bool is_always_unique() + { return mapping_type::is_always_unique(); } + static constexpr bool is_always_exhaustive() + { return mapping_type::is_always_exhaustive(); } + static constexpr bool is_always_strided() + { return mapping_type::is_always_strided(); } + + constexpr bool is_unique() const + { return mapping_.is_unique(); } + constexpr bool is_exhaustive() const + { return mapping_.is_exhaustive(); } + constexpr bool is_strided() const + { return mapping_.is_strided(); } + constexpr index_type stride(rank_type r) const + { return mapping_.stride(r); } + +private: + std::unique_ptr ptr_{}; + typename Layout::template mapping mapping_{}; +}; + +// +// The analog of make_unique needs a mapping instead of a size. +// +template +constexpr unique_mdarray + make_unique_mdarray(const Mapping& mapping) +{ + const auto num_elts = mapping.required_span_size(); + auto ptr = std::make_unique(num_elts); + return {std::span{ptr.release(), num_elts}, mapping}; +} + +} // namespace md From 5c1cdde7a455c7b49ecd8f67784510f752fe62ad Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 24 Mar 2025 19:13:35 +0200 Subject: [PATCH 42/83] Start mdspan port * Replace raw allocations with make_unique * Fix unused variable warnings * Replace dmin with fmin (note different NaN behavior) --- cpp-mdspan/miniWeather_mdspan.cpp | 116 ++++++++++++------------------ 1 file changed, 45 insertions(+), 71 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index ebea9c1..6564e1b 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -82,11 +82,11 @@ int i_beg, k_beg; //beginning index in the x- and z-directions for t int nranks, myrank; //Number of MPI ranks and my rank id int left_rank, right_rank; //MPI Rank IDs that exist to my left and right in the global domain int mainproc; //Am I the main process (rank == 0)? -double *hy_dens_cell; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) -double *hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) -double *hy_dens_int; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) -double *hy_dens_theta_int; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) -double *hy_pressure_int; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) +std::unique_ptr hy_dens_cell; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) +std::unique_ptr hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) +std::unique_ptr hy_dens_int; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) +std::unique_ptr hy_dens_theta_int; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) +std::unique_ptr hy_pressure_int; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) /////////////////////////////////////////////////////////////////////////////////////// // Variables that are dynamics over the course of the simulation @@ -94,18 +94,15 @@ double *hy_pressure_int; //hydrostatic press (vert cell interf). Dimensio double etime; //Elapsed model time double output_counter; //Helps determine when it's time to do output //Runtime variable arrays -double *state; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) -double *state_tmp; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) -double *flux; //Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) -double *tend; //Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) +std::unique_ptr state; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +std::unique_ptr state_tmp; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +std::unique_ptr flux; //Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) +std::unique_ptr tend; //Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) int num_out = 0; //The number of outputs performed so far int direction_switch = 1; double mass0, te0; //Initial domain totals for mass and total energy double mass , te ; //Domain totals for mass and total energy -//How is this not in the standard?! -double dmin( double a , double b ) { if (a sim_time) { dt = sim_time - etime; } //Perform a single time step - perform_timestep(state,state_tmp,flux,tend,dt); + perform_timestep(state.get(), state_tmp.get(), flux.get(), tend.get(), dt); //Inform the user #ifndef NO_INFORM if (mainproc) { fprintf(stderr, "Elapsed Time: %lf / %lf\n", etime , sim_time ); } @@ -165,7 +162,7 @@ int main(int argc, char **argv) { //If it's time for output, reset the counter, and do output if (output_counter >= output_freq) { output_counter = output_counter - output_freq; - output(state,etime); + output(state.get(), etime); } #if 0 { @@ -459,14 +456,11 @@ void set_halo_values_x( double *state ) { //Set this MPI task's halo values in the z-direction. This does not require MPI because there is no MPI //decomposition in the vertical direction void set_halo_values_z( double *state ) { - int i, ll; - const double mnt_width = xlen/8; - double x, xloc, mnt_deriv; ///////////////////////////////////////////////// // TODO: THREAD ME ///////////////////////////////////////////////// - for (ll=0; ll( (nx+2*hs)*(nz+2*hs)*NUM_VARS ); + state_tmp = std::make_unique( (nx+2*hs)*(nz+2*hs)*NUM_VARS ); + flux = std::make_unique( (nx+1)*(nz+1)*NUM_VARS ); + tend = std::make_unique( nx*nz*NUM_VARS ); + hy_dens_cell = std::make_unique( (nz+2*hs) ); + hy_dens_theta_cell = std::make_unique( (nz+2*hs) ); + hy_dens_int = std::make_unique( (nz+1) ); + hy_dens_theta_int = std::make_unique( (nz+1) ); + hy_pressure_int = std::make_unique( (nz+1) ); //Define the maximum stable time step based on an assumed maximum wind speed - dt = dmin(dx,dz) / max_speed * cfl; + dt = fmin(dx,dz) / max_speed * cfl; //Set initial elapsed model time and output_counter to zero etime = 0.; output_counter = 0.; @@ -548,7 +542,7 @@ void init( int *argc , char ***argv ) { fprintf(stderr, "dt: %lf\n",dt); } //Want to make sure this info is displayed before further output - ierr = MPI_Barrier(MPI_COMM_WORLD); + (void) MPI_Barrier(MPI_COMM_WORLD); ////////////////////////////////////////////////////////////////////////// // Initialize the cell-averaged fluid state via Gauss-Legendre quadrature @@ -742,23 +736,24 @@ double sample_ellipse_cosine( double x , double z , double amp , double x0 , dou //The file I/O uses parallel-netcdf, the only external library required for this mini-app. //If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics void output( double *state , double etime ) { -#if 1 - int ncid, t_dimid, x_dimid, z_dimid, dens_varid, uwnd_varid, wwnd_varid, theta_varid, t_varid, dimids[3]; - int i, k, ind_r, ind_u, ind_w, ind_t; + int ncid, t_dimid, x_dimid, z_dimid, theta_varid, t_varid, dimids[3]; + int i, k, ind_r, ind_t; +#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) + int ind_u, ind_w, dens_varid, uwnd_varid, wwnd_varid; +#endif MPI_Offset st1[1], ct1[1], st3[3], ct3[3]; - //Temporary arrays to hold density, u-wind, w-wind, and potential temperature (theta) - double *dens, *uwnd, *wwnd, *theta; - double *etimearr; + //Inform the user if (mainproc) { fprintf(stderr, "*** OUTPUT ***\n"); } - //Allocate some (big) temp arrays + + //Temporary arrays to hold density, u-wind, w-wind, and potential temperature (theta) #if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) - dens = (double *) malloc(nx*nz*sizeof(double)); - uwnd = (double *) malloc(nx*nz*sizeof(double)); - wwnd = (double *) malloc(nx*nz*sizeof(double)); + auto dens = std::make_unique(nx*nz); + auto uwnd = std::make_unique(nx*nz); + auto wwnd = std::make_unique(nx*nz); #endif - theta = (double *) malloc(nx*nz*sizeof(double)); - etimearr = (double *) malloc(1 *sizeof(double)); + auto theta = std::make_unique(nx*nz); + auto etimearr = std::make_unique(1); // PNetCDF needs an MPI_Info object that is not MPI_INFO_NULL. // It's possible that earlier PNetCDF versions tolerated MPI_INFO_NULL. @@ -828,7 +823,7 @@ void output( double *state , double etime ) { ncwrap( ncmpi_put_vara_double_all( ncid , uwnd_varid , st3 , ct3 , uwnd ) , __LINE__ ); ncwrap( ncmpi_put_vara_double_all( ncid , wwnd_varid , st3 , ct3 , wwnd ) , __LINE__ ); #endif - ncwrap( ncmpi_put_vara_double_all( ncid , theta_varid , st3 , ct3 , theta ) , __LINE__ ); + ncwrap( ncmpi_put_vara_double_all( ncid , theta_varid , st3 , ct3 , theta.get() ) , __LINE__ ); //Only the main process needs to write the elapsed time //Begin "independent" write mode @@ -838,29 +833,18 @@ void output( double *state , double etime ) { st1[0] = num_out; ct1[0] = 1; etimearr[0] = etime; - ncwrap( ncmpi_put_vara_double( ncid , t_varid , st1 , ct1 , etimearr ) , __LINE__ ); + ncwrap( ncmpi_put_vara_double( ncid , t_varid , st1 , ct1 , etimearr.get() ) , __LINE__ ); } //End "independent" write mode ncwrap( ncmpi_end_indep_data(ncid) , __LINE__ ); //Close the file ncwrap( ncmpi_close(ncid) , __LINE__ ); -#endif // 0 + //Increment the number of outputs num_out = num_out + 1; -#if 1 - MPI_Info_free(&mpi_info); - - //Deallocate the temp arrays -#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) - free( dens ); - free( uwnd ); - free( wwnd ); -#endif - free( theta ); - free( etimearr ); -#endif // 0 + (void) MPI_Info_free(&mpi_info); } @@ -875,17 +859,7 @@ void ncwrap( int ierr , int line ) { void finalize() { - int ierr; - free( state ); - free( state_tmp ); - free( flux ); - free( tend ); - free( hy_dens_cell ); - free( hy_dens_theta_cell ); - free( hy_dens_int ); - free( hy_dens_theta_int ); - free( hy_pressure_int ); - ierr = MPI_Finalize(); + (void) MPI_Finalize(); } From 4ad6128c383910b7506f8092d4f85d1e221f2fb1 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 24 Mar 2025 19:31:08 +0200 Subject: [PATCH 43/83] C++-ify reductions --- cpp-mdspan/miniWeather_mdspan.cpp | 42 ++++++++++++++++++------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index 6564e1b..d7ad918 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -100,8 +100,6 @@ std::unique_ptr flux; //Cell interface fluxes. Dimen std::unique_ptr tend; //Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) int num_out = 0; //The number of outputs performed so far int direction_switch = 1; -double mass0, te0; //Initial domain totals for mass and total energy -double mass , te ; //Domain totals for mass and total energy //Declaring the functions defined after "main" @@ -123,7 +121,12 @@ void compute_tendencies_x ( double *state , double *flux , double *tend , doub void compute_tendencies_z ( double *state , double *flux , double *tend , double dt); void set_halo_values_x ( double *state ); void set_halo_values_z ( double *state ); -void reductions ( double &mass , double &te ); + +struct reduction_result { + double mass; + double te; +}; +reduction_result reductions(); /////////////////////////////////////////////////////////////////////////////////////// @@ -133,8 +136,11 @@ int main(int argc, char **argv) { init( &argc , &argv ); - //Initial reductions for mass, kinetic energy, and total energy - reductions(mass0,te0); + //Initial reductions for mass, kinetic energy, and total energy. + // + // mass0: initial domain total for mass + // te0: initial domain total for total energy + auto [mass0, te0] = reductions(); { fprintf(stderr, "mass0: %le\n" , mass0); fprintf(stderr, "te0: %le\n" , te0 ); @@ -166,9 +172,7 @@ int main(int argc, char **argv) { } #if 0 { - double mass = 0.0; - double te = 0.0; - reductions(mass, te); + auto [mass, te] = reductions(); fprintf(stderr, "mass: %le\n" , mass ); fprintf(stderr, "te: %le\n" , te ); } @@ -180,7 +184,7 @@ int main(int argc, char **argv) { } //Final reductions for mass, kinetic energy, and total energy - reductions(mass,te); + auto [mass, te] = reductions(); if (mainproc) { fprintf(stderr, "d_mass: %le\n" , (mass - mass0)/mass0 ); @@ -864,9 +868,9 @@ void finalize() { //Compute reduced quantities for error checking without resorting to the "ncdiff" tool -void reductions( double &mass , double &te ) { - mass = 0; - te = 0; +reduction_result reductions() { + reduction_result result{0.0, 0.0}; + for (int k=0; k Date: Mon, 24 Mar 2025 19:54:03 +0200 Subject: [PATCH 44/83] output: start using mdspan for state --- cpp-mdspan/miniWeather_mdspan.cpp | 47 ++++++++++++++++++------------- cpp-mdspan/unique_mdarray.hpp | 2 -- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index d7ad918..8d69cce 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -94,13 +94,27 @@ std::unique_ptr hy_pressure_int; //hydrostatic press (vert cell i double etime; //Elapsed model time double output_counter; //Helps determine when it's time to do output //Runtime variable arrays -std::unique_ptr state; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) -std::unique_ptr state_tmp; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) -std::unique_ptr flux; //Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) -std::unique_ptr tend; //Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) +// +// C indexing seems to prefer the extents in reverse order. +// Respecting that also avoids divergence from the Python version. +// This means that the mdspan must be layout_right; the intent appears +// to be for C code to use row-major storage, but with Fortran ordering. +// +// state extents: NUM_VARS, (nz+2*hs), (nx+2*hs) +// +std::unique_ptr state; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +std::unique_ptr state_tmp; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +std::unique_ptr flux; //Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) +std::unique_ptr tend; //Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) int num_out = 0; //The number of outputs performed so far int direction_switch = 1; +namespace md { + using MDSPAN_IMPL_STANDARD_NAMESPACE :: layout_left; + using MDSPAN_IMPL_STANDARD_NAMESPACE :: MDSPAN_IMPL_PROPOSED_NAMESPACE :: dims; +} // namespace md + +using view_3d = md::mdspan, md::layout_left>; //Declaring the functions defined after "main" void init ( int *argc , char ***argv ); @@ -739,11 +753,12 @@ double sample_ellipse_cosine( double x , double z , double amp , double x0 , dou //Output the fluid state (state) to a NetCDF file at a given elapsed model time (etime) //The file I/O uses parallel-netcdf, the only external library required for this mini-app. //If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics -void output( double *state , double etime ) { +void output(double* state_ptr, double etime) { + auto state = view_3d(state_ptr, NUM_VARS, (nz+2*hs), (nx+2*hs)); + int ncid, t_dimid, x_dimid, z_dimid, theta_varid, t_varid, dimids[3]; - int i, k, ind_r, ind_t; #if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) - int ind_u, ind_w, dens_varid, uwnd_varid, wwnd_varid; + int dens_varid, uwnd_varid, wwnd_varid; #endif MPI_Offset st1[1], ct1[1], st3[3], ct3[3]; @@ -802,20 +817,14 @@ void output( double *state , double etime ) { } //Store perturbed values in the temp arrays for output - for (k=0; k Date: Mon, 24 Mar 2025 20:44:05 +0200 Subject: [PATCH 45/83] Fix view_3d layout; mdspan-ify more --- cpp-mdspan/miniWeather_mdspan.cpp | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index 8d69cce..c6c7828 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -114,7 +114,7 @@ namespace md { using MDSPAN_IMPL_STANDARD_NAMESPACE :: MDSPAN_IMPL_PROPOSED_NAMESPACE :: dims; } // namespace md -using view_3d = md::mdspan, md::layout_left>; +using view_3d = md::mdspan, md::layout_right>; //Declaring the functions defined after "main" void init ( int *argc , char ***argv ); @@ -765,13 +765,18 @@ void output(double* state_ptr, double etime) { //Inform the user if (mainproc) { fprintf(stderr, "*** OUTPUT ***\n"); } - //Temporary arrays to hold density, u-wind, w-wind, and potential temperature (theta) + //Temporary arrays to hold density, u-wind, w-wind, and potential temperature (theta). + // + // As with state, we retain the reversed order of extents. + // Some compilers aren't so good at CTAD for mapping. + auto mapping_2d = md::layout_right::template mapping>{md::dims<2>{nz, nx}}; + #if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) - auto dens = std::make_unique(nx*nz); - auto uwnd = std::make_unique(nx*nz); - auto wwnd = std::make_unique(nx*nz); + auto dens = md::make_unique_mdarray(mapping_2d); + auto uwnd = md::make_unique_mdarray(mapping_2d); + auto wwnd = md::make_unique_mdarray(mapping_2d); #endif - auto theta = std::make_unique(nx*nz); + auto theta = md::make_unique_mdarray(mapping_2d); auto etimearr = std::make_unique(1); // PNetCDF needs an MPI_Info object that is not MPI_INFO_NULL. @@ -820,11 +825,11 @@ void output(double* state_ptr, double etime) { for (int k = 0; k < nz; ++k) { for (int i = 0; i < nx; ++i) { #if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) - dens [k*nx+i] = state(ID_DENS, k+hs, i+hs); - uwnd [k*nx+i] = state(ID_UMOM, k+hs, i+hs) / ( hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs) ); - wwnd [k*nx+i] = state(ID_WMOM, k+hs, i+hs) / ( hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs) ); + dens(k, i) = state(ID_DENS, k+hs, i+hs); + uwnd(k, i) = state(ID_UMOM, k+hs, i+hs) / ( hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs) ); + wwnd(k, i) = state(ID_WMOM, k+hs, i+hs) / ( hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs) ); #endif - theta[k*nx+i] = ( state(ID_RHOT, k+hs, i+hs) + hy_dens_theta_cell[k+hs] ) / ( hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs) ) - hy_dens_theta_cell[k+hs] / hy_dens_cell[k+hs]; + theta(k, i) = ( state(ID_RHOT, k+hs, i+hs) + hy_dens_theta_cell[k+hs] ) / ( hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs) ) - hy_dens_theta_cell[k+hs] / hy_dens_cell[k+hs]; } } @@ -832,9 +837,9 @@ void output(double* state_ptr, double etime) { st3[0] = num_out; st3[1] = k_beg; st3[2] = i_beg; ct3[0] = 1 ; ct3[1] = nz ; ct3[2] = nx ; #if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) - ncwrap( ncmpi_put_vara_double_all( ncid , dens_varid , st3 , ct3 , dens ) , __LINE__ ); - ncwrap( ncmpi_put_vara_double_all( ncid , uwnd_varid , st3 , ct3 , uwnd ) , __LINE__ ); - ncwrap( ncmpi_put_vara_double_all( ncid , wwnd_varid , st3 , ct3 , wwnd ) , __LINE__ ); + ncwrap( ncmpi_put_vara_double_all( ncid , dens_varid , st3 , ct3 , dens.get() ) , __LINE__ ); + ncwrap( ncmpi_put_vara_double_all( ncid , uwnd_varid , st3 , ct3 , uwnd.get() ) , __LINE__ ); + ncwrap( ncmpi_put_vara_double_all( ncid , wwnd_varid , st3 , ct3 , wwnd.get() ) , __LINE__ ); #endif ncwrap( ncmpi_put_vara_double_all( ncid , theta_varid , st3 , ct3 , theta.get() ) , __LINE__ ); From 5bd6d57f5ca910ba4e0a5e4b4f772ef07e24942e Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 24 Mar 2025 20:54:27 +0200 Subject: [PATCH 46/83] mdspan-ify set_halo_values_x --- cpp-mdspan/miniWeather_mdspan.cpp | 43 ++++++++++++++----------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index c6c7828..09954f1 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -245,27 +245,27 @@ void perform_timestep( double *state , double *state_tmp , double *flux , double //state_out = state_init + dt * rhs(state_forcing) //Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out void semi_discrete_step( double *state_init , double *state_forcing , double *state_out , double dt , int dir , double *flux , double *tend ) { - int i, k, ll, inds, indt, indw; + int inds, indt, indw; double x, z, wpert, dist, x0, z0, xrad, zrad, amp; if (dir == DIR_X) { //Set the halo values for this MPI task's fluid state in the x-direction set_halo_values_x(state_forcing); //Compute the time tendencies for the fluid state in the x-direction - compute_tendencies_x(state_forcing,flux,tend,dt); + compute_tendencies_x(state_forcing, flux, tend, dt); } else if (dir == DIR_Z) { //Set the halo values for this MPI task's fluid state in the z-direction set_halo_values_z(state_forcing); //Compute the time tendencies for the fluid state in the z-direction - compute_tendencies_z(state_forcing,flux,tend,dt); + compute_tendencies_z(state_forcing, flux, tend, dt); } ///////////////////////////////////////////////// // TODO: THREAD ME ///////////////////////////////////////////////// //Apply the tendencies to the fluid state - for (ll=0; ll Date: Mon, 24 Mar 2025 20:56:58 +0200 Subject: [PATCH 47/83] mdspan-ify set_halo_values_z --- cpp-mdspan/miniWeather_mdspan.cpp | 32 ++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index 09954f1..9d4a86c 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -470,27 +470,29 @@ void set_halo_values_x( double* state_ptr ) { //Set this MPI task's halo values in the z-direction. This does not require MPI because there is no MPI //decomposition in the vertical direction -void set_halo_values_z( double *state ) { +void set_halo_values_z(double* state_ptr) { + auto state = view_3d(state_ptr, NUM_VARS, (nz+2*hs), (nx+2*hs)); + ///////////////////////////////////////////////// // TODO: THREAD ME ///////////////////////////////////////////////// - for (int ll=0; ll Date: Mon, 24 Mar 2025 21:03:53 +0200 Subject: [PATCH 48/83] Start mdspan-ifying compute_tendencies_x --- cpp-mdspan/miniWeather_mdspan.cpp | 40 +++++++++++++++---------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index 9d4a86c..4d3d8e9 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -303,22 +303,22 @@ void semi_discrete_step( double *state_init , double *state_forcing , double *st //Since the halos are set in a separate routine, this will not require MPI //First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) //Then, compute the tendencies using those fluxes -void compute_tendencies_x( double *state , double *flux , double *tend , double dt) { - int i,k,ll,s,inds,indf1,indf2,indt; - double r,u,w,t,p, stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef; +void compute_tendencies_x(double* state_ptr, double* flux, double* tend, double dt) { + auto state = view_3d(state_ptr, NUM_VARS, (nz+2*hs), (nx+2*hs)); + + double stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef; //Compute the hyperviscosity coefficient hv_coef = -hv_beta * dx / (16*dt); ///////////////////////////////////////////////// // TODO: THREAD ME ///////////////////////////////////////////////// //Compute fluxes in the x-direction for each cell - for (k=0; k Date: Mon, 24 Mar 2025 21:25:45 +0200 Subject: [PATCH 49/83] Fully mdspan-ify state --- cpp-mdspan/miniWeather_mdspan.cpp | 129 ++++++++++++++---------------- 1 file changed, 62 insertions(+), 67 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index 4d3d8e9..842aa67 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -127,14 +127,14 @@ void collision ( double x , double z , double &r , double &u , doub void hydro_const_theta ( double z , double &r , double &t ); void hydro_const_bvfreq ( double z , double bv_freq0 , double &r , double &t ); double sample_ellipse_cosine( double x , double z , double amp , double x0 , double z0 , double xrad , double zrad ); -void output ( double *state , double etime ); -void ncwrap ( int ierr , int line ); -void perform_timestep ( double *state , double *state_tmp , double *flux , double *tend , double dt ); -void semi_discrete_step ( double *state_init , double *state_forcing , double *state_out , double dt , int dir , double *flux , double *tend ); -void compute_tendencies_x ( double *state , double *flux , double *tend , double dt); -void compute_tendencies_z ( double *state , double *flux , double *tend , double dt); -void set_halo_values_x ( double *state ); -void set_halo_values_z ( double *state ); +void output (view_3d state, double etime); +void ncwrap (int ierr, int line); +void perform_timestep (view_3d state, view_3d state_tmp, double* flux, double* tend, double dt); +void semi_discrete_step (view_3d state_init, view_3d state_forcing, view_3d state_out, double dt, int dir, double* flux, double* tend); +void compute_tendencies_x (view_3d state, double* flux, double* tend, double dt); +void compute_tendencies_z (view_3d state, double* flux, double* tend, double dt); +void set_halo_values_x (view_3d state); +void set_halo_values_z (view_3d state); struct reduction_result { double mass; @@ -159,9 +159,11 @@ int main(int argc, char **argv) { fprintf(stderr, "mass0: %le\n" , mass0); fprintf(stderr, "te0: %le\n" , te0 ); } + auto state_view = view_3d(state.get(), NUM_VARS, (nz+2*hs), (nx+2*hs)); + auto state_tmp_view = view_3d(state_tmp.get(), NUM_VARS, (nz+2*hs), (nx+2*hs)); //Output the initial state - output(state.get(), etime); + output(state_view, etime); //////////////////////////////////////////////////// // MAIN TIME STEP LOOP @@ -171,7 +173,7 @@ int main(int argc, char **argv) { //If the time step leads to exceeding the simulation time, shorten it for the last step if (etime + dt > sim_time) { dt = sim_time - etime; } //Perform a single time step - perform_timestep(state.get(), state_tmp.get(), flux.get(), tend.get(), dt); + perform_timestep(state_view, state_tmp_view, flux.get(), tend.get(), dt); //Inform the user #ifndef NO_INFORM if (mainproc) { fprintf(stderr, "Elapsed Time: %lf / %lf\n", etime , sim_time ); } @@ -182,7 +184,7 @@ int main(int argc, char **argv) { //If it's time for output, reset the counter, and do output if (output_counter >= output_freq) { output_counter = output_counter - output_freq; - output(state.get(), etime); + output(state_view, etime); } #if 0 { @@ -216,26 +218,25 @@ int main(int argc, char **argv) { // q* = q[n] + dt/3 * rhs(q[n]) // q** = q[n] + dt/2 * rhs(q* ) // q[n+1] = q[n] + dt/1 * rhs(q** ) -void perform_timestep( double *state , double *state_tmp , double *flux , double *tend , double dt ) { - //fprintf(stderr, "direction_switch: %d\n", direction_switch); +void perform_timestep(view_3d state, view_3d state_tmp, double* flux, double* tend, double dt) { if (direction_switch) { //x-direction first - semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_X , flux , tend ); - semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , flux , tend ); - semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_X , flux , tend ); + semi_discrete_step(state, state , state_tmp, dt / 3, DIR_X, flux, tend); + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend); + semi_discrete_step(state, state_tmp, state , dt / 1, DIR_X, flux, tend); //z-direction second - semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_Z , flux , tend ); - semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , flux , tend ); - semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_Z , flux , tend ); + semi_discrete_step(state, state , state_tmp, dt / 3, DIR_Z, flux, tend); + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend); + semi_discrete_step(state, state_tmp, state , dt / 1, DIR_Z, flux, tend); } else { //z-direction second - semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_Z , flux , tend ); - semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , flux , tend ); - semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_Z , flux , tend ); + semi_discrete_step(state, state , state_tmp, dt / 3, DIR_Z, flux, tend); + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend); + semi_discrete_step(state, state_tmp, state , dt / 1, DIR_Z, flux, tend); //x-direction first - semi_discrete_step( state , state , state_tmp , dt / 3 , DIR_X , flux , tend ); - semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , flux , tend ); - semi_discrete_step( state , state_tmp , state , dt / 1 , DIR_X , flux , tend ); + semi_discrete_step(state, state , state_tmp, dt / 3, DIR_X, flux, tend); + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend); + semi_discrete_step(state, state_tmp, state , dt / 1, DIR_X, flux, tend); } if (direction_switch) { direction_switch = 0; } else { direction_switch = 1; } } @@ -244,10 +245,9 @@ void perform_timestep( double *state , double *state_tmp , double *flux , double //Perform a single semi-discretized step in time with the form: //state_out = state_init + dt * rhs(state_forcing) //Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out -void semi_discrete_step( double *state_init , double *state_forcing , double *state_out , double dt , int dir , double *flux , double *tend ) { - int inds, indt, indw; - double x, z, wpert, dist, x0, z0, xrad, zrad, amp; - if (dir == DIR_X) { +void semi_discrete_step(view_3d state_init, view_3d state_forcing, view_3d state_out, double dt, int dir, double* flux, double* tend) { + int indt, indw; + if (dir == DIR_X) { //Set the halo values for this MPI task's fluid state in the x-direction set_halo_values_x(state_forcing); //Compute the time tendencies for the fluid state in the x-direction @@ -267,19 +267,21 @@ void semi_discrete_step( double *state_init , double *state_forcing , double *st for (int k = 0; k < nz; ++k) { for (int i = 0; i < nx; ++i) { if (data_spec_int == DATA_SPEC_GRAVITY_WAVES) { - x = (i_beg + i+0.5)*dx; - z = (k_beg + k+0.5)*dz; + double x = (i_beg + i+0.5)*dx; + double z = (k_beg + k+0.5)*dz; // Using sample_ellipse_cosine requires "acc routine" in OpenACC and "declare target" in OpenMP offload // Neither of these are particularly well supported. So I'm manually inlining here // wpert = sample_ellipse_cosine( x,z , 0.01 , xlen/8,1000., 500.,500. ); + + double wpert = 0.0; { - x0 = xlen/8; - z0 = 1000; - xrad = 500; - zrad = 500; - amp = 0.01; + double x0 = xlen/8; + double z0 = 1000; + double xrad = 500; + double zrad = 500; + double amp = 0.01; //Compute distance from bubble center - dist = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.; + double dist = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.; //If the distance from bubble center is less than the radius, create a cos**2 profile if (dist <= pi / 2.) { wpert = amp * pow(cos(dist),2.); @@ -290,9 +292,8 @@ void semi_discrete_step( double *state_init , double *state_forcing , double *st indw = ID_WMOM*nz*nx + k*nx + i; tend[indw] += wpert*hy_dens_cell[hs+k]; } - inds = ll*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs; indt = ll*nz*nx + k*nx + i; - state_out[inds] = state_init[inds] + dt * tend[indt]; + state_out(ll, k+hs, i+hs) = state_init(ll, k+hs, i+hs) + dt * tend[indt]; } } } @@ -303,8 +304,7 @@ void semi_discrete_step( double *state_init , double *state_forcing , double *st //Since the halos are set in a separate routine, this will not require MPI //First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) //Then, compute the tendencies using those fluxes -void compute_tendencies_x(double* state_ptr, double* flux, double* tend, double dt) { - auto state = view_3d(state_ptr, NUM_VARS, (nz+2*hs), (nx+2*hs)); +void compute_tendencies_x(view_3d state, double* flux, double* tend, double dt) { double stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef; //Compute the hyperviscosity coefficient @@ -362,22 +362,22 @@ void compute_tendencies_x(double* state_ptr, double* flux, double* tend, double //Since the halos are set in a separate routine, this will not require MPI //First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity) //Then, compute the tendencies using those fluxes -void compute_tendencies_z( double *state , double *flux , double *tend , double dt) { - int i,k,ll,s, inds, indf1, indf2, indt; - double r,u,w,t,p, stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef; +void compute_tendencies_z(view_3d state, double* flux, double* tend, double dt) { + + int indf1, indf2, indt; + double stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef; //Compute the hyperviscosity coefficient hv_coef = -hv_beta * dz / (16*dt); ///////////////////////////////////////////////// // TODO: THREAD ME ///////////////////////////////////////////////// //Compute fluxes in the x-direction for each cell - for (k=0; k Date: Mon, 24 Mar 2025 22:56:34 +0200 Subject: [PATCH 50/83] Add make_unique_mdarray extents and IndexType... overloads --- cpp-mdspan/unique_mdarray.hpp | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/cpp-mdspan/unique_mdarray.hpp b/cpp-mdspan/unique_mdarray.hpp index eb45eae..6c34f8a 100644 --- a/cpp-mdspan/unique_mdarray.hpp +++ b/cpp-mdspan/unique_mdarray.hpp @@ -22,6 +22,7 @@ using MDSPAN_IMPL_STANDARD_NAMESPACE :: extents; using MDSPAN_IMPL_STANDARD_NAMESPACE :: layout_right; using MDSPAN_IMPL_STANDARD_NAMESPACE :: mdspan; using MDSPAN_IMPL_STANDARD_NAMESPACE :: default_accessor; +using MDSPAN_IMPL_STANDARD_NAMESPACE :: MDSPAN_IMPL_PROPOSED_NAMESPACE :: dims; namespace impl { @@ -525,7 +526,7 @@ class unique_mdarray { }; // -// The analog of make_unique needs a mapping instead of a size. +// Analog of make_unique: it takes a mapping instead of a size. // template constexpr unique_mdarray @@ -536,4 +537,29 @@ constexpr unique_mdarray{ptr.release(), num_elts}, mapping}; } +// +// Another analog of make_unique: it takes an extents object instead of a size. +// +template +constexpr unique_mdarray> + make_unique_mdarray(const extents& exts) +{ + return make_unique_mdarray(layout_right::template mapping{exts}); +} + +// +// Another make_unique(size_t) analog; it takes a list of extents +// (as things convertible to index_type, generally integers). +// +template +requires( + (std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && ...) +) +constexpr unique_mdarray> + make_unique_mdarray(OtherIndexTypes... exts) +{ + return make_unique_mdarray(dims{exts...}); +} + } // namespace md From f89c27d50358521e752e556c644c21d3229d6862 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 24 Mar 2025 22:56:38 +0200 Subject: [PATCH 51/83] Finish mdspan-ifying state --- cpp-mdspan/miniWeather_mdspan.cpp | 82 ++++++++++++++----------------- 1 file changed, 37 insertions(+), 45 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index 842aa67..ec19f3e 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -93,6 +93,13 @@ std::unique_ptr hy_pressure_int; //hydrostatic press (vert cell i /////////////////////////////////////////////////////////////////////////////////////// double etime; //Elapsed model time double output_counter; //Helps determine when it's time to do output + +namespace md { + using MDSPAN_IMPL_STANDARD_NAMESPACE :: MDSPAN_IMPL_PROPOSED_NAMESPACE :: dims; +} // namespace md +using alloc_3d = md::unique_mdarray, md::layout_right>; +using view_3d = md::mdspan, md::layout_right>; + //Runtime variable arrays // // C indexing seems to prefer the extents in reverse order. @@ -102,20 +109,13 @@ double output_counter; //Helps determine when it's time to do output // // state extents: NUM_VARS, (nz+2*hs), (nx+2*hs) // -std::unique_ptr state; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) -std::unique_ptr state_tmp; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +alloc_3d state; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +alloc_3d state_tmp; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) std::unique_ptr flux; //Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) std::unique_ptr tend; //Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) int num_out = 0; //The number of outputs performed so far int direction_switch = 1; -namespace md { - using MDSPAN_IMPL_STANDARD_NAMESPACE :: layout_left; - using MDSPAN_IMPL_STANDARD_NAMESPACE :: MDSPAN_IMPL_PROPOSED_NAMESPACE :: dims; -} // namespace md - -using view_3d = md::mdspan, md::layout_right>; - //Declaring the functions defined after "main" void init ( int *argc , char ***argv ); void finalize ( ); @@ -159,8 +159,8 @@ int main(int argc, char **argv) { fprintf(stderr, "mass0: %le\n" , mass0); fprintf(stderr, "te0: %le\n" , te0 ); } - auto state_view = view_3d(state.get(), NUM_VARS, (nz+2*hs), (nx+2*hs)); - auto state_tmp_view = view_3d(state_tmp.get(), NUM_VARS, (nz+2*hs), (nx+2*hs)); + auto state_view = view_3d(state.get(), NUM_VARS, nz+2*hs, nx+2*hs); + auto state_tmp_view = view_3d(state_tmp.get(), NUM_VARS, nz+2*hs, nx+2*hs); //Output the initial state output(state_view, etime); @@ -496,7 +496,6 @@ void set_halo_values_z(view_3d state) { void init( int *argc , char ***argv ) { - int i, k, ii, kk, ll, inds; double x, z, r, u, w, t, hr, ht; (void) MPI_Init(argc,argv); @@ -532,8 +531,11 @@ void init( int *argc , char ***argv ) { mainproc = (myrank == 0); //Allocate the model data - state = std::make_unique( (nx+2*hs)*(nz+2*hs)*NUM_VARS ); - state_tmp = std::make_unique( (nx+2*hs)*(nz+2*hs)*NUM_VARS ); + { + auto state_mapping = md::layout_right::template mapping>{md::dims<3>{NUM_VARS, nz+2*hs, nx+2*hs}}; + state = md::make_unique_mdarray(state_mapping); + state_tmp = md::make_unique_mdarray(state_mapping); + } flux = std::make_unique( (nx+1)*(nz+1)*NUM_VARS ); tend = std::make_unique( nx*nz*NUM_VARS ); hy_dens_cell = std::make_unique( (nz+2*hs) ); @@ -560,16 +562,15 @@ void init( int *argc , char ***argv ) { ////////////////////////////////////////////////////////////////////////// // Initialize the cell-averaged fluid state via Gauss-Legendre quadrature ////////////////////////////////////////////////////////////////////////// - for (k=0; k Date: Mon, 24 Mar 2025 23:16:18 +0200 Subject: [PATCH 52/83] mdspan-ify flux & tend in compute_tendencies_{x,z} --- cpp-mdspan/miniWeather_mdspan.cpp | 37 ++++++++++++++----------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index ec19f3e..4e64dcb 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -304,7 +304,9 @@ void semi_discrete_step(view_3d state_init, view_3d state_forcing, view_3d state //Since the halos are set in a separate routine, this will not require MPI //First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) //Then, compute the tendencies using those fluxes -void compute_tendencies_x(view_3d state, double* flux, double* tend, double dt) { +void compute_tendencies_x(view_3d state, double* flux_ptr, double* tend_ptr, double dt) { + auto flux = view_3d(flux_ptr, NUM_VARS, nz+1, nx+1); + auto tend = view_3d(tend_ptr, NUM_VARS, nz, nx); double stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef; //Compute the hyperviscosity coefficient @@ -334,10 +336,10 @@ void compute_tendencies_x(view_3d state, double* flux, double* tend, double dt) double p = C0 * pow(r*t, gamm); //Compute the flux vector - flux[ID_DENS*(nz+1)*(nx+1) + k*(nx+1) + i] = r*u - hv_coef*d3_vals[ID_DENS]; - flux[ID_UMOM*(nz+1)*(nx+1) + k*(nx+1) + i] = r*u*u+p - hv_coef*d3_vals[ID_UMOM]; - flux[ID_WMOM*(nz+1)*(nx+1) + k*(nx+1) + i] = r*u*w - hv_coef*d3_vals[ID_WMOM]; - flux[ID_RHOT*(nz+1)*(nx+1) + k*(nx+1) + i] = r*u*t - hv_coef*d3_vals[ID_RHOT]; + flux(ID_DENS, k, i) = r*u - hv_coef*d3_vals[ID_DENS]; + flux(ID_UMOM, k, i) = r*u*u+p - hv_coef*d3_vals[ID_UMOM]; + flux(ID_WMOM, k, i) = r*u*w - hv_coef*d3_vals[ID_WMOM]; + flux(ID_RHOT, k, i) = r*u*t - hv_coef*d3_vals[ID_RHOT]; } } @@ -348,10 +350,7 @@ void compute_tendencies_x(view_3d state, double* flux, double* tend, double dt) for (int ll = 0; ll < NUM_VARS; ++ll) { for (int k = 0; k < nz; ++k) { for (int i = 0; i < nx; ++i) { - int indt = ll* nz * nx + k* nx + i ; - int indf1 = ll*(nz+1)*(nx+1) + k*(nx+1) + i ; - int indf2 = ll*(nz+1)*(nx+1) + k*(nx+1) + i+1; - tend[indt] = -( flux[indf2] - flux[indf1] ) / dx; + tend(ll, k, i) = -( flux(ll, k, i+1) - flux(ll, k, i) ) / dx; } } } @@ -362,9 +361,10 @@ void compute_tendencies_x(view_3d state, double* flux, double* tend, double dt) //Since the halos are set in a separate routine, this will not require MPI //First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity) //Then, compute the tendencies using those fluxes -void compute_tendencies_z(view_3d state, double* flux, double* tend, double dt) { +void compute_tendencies_z(view_3d state, double* flux_ptr, double* tend_ptr, double dt) { + auto flux = view_3d(flux_ptr, NUM_VARS, nz+1, nx+1); + auto tend = view_3d(tend_ptr, NUM_VARS, nz, nx); - int indf1, indf2, indt; double stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef; //Compute the hyperviscosity coefficient hv_coef = -hv_beta * dz / (16*dt); @@ -398,10 +398,10 @@ void compute_tendencies_z(view_3d state, double* flux, double* tend, double dt) } //Compute the flux vector with hyperviscosity - flux[ID_DENS*(nz+1)*(nx+1) + k*(nx+1) + i] = r*w - hv_coef*d3_vals[ID_DENS]; - flux[ID_UMOM*(nz+1)*(nx+1) + k*(nx+1) + i] = r*w*u - hv_coef*d3_vals[ID_UMOM]; - flux[ID_WMOM*(nz+1)*(nx+1) + k*(nx+1) + i] = r*w*w+p - hv_coef*d3_vals[ID_WMOM]; - flux[ID_RHOT*(nz+1)*(nx+1) + k*(nx+1) + i] = r*w*t - hv_coef*d3_vals[ID_RHOT]; + flux(ID_DENS, k, i) = r*w - hv_coef*d3_vals[ID_DENS]; + flux(ID_UMOM, k, i) = r*w*u - hv_coef*d3_vals[ID_UMOM]; + flux(ID_WMOM, k, i) = r*w*w+p - hv_coef*d3_vals[ID_WMOM]; + flux(ID_RHOT, k, i) = r*w*t - hv_coef*d3_vals[ID_RHOT]; } } @@ -412,12 +412,9 @@ void compute_tendencies_z(view_3d state, double* flux, double* tend, double dt) for (int ll = 0; ll < NUM_VARS; ++ll) { for (int k = 0; k < nz; ++k) { for (int i = 0; i < nx; ++i) { - indt = ll* nz * nx + k* nx + i ; - indf1 = ll*(nz+1)*(nx+1) + (k )*(nx+1) + i; - indf2 = ll*(nz+1)*(nx+1) + (k+1)*(nx+1) + i; - tend[indt] = -( flux[indf2] - flux[indf1] ) / dz; + tend(ll, k, i) = -( flux(ll, k+1, i) - flux(ll, k, i) ) / dz; if (ll == ID_WMOM) { - tend[indt] = tend[indt] - state(ID_DENS, k+hs, i+hs)*grav; + tend(ll, k, i) = tend(ll, k, i) - state(ID_DENS, k+hs, i+hs)*grav; } } } From 2c8c76687b8e07f173d32dcf49caa9ff34417db0 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 24 Mar 2025 23:41:11 +0200 Subject: [PATCH 53/83] Add make_unique_mdarray tests --- cpp-mdspan/test_unique_mdarray.cpp | 54 ++++++++++++++++++++++++++++++ cpp-mdspan/unique_mdarray.hpp | 6 +++- 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/cpp-mdspan/test_unique_mdarray.cpp b/cpp-mdspan/test_unique_mdarray.cpp index 0acefd7..ce5a4ab 100644 --- a/cpp-mdspan/test_unique_mdarray.cpp +++ b/cpp-mdspan/test_unique_mdarray.cpp @@ -249,10 +249,64 @@ void construction(const Deleter& d) { } } +void make_unique_mdarray_with_mapping() { + { + using extents_type = md::dims<0>; + const extents_type exts{}; + const auto mapping = md::layout_right::template mapping{exts}; + auto x = md::make_unique_mdarray(mapping); + using x_type = decltype(x); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(x.rank() == 0); + assert(x.mapping().extents() == exts); + } + { + using extents_type = md::dims<1>; + const extents_type exts{3}; + const auto mapping = md::layout_right::template mapping{exts}; + auto x = md::make_unique_mdarray(mapping); + using x_type = decltype(x); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(x.rank() == 1); + assert(x.mapping().extents() == exts); + assert(x.extent(0) == 3); + } + { + using extents_type = md::dims<2>; + const extents_type exts{3, 5}; + const auto mapping = md::layout_right::template mapping{exts}; + auto x = md::make_unique_mdarray(mapping); + using x_type = decltype(x); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(x.rank() == 2); + assert(x.mapping().extents() == exts); + assert(x.extent(0) == 3); + assert(x.extent(1) == 5); + } +} + +void make_unique_mdarray_with_extents() { + const auto exts = md::dims<3>{3, 5, 7}; + auto x = md::make_unique_mdarray(exts); + using x_type = decltype(x); + static_assert(std::is_same_v>); + static_assert(std::is_same_v); + static_assert(x.rank() == 3); + assert(x.mapping().extents() == exts); + assert(x.extent(0) == 3); + assert(x.extent(1) == 5); + assert(x.extent(2) == 7); +} + } // namespace test int main() { test::construction(std::default_delete{}); test::construction(test::my_array_deleter{}); + test::make_unique_mdarray_with_mapping(); + test::make_unique_mdarray_with_extents(); return 0; } diff --git a/cpp-mdspan/unique_mdarray.hpp b/cpp-mdspan/unique_mdarray.hpp index 6c34f8a..1aa06ba 100644 --- a/cpp-mdspan/unique_mdarray.hpp +++ b/cpp-mdspan/unique_mdarray.hpp @@ -541,12 +541,15 @@ constexpr unique_mdarray: it takes an extents object instead of a size. // template -constexpr unique_mdarray> +constexpr unique_mdarray, layout_right> make_unique_mdarray(const extents& exts) { + using extents_type = extents; return make_unique_mdarray(layout_right::template mapping{exts}); } +// NEEDS A BIT MORE TESTING +#if 0 // // Another make_unique(size_t) analog; it takes a list of extents // (as things convertible to index_type, generally integers). @@ -561,5 +564,6 @@ constexpr unique_mdarray> { return make_unique_mdarray(dims{exts...}); } +#endif } // namespace md From 4e478b4edf21b4d78e2c8fd8c1faae77772dd5c9 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 25 Mar 2025 00:19:08 +0200 Subject: [PATCH 54/83] Add make_unique_mdarray taking index_type extents --- cpp-mdspan/test_unique_mdarray.cpp | 37 ++++++++++++++++++++++++++++++ cpp-mdspan/unique_mdarray.hpp | 16 +++++++++---- 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/cpp-mdspan/test_unique_mdarray.cpp b/cpp-mdspan/test_unique_mdarray.cpp index ce5a4ab..0c91489 100644 --- a/cpp-mdspan/test_unique_mdarray.cpp +++ b/cpp-mdspan/test_unique_mdarray.cpp @@ -301,6 +301,42 @@ void make_unique_mdarray_with_extents() { assert(x.extent(2) == 7); } +void make_unique_mdarray_with_dims() { + { + using extents_type = md::dims<0>; + const extents_type exts{}; + auto x = md::make_unique_mdarray(); + using x_type = decltype(x); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(x.rank() == 0); + assert(x.mapping().extents() == exts); + } + { + using extents_type = md::dims<1>; + const extents_type exts{3}; + auto x = md::make_unique_mdarray(3); + using x_type = decltype(x); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(x.rank() == 1); + assert(x.mapping().extents() == exts); + assert(x.extent(0) == 3); + } + { + using extents_type = md::dims<2>; + const extents_type exts{3, 5}; + auto x = md::make_unique_mdarray(3, 5); + using x_type = decltype(x); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(x.rank() == 2); + assert(x.mapping().extents() == exts); + assert(x.extent(0) == 3); + assert(x.extent(1) == 5); + } +} + } // namespace test int main() { @@ -308,5 +344,6 @@ int main() { test::construction(test::my_array_deleter{}); test::make_unique_mdarray_with_mapping(); test::make_unique_mdarray_with_extents(); + test::make_unique_mdarray_with_dims(); return 0; } diff --git a/cpp-mdspan/unique_mdarray.hpp b/cpp-mdspan/unique_mdarray.hpp index 1aa06ba..1487527 100644 --- a/cpp-mdspan/unique_mdarray.hpp +++ b/cpp-mdspan/unique_mdarray.hpp @@ -548,14 +548,23 @@ constexpr unique_mdarray, layout_right> return make_unique_mdarray(layout_right::template mapping{exts}); } -// NEEDS A BIT MORE TESTING -#if 0 +// +// Special case for rank-0 array (with a single element). +// +template +constexpr unique_mdarray> + make_unique_mdarray() +{ + return make_unique_mdarray(dims<0>{}); +} + // // Another make_unique(size_t) analog; it takes a list of extents // (as things convertible to index_type, generally integers). // -template +template requires( + (sizeof...(OtherIndexTypes) != 0) && (std::is_convertible_v && ...) && (std::is_nothrow_constructible_v && ...) ) @@ -564,6 +573,5 @@ constexpr unique_mdarray> { return make_unique_mdarray(dims{exts...}); } -#endif } // namespace md From 9541d310776a7bb0753530d30968fb2ef778a86d Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 25 Mar 2025 00:19:27 +0200 Subject: [PATCH 55/83] mdspan-ify flux and tend --- cpp-mdspan/miniWeather_mdspan.cpp | 85 +++++++++++++++---------------- 1 file changed, 40 insertions(+), 45 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index 4e64dcb..2e9300c 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -109,11 +109,11 @@ using view_3d = md::mdspan, md::layout_right>; // // state extents: NUM_VARS, (nz+2*hs), (nx+2*hs) // -alloc_3d state; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) -alloc_3d state_tmp; //Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) -std::unique_ptr flux; //Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) -std::unique_ptr tend; //Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) -int num_out = 0; //The number of outputs performed so far +alloc_3d state; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +alloc_3d state_tmp; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) +alloc_3d flux; // Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) +alloc_3d tend; // Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) +int num_out = 0; // The number of outputs performed so far int direction_switch = 1; //Declaring the functions defined after "main" @@ -129,10 +129,11 @@ void hydro_const_bvfreq ( double z , double bv_freq0 , double &r , double &t double sample_ellipse_cosine( double x , double z , double amp , double x0 , double z0 , double xrad , double zrad ); void output (view_3d state, double etime); void ncwrap (int ierr, int line); -void perform_timestep (view_3d state, view_3d state_tmp, double* flux, double* tend, double dt); -void semi_discrete_step (view_3d state_init, view_3d state_forcing, view_3d state_out, double dt, int dir, double* flux, double* tend); -void compute_tendencies_x (view_3d state, double* flux, double* tend, double dt); -void compute_tendencies_z (view_3d state, double* flux, double* tend, double dt); +void perform_timestep (view_3d state, view_3d state_tmp, view_3d flux, view_3d tend, double dt); +void semi_discrete_step (view_3d state_init, view_3d state_forcing, view_3d state_out, + double dt, int dir, view_3d flux, view_3d tend); +void compute_tendencies_x (view_3d state, view_3d flux, view_3d tend, double dt); +void compute_tendencies_z (view_3d state, view_3d flux, view_3d tend, double dt); void set_halo_values_x (view_3d state); void set_halo_values_z (view_3d state); @@ -161,6 +162,8 @@ int main(int argc, char **argv) { } auto state_view = view_3d(state.get(), NUM_VARS, nz+2*hs, nx+2*hs); auto state_tmp_view = view_3d(state_tmp.get(), NUM_VARS, nz+2*hs, nx+2*hs); + auto flux_view = view_3d(flux.get(), NUM_VARS, nz+1, nx+1); + auto tend_view = view_3d(tend.get(), NUM_VARS, nz, nx); //Output the initial state output(state_view, etime); @@ -171,12 +174,16 @@ int main(int argc, char **argv) { auto t1 = std::chrono::steady_clock::now(); while (etime < sim_time) { //If the time step leads to exceeding the simulation time, shorten it for the last step - if (etime + dt > sim_time) { dt = sim_time - etime; } + if (etime + dt > sim_time) { + dt = sim_time - etime; + } //Perform a single time step - perform_timestep(state_view, state_tmp_view, flux.get(), tend.get(), dt); + perform_timestep(state_view, state_tmp_view, flux_view, tend_view, dt); //Inform the user -#ifndef NO_INFORM - if (mainproc) { fprintf(stderr, "Elapsed Time: %lf / %lf\n", etime , sim_time ); } +#if ! defined(NO_INFORM) + if (mainproc) { + fprintf(stderr, "Elapsed Time: %lf / %lf\n", etime, sim_time); + } #endif //Update the elapsed time and output counter etime = etime + dt; @@ -186,13 +193,6 @@ int main(int argc, char **argv) { output_counter = output_counter - output_freq; output(state_view, etime); } -#if 0 - { - auto [mass, te] = reductions(); - fprintf(stderr, "mass: %le\n" , mass ); - fprintf(stderr, "te: %le\n" , te ); - } -#endif // 0 } auto t2 = std::chrono::steady_clock::now(); if (mainproc) { @@ -218,7 +218,8 @@ int main(int argc, char **argv) { // q* = q[n] + dt/3 * rhs(q[n]) // q** = q[n] + dt/2 * rhs(q* ) // q[n+1] = q[n] + dt/1 * rhs(q** ) -void perform_timestep(view_3d state, view_3d state_tmp, double* flux, double* tend, double dt) { +void perform_timestep(view_3d state, view_3d state_tmp, view_3d flux, view_3d tend, double dt) +{ if (direction_switch) { //x-direction first semi_discrete_step(state, state , state_tmp, dt / 3, DIR_X, flux, tend); @@ -244,9 +245,11 @@ void perform_timestep(view_3d state, view_3d state_tmp, double* flux, double* te //Perform a single semi-discretized step in time with the form: //state_out = state_init + dt * rhs(state_forcing) -//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out -void semi_discrete_step(view_3d state_init, view_3d state_forcing, view_3d state_out, double dt, int dir, double* flux, double* tend) { - int indt, indw; +//Meaning the step starts from state_init, computes the rhs using state_forcing, +//and stores the result in state_out +void semi_discrete_step(view_3d state_init, view_3d state_forcing, view_3d state_out, + double dt, int dir, view_3d flux, view_3d tend) +{ if (dir == DIR_X) { //Set the halo values for this MPI task's fluid state in the x-direction set_halo_values_x(state_forcing); @@ -289,11 +292,9 @@ void semi_discrete_step(view_3d state_init, view_3d state_forcing, view_3d state wpert = 0.; } } - indw = ID_WMOM*nz*nx + k*nx + i; - tend[indw] += wpert*hy_dens_cell[hs+k]; + tend(ID_WMOM, k, i) += wpert*hy_dens_cell[hs+k]; } - indt = ll*nz*nx + k*nx + i; - state_out(ll, k+hs, i+hs) = state_init(ll, k+hs, i+hs) + dt * tend[indt]; + state_out(ll, k+hs, i+hs) = state_init(ll, k+hs, i+hs) + dt * tend(ll, k, i); } } } @@ -304,10 +305,7 @@ void semi_discrete_step(view_3d state_init, view_3d state_forcing, view_3d state //Since the halos are set in a separate routine, this will not require MPI //First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) //Then, compute the tendencies using those fluxes -void compute_tendencies_x(view_3d state, double* flux_ptr, double* tend_ptr, double dt) { - auto flux = view_3d(flux_ptr, NUM_VARS, nz+1, nx+1); - auto tend = view_3d(tend_ptr, NUM_VARS, nz, nx); - +void compute_tendencies_x(view_3d state, view_3d flux, view_3d tend, double dt) { double stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef; //Compute the hyperviscosity coefficient hv_coef = -hv_beta * dx / (16*dt); @@ -361,10 +359,7 @@ void compute_tendencies_x(view_3d state, double* flux_ptr, double* tend_ptr, dou //Since the halos are set in a separate routine, this will not require MPI //First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity) //Then, compute the tendencies using those fluxes -void compute_tendencies_z(view_3d state, double* flux_ptr, double* tend_ptr, double dt) { - auto flux = view_3d(flux_ptr, NUM_VARS, nz+1, nx+1); - auto tend = view_3d(tend_ptr, NUM_VARS, nz, nx); - +void compute_tendencies_z(view_3d state, view_3d flux, view_3d tend, double dt) { double stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef; //Compute the hyperviscosity coefficient hv_coef = -hv_beta * dz / (16*dt); @@ -533,19 +528,19 @@ void init( int *argc , char ***argv ) { state = md::make_unique_mdarray(state_mapping); state_tmp = md::make_unique_mdarray(state_mapping); } - flux = std::make_unique( (nx+1)*(nz+1)*NUM_VARS ); - tend = std::make_unique( nx*nz*NUM_VARS ); - hy_dens_cell = std::make_unique( (nz+2*hs) ); - hy_dens_theta_cell = std::make_unique( (nz+2*hs) ); - hy_dens_int = std::make_unique( (nz+1) ); - hy_dens_theta_int = std::make_unique( (nz+1) ); - hy_pressure_int = std::make_unique( (nz+1) ); + flux = md::make_unique_mdarray(md::dims<3>{NUM_VARS, nz+1, nx+1}); + tend = md::make_unique_mdarray(md::dims<3>{NUM_VARS, nz, nx}); + hy_dens_cell = std::make_unique(nz+2*hs); + hy_dens_theta_cell = std::make_unique(nz+2*hs); + hy_dens_int = std::make_unique(nz+1); + hy_dens_theta_int = std::make_unique(nz+1); + hy_pressure_int = std::make_unique(nz+1); //Define the maximum stable time step based on an assumed maximum wind speed dt = fmin(dx,dz) / max_speed * cfl; //Set initial elapsed model time and output_counter to zero - etime = 0.; - output_counter = 0.; + etime = 0.0; + output_counter = 0.0; //If I'm the main process in MPI, display some grid information if (mainproc) { From ca60a5cd49fe040726c545e93f748820ab9d42a1 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 25 Mar 2025 00:19:45 +0200 Subject: [PATCH 56/83] Make Python miniWeather comparable to C --- python/miniWeather.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/miniWeather.py b/python/miniWeather.py index bcb2e13..ac285a8 100644 --- a/python/miniWeather.py +++ b/python/miniWeather.py @@ -75,7 +75,7 @@ # /////////////////////////////////////////////////////////////////////////////////////// # The x-direction length is twice as long as the z-direction length # So, you'll want to have nx_glob be twice as large as nz_glob -nz_glob: int = 100 # Number of total cells in the z-direction +nz_glob: int = 50 # Number of total cells in the z-direction nx_glob: int = 2 * nz_glob # Number of total cells in the x-direction sim_time: real = 1000.0 # How many seconds to run the simulation output_freq: real = 10.0 # How frequently to output data to file (in seconds) From 160dae63ab56996a23765328f3a8e9789ef43751 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 25 Mar 2025 01:07:35 +0200 Subject: [PATCH 57/83] More C++-ification and reformatting --- cpp-mdspan/miniWeather_mdspan.cpp | 253 +++++++++++++++--------------- 1 file changed, 124 insertions(+), 129 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index 2e9300c..ef7e2f8 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -117,25 +117,54 @@ int num_out = 0; // The number of outputs performed so far int direction_switch = 1; //Declaring the functions defined after "main" -void init ( int *argc , char ***argv ); -void finalize ( ); -void injection ( double x , double z , double &r , double &u , double &w , double &t , double &hr , double &ht ); -void density_current ( double x , double z , double &r , double &u , double &w , double &t , double &hr , double &ht ); -void gravity_waves ( double x , double z , double &r , double &u , double &w , double &t , double &hr , double &ht ); -void thermal ( double x , double z , double &r , double &u , double &w , double &t , double &hr , double &ht ); -void collision ( double x , double z , double &r , double &u , double &w , double &t , double &hr , double &ht ); -void hydro_const_theta ( double z , double &r , double &t ); -void hydro_const_bvfreq ( double z , double bv_freq0 , double &r , double &t ); -double sample_ellipse_cosine( double x , double z , double amp , double x0 , double z0 , double xrad , double zrad ); -void output (view_3d state, double etime); -void ncwrap (int ierr, int line); -void perform_timestep (view_3d state, view_3d state_tmp, view_3d flux, view_3d tend, double dt); -void semi_discrete_step (view_3d state_init, view_3d state_forcing, view_3d state_out, - double dt, int dir, view_3d flux, view_3d tend); -void compute_tendencies_x (view_3d state, view_3d flux, view_3d tend, double dt); -void compute_tendencies_z (view_3d state, view_3d flux, view_3d tend, double dt); -void set_halo_values_x (view_3d state); -void set_halo_values_z (view_3d state); +void init(int *argc , char ***argv ); +void finalize(); + +struct test_case { + double r; + double u; + double w; + double t; + double hr; + double ht; +}; + +test_case injection(double x, double z); +test_case density_current(double x, double z); +test_case gravity_waves(double x, double z); +test_case thermal(double x, double z); +test_case collision(double x, double z); + +test_case get_test_case(int data_spec, double x_, double z_) { + if (data_spec == DATA_SPEC_COLLISION ) { return collision(x_, z_); } + if (data_spec == DATA_SPEC_THERMAL ) { return thermal(x_, z_); } + if (data_spec == DATA_SPEC_GRAVITY_WAVES ) { return gravity_waves(x_, z_); } + if (data_spec == DATA_SPEC_DENSITY_CURRENT) { return density_current(x_, z_); } + if (data_spec == DATA_SPEC_INJECTION ) { return injection(x_, z_); } + assert(false); + return test_case{}; +} + +struct r_t_pair { + double r; + double t; +}; + +r_t_pair hydro_const_theta(double z); +r_t_pair hydro_const_bvfreq(double z, double bv_freq0); +double sample_ellipse_cosine(double x, double z, double amp, double x0, double z0, + double xrad, double zrad); + +void output(view_3d state, double etime); +void ncwrap(int ierr, int line); +void perform_timestep(view_3d state, view_3d state_tmp, + view_3d flux, view_3d tend, double dt); +void semi_discrete_step(view_3d state_init, view_3d state_forcing, view_3d state_out, + double dt, int dir, view_3d flux, view_3d tend); +void compute_tendencies_x(view_3d state, view_3d flux, view_3d tend, double dt); +void compute_tendencies_z(view_3d state, view_3d flux, view_3d tend, double dt); +void set_halo_values_x(view_3d state); +void set_halo_values_z(view_3d state); struct reduction_result { double mass; @@ -270,28 +299,9 @@ void semi_discrete_step(view_3d state_init, view_3d state_forcing, view_3d state for (int k = 0; k < nz; ++k) { for (int i = 0; i < nx; ++i) { if (data_spec_int == DATA_SPEC_GRAVITY_WAVES) { - double x = (i_beg + i+0.5)*dx; - double z = (k_beg + k+0.5)*dz; - // Using sample_ellipse_cosine requires "acc routine" in OpenACC and "declare target" in OpenMP offload - // Neither of these are particularly well supported. So I'm manually inlining here - // wpert = sample_ellipse_cosine( x,z , 0.01 , xlen/8,1000., 500.,500. ); - - double wpert = 0.0; - { - double x0 = xlen/8; - double z0 = 1000; - double xrad = 500; - double zrad = 500; - double amp = 0.01; - //Compute distance from bubble center - double dist = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.; - //If the distance from bubble center is less than the radius, create a cos**2 profile - if (dist <= pi / 2.) { - wpert = amp * pow(cos(dist),2.); - } else { - wpert = 0.; - } - } + const double x = (i_beg + i+0.5)*dx; + const double z = (k_beg + k+0.5)*dz; + const double wpert = sample_ellipse_cosine(x, z, 0.01, xlen/8, 1000.0, 500.0, 500.0); tend(ID_WMOM, k, i) += wpert*hy_dens_cell[hs+k]; } state_out(ll, k+hs, i+hs) = state_init(ll, k+hs, i+hs) + dt * tend(ll, k, i); @@ -360,9 +370,9 @@ void compute_tendencies_x(view_3d state, view_3d flux, view_3d tend, double dt) //First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity) //Then, compute the tendencies using those fluxes void compute_tendencies_z(view_3d state, view_3d flux, view_3d tend, double dt) { - double stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef; + double stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS]; //Compute the hyperviscosity coefficient - hv_coef = -hv_beta * dz / (16*dt); + const double hv_coef = -hv_beta * dz / (16*dt); ///////////////////////////////////////////////// // TODO: THREAD ME ///////////////////////////////////////////////// @@ -488,8 +498,6 @@ void set_halo_values_z(view_3d state) { void init( int *argc , char ***argv ) { - double x, z, r, u, w, t, hr, ht; - (void) MPI_Init(argc,argv); ///////////////////////////////////////////////////////////// @@ -523,13 +531,10 @@ void init( int *argc , char ***argv ) { mainproc = (myrank == 0); //Allocate the model data - { - auto state_mapping = md::layout_right::template mapping>{md::dims<3>{NUM_VARS, nz+2*hs, nx+2*hs}}; - state = md::make_unique_mdarray(state_mapping); - state_tmp = md::make_unique_mdarray(state_mapping); - } - flux = md::make_unique_mdarray(md::dims<3>{NUM_VARS, nz+1, nx+1}); - tend = md::make_unique_mdarray(md::dims<3>{NUM_VARS, nz, nx}); + state = md::make_unique_mdarray(NUM_VARS, nz+2*hs, nx+2*hs); + state_tmp = md::make_unique_mdarray(NUM_VARS, nz+2*hs, nx+2*hs); + flux = md::make_unique_mdarray(NUM_VARS, nz+1, nx+1); + tend = md::make_unique_mdarray(NUM_VARS, nz, nx); hy_dens_cell = std::make_unique(nz+2*hs); hy_dens_theta_cell = std::make_unique(nz+2*hs); hy_dens_int = std::make_unique(nz+1); @@ -564,15 +569,11 @@ void init( int *argc , char ***argv ) { for (int kk = 0; kk < nqpoints; ++kk) { for (int ii = 0; ii < nqpoints; ++ii) { //Compute the x,z location within the global domain based on cell and quadrature index - x = (i_beg + i-hs+0.5)*dx + (qpoints[ii]-0.5)*dx; - z = (k_beg + k-hs+0.5)*dz + (qpoints[kk]-0.5)*dz; + const double x = (i_beg + i-hs+0.5)*dx + (qpoints[ii]-0.5)*dx; + const double z = (k_beg + k-hs+0.5)*dz + (qpoints[kk]-0.5)*dz; //Set the fluid state based on the user's specification - if (data_spec_int == DATA_SPEC_COLLISION ) { collision (x,z,r,u,w,t,hr,ht); } - if (data_spec_int == DATA_SPEC_THERMAL ) { thermal (x,z,r,u,w,t,hr,ht); } - if (data_spec_int == DATA_SPEC_GRAVITY_WAVES ) { gravity_waves (x,z,r,u,w,t,hr,ht); } - if (data_spec_int == DATA_SPEC_DENSITY_CURRENT) { density_current(x,z,r,u,w,t,hr,ht); } - if (data_spec_int == DATA_SPEC_INJECTION ) { injection (x,z,r,u,w,t,hr,ht); } + auto [r, u, w, t, hr, ht] = get_test_case(data_spec_int, x, z); //Store into the fluid state array state(ID_DENS, k, i) = state(ID_DENS, k, i) + r * qweights[ii]*qweights[kk]; @@ -591,28 +592,20 @@ void init( int *argc , char ***argv ) { hy_dens_cell [k] = 0.; hy_dens_theta_cell[k] = 0.; for (int kk = 0; kk < nqpoints; ++kk) { - z = (k_beg + k-hs+0.5)*dz; + const double z = (k_beg + k-hs+0.5)*dz; //Set the fluid state based on the user's specification - if (data_spec_int == DATA_SPEC_COLLISION ) { collision (0.,z,r,u,w,t,hr,ht); } - if (data_spec_int == DATA_SPEC_THERMAL ) { thermal (0.,z,r,u,w,t,hr,ht); } - if (data_spec_int == DATA_SPEC_GRAVITY_WAVES ) { gravity_waves (0.,z,r,u,w,t,hr,ht); } - if (data_spec_int == DATA_SPEC_DENSITY_CURRENT) { density_current(0.,z,r,u,w,t,hr,ht); } - if (data_spec_int == DATA_SPEC_INJECTION ) { injection (0.,z,r,u,w,t,hr,ht); } + auto [r, u, w, t, hr, ht] = get_test_case(data_spec_int, 0.0, z); hy_dens_cell [k] = hy_dens_cell [k] + hr * qweights[kk]; hy_dens_theta_cell[k] = hy_dens_theta_cell[k] + hr*ht * qweights[kk]; } } //Compute the hydrostatic background state at vertical cell interfaces for (int k = 0; k < nz+1; ++k) { - z = (k_beg + k)*dz; - if (data_spec_int == DATA_SPEC_COLLISION ) { collision (0.,z,r,u,w,t,hr,ht); } - if (data_spec_int == DATA_SPEC_THERMAL ) { thermal (0.,z,r,u,w,t,hr,ht); } - if (data_spec_int == DATA_SPEC_GRAVITY_WAVES ) { gravity_waves (0.,z,r,u,w,t,hr,ht); } - if (data_spec_int == DATA_SPEC_DENSITY_CURRENT) { density_current(0.,z,r,u,w,t,hr,ht); } - if (data_spec_int == DATA_SPEC_INJECTION ) { injection (0.,z,r,u,w,t,hr,ht); } + const double z = (k_beg + k)*dz; + auto [r, u, w, t, hr, ht] = get_test_case(data_spec_int, 0.0, z); hy_dens_int [k] = hr; - hy_dens_theta_int[k] = hr*ht; - hy_pressure_int [k] = C0*pow((hr*ht),gamm); + hy_dens_theta_int[k] = hr * ht; + hy_pressure_int [k] = C0 * pow(hr * ht, gamm); } } @@ -621,12 +614,13 @@ void init( int *argc , char ***argv ) { //x and z are input coordinates at which to sample //r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location //hr and ht are output background hydrostatic density and potential temperature at that location -void injection( double x , double z , double &r , double &u , double &w , double &t , double &hr , double &ht ) { - hydro_const_theta(z,hr,ht); - r = 0.; - t = 0.; - u = 0.; - w = 0.; +test_case injection(double x , double z) { + auto [hr, ht] = hydro_const_theta(z); + double r = 0.0; + double t = 0.0; + double u = 0.0; + double w = 0.0; + return {r, u, w, t, hr, ht}; } @@ -634,25 +628,26 @@ void injection( double x , double z , double &r , double &u , double &w , double //x and z are input coordinates at which to sample //r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location //hr and ht are output background hydrostatic density and potential temperature at that location -void density_current( double x , double z , double &r , double &u , double &w , double &t , double &hr , double &ht ) { - hydro_const_theta(z,hr,ht); - r = 0.; - t = 0.; - u = 0.; - w = 0.; - t = t + sample_ellipse_cosine(x,z,-20. ,xlen/2,5000.,4000.,2000.); +test_case density_current(double x , double z) { + auto [hr, ht] = hydro_const_theta(z); + double r = 0.0; + double t = sample_ellipse_cosine(x, z, -20.0, xlen/2, 5000.0, 4000.0, 2000.0); + double u = 0.0; + double w = 0.0; + return {r, u, w, t, hr, ht}; } //x and z are input coordinates at which to sample //r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location //hr and ht are output background hydrostatic density and potential temperature at that location -void gravity_waves( double x , double z , double &r , double &u , double &w , double &t , double &hr , double &ht ) { - hydro_const_bvfreq(z,0.02,hr,ht); - r = 0.; - t = 0.; - u = 15.; - w = 0.; +test_case gravity_waves(double x, double z) { + auto [hr, ht] = hydro_const_bvfreq(z, 0.02); + double r = 0.0; + double t = 0.0; + double u = 15.0; + double w = 0.0; + return {r, u, w, t, hr, ht}; } @@ -660,13 +655,13 @@ void gravity_waves( double x , double z , double &r , double &u , double &w , do //x and z are input coordinates at which to sample //r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location //hr and ht are output background hydrostatic density and potential temperature at that location -void thermal( double x , double z , double &r , double &u , double &w , double &t , double &hr , double &ht ) { - hydro_const_theta(z,hr,ht); - r = 0.; - t = 0.; - u = 0.; - w = 0.; - t = t + sample_ellipse_cosine(x,z, 3. ,xlen/2,2000.,2000.,2000.); +test_case thermal(double x, double z) { + auto [hr, ht] = hydro_const_theta(z); + double r = 0.0; + double t = sample_ellipse_cosine(x, z, 3.0, xlen/2,2000.0, 2000.0, 2000.0); + double u = 0.0; + double w = 0.0; + return {r, u, w, t, hr, ht}; } @@ -674,30 +669,33 @@ void thermal( double x , double z , double &r , double &u , double &w , double & //x and z are input coordinates at which to sample //r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location //hr and ht are output background hydrostatic density and potential temperature at that location -void collision( double x , double z , double &r , double &u , double &w , double &t , double &hr , double &ht ) { - hydro_const_theta(z,hr,ht); - r = 0.; - t = 0.; - u = 0.; - w = 0.; - t = t + sample_ellipse_cosine(x,z, 20.,xlen/2,2000.,2000.,2000.); - t = t + sample_ellipse_cosine(x,z,-20.,xlen/2,8000.,2000.,2000.); +test_case collision(double x , double z) { + auto [hr, ht] = hydro_const_theta(z); + double r = 0.0; + double t = 0.0; + double u = 0.0; + double w = 0.0; + t = t + sample_ellipse_cosine(x, z, 20.0, xlen/2,2000.0, 2000.0, 2000.0); + t = t + sample_ellipse_cosine(x, z, -20.0, xlen/2,8000.0, 2000.0, 2000.0); + return {r, u, w, t, hr, ht}; } //Establish hydrostatic balance using constant potential temperature (thermally neutral atmosphere) //z is the input coordinate //r and t are the output background hydrostatic density and potential temperature -void hydro_const_theta( double z , double &r , double &t ) { +r_t_pair hydro_const_theta(double z) { const double theta0 = 300.; //Background potential temperature const double exner0 = 1.; //Surface-level Exner pressure double p,exner,rt; //Establish hydrostatic balance first using Exner pressure - t = theta0; //Potential Temperature at z - exner = exner0 - grav * z / (cp * theta0); //Exner pressure at z - p = p0 * pow(exner,(cp/rd)); //Pressure at z - rt = pow((p / C0),(1. / gamm)); //rho*theta at z - r = rt / t; //Density at z + double t = theta0; //Potential Temperature at z + exner = exner0 - grav * z / (cp * theta0); //Exner pressure at z + p = p0 * pow(exner,(cp/rd)); //Pressure at z + rt = pow((p / C0),(1. / gamm)); //rho*theta at z + double r = rt / t; //Density at z + + return {r, t}; } @@ -705,15 +703,17 @@ void hydro_const_theta( double z , double &r , double &t ) { //z is the input coordinate //bv_freq0 is the constant Brunt-Vaisala frequency //r and t are the output background hydrostatic density and potential temperature -void hydro_const_bvfreq( double z , double bv_freq0 , double &r , double &t ) { +r_t_pair hydro_const_bvfreq(double z, double bv_freq0) { const double theta0 = 300.; //Background potential temperature const double exner0 = 1.; //Surface-level Exner pressure double p, exner, rt; - t = theta0 * exp( bv_freq0*bv_freq0 / grav * z ); //Pot temp at z + double t = theta0 * exp( bv_freq0*bv_freq0 / grav * z ); //Pot temp at z exner = exner0 - grav*grav / (cp * bv_freq0*bv_freq0) * (t - theta0) / (t * theta0); //Exner pressure at z p = p0 * pow(exner,(cp/rd)); //Pressure at z - rt = pow((p / C0),(1. / gamm)); //rho*theta at z - r = rt / t; //Density at z + rt = pow((p / C0), (1. / gamm)); //rho*theta at z + double r = rt / t; //Density at z + + return {r, t}; } @@ -723,10 +723,10 @@ void hydro_const_bvfreq( double z , double bv_freq0 , double &r , double &t ) { double sample_ellipse_cosine( double x , double z , double amp , double x0 , double z0 , double xrad , double zrad ) { double dist; //Compute distance from bubble center - dist = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.; + dist = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.0; //If the distance from bubble center is less than the radius, create a cos**2 profile - if (dist <= pi / 2.) { - return amp * pow(cos(dist),2.); + if (dist <= pi / 2.0) { + return amp * pow(cos(dist), 2.0); } else { return 0.; } @@ -748,17 +748,12 @@ void output(view_3d state, double etime) { if (mainproc) { fprintf(stderr, "*** OUTPUT ***\n"); } //Temporary arrays to hold density, u-wind, w-wind, and potential temperature (theta). - // - // As with state, we retain the reversed order of extents. - // Some compilers aren't so good at CTAD for mapping. - auto mapping_2d = md::layout_right::template mapping>{md::dims<2>{nz, nx}}; - #if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) - auto dens = md::make_unique_mdarray(mapping_2d); - auto uwnd = md::make_unique_mdarray(mapping_2d); - auto wwnd = md::make_unique_mdarray(mapping_2d); + auto dens = md::make_unique_mdarray(nz, nx); + auto uwnd = md::make_unique_mdarray(nz, nx); + auto wwnd = md::make_unique_mdarray(nz, nx); #endif - auto theta = md::make_unique_mdarray(mapping_2d); + auto theta = md::make_unique_mdarray(nz, nx); auto etimearr = std::make_unique(1); // PNetCDF needs an MPI_Info object that is not MPI_INFO_NULL. From 3e911e50f1367f380ff38d24d504822f79023804 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 25 Mar 2025 22:01:40 +0200 Subject: [PATCH 58/83] Remove most global state The result is gross, because it involves two "god structs." A better solution would be to separate out the const arrays from the nonconst arrays. An even better solution would be a state object that owns all the allocations, and hands out const or nonconst views as needed. --- cpp-mdspan/miniWeather_mdspan.cpp | 367 ++++++++++++++++++++---------- 1 file changed, 241 insertions(+), 126 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index ef7e2f8..3397a02 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -44,8 +44,9 @@ constexpr int ID_DENS = 0; //index for density ("rho") constexpr int ID_UMOM = 1; //index for momentum in the x-direction ("rho * u") constexpr int ID_WMOM = 2; //index for momentum in the z-direction ("rho * w") constexpr int ID_RHOT = 3; //index for density * potential temperature ("rho * theta") -constexpr int DIR_X = 1; //Integer constant to express that this operation is in the x-direction -constexpr int DIR_Z = 2; //Integer constant to express that this operation is in the z-direction + +enum class direction { X, Z }; + constexpr int DATA_SPEC_COLLISION = 1; constexpr int DATA_SPEC_THERMAL = 2; constexpr int DATA_SPEC_GRAVITY_WAVES = 3; @@ -76,17 +77,6 @@ double constexpr dz = zlen / nz_glob; // grid spacing in the x-direct /////////////////////////////////////////////////////////////////////////////////////// // Variables that are initialized but remain static over the course of the simulation /////////////////////////////////////////////////////////////////////////////////////// -double dt; //Model time step (seconds) -int nx, nz; //Number of local grid cells in the x- and z- dimensions for this MPI task -int i_beg, k_beg; //beginning index in the x- and z-directions for this MPI task -int nranks, myrank; //Number of MPI ranks and my rank id -int left_rank, right_rank; //MPI Rank IDs that exist to my left and right in the global domain -int mainproc; //Am I the main process (rank == 0)? -std::unique_ptr hy_dens_cell; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) -std::unique_ptr hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) -std::unique_ptr hy_dens_int; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) -std::unique_ptr hy_dens_theta_int; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) -std::unique_ptr hy_pressure_int; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) /////////////////////////////////////////////////////////////////////////////////////// // Variables that are dynamics over the course of the simulation @@ -108,16 +98,46 @@ using view_3d = md::mdspan, md::layout_right>; // to be for C code to use row-major storage, but with Fortran ordering. // // state extents: NUM_VARS, (nz+2*hs), (nx+2*hs) -// -alloc_3d state; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) -alloc_3d state_tmp; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) -alloc_3d flux; // Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) -alloc_3d tend; // Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) + +struct global_scalars { + // Model time step (seconds). The last time step might shorten this. + double dt; + + // Variables and arrays that are set once in init and remain read-only throughout the simulation. + + int nx = nx_glob; + int nz = nz_glob; //Number of local grid cells in the x- and z- dimensions for this MPI task + int i_beg = 0; + int k_beg = 0; //beginning index in the x- and z-directions for this MPI task + + int nranks = 1; + int myrank = 0; //Number of MPI ranks and my rank id + int left_rank = 0; + int right_rank = 0; //MPI Rank IDs that exist to my left and right in the global domain + + bool mainproc() const { return myrank == 0; } //Am I the main process (rank == 0)? +}; + +struct global_arrays { + std::unique_ptr hy_dens_cell; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) + std::unique_ptr hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) + std::unique_ptr hy_dens_int; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) + std::unique_ptr hy_dens_theta_int; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) + std::unique_ptr hy_pressure_int; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) + + // Arrays that are allocated in init and updated throughout the simulation. + + alloc_3d state; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) + alloc_3d state_tmp; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) + alloc_3d flux; // Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) + alloc_3d tend; // Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) +}; + int num_out = 0; // The number of outputs performed so far int direction_switch = 1; //Declaring the functions defined after "main" -void init(int *argc , char ***argv ); +std::tuple init(int *argc , char ***argv ); void finalize(); struct test_case { @@ -155,22 +175,31 @@ r_t_pair hydro_const_bvfreq(double z, double bv_freq0); double sample_ellipse_cosine(double x, double z, double amp, double x0, double z0, double xrad, double zrad); -void output(view_3d state, double etime); +void output(const global_scalars& scalars, const global_arrays& arrays, double etime); void ncwrap(int ierr, int line); void perform_timestep(view_3d state, view_3d state_tmp, - view_3d flux, view_3d tend, double dt); + view_3d flux, view_3d tend, + const global_scalars& scalars, + const global_arrays& arrays); void semi_discrete_step(view_3d state_init, view_3d state_forcing, view_3d state_out, - double dt, int dir, view_3d flux, view_3d tend); -void compute_tendencies_x(view_3d state, view_3d flux, view_3d tend, double dt); -void compute_tendencies_z(view_3d state, view_3d flux, view_3d tend, double dt); -void set_halo_values_x(view_3d state); -void set_halo_values_z(view_3d state); + double dt /* not scalars.dt */, + direction dir, view_3d flux, view_3d tend, + const global_scalars& scalars, + const global_arrays& arrays); +void compute_tendencies_x(view_3d state, view_3d flux, view_3d tend, + double dt, int nx, int nz, const global_arrays& arrays); +void compute_tendencies_z(view_3d state, view_3d flux, view_3d tend, + double dt, int nx, int nz, const global_arrays& arrays); +void set_halo_values_x(view_3d state, + const global_scalars& scalars, const global_arrays& arrays); +void set_halo_values_z(view_3d state, + const global_scalars& scalars, const global_arrays& arrays); struct reduction_result { double mass; double te; }; -reduction_result reductions(); +reduction_result reductions(const global_scalars& scalars, const global_arrays& arrays); /////////////////////////////////////////////////////////////////////////////////////// @@ -178,24 +207,28 @@ reduction_result reductions(); /////////////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { - init( &argc , &argv ); + auto [scalars, arrays] = init( &argc , &argv ); //Initial reductions for mass, kinetic energy, and total energy. // // mass0: initial domain total for mass // te0: initial domain total for total energy - auto [mass0, te0] = reductions(); + auto [mass0, te0] = reductions(scalars, arrays); { fprintf(stderr, "mass0: %le\n" , mass0); fprintf(stderr, "te0: %le\n" , te0 ); } - auto state_view = view_3d(state.get(), NUM_VARS, nz+2*hs, nx+2*hs); - auto state_tmp_view = view_3d(state_tmp.get(), NUM_VARS, nz+2*hs, nx+2*hs); - auto flux_view = view_3d(flux.get(), NUM_VARS, nz+1, nx+1); - auto tend_view = view_3d(tend.get(), NUM_VARS, nz, nx); + auto state_view = view_3d(arrays.state.get(), + NUM_VARS, scalars.nz + 2 * hs, scalars.nx + 2 * hs); + auto state_tmp_view = view_3d(arrays.state_tmp.get(), + NUM_VARS, scalars.nz + 2 * hs, scalars.nx + 2 * hs); + auto flux_view = view_3d(arrays.flux.get(), + NUM_VARS, scalars.nz + 1, scalars.nx + 1); + auto tend_view = view_3d(arrays.tend.get(), + NUM_VARS, scalars.nz, scalars.nx); //Output the initial state - output(state_view, etime); + output(scalars, arrays, etime); //////////////////////////////////////////////////// // MAIN TIME STEP LOOP @@ -203,35 +236,35 @@ int main(int argc, char **argv) { auto t1 = std::chrono::steady_clock::now(); while (etime < sim_time) { //If the time step leads to exceeding the simulation time, shorten it for the last step - if (etime + dt > sim_time) { - dt = sim_time - etime; + if (etime + scalars.dt > sim_time) { + scalars.dt = sim_time - etime; } //Perform a single time step - perform_timestep(state_view, state_tmp_view, flux_view, tend_view, dt); + perform_timestep(state_view, state_tmp_view, flux_view, tend_view, scalars, arrays); //Inform the user #if ! defined(NO_INFORM) - if (mainproc) { + if (scalars.mainproc()) { fprintf(stderr, "Elapsed Time: %lf / %lf\n", etime, sim_time); } #endif //Update the elapsed time and output counter - etime = etime + dt; - output_counter = output_counter + dt; + etime = etime + scalars.dt; + output_counter = output_counter + scalars.dt; //If it's time for output, reset the counter, and do output if (output_counter >= output_freq) { output_counter = output_counter - output_freq; - output(state_view, etime); + output(scalars, arrays, etime); } } auto t2 = std::chrono::steady_clock::now(); - if (mainproc) { + if (scalars.mainproc()) { std::cerr << "CPU Time: " << std::chrono::duration(t2-t1).count() << " sec\n"; } //Final reductions for mass, kinetic energy, and total energy - auto [mass, te] = reductions(); + auto [mass, te] = reductions(scalars, arrays); - if (mainproc) { + if (scalars.mainproc()) { fprintf(stderr, "d_mass: %le\n" , (mass - mass0)/mass0 ); fprintf(stderr, "d_te: %le\n" , (te - te0 )/te0 ); } @@ -247,26 +280,30 @@ int main(int argc, char **argv) { // q* = q[n] + dt/3 * rhs(q[n]) // q** = q[n] + dt/2 * rhs(q* ) // q[n+1] = q[n] + dt/1 * rhs(q** ) -void perform_timestep(view_3d state, view_3d state_tmp, view_3d flux, view_3d tend, double dt) +void perform_timestep(view_3d state, view_3d state_tmp, + view_3d flux, view_3d tend, + const global_scalars& scalars, + const global_arrays& arrays) { + const double dt = scalars.dt; if (direction_switch) { //x-direction first - semi_discrete_step(state, state , state_tmp, dt / 3, DIR_X, flux, tend); - semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend); - semi_discrete_step(state, state_tmp, state , dt / 1, DIR_X, flux, tend); + semi_discrete_step(state, state , state_tmp, dt / 3, direction::X, flux, tend, scalars, arrays); + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, direction::X, flux, tend, scalars, arrays); + semi_discrete_step(state, state_tmp, state , dt / 1, direction::X, flux, tend, scalars, arrays); //z-direction second - semi_discrete_step(state, state , state_tmp, dt / 3, DIR_Z, flux, tend); - semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend); - semi_discrete_step(state, state_tmp, state , dt / 1, DIR_Z, flux, tend); + semi_discrete_step(state, state , state_tmp, dt / 3, direction::Z, flux, tend, scalars, arrays); + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, direction::Z, flux, tend, scalars, arrays); + semi_discrete_step(state, state_tmp, state , dt / 1, direction::Z, flux, tend, scalars, arrays); } else { //z-direction second - semi_discrete_step(state, state , state_tmp, dt / 3, DIR_Z, flux, tend); - semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend); - semi_discrete_step(state, state_tmp, state , dt / 1, DIR_Z, flux, tend); + semi_discrete_step(state, state , state_tmp, dt / 3, direction::Z, flux, tend, scalars, arrays); + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, direction::Z, flux, tend, scalars, arrays); + semi_discrete_step(state, state_tmp, state , dt / 1, direction::Z, flux, tend, scalars, arrays); //x-direction first - semi_discrete_step(state, state , state_tmp, dt / 3, DIR_X, flux, tend); - semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend); - semi_discrete_step(state, state_tmp, state , dt / 1, DIR_X, flux, tend); + semi_discrete_step(state, state , state_tmp, dt / 3, direction::X, flux, tend, scalars, arrays); + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, direction::X, flux, tend, scalars, arrays); + semi_discrete_step(state, state_tmp, state , dt / 1, direction::X, flux, tend, scalars, arrays); } if (direction_switch) { direction_switch = 0; } else { direction_switch = 1; } } @@ -277,24 +314,32 @@ void perform_timestep(view_3d state, view_3d state_tmp, view_3d flux, view_3d te //Meaning the step starts from state_init, computes the rhs using state_forcing, //and stores the result in state_out void semi_discrete_step(view_3d state_init, view_3d state_forcing, view_3d state_out, - double dt, int dir, view_3d flux, view_3d tend) + double dt /* not scalars.dt */, + direction dir, view_3d flux, view_3d tend, + const global_scalars& scalars, + const global_arrays& arrays) { - if (dir == DIR_X) { + const int nx = scalars.nx; + const int nz = scalars.nz; + + if (dir == direction::X) { //Set the halo values for this MPI task's fluid state in the x-direction - set_halo_values_x(state_forcing); + set_halo_values_x(state_forcing, scalars, arrays); //Compute the time tendencies for the fluid state in the x-direction - compute_tendencies_x(state_forcing, flux, tend, dt); - } else if (dir == DIR_Z) { + compute_tendencies_x(state_forcing, flux, tend, dt, nx, nz, arrays); + } else if (dir == direction::Z) { //Set the halo values for this MPI task's fluid state in the z-direction - set_halo_values_z(state_forcing); + set_halo_values_z(state_forcing, scalars, arrays); //Compute the time tendencies for the fluid state in the z-direction - compute_tendencies_z(state_forcing, flux, tend, dt); + compute_tendencies_z(state_forcing, flux, tend, dt, nx, nz, arrays); } ///////////////////////////////////////////////// // TODO: THREAD ME ///////////////////////////////////////////////// //Apply the tendencies to the fluid state + const int i_beg = scalars.i_beg; + const int k_beg = scalars.k_beg; for (int ll = 0; ll < NUM_VARS; ++ll) { for (int k = 0; k < nz; ++k) { for (int i = 0; i < nx; ++i) { @@ -302,7 +347,7 @@ void semi_discrete_step(view_3d state_init, view_3d state_forcing, view_3d state const double x = (i_beg + i+0.5)*dx; const double z = (k_beg + k+0.5)*dz; const double wpert = sample_ellipse_cosine(x, z, 0.01, xlen/8, 1000.0, 500.0, 500.0); - tend(ID_WMOM, k, i) += wpert*hy_dens_cell[hs+k]; + tend(ID_WMOM, k, i) += wpert * arrays.hy_dens_cell[hs+k]; } state_out(ll, k+hs, i+hs) = state_init(ll, k+hs, i+hs) + dt * tend(ll, k, i); } @@ -315,7 +360,9 @@ void semi_discrete_step(view_3d state_init, view_3d state_forcing, view_3d state //Since the halos are set in a separate routine, this will not require MPI //First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) //Then, compute the tendencies using those fluxes -void compute_tendencies_x(view_3d state, view_3d flux, view_3d tend, double dt) { +void compute_tendencies_x(view_3d state, view_3d flux, view_3d tend, + double dt, int nx, int nz, const global_arrays& arrays) +{ double stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef; //Compute the hyperviscosity coefficient hv_coef = -hv_beta * dx / (16*dt); @@ -337,10 +384,10 @@ void compute_tendencies_x(view_3d state, view_3d flux, view_3d tend, double dt) } //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively) - double r = vals[ID_DENS] + hy_dens_cell[k+hs]; + double r = vals[ID_DENS] + arrays.hy_dens_cell[k+hs]; double u = vals[ID_UMOM] / r; double w = vals[ID_WMOM] / r; - double t = ( vals[ID_RHOT] + hy_dens_theta_cell[k+hs] ) / r; + double t = ( vals[ID_RHOT] + arrays.hy_dens_theta_cell[k+hs] ) / r; double p = C0 * pow(r*t, gamm); //Compute the flux vector @@ -369,7 +416,9 @@ void compute_tendencies_x(view_3d state, view_3d flux, view_3d tend, double dt) //Since the halos are set in a separate routine, this will not require MPI //First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity) //Then, compute the tendencies using those fluxes -void compute_tendencies_z(view_3d state, view_3d flux, view_3d tend, double dt) { +void compute_tendencies_z(view_3d state, view_3d flux, view_3d tend, + double dt, int nx, int nz, const global_arrays& arrays) +{ double stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS]; //Compute the hyperviscosity coefficient const double hv_coef = -hv_beta * dz / (16*dt); @@ -391,11 +440,11 @@ void compute_tendencies_z(view_3d state, view_3d flux, view_3d tend, double dt) } //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively) - double r = vals[ID_DENS] + hy_dens_int[k]; + double r = vals[ID_DENS] + arrays.hy_dens_int[k]; double u = vals[ID_UMOM] / r; double w = vals[ID_WMOM] / r; - double t = (vals[ID_RHOT] + hy_dens_theta_int[k]) / r; - double p = C0 * pow(r * t, gamm) - hy_pressure_int[k]; + double t = (vals[ID_RHOT] + arrays.hy_dens_theta_int[k]) / r; + double p = C0 * pow(r * t, gamm) - arrays.hy_pressure_int[k]; //Enforce vertical boundary condition and exact mass conservation if (k == 0 || k == nz) { w = 0; @@ -429,7 +478,12 @@ void compute_tendencies_z(view_3d state, view_3d flux, view_3d tend, double dt) //Set this MPI task's halo values in the x-direction. This routine will require MPI -void set_halo_values_x(view_3d state) { +void set_halo_values_x(view_3d state, + const global_scalars& scalars, const global_arrays& arrays) +{ + const int nx = scalars.nx; + const int nz = scalars.nz; + //////////////////////////////////////////////////////////////////////// // TODO: EXCHANGE HALO VALUES WITH NEIGHBORING MPI TASKS // (1) give state(1:hs,1:nz,1:NUM_VARS) to my left neighbor @@ -452,13 +506,15 @@ void set_halo_values_x(view_3d state) { //////////////////////////////////////////////////// if (data_spec_int == DATA_SPEC_INJECTION) { - if (myrank == 0) { + if (scalars.myrank == 0) { + const int k_beg = scalars.k_beg; for (int k = 0; k < nz; ++k) { for (int i = 0; i < hs; ++i) { const double z = (k_beg + k+0.5)*dz; if (fabs(z-3*zlen/4) <= zlen/16) { - state(ID_UMOM, k+hs, i) = (state(ID_DENS, k+hs, i)+hy_dens_cell[k+hs]) * 50.; - state(ID_RHOT, k+hs, i) = (state(ID_DENS, k+hs, i)+hy_dens_cell[k+hs]) * 298. - hy_dens_theta_cell[k+hs]; + state(ID_UMOM, k+hs, i) = (state(ID_DENS, k+hs, i) + arrays.hy_dens_cell[k+hs]) * 50.0; + state(ID_RHOT, k+hs, i) = (state(ID_DENS, k+hs, i) + arrays.hy_dens_cell[k+hs]) * 298.0 - + arrays.hy_dens_theta_cell[k+hs]; } } } @@ -469,7 +525,11 @@ void set_halo_values_x(view_3d state) { //Set this MPI task's halo values in the z-direction. This does not require MPI because there is no MPI //decomposition in the vertical direction -void set_halo_values_z(view_3d state) { +void set_halo_values_z(view_3d state, + const global_scalars& scalars, const global_arrays& arrays) +{ + const int nx = scalars.nx; + const int nz = scalars.nz; ///////////////////////////////////////////////// // TODO: THREAD ME @@ -482,10 +542,10 @@ void set_halo_values_z(view_3d state) { state(ll, nz+hs, i) = 0.; state(ll, nz+hs+1, i) = 0.; } else if (ll == ID_UMOM) { - state(ll, 0, i) = state(ll, hs, i) / hy_dens_cell[hs] * hy_dens_cell[0]; - state(ll, 1, i) = state(ll, hs, i) / hy_dens_cell[hs] * hy_dens_cell[1]; - state(ll, nz+hs, i) = state(ll, nz+hs-1, i) / hy_dens_cell[nz+hs-1] * hy_dens_cell[nz+hs]; - state(ll, nz+hs+1, i) = state(ll, nz+hs-1, i) / hy_dens_cell[nz+hs-1] * hy_dens_cell[nz+hs+1]; + state(ll, 0, i) = state(ll, hs, i) / arrays.hy_dens_cell[hs] * arrays.hy_dens_cell[0]; + state(ll, 1, i) = state(ll, hs, i) / arrays.hy_dens_cell[hs] * arrays.hy_dens_cell[1]; + state(ll, nz+hs, i) = state(ll, nz+hs-1, i) / arrays.hy_dens_cell[nz+hs-1] * arrays.hy_dens_cell[nz+hs]; + state(ll, nz+hs+1, i) = state(ll, nz+hs-1, i) / arrays.hy_dens_cell[nz+hs-1] * arrays.hy_dens_cell[nz+hs+1]; } else { state(ll, 0, i) = state(ll, hs, i); state(ll, 1, i) = state(ll, hs, i); @@ -497,7 +557,7 @@ void set_halo_values_z(view_3d state) { } -void init( int *argc , char ***argv ) { +std::tuple init( int *argc , char ***argv ) { (void) MPI_Init(argc,argv); ///////////////////////////////////////////////////////////// @@ -508,17 +568,13 @@ void init( int *argc , char ***argv ) { // (4) COMPUTE HOW MANY X-DIRECTION CELLS MY RANK HAS // (5) FIND MY LEFT AND RIGHT NEIGHBORING RANK IDs ///////////////////////////////////////////////////////////// - nranks = 1; - myrank = 0; - i_beg = 0; - nx = nx_glob; - left_rank = 0; - right_rank = 0; + int i_beg = 0; + int nx = nx_glob; + ////////////////////////////////////////////// // END MPI DUMMY SECTION ////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// // YOU DON'T NEED TO ALTER ANYTHING BELOW THIS POINT IN THE CODE @@ -526,23 +582,27 @@ void init( int *argc , char ***argv ) { //////////////////////////////////////////////////////////////////////////////// //Vertical direction isn't MPI-ized, so the rank's local values = the global values - k_beg = 0; - nz = nz_glob; - mainproc = (myrank == 0); + int k_beg = 0; + int nz = nz_glob; + int nranks = 1; + int myrank = 0; + int left_rank = 0; + int right_rank = 0; + bool mainproc = (myrank == 0); //Allocate the model data - state = md::make_unique_mdarray(NUM_VARS, nz+2*hs, nx+2*hs); - state_tmp = md::make_unique_mdarray(NUM_VARS, nz+2*hs, nx+2*hs); - flux = md::make_unique_mdarray(NUM_VARS, nz+1, nx+1); - tend = md::make_unique_mdarray(NUM_VARS, nz, nx); - hy_dens_cell = std::make_unique(nz+2*hs); - hy_dens_theta_cell = std::make_unique(nz+2*hs); - hy_dens_int = std::make_unique(nz+1); - hy_dens_theta_int = std::make_unique(nz+1); - hy_pressure_int = std::make_unique(nz+1); + auto state = md::make_unique_mdarray(NUM_VARS, nz+2*hs, nx+2*hs); + auto state_tmp = md::make_unique_mdarray(NUM_VARS, nz+2*hs, nx+2*hs); + auto flux = md::make_unique_mdarray(NUM_VARS, nz+1, nx+1); + auto tend = md::make_unique_mdarray(NUM_VARS, nz, nx); + auto hy_dens_cell = std::make_unique(nz+2*hs); + auto hy_dens_theta_cell = std::make_unique(nz+2*hs); + auto hy_dens_int = std::make_unique(nz+1); + auto hy_dens_theta_int = std::make_unique(nz+1); + auto hy_pressure_int = std::make_unique(nz+1); //Define the maximum stable time step based on an assumed maximum wind speed - dt = fmin(dx,dz) / max_speed * cfl; + double dt = fmin(dx,dz) / max_speed * cfl; //Set initial elapsed model time and output_counter to zero etime = 0.0; output_counter = 0.0; @@ -589,13 +649,13 @@ void init( int *argc , char ***argv ) { } //Compute the hydrostatic background state over vertical cell averages for (int k = 0; k < nz+2*hs; ++k) { - hy_dens_cell [k] = 0.; + hy_dens_cell[k] = 0.; hy_dens_theta_cell[k] = 0.; for (int kk = 0; kk < nqpoints; ++kk) { const double z = (k_beg + k-hs+0.5)*dz; //Set the fluid state based on the user's specification auto [r, u, w, t, hr, ht] = get_test_case(data_spec_int, 0.0, z); - hy_dens_cell [k] = hy_dens_cell [k] + hr * qweights[kk]; + hy_dens_cell[k] = hy_dens_cell[k] + hr * qweights[kk]; hy_dens_theta_cell[k] = hy_dens_theta_cell[k] + hr*ht * qweights[kk]; } } @@ -607,6 +667,56 @@ void init( int *argc , char ***argv ) { hy_dens_theta_int[k] = hr * ht; hy_pressure_int [k] = C0 * pow(hr * ht, gamm); } + + return std::tuple{ +#if defined(__cpp_designated_initializers) + global_scalars{ + .dt = dt, + .nx = nx, + .nz = nz, + .i_beg = i_beg, + .k_beg = k_beg, + .nranks = nranks, + .myrank = myrank, + .left_rank = left_rank, + .right_rank = right_rank + }, + global_arrays{ + .hy_dens_cell = std::move(hy_dens_cell), + .hy_dens_theta_cell = std::move(hy_dens_theta_cell), + .hy_dens_int = std::move(hy_dens_int), + .hy_dens_theta_int = std::move(hy_dens_theta_int), + .hy_pressure_int = std::move(hy_pressure_int), + .state = std::move(state), + .state_tmp = std::move(state_tmp), + .flux = std::move(flux), + .tend = std::move(tend) + } +#else + global_scalars{ + dt, + nx, + nz, + i_beg, + k_beg, + nranks, + myrank, + left_rank, + right_rank + }, + global_arrays{ + std::move(hy_dens_cell), + std::move(hy_dens_theta_cell), + std::move(hy_dens_int), + std::move(hy_dens_theta_int), + std::move(hy_pressure_int), + std::move(state), + std::move(state_tmp), + std::move(flux), + std::move(tend) + } +#endif + }; } @@ -736,7 +846,7 @@ double sample_ellipse_cosine( double x , double z , double amp , double x0 , dou //Output the fluid state (state) to a NetCDF file at a given elapsed model time (etime) //The file I/O uses parallel-netcdf, the only external library required for this mini-app. //If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics -void output(view_3d state, double etime) { +void output(const global_scalars& scalars, const global_arrays& arrays, double etime) { int ncid, t_dimid, x_dimid, z_dimid, theta_varid, t_varid, dimids[3]; #if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) @@ -745,15 +855,15 @@ void output(view_3d state, double etime) { MPI_Offset st1[1], ct1[1], st3[3], ct3[3]; //Inform the user - if (mainproc) { fprintf(stderr, "*** OUTPUT ***\n"); } + if (scalars.mainproc()) { fprintf(stderr, "*** OUTPUT ***\n"); } //Temporary arrays to hold density, u-wind, w-wind, and potential temperature (theta). #if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) - auto dens = md::make_unique_mdarray(nz, nx); - auto uwnd = md::make_unique_mdarray(nz, nx); - auto wwnd = md::make_unique_mdarray(nz, nx); + auto dens = md::make_unique_mdarray(scalars.nz, scalars.nx); + auto uwnd = md::make_unique_mdarray(scalars.nz, scalars.nx); + auto wwnd = md::make_unique_mdarray(scalars.nz, scalars.nx); #endif - auto theta = md::make_unique_mdarray(nz, nx); + auto theta = md::make_unique_mdarray(scalars.nz, scalars.nx); auto etimearr = std::make_unique(1); // PNetCDF needs an MPI_Info object that is not MPI_INFO_NULL. @@ -799,20 +909,22 @@ void output(view_3d state, double etime) { } //Store perturbed values in the temp arrays for output - for (int k = 0; k < nz; ++k) { - for (int i = 0; i < nx; ++i) { + for (int k = 0; k < scalars.nz; ++k) { + for (int i = 0; i < scalars.nx; ++i) { #if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) - dens(k, i) = state(ID_DENS, k+hs, i+hs); - uwnd(k, i) = state(ID_UMOM, k+hs, i+hs) / ( hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs) ); - wwnd(k, i) = state(ID_WMOM, k+hs, i+hs) / ( hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs) ); + dens(k, i) = arrays.state(ID_DENS, k+hs, i+hs); + uwnd(k, i) = arrays.state(ID_UMOM, k+hs, i+hs) / (arrays.hy_dens_cell[k+hs] + arrays.state(ID_DENS, k+hs, i+hs)); + wwnd(k, i) = arrays.state(ID_WMOM, k+hs, i+hs) / (arrays.hy_dens_cell[k+hs] + arrays.state(ID_DENS, k+hs, i+hs)); #endif - theta(k, i) = ( state(ID_RHOT, k+hs, i+hs) + hy_dens_theta_cell[k+hs] ) / ( hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs) ) - hy_dens_theta_cell[k+hs] / hy_dens_cell[k+hs]; + theta(k, i) = (arrays.state(ID_RHOT, k+hs, i+hs) + arrays.hy_dens_theta_cell[k+hs]) / + (arrays.hy_dens_cell[k+hs] + arrays.state(ID_DENS, k+hs, i+hs)) - + arrays.hy_dens_theta_cell[k+hs] / arrays.hy_dens_cell[k+hs]; } } //Write the grid data to file with all the processes writing collectively - st3[0] = num_out; st3[1] = k_beg; st3[2] = i_beg; - ct3[0] = 1 ; ct3[1] = nz ; ct3[2] = nx ; + st3[0] = num_out; st3[1] = scalars.k_beg; st3[2] = scalars.i_beg; + ct3[0] = 1 ; ct3[1] = scalars.nz ; ct3[2] = scalars.nx ; #if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) ncwrap( ncmpi_put_vara_double_all( ncid , dens_varid , st3 , ct3 , dens.get() ) , __LINE__ ); ncwrap( ncmpi_put_vara_double_all( ncid , uwnd_varid , st3 , ct3 , uwnd.get() ) , __LINE__ ); @@ -824,7 +936,7 @@ void output(view_3d state, double etime) { //Begin "independent" write mode ncwrap( ncmpi_begin_indep_data(ncid) , __LINE__ ); //write elapsed time to file - if (mainproc) { + if (scalars.mainproc()) { st1[0] = num_out; ct1[0] = 1; etimearr[0] = etime; @@ -859,15 +971,18 @@ void finalize() { //Compute reduced quantities for error checking without resorting to the "ncdiff" tool -reduction_result reductions() { +reduction_result reductions(const global_scalars& scalars, const global_arrays& arrays) { reduction_result result{0.0, 0.0}; - for (int k=0; k Date: Tue, 25 Mar 2025 22:12:48 +0200 Subject: [PATCH 59/83] Remove num_out and direction_switch global state --- cpp-mdspan/miniWeather_mdspan.cpp | 48 ++++++++++++++++++------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index 3397a02..bd19109 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -133,9 +133,6 @@ struct global_arrays { alloc_3d tend; // Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) }; -int num_out = 0; // The number of outputs performed so far -int direction_switch = 1; - //Declaring the functions defined after "main" std::tuple init(int *argc , char ***argv ); void finalize(); @@ -175,12 +172,14 @@ r_t_pair hydro_const_bvfreq(double z, double bv_freq0); double sample_ellipse_cosine(double x, double z, double amp, double x0, double z0, double xrad, double zrad); -void output(const global_scalars& scalars, const global_arrays& arrays, double etime); +int output(const global_scalars& scalars, const global_arrays& arrays, + double etime, int num_out); void ncwrap(int ierr, int line); -void perform_timestep(view_3d state, view_3d state_tmp, - view_3d flux, view_3d tend, - const global_scalars& scalars, - const global_arrays& arrays); +int perform_timestep(view_3d state, view_3d state_tmp, + view_3d flux, view_3d tend, + const global_scalars& scalars, + const global_arrays& arrays, + int direction_switch); void semi_discrete_step(view_3d state_init, view_3d state_forcing, view_3d state_out, double dt /* not scalars.dt */, direction dir, view_3d flux, view_3d tend, @@ -206,8 +205,9 @@ reduction_result reductions(const global_scalars& scalars, const global_arrays& // THE MAIN PROGRAM STARTS HERE /////////////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { - auto [scalars, arrays] = init( &argc , &argv ); + int direction_switch = 1; + int num_out = 0; //Initial reductions for mass, kinetic energy, and total energy. // @@ -228,7 +228,7 @@ int main(int argc, char **argv) { NUM_VARS, scalars.nz, scalars.nx); //Output the initial state - output(scalars, arrays, etime); + num_out = output(scalars, arrays, etime, num_out); //////////////////////////////////////////////////// // MAIN TIME STEP LOOP @@ -240,7 +240,8 @@ int main(int argc, char **argv) { scalars.dt = sim_time - etime; } //Perform a single time step - perform_timestep(state_view, state_tmp_view, flux_view, tend_view, scalars, arrays); + direction_switch = perform_timestep(state_view, state_tmp_view, flux_view, tend_view, + scalars, arrays, direction_switch); //Inform the user #if ! defined(NO_INFORM) if (scalars.mainproc()) { @@ -253,7 +254,7 @@ int main(int argc, char **argv) { //If it's time for output, reset the counter, and do output if (output_counter >= output_freq) { output_counter = output_counter - output_freq; - output(scalars, arrays, etime); + num_out = output(scalars, arrays, etime, num_out); } } auto t2 = std::chrono::steady_clock::now(); @@ -280,10 +281,13 @@ int main(int argc, char **argv) { // q* = q[n] + dt/3 * rhs(q[n]) // q** = q[n] + dt/2 * rhs(q* ) // q[n+1] = q[n] + dt/1 * rhs(q** ) -void perform_timestep(view_3d state, view_3d state_tmp, - view_3d flux, view_3d tend, - const global_scalars& scalars, - const global_arrays& arrays) +// +// Return: updated direction_switch +int perform_timestep(view_3d state, view_3d state_tmp, + view_3d flux, view_3d tend, + const global_scalars& scalars, + const global_arrays& arrays, + int direction_switch) { const double dt = scalars.dt; if (direction_switch) { @@ -306,6 +310,8 @@ void perform_timestep(view_3d state, view_3d state_tmp, semi_discrete_step(state, state_tmp, state , dt / 1, direction::X, flux, tend, scalars, arrays); } if (direction_switch) { direction_switch = 0; } else { direction_switch = 1; } + + return direction_switch; } @@ -846,7 +852,11 @@ double sample_ellipse_cosine( double x , double z , double amp , double x0 , dou //Output the fluid state (state) to a NetCDF file at a given elapsed model time (etime) //The file I/O uses parallel-netcdf, the only external library required for this mini-app. //If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics -void output(const global_scalars& scalars, const global_arrays& arrays, double etime) { +// +// Input: number of outputs performed before calling this function. +// Return: number of outputs performed after calling this function. +int output(const global_scalars& scalars, const global_arrays& arrays, + double etime, int num_out) { int ncid, t_dimid, x_dimid, z_dimid, theta_varid, t_varid, dimids[3]; #if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) @@ -948,10 +958,8 @@ void output(const global_scalars& scalars, const global_arrays& arrays, double e //Close the file ncwrap( ncmpi_close(ncid) , __LINE__ ); - //Increment the number of outputs - num_out = num_out + 1; - (void) MPI_Info_free(&mpi_info); + return num_out + 1; } From c83a9c8f37511a957da9bd1717af0b5ff26e28c6 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 25 Mar 2025 22:23:54 +0200 Subject: [PATCH 60/83] Separate const and nonconst arrays --- cpp-mdspan/miniWeather_mdspan.cpp | 115 +++++++++++++++++------------- 1 file changed, 67 insertions(+), 48 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index bd19109..b3a1214 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -118,15 +118,17 @@ struct global_scalars { bool mainproc() const { return myrank == 0; } //Am I the main process (rank == 0)? }; -struct global_arrays { +// Arrays that are allocated in init and never changed after that. +struct global_const_arrays { std::unique_ptr hy_dens_cell; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) std::unique_ptr hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) std::unique_ptr hy_dens_int; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) std::unique_ptr hy_dens_theta_int; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) std::unique_ptr hy_pressure_int; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) +}; - // Arrays that are allocated in init and updated throughout the simulation. - +// Arrays that are allocated in init and updated throughout the simulation. +struct global_arrays { alloc_3d state; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) alloc_3d state_tmp; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) alloc_3d flux; // Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) @@ -134,7 +136,7 @@ struct global_arrays { }; //Declaring the functions defined after "main" -std::tuple init(int *argc , char ***argv ); +std::tuple init(int *argc , char ***argv ); void finalize(); struct test_case { @@ -172,63 +174,66 @@ r_t_pair hydro_const_bvfreq(double z, double bv_freq0); double sample_ellipse_cosine(double x, double z, double amp, double x0, double z0, double xrad, double zrad); -int output(const global_scalars& scalars, const global_arrays& arrays, - double etime, int num_out); +int output(view_3d state, + const global_scalars& scalars, + const global_const_arrays& arrays, + double etime, int num_out); void ncwrap(int ierr, int line); int perform_timestep(view_3d state, view_3d state_tmp, view_3d flux, view_3d tend, const global_scalars& scalars, - const global_arrays& arrays, + const global_const_arrays& arrays, int direction_switch); void semi_discrete_step(view_3d state_init, view_3d state_forcing, view_3d state_out, double dt /* not scalars.dt */, direction dir, view_3d flux, view_3d tend, const global_scalars& scalars, - const global_arrays& arrays); + const global_const_arrays& arrays); void compute_tendencies_x(view_3d state, view_3d flux, view_3d tend, - double dt, int nx, int nz, const global_arrays& arrays); + double dt, int nx, int nz, const global_const_arrays& arrays); void compute_tendencies_z(view_3d state, view_3d flux, view_3d tend, - double dt, int nx, int nz, const global_arrays& arrays); + double dt, int nx, int nz, const global_const_arrays& arrays); void set_halo_values_x(view_3d state, - const global_scalars& scalars, const global_arrays& arrays); + const global_scalars& scalars, const global_const_arrays& arrays); void set_halo_values_z(view_3d state, - const global_scalars& scalars, const global_arrays& arrays); + const global_scalars& scalars, const global_const_arrays& arrays); struct reduction_result { double mass; double te; }; -reduction_result reductions(const global_scalars& scalars, const global_arrays& arrays); - +reduction_result reductions(view_3d state, + const global_scalars& scalars, + const global_const_arrays& arrays); /////////////////////////////////////////////////////////////////////////////////////// // THE MAIN PROGRAM STARTS HERE /////////////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { - auto [scalars, arrays] = init( &argc , &argv ); + auto [scalars, const_arrays, arrays] = init( &argc , &argv ); int direction_switch = 1; int num_out = 0; + auto state_view = view_3d(arrays.state.get(), + NUM_VARS, scalars.nz + 2 * hs, scalars.nx + 2 * hs); + auto state_tmp_view = view_3d(arrays.state_tmp.get(), + NUM_VARS, scalars.nz + 2 * hs, scalars.nx + 2 * hs); + auto flux_view = view_3d(arrays.flux.get(), + NUM_VARS, scalars.nz + 1, scalars.nx + 1); + auto tend_view = view_3d(arrays.tend.get(), + NUM_VARS, scalars.nz, scalars.nx); //Initial reductions for mass, kinetic energy, and total energy. // // mass0: initial domain total for mass // te0: initial domain total for total energy - auto [mass0, te0] = reductions(scalars, arrays); + auto [mass0, te0] = reductions(state_view, scalars, const_arrays); { fprintf(stderr, "mass0: %le\n" , mass0); fprintf(stderr, "te0: %le\n" , te0 ); } - auto state_view = view_3d(arrays.state.get(), - NUM_VARS, scalars.nz + 2 * hs, scalars.nx + 2 * hs); - auto state_tmp_view = view_3d(arrays.state_tmp.get(), - NUM_VARS, scalars.nz + 2 * hs, scalars.nx + 2 * hs); - auto flux_view = view_3d(arrays.flux.get(), - NUM_VARS, scalars.nz + 1, scalars.nx + 1); - auto tend_view = view_3d(arrays.tend.get(), - NUM_VARS, scalars.nz, scalars.nx); //Output the initial state - num_out = output(scalars, arrays, etime, num_out); + num_out = output(state_view, scalars, const_arrays, etime, num_out); //////////////////////////////////////////////////// // MAIN TIME STEP LOOP @@ -241,7 +246,7 @@ int main(int argc, char **argv) { } //Perform a single time step direction_switch = perform_timestep(state_view, state_tmp_view, flux_view, tend_view, - scalars, arrays, direction_switch); + scalars, const_arrays, direction_switch); //Inform the user #if ! defined(NO_INFORM) if (scalars.mainproc()) { @@ -254,7 +259,7 @@ int main(int argc, char **argv) { //If it's time for output, reset the counter, and do output if (output_counter >= output_freq) { output_counter = output_counter - output_freq; - num_out = output(scalars, arrays, etime, num_out); + num_out = output(state_view, scalars, const_arrays, etime, num_out); } } auto t2 = std::chrono::steady_clock::now(); @@ -263,7 +268,7 @@ int main(int argc, char **argv) { } //Final reductions for mass, kinetic energy, and total energy - auto [mass, te] = reductions(scalars, arrays); + auto [mass, te] = reductions(state_view, scalars, const_arrays); if (scalars.mainproc()) { fprintf(stderr, "d_mass: %le\n" , (mass - mass0)/mass0 ); @@ -286,7 +291,7 @@ int main(int argc, char **argv) { int perform_timestep(view_3d state, view_3d state_tmp, view_3d flux, view_3d tend, const global_scalars& scalars, - const global_arrays& arrays, + const global_const_arrays& arrays, int direction_switch) { const double dt = scalars.dt; @@ -323,7 +328,7 @@ void semi_discrete_step(view_3d state_init, view_3d state_forcing, view_3d state double dt /* not scalars.dt */, direction dir, view_3d flux, view_3d tend, const global_scalars& scalars, - const global_arrays& arrays) + const global_const_arrays& arrays) { const int nx = scalars.nx; const int nz = scalars.nz; @@ -367,7 +372,7 @@ void semi_discrete_step(view_3d state_init, view_3d state_forcing, view_3d state //First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) //Then, compute the tendencies using those fluxes void compute_tendencies_x(view_3d state, view_3d flux, view_3d tend, - double dt, int nx, int nz, const global_arrays& arrays) + double dt, int nx, int nz, const global_const_arrays& arrays) { double stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef; //Compute the hyperviscosity coefficient @@ -423,7 +428,7 @@ void compute_tendencies_x(view_3d state, view_3d flux, view_3d tend, //First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity) //Then, compute the tendencies using those fluxes void compute_tendencies_z(view_3d state, view_3d flux, view_3d tend, - double dt, int nx, int nz, const global_arrays& arrays) + double dt, int nx, int nz, const global_const_arrays& arrays) { double stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS]; //Compute the hyperviscosity coefficient @@ -485,7 +490,7 @@ void compute_tendencies_z(view_3d state, view_3d flux, view_3d tend, //Set this MPI task's halo values in the x-direction. This routine will require MPI void set_halo_values_x(view_3d state, - const global_scalars& scalars, const global_arrays& arrays) + const global_scalars& scalars, const global_const_arrays& arrays) { const int nx = scalars.nx; const int nz = scalars.nz; @@ -532,7 +537,7 @@ void set_halo_values_x(view_3d state, //Set this MPI task's halo values in the z-direction. This does not require MPI because there is no MPI //decomposition in the vertical direction void set_halo_values_z(view_3d state, - const global_scalars& scalars, const global_arrays& arrays) + const global_scalars& scalars, const global_const_arrays& arrays) { const int nx = scalars.nx; const int nz = scalars.nz; @@ -563,7 +568,7 @@ void set_halo_values_z(view_3d state, } -std::tuple init( int *argc , char ***argv ) { +std::tuple init( int *argc , char ***argv ) { (void) MPI_Init(argc,argv); ///////////////////////////////////////////////////////////// @@ -687,12 +692,14 @@ std::tuple init( int *argc , char ***argv ) { .left_rank = left_rank, .right_rank = right_rank }, - global_arrays{ + global_const_arrays{ .hy_dens_cell = std::move(hy_dens_cell), .hy_dens_theta_cell = std::move(hy_dens_theta_cell), .hy_dens_int = std::move(hy_dens_int), .hy_dens_theta_int = std::move(hy_dens_theta_int), - .hy_pressure_int = std::move(hy_pressure_int), + .hy_pressure_int = std::move(hy_pressure_int) + }, + global_arrays{ .state = std::move(state), .state_tmp = std::move(state_tmp), .flux = std::move(flux), @@ -710,6 +717,13 @@ std::tuple init( int *argc , char ***argv ) { left_rank, right_rank }, + global_const_arrays{ + std::move(hy_dens_cell), + std::move(hy_dens_theta_cell), + std::move(hy_dens_int), + std::move(hy_dens_theta_int), + std::move(hy_pressure_int) + }, global_arrays{ std::move(hy_dens_cell), std::move(hy_dens_theta_cell), @@ -855,7 +869,9 @@ double sample_ellipse_cosine( double x , double z , double amp , double x0 , dou // // Input: number of outputs performed before calling this function. // Return: number of outputs performed after calling this function. -int output(const global_scalars& scalars, const global_arrays& arrays, +int output(view_3d state, + const global_scalars& scalars, + const global_const_arrays& arrays, double etime, int num_out) { int ncid, t_dimid, x_dimid, z_dimid, theta_varid, t_varid, dimids[3]; @@ -922,12 +938,12 @@ int output(const global_scalars& scalars, const global_arrays& arrays, for (int k = 0; k < scalars.nz; ++k) { for (int i = 0; i < scalars.nx; ++i) { #if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) - dens(k, i) = arrays.state(ID_DENS, k+hs, i+hs); - uwnd(k, i) = arrays.state(ID_UMOM, k+hs, i+hs) / (arrays.hy_dens_cell[k+hs] + arrays.state(ID_DENS, k+hs, i+hs)); - wwnd(k, i) = arrays.state(ID_WMOM, k+hs, i+hs) / (arrays.hy_dens_cell[k+hs] + arrays.state(ID_DENS, k+hs, i+hs)); + dens(k, i) = state(ID_DENS, k+hs, i+hs); + uwnd(k, i) = state(ID_UMOM, k+hs, i+hs) / (arrays.hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs)); + wwnd(k, i) = state(ID_WMOM, k+hs, i+hs) / (arrays.hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs)); #endif - theta(k, i) = (arrays.state(ID_RHOT, k+hs, i+hs) + arrays.hy_dens_theta_cell[k+hs]) / - (arrays.hy_dens_cell[k+hs] + arrays.state(ID_DENS, k+hs, i+hs)) - + theta(k, i) = (state(ID_RHOT, k+hs, i+hs) + arrays.hy_dens_theta_cell[k+hs]) / + (arrays.hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs)) - arrays.hy_dens_theta_cell[k+hs] / arrays.hy_dens_cell[k+hs]; } } @@ -979,7 +995,10 @@ void finalize() { //Compute reduced quantities for error checking without resorting to the "ncdiff" tool -reduction_result reductions(const global_scalars& scalars, const global_arrays& arrays) { +reduction_result reductions(view_3d state, + const global_scalars& scalars, + const global_const_arrays& arrays) +{ reduction_result result{0.0, 0.0}; const int nx = scalars.nx; @@ -987,10 +1006,10 @@ reduction_result reductions(const global_scalars& scalars, const global_arrays& for (int k = 0; k < nz; ++k) { for (int i = 0; i < nx; ++i) { - double r = arrays.state(ID_DENS, k+hs, i+hs) + arrays.hy_dens_cell[hs+k]; // Density - double u = arrays.state(ID_UMOM, k+hs, i+hs) / r; // U-wind - double w = arrays.state(ID_WMOM, k+hs, i+hs) / r; // W-wind - double th = ( arrays.state(ID_RHOT, k+hs, i+hs) + arrays.hy_dens_theta_cell[hs+k] ) / r; // Potential Temperature (theta) + double r = state(ID_DENS, k+hs, i+hs) + arrays.hy_dens_cell[hs+k]; // Density + double u = state(ID_UMOM, k+hs, i+hs) / r; // U-wind + double w = state(ID_WMOM, k+hs, i+hs) / r; // W-wind + double th = (state(ID_RHOT, k+hs, i+hs) + arrays.hy_dens_theta_cell[hs+k]) / r; // Potential Temperature (theta) double p = C0 * pow(r * th, gamm); // Pressure double t = th / pow(p0 / p, rd / cp); // Temperature double ke = r*(u*u+w*w); // Kinetic Energy From 5801600cba5a5e4b729df58a0eec787ee013cd23 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 25 Mar 2025 23:52:14 +0200 Subject: [PATCH 61/83] De-globalize etime and output_counter --- cpp-mdspan/miniWeather_mdspan.cpp | 39 +++++++++++++------------------ 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index b3a1214..c710ce4 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -74,16 +74,6 @@ double constexpr dz = zlen / nz_glob; // grid spacing in the x-direct // END USER-CONFIGURABLE PARAMETERS /////////////////////////////////////////////////////////////////////////////////////// -/////////////////////////////////////////////////////////////////////////////////////// -// Variables that are initialized but remain static over the course of the simulation -/////////////////////////////////////////////////////////////////////////////////////// - -/////////////////////////////////////////////////////////////////////////////////////// -// Variables that are dynamics over the course of the simulation -/////////////////////////////////////////////////////////////////////////////////////// -double etime; //Elapsed model time -double output_counter; //Helps determine when it's time to do output - namespace md { using MDSPAN_IMPL_STANDARD_NAMESPACE :: MDSPAN_IMPL_PROPOSED_NAMESPACE :: dims; } // namespace md @@ -102,6 +92,8 @@ using view_3d = md::mdspan, md::layout_right>; struct global_scalars { // Model time step (seconds). The last time step might shorten this. double dt; + double etime = 0.0; //Elapsed model time + double output_counter = 0.0; //Helps determine when it's time to do output // Variables and arrays that are set once in init and remain read-only throughout the simulation. @@ -233,16 +225,16 @@ int main(int argc, char **argv) { } //Output the initial state - num_out = output(state_view, scalars, const_arrays, etime, num_out); + num_out = output(state_view, scalars, const_arrays, scalars.etime, num_out); //////////////////////////////////////////////////// // MAIN TIME STEP LOOP //////////////////////////////////////////////////// auto t1 = std::chrono::steady_clock::now(); - while (etime < sim_time) { + while (scalars.etime < sim_time) { //If the time step leads to exceeding the simulation time, shorten it for the last step - if (etime + scalars.dt > sim_time) { - scalars.dt = sim_time - etime; + if (scalars.etime + scalars.dt > sim_time) { + scalars.dt = sim_time - scalars.etime; } //Perform a single time step direction_switch = perform_timestep(state_view, state_tmp_view, flux_view, tend_view, @@ -250,16 +242,16 @@ int main(int argc, char **argv) { //Inform the user #if ! defined(NO_INFORM) if (scalars.mainproc()) { - fprintf(stderr, "Elapsed Time: %lf / %lf\n", etime, sim_time); + fprintf(stderr, "Elapsed Time: %lf / %lf\n", scalars.etime, sim_time); } #endif //Update the elapsed time and output counter - etime = etime + scalars.dt; - output_counter = output_counter + scalars.dt; + scalars.etime = scalars.etime + scalars.dt; + scalars.output_counter = scalars.output_counter + scalars.dt; //If it's time for output, reset the counter, and do output - if (output_counter >= output_freq) { - output_counter = output_counter - output_freq; - num_out = output(state_view, scalars, const_arrays, etime, num_out); + if (scalars.output_counter >= output_freq) { + scalars.output_counter = scalars.output_counter - output_freq; + num_out = output(state_view, scalars, const_arrays, scalars.etime, num_out); } } auto t2 = std::chrono::steady_clock::now(); @@ -614,9 +606,6 @@ std::tuple init( int *argc , //Define the maximum stable time step based on an assumed maximum wind speed double dt = fmin(dx,dz) / max_speed * cfl; - //Set initial elapsed model time and output_counter to zero - etime = 0.0; - output_counter = 0.0; //If I'm the main process in MPI, display some grid information if (mainproc) { @@ -683,6 +672,8 @@ std::tuple init( int *argc , #if defined(__cpp_designated_initializers) global_scalars{ .dt = dt, + .etime = 0.0, + .output_counter = 0.0, .nx = nx, .nz = nz, .i_beg = i_beg, @@ -708,6 +699,8 @@ std::tuple init( int *argc , #else global_scalars{ dt, + etime, + output_counter, nx, nz, i_beg, From f143b9493a7890c3c98192aab459cfc84899169b Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 26 Mar 2025 00:20:00 +0200 Subject: [PATCH 62/83] Start abstracting global arrays Encapsulate global arrays in a class that offers (mdspan) views. Make functions take mdspan-of-const where possible. --- cpp-mdspan/miniWeather_mdspan.cpp | 170 +++++++++++++++++------------- 1 file changed, 98 insertions(+), 72 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index c710ce4..33d1904 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -7,14 +7,13 @@ // ////////////////////////////////////////////////////////////////////////////////////////// -#include -#include -#include -#include -#include +#include +#include +#include +#include + #include #include "pnetcdf.h" -#include #include "mdspan/mdspan.hpp" #include "unique_mdarray.hpp" @@ -79,6 +78,7 @@ namespace md { } // namespace md using alloc_3d = md::unique_mdarray, md::layout_right>; using view_3d = md::mdspan, md::layout_right>; +using view_3d_const = md::mdspan, md::layout_right>; //Runtime variable arrays // @@ -120,15 +120,46 @@ struct global_const_arrays { }; // Arrays that are allocated in init and updated throughout the simulation. -struct global_arrays { - alloc_3d state; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) - alloc_3d state_tmp; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) - alloc_3d flux; // Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) - alloc_3d tend; // Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) +class global_arrays { +public: + global_arrays(alloc_3d state, alloc_3d state_tmp, alloc_3d flux, alloc_3d tend) + : state_(std::move(state)), + state_tmp_(std::move(state_tmp)), + flux_(std::move(flux)), + tend_(std::move(tend)) + {} + + // The view member functions are const, but currently return nonconst views. + // We might consider a different model where users declare access intent + // (read-only, write-only, or read-write) at the point of use. + view_3d state() const { + // The various allocations have dimensions that depend on + // just a few metadata (NUM_VARS, nz, nx, and hs). + // Storing extents for each allocation duplicates storage of these metadata. + // Instead, we might consider flat allocations (e.g., make_unique) + // and constructing layout mappings on the fly in these member functions. + return view_3d{state_}; + } + view_3d state_tmp() const { + return view_3d{state_tmp_}; + } + view_3d flux() const { + return view_3d{flux_}; + } + view_3d tend() const { + return view_3d{tend_}; + } + +private: + alloc_3d state_; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) + alloc_3d state_tmp_; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) + alloc_3d flux_; // Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) + alloc_3d tend_; // Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) }; -//Declaring the functions defined after "main" -std::tuple init(int *argc , char ***argv ); +std::tuple +init(int *argc , char ***argv ); + void finalize(); struct test_case { @@ -166,7 +197,7 @@ r_t_pair hydro_const_bvfreq(double z, double bv_freq0); double sample_ellipse_cosine(double x, double z, double amp, double x0, double z0, double xrad, double zrad); -int output(view_3d state, +int output(view_3d_const state, const global_scalars& scalars, const global_const_arrays& arrays, double etime, int num_out); @@ -176,14 +207,16 @@ int perform_timestep(view_3d state, view_3d state_tmp, const global_scalars& scalars, const global_const_arrays& arrays, int direction_switch); -void semi_discrete_step(view_3d state_init, view_3d state_forcing, view_3d state_out, +void semi_discrete_step(view_3d_const state_init, + view_3d state_forcing, + view_3d state_out, double dt /* not scalars.dt */, direction dir, view_3d flux, view_3d tend, const global_scalars& scalars, const global_const_arrays& arrays); -void compute_tendencies_x(view_3d state, view_3d flux, view_3d tend, +void compute_tendencies_x(view_3d_const state, view_3d flux, view_3d tend, double dt, int nx, int nz, const global_const_arrays& arrays); -void compute_tendencies_z(view_3d state, view_3d flux, view_3d tend, +void compute_tendencies_z(view_3d_const state, view_3d flux, view_3d tend, double dt, int nx, int nz, const global_const_arrays& arrays); void set_halo_values_x(view_3d state, const global_scalars& scalars, const global_const_arrays& arrays); @@ -194,7 +227,7 @@ struct reduction_result { double mass; double te; }; -reduction_result reductions(view_3d state, +reduction_result reductions(view_3d_const state, const global_scalars& scalars, const global_const_arrays& arrays); @@ -205,40 +238,34 @@ int main(int argc, char **argv) { auto [scalars, const_arrays, arrays] = init( &argc , &argv ); int direction_switch = 1; int num_out = 0; - auto state_view = view_3d(arrays.state.get(), - NUM_VARS, scalars.nz + 2 * hs, scalars.nx + 2 * hs); - auto state_tmp_view = view_3d(arrays.state_tmp.get(), - NUM_VARS, scalars.nz + 2 * hs, scalars.nx + 2 * hs); - auto flux_view = view_3d(arrays.flux.get(), - NUM_VARS, scalars.nz + 1, scalars.nx + 1); - auto tend_view = view_3d(arrays.tend.get(), - NUM_VARS, scalars.nz, scalars.nx); //Initial reductions for mass, kinetic energy, and total energy. // // mass0: initial domain total for mass // te0: initial domain total for total energy - auto [mass0, te0] = reductions(state_view, scalars, const_arrays); - { + auto [mass0, te0] = reductions(arrays.state(), scalars, const_arrays); +#if ! defined(NO_INFORM) + if (scalars.mainproc()) { fprintf(stderr, "mass0: %le\n" , mass0); fprintf(stderr, "te0: %le\n" , te0 ); } +#endif //Output the initial state - num_out = output(state_view, scalars, const_arrays, scalars.etime, num_out); + num_out = output(arrays.state(), scalars, const_arrays, scalars.etime, num_out); //////////////////////////////////////////////////// // MAIN TIME STEP LOOP //////////////////////////////////////////////////// - auto t1 = std::chrono::steady_clock::now(); + [[maybe_unused]] auto t1 = std::chrono::steady_clock::now(); while (scalars.etime < sim_time) { //If the time step leads to exceeding the simulation time, shorten it for the last step if (scalars.etime + scalars.dt > sim_time) { scalars.dt = sim_time - scalars.etime; } //Perform a single time step - direction_switch = perform_timestep(state_view, state_tmp_view, flux_view, tend_view, - scalars, const_arrays, direction_switch); + direction_switch = perform_timestep(arrays.state(), arrays.state_tmp(), + arrays.flux(), arrays.tend(), scalars, const_arrays, direction_switch); //Inform the user #if ! defined(NO_INFORM) if (scalars.mainproc()) { @@ -251,21 +278,24 @@ int main(int argc, char **argv) { //If it's time for output, reset the counter, and do output if (scalars.output_counter >= output_freq) { scalars.output_counter = scalars.output_counter - output_freq; - num_out = output(state_view, scalars, const_arrays, scalars.etime, num_out); + num_out = output(arrays.state(), scalars, const_arrays, scalars.etime, num_out); } } - auto t2 = std::chrono::steady_clock::now(); + [[maybe_unused]] auto t2 = std::chrono::steady_clock::now(); +#if ! defined(NO_INFORM) if (scalars.mainproc()) { - std::cerr << "CPU Time: " << std::chrono::duration(t2-t1).count() << " sec\n"; + printf("CPU Time: %e sec\n", std::chrono::duration(t2-t1).count()); } +#endif +#if ! defined(NO_INFORM) //Final reductions for mass, kinetic energy, and total energy - auto [mass, te] = reductions(state_view, scalars, const_arrays); - + auto [mass, te] = reductions(arrays.state(), scalars, const_arrays); if (scalars.mainproc()) { fprintf(stderr, "d_mass: %le\n" , (mass - mass0)/mass0 ); fprintf(stderr, "d_te: %le\n" , (te - te0 )/te0 ); } +#endif finalize(); } @@ -316,7 +346,9 @@ int perform_timestep(view_3d state, view_3d state_tmp, //state_out = state_init + dt * rhs(state_forcing) //Meaning the step starts from state_init, computes the rhs using state_forcing, //and stores the result in state_out -void semi_discrete_step(view_3d state_init, view_3d state_forcing, view_3d state_out, +void semi_discrete_step(view_3d_const state_init, + view_3d state_forcing, + view_3d state_out, double dt /* not scalars.dt */, direction dir, view_3d flux, view_3d tend, const global_scalars& scalars, @@ -363,7 +395,7 @@ void semi_discrete_step(view_3d state_init, view_3d state_forcing, view_3d state //Since the halos are set in a separate routine, this will not require MPI //First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) //Then, compute the tendencies using those fluxes -void compute_tendencies_x(view_3d state, view_3d flux, view_3d tend, +void compute_tendencies_x(view_3d_const state, view_3d flux, view_3d tend, double dt, int nx, int nz, const global_const_arrays& arrays) { double stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef; @@ -419,7 +451,7 @@ void compute_tendencies_x(view_3d state, view_3d flux, view_3d tend, //Since the halos are set in a separate routine, this will not require MPI //First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity) //Then, compute the tendencies using those fluxes -void compute_tendencies_z(view_3d state, view_3d flux, view_3d tend, +void compute_tendencies_z(view_3d_const state, view_3d flux, view_3d tend, double dt, int nx, int nz, const global_const_arrays& arrays) { double stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS]; @@ -560,7 +592,9 @@ void set_halo_values_z(view_3d state, } -std::tuple init( int *argc , char ***argv ) { +std::tuple +init( int *argc , char ***argv ) +{ (void) MPI_Init(argc,argv); ///////////////////////////////////////////////////////////// @@ -593,16 +627,16 @@ std::tuple init( int *argc , int right_rank = 0; bool mainproc = (myrank == 0); - //Allocate the model data - auto state = md::make_unique_mdarray(NUM_VARS, nz+2*hs, nx+2*hs); - auto state_tmp = md::make_unique_mdarray(NUM_VARS, nz+2*hs, nx+2*hs); - auto flux = md::make_unique_mdarray(NUM_VARS, nz+1, nx+1); - auto tend = md::make_unique_mdarray(NUM_VARS, nz, nx); - auto hy_dens_cell = std::make_unique(nz+2*hs); - auto hy_dens_theta_cell = std::make_unique(nz+2*hs); - auto hy_dens_int = std::make_unique(nz+1); - auto hy_dens_theta_int = std::make_unique(nz+1); - auto hy_pressure_int = std::make_unique(nz+1); + global_arrays gl_arrs{ + /* state = */ md::make_unique_mdarray(NUM_VARS, nz+2*hs, nx+2*hs), + /* state_tmp = */ md::make_unique_mdarray(NUM_VARS, nz+2*hs, nx+2*hs), + /* flux = */ md::make_unique_mdarray(NUM_VARS, nz+1, nx+1), + /* tend = */ md::make_unique_mdarray(NUM_VARS, nz, nx) + }; + auto state = gl_arrs.state(); + auto state_tmp = gl_arrs.state_tmp(); + auto flux = gl_arrs.flux(); + auto tend = gl_arrs.tend(); //Define the maximum stable time step based on an assumed maximum wind speed double dt = fmin(dx,dz) / max_speed * cfl; @@ -647,6 +681,13 @@ std::tuple init( int *argc , } } } + + auto hy_dens_cell = std::make_unique(nz+2*hs); + auto hy_dens_theta_cell = std::make_unique(nz+2*hs); + auto hy_dens_int = std::make_unique(nz+1); + auto hy_dens_theta_int = std::make_unique(nz+1); + auto hy_pressure_int = std::make_unique(nz+1); + //Compute the hydrostatic background state over vertical cell averages for (int k = 0; k < nz+2*hs; ++k) { hy_dens_cell[k] = 0.; @@ -690,17 +731,12 @@ std::tuple init( int *argc , .hy_dens_theta_int = std::move(hy_dens_theta_int), .hy_pressure_int = std::move(hy_pressure_int) }, - global_arrays{ - .state = std::move(state), - .state_tmp = std::move(state_tmp), - .flux = std::move(flux), - .tend = std::move(tend) - } + std::move(gl_arrs) #else global_scalars{ dt, - etime, - output_counter, + /* etime = */ 0.0, + /* output_counter = */ 0.0, nx, nz, i_beg, @@ -717,17 +753,7 @@ std::tuple init( int *argc , std::move(hy_dens_theta_int), std::move(hy_pressure_int) }, - global_arrays{ - std::move(hy_dens_cell), - std::move(hy_dens_theta_cell), - std::move(hy_dens_int), - std::move(hy_dens_theta_int), - std::move(hy_pressure_int), - std::move(state), - std::move(state_tmp), - std::move(flux), - std::move(tend) - } + std::move(gl_arrs) #endif }; } @@ -862,7 +888,7 @@ double sample_ellipse_cosine( double x , double z , double amp , double x0 , dou // // Input: number of outputs performed before calling this function. // Return: number of outputs performed after calling this function. -int output(view_3d state, +int output(view_3d_const state, const global_scalars& scalars, const global_const_arrays& arrays, double etime, int num_out) { @@ -988,7 +1014,7 @@ void finalize() { //Compute reduced quantities for error checking without resorting to the "ncdiff" tool -reduction_result reductions(view_3d state, +reduction_result reductions(view_3d_const state, const global_scalars& scalars, const global_const_arrays& arrays) { From a6a30c745db676529f41f2dc8a99e74d503aa16e Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 26 Mar 2025 01:11:18 +0200 Subject: [PATCH 63/83] Encapsulate storage for const arrays --- cpp-mdspan/miniWeather_mdspan.cpp | 207 +++++++++++++++++++----------- 1 file changed, 129 insertions(+), 78 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index 33d1904..7816287 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -76,10 +76,17 @@ double constexpr dz = zlen / nz_glob; // grid spacing in the x-direct namespace md { using MDSPAN_IMPL_STANDARD_NAMESPACE :: MDSPAN_IMPL_PROPOSED_NAMESPACE :: dims; } // namespace md + +// FIXME use dims (make_unique_mdarray constructor +// that takes extents doesn't like that) using alloc_3d = md::unique_mdarray, md::layout_right>; -using view_3d = md::mdspan, md::layout_right>; +using view_3d = md::mdspan, md::layout_right>; using view_3d_const = md::mdspan, md::layout_right>; +using alloc_1d = std::unique_ptr; +using view_1d = md::mdspan, md::layout_right>; +using view_1d_const = md::mdspan, md::layout_right>; + //Runtime variable arrays // // C indexing seems to prefer the extents in reverse order. @@ -111,24 +118,70 @@ struct global_scalars { }; // Arrays that are allocated in init and never changed after that. -struct global_const_arrays { - std::unique_ptr hy_dens_cell; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) - std::unique_ptr hy_dens_theta_cell; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) - std::unique_ptr hy_dens_int; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) - std::unique_ptr hy_dens_theta_int; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) - std::unique_ptr hy_pressure_int; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) +class global_const_arrays { +public: + global_const_arrays(int nx, int nz, int hs) : + nx_(nx), + nz_(nz), + hs_(hs), + hy_dens_cell_ (std::make_unique(nz+2*hs)), + hy_dens_theta_cell_(std::make_unique(nz+2*hs)), + hy_dens_int_ (std::make_unique(nz+1)), + hy_dens_theta_int_ (std::make_unique(nz+1)), + hy_pressure_int_ (std::make_unique(nz+1)) + {} + + view_1d_const hy_dens_cell() const { + return view_1d_const{hy_dens_cell_.get(), nz_ + 2 * hs_}; + } + view_1d_const hy_dens_theta_cell() const { + return view_1d_const{hy_dens_theta_cell_.get(), nz_ + 2 * hs_}; + } + view_1d_const hy_dens_int() const { + return view_1d_const{hy_dens_int_.get(), nz_ + 1}; + } + view_1d_const hy_dens_theta_int() const { + return view_1d_const{hy_dens_theta_int_.get(), nz_ + 1}; + } + view_1d_const hy_pressure_int() const { + return view_1d_const{hy_pressure_int_.get(), nz_ + 1}; + } + + view_1d hy_dens_cell() { + return view_1d{hy_dens_cell_.get(), nz_ + 2 * hs_}; + } + view_1d hy_dens_theta_cell() { + return view_1d{hy_dens_theta_cell_.get(), nz_ + 2 * hs_}; + } + view_1d hy_dens_int() { + return view_1d{hy_dens_int_.get(), nz_ + 1}; + } + view_1d hy_dens_theta_int() { + return view_1d{hy_dens_theta_int_.get(), nz_ + 1}; + } + view_1d hy_pressure_int() { + return view_1d{hy_pressure_int_.get(), nz_ + 1}; + } + +private: + int nx_, nz_, hs_; + std::unique_ptr hy_dens_cell_; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) + std::unique_ptr hy_dens_theta_cell_; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) + std::unique_ptr hy_dens_int_; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) + std::unique_ptr hy_dens_theta_int_; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) + std::unique_ptr hy_pressure_int_; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) }; // Arrays that are allocated in init and updated throughout the simulation. class global_arrays { public: - global_arrays(alloc_3d state, alloc_3d state_tmp, alloc_3d flux, alloc_3d tend) - : state_(std::move(state)), - state_tmp_(std::move(state_tmp)), - flux_(std::move(flux)), - tend_(std::move(tend)) + global_arrays(int nx, int nz) : + state_ (md::make_unique_mdarray(NUM_VARS, nz+2*hs, nx+2*hs)), + state_tmp_(md::make_unique_mdarray(NUM_VARS, nz+2*hs, nx+2*hs)), + flux_ (md::make_unique_mdarray(NUM_VARS, nz+1, nx+1)), + tend_ (md::make_unique_mdarray(NUM_VARS, nz, nx)) {} - + // The view member functions are const, but currently return nonconst views. // We might consider a different model where users declare access intent // (read-only, write-only, or read-write) at the point of use. @@ -373,6 +426,8 @@ void semi_discrete_step(view_3d_const state_init, // TODO: THREAD ME ///////////////////////////////////////////////// //Apply the tendencies to the fluid state + + auto hy_dens_cell = arrays.hy_dens_cell(); const int i_beg = scalars.i_beg; const int k_beg = scalars.k_beg; for (int ll = 0; ll < NUM_VARS; ++ll) { @@ -382,7 +437,7 @@ void semi_discrete_step(view_3d_const state_init, const double x = (i_beg + i+0.5)*dx; const double z = (k_beg + k+0.5)*dz; const double wpert = sample_ellipse_cosine(x, z, 0.01, xlen/8, 1000.0, 500.0, 500.0); - tend(ID_WMOM, k, i) += wpert * arrays.hy_dens_cell[hs+k]; + tend(ID_WMOM, k, i) += wpert * hy_dens_cell[hs+k]; } state_out(ll, k+hs, i+hs) = state_init(ll, k+hs, i+hs) + dt * tend(ll, k, i); } @@ -405,6 +460,10 @@ void compute_tendencies_x(view_3d_const state, view_3d flux, view_3d tend, // TODO: THREAD ME ///////////////////////////////////////////////// //Compute fluxes in the x-direction for each cell + + auto hy_dens_cell = arrays.hy_dens_cell(); + auto hy_dens_theta_cell = arrays.hy_dens_theta_cell(); + for (int k = 0; k < nz; ++k) { for (int i = 0; i < nx+1; ++i) { //Use fourth-order interpolation from four cell averages to compute the value at the interface in question @@ -419,10 +478,10 @@ void compute_tendencies_x(view_3d_const state, view_3d flux, view_3d tend, } //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively) - double r = vals[ID_DENS] + arrays.hy_dens_cell[k+hs]; + double r = vals[ID_DENS] + hy_dens_cell[k+hs]; double u = vals[ID_UMOM] / r; double w = vals[ID_WMOM] / r; - double t = ( vals[ID_RHOT] + arrays.hy_dens_theta_cell[k+hs] ) / r; + double t = ( vals[ID_RHOT] + hy_dens_theta_cell[k+hs] ) / r; double p = C0 * pow(r*t, gamm); //Compute the flux vector @@ -461,6 +520,11 @@ void compute_tendencies_z(view_3d_const state, view_3d flux, view_3d tend, // TODO: THREAD ME ///////////////////////////////////////////////// //Compute fluxes in the x-direction for each cell + + auto hy_dens_int = arrays.hy_dens_int(); + auto hy_dens_theta_int = arrays.hy_dens_theta_int(); + auto hy_pressure_int = arrays.hy_pressure_int(); + for (int k = 0; k < nz+1; ++k) { for (int i = 0; i < nx; ++i) { //Use fourth-order interpolation from four cell averages to compute the value at the interface in question @@ -475,11 +539,11 @@ void compute_tendencies_z(view_3d_const state, view_3d flux, view_3d tend, } //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively) - double r = vals[ID_DENS] + arrays.hy_dens_int[k]; + double r = vals[ID_DENS] + hy_dens_int[k]; double u = vals[ID_UMOM] / r; double w = vals[ID_WMOM] / r; - double t = (vals[ID_RHOT] + arrays.hy_dens_theta_int[k]) / r; - double p = C0 * pow(r * t, gamm) - arrays.hy_pressure_int[k]; + double t = (vals[ID_RHOT] + hy_dens_theta_int[k]) / r; + double p = C0 * pow(r * t, gamm) - hy_pressure_int[k]; //Enforce vertical boundary condition and exact mass conservation if (k == 0 || k == nz) { w = 0; @@ -542,14 +606,16 @@ void set_halo_values_x(view_3d state, if (data_spec_int == DATA_SPEC_INJECTION) { if (scalars.myrank == 0) { + auto hy_dens_cell = arrays.hy_dens_cell(); + auto hy_dens_theta_cell = arrays.hy_dens_theta_cell(); const int k_beg = scalars.k_beg; for (int k = 0; k < nz; ++k) { for (int i = 0; i < hs; ++i) { const double z = (k_beg + k+0.5)*dz; if (fabs(z-3*zlen/4) <= zlen/16) { - state(ID_UMOM, k+hs, i) = (state(ID_DENS, k+hs, i) + arrays.hy_dens_cell[k+hs]) * 50.0; - state(ID_RHOT, k+hs, i) = (state(ID_DENS, k+hs, i) + arrays.hy_dens_cell[k+hs]) * 298.0 - - arrays.hy_dens_theta_cell[k+hs]; + state(ID_UMOM, k+hs, i) = (state(ID_DENS, k+hs, i) + hy_dens_cell[k+hs]) * 50.0; + state(ID_RHOT, k+hs, i) = (state(ID_DENS, k+hs, i) + hy_dens_cell[k+hs]) * 298.0 - + hy_dens_theta_cell[k+hs]; } } } @@ -565,6 +631,7 @@ void set_halo_values_z(view_3d state, { const int nx = scalars.nx; const int nz = scalars.nz; + auto hy_dens_cell = arrays.hy_dens_cell(); ///////////////////////////////////////////////// // TODO: THREAD ME @@ -577,10 +644,10 @@ void set_halo_values_z(view_3d state, state(ll, nz+hs, i) = 0.; state(ll, nz+hs+1, i) = 0.; } else if (ll == ID_UMOM) { - state(ll, 0, i) = state(ll, hs, i) / arrays.hy_dens_cell[hs] * arrays.hy_dens_cell[0]; - state(ll, 1, i) = state(ll, hs, i) / arrays.hy_dens_cell[hs] * arrays.hy_dens_cell[1]; - state(ll, nz+hs, i) = state(ll, nz+hs-1, i) / arrays.hy_dens_cell[nz+hs-1] * arrays.hy_dens_cell[nz+hs]; - state(ll, nz+hs+1, i) = state(ll, nz+hs-1, i) / arrays.hy_dens_cell[nz+hs-1] * arrays.hy_dens_cell[nz+hs+1]; + state(ll, 0, i) = state(ll, hs, i) / hy_dens_cell[hs] * hy_dens_cell[0]; + state(ll, 1, i) = state(ll, hs, i) / hy_dens_cell[hs] * hy_dens_cell[1]; + state(ll, nz+hs, i) = state(ll, nz+hs-1, i) / hy_dens_cell[nz+hs-1] * hy_dens_cell[nz+hs]; + state(ll, nz+hs+1, i) = state(ll, nz+hs-1, i) / hy_dens_cell[nz+hs-1] * hy_dens_cell[nz+hs+1]; } else { state(ll, 0, i) = state(ll, hs, i); state(ll, 1, i) = state(ll, hs, i); @@ -627,12 +694,7 @@ init( int *argc , char ***argv ) int right_rank = 0; bool mainproc = (myrank == 0); - global_arrays gl_arrs{ - /* state = */ md::make_unique_mdarray(NUM_VARS, nz+2*hs, nx+2*hs), - /* state_tmp = */ md::make_unique_mdarray(NUM_VARS, nz+2*hs, nx+2*hs), - /* flux = */ md::make_unique_mdarray(NUM_VARS, nz+1, nx+1), - /* tend = */ md::make_unique_mdarray(NUM_VARS, nz, nx) - }; + global_arrays gl_arrs(nx, nz); auto state = gl_arrs.state(); auto state_tmp = gl_arrs.state_tmp(); auto flux = gl_arrs.flux(); @@ -682,11 +744,13 @@ init( int *argc , char ***argv ) } } - auto hy_dens_cell = std::make_unique(nz+2*hs); - auto hy_dens_theta_cell = std::make_unique(nz+2*hs); - auto hy_dens_int = std::make_unique(nz+1); - auto hy_dens_theta_int = std::make_unique(nz+1); - auto hy_pressure_int = std::make_unique(nz+1); + global_const_arrays gl_const_arrs(nx, nz, hs); + // Get nonconst views, so we can fill them in below. + auto hy_dens_cell = gl_const_arrs.hy_dens_cell(); + auto hy_dens_theta_cell = gl_const_arrs.hy_dens_theta_cell(); + auto hy_dens_int = gl_const_arrs.hy_dens_int(); + auto hy_dens_theta_int = gl_const_arrs.hy_dens_theta_int(); + auto hy_pressure_int = gl_const_arrs.hy_pressure_int(); //Compute the hydrostatic background state over vertical cell averages for (int k = 0; k < nz+2*hs; ++k) { @@ -710,8 +774,8 @@ init( int *argc , char ***argv ) } return std::tuple{ -#if defined(__cpp_designated_initializers) global_scalars{ +#if defined(__cpp_designated_initializers) .dt = dt, .etime = 0.0, .output_counter = 0.0, @@ -723,17 +787,7 @@ init( int *argc , char ***argv ) .myrank = myrank, .left_rank = left_rank, .right_rank = right_rank - }, - global_const_arrays{ - .hy_dens_cell = std::move(hy_dens_cell), - .hy_dens_theta_cell = std::move(hy_dens_theta_cell), - .hy_dens_int = std::move(hy_dens_int), - .hy_dens_theta_int = std::move(hy_dens_theta_int), - .hy_pressure_int = std::move(hy_pressure_int) - }, - std::move(gl_arrs) #else - global_scalars{ dt, /* etime = */ 0.0, /* output_counter = */ 0.0, @@ -745,16 +799,10 @@ init( int *argc , char ***argv ) myrank, left_rank, right_rank +#endif }, - global_const_arrays{ - std::move(hy_dens_cell), - std::move(hy_dens_theta_cell), - std::move(hy_dens_int), - std::move(hy_dens_theta_int), - std::move(hy_pressure_int) - }, + std::move(gl_const_arrs), std::move(gl_arrs) -#endif }; } @@ -954,16 +1002,19 @@ int output(view_3d_const state, } //Store perturbed values in the temp arrays for output + + auto hy_dens_cell = arrays.hy_dens_cell(); + auto hy_dens_theta_cell = arrays.hy_dens_theta_cell(); for (int k = 0; k < scalars.nz; ++k) { for (int i = 0; i < scalars.nx; ++i) { #if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) dens(k, i) = state(ID_DENS, k+hs, i+hs); - uwnd(k, i) = state(ID_UMOM, k+hs, i+hs) / (arrays.hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs)); - wwnd(k, i) = state(ID_WMOM, k+hs, i+hs) / (arrays.hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs)); + uwnd(k, i) = state(ID_UMOM, k+hs, i+hs) / (hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs)); + wwnd(k, i) = state(ID_WMOM, k+hs, i+hs) / (hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs)); #endif - theta(k, i) = (state(ID_RHOT, k+hs, i+hs) + arrays.hy_dens_theta_cell[k+hs]) / - (arrays.hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs)) - - arrays.hy_dens_theta_cell[k+hs] / arrays.hy_dens_cell[k+hs]; + theta(k, i) = (state(ID_RHOT, k+hs, i+hs) + hy_dens_theta_cell[k+hs]) / + (hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs)) - + hy_dens_theta_cell[k+hs] / hy_dens_cell[k+hs]; } } @@ -1019,32 +1070,32 @@ reduction_result reductions(view_3d_const state, const global_const_arrays& arrays) { reduction_result result{0.0, 0.0}; - const int nx = scalars.nx; const int nz = scalars.nz; + auto hy_dens_cell = arrays.hy_dens_cell(); + auto hy_dens_theta_cell = arrays.hy_dens_theta_cell(); for (int k = 0; k < nz; ++k) { for (int i = 0; i < nx; ++i) { - double r = state(ID_DENS, k+hs, i+hs) + arrays.hy_dens_cell[hs+k]; // Density - double u = state(ID_UMOM, k+hs, i+hs) / r; // U-wind - double w = state(ID_WMOM, k+hs, i+hs) / r; // W-wind - double th = (state(ID_RHOT, k+hs, i+hs) + arrays.hy_dens_theta_cell[hs+k]) / r; // Potential Temperature (theta) - double p = C0 * pow(r * th, gamm); // Pressure - double t = th / pow(p0 / p, rd / cp); // Temperature - double ke = r*(u*u+w*w); // Kinetic Energy - double ie = r*cv*t; // Internal Energy + double r = state(ID_DENS, k+hs, i+hs) + hy_dens_cell[hs+k]; // Density + double u = state(ID_UMOM, k+hs, i+hs) / r; // U-wind + double w = state(ID_WMOM, k+hs, i+hs) / r; // W-wind + double th = (state(ID_RHOT, k+hs, i+hs) + hy_dens_theta_cell[hs+k]) / r; // Potential Temperature (theta) + double p = C0 * pow(r * th, gamm); // Pressure + double t = th / pow(p0 / p, rd / cp); // Temperature + double ke = r*(u*u+w*w); // Kinetic Energy + double ie = r*cv*t; // Internal Energy result.mass += r *dx*dz; // Accumulate domain mass result.te += (ke + ie)*dx*dz; // Accumulate domain total energy } } - double glob[2], loc[2]; - loc[0] = result.mass; - loc[1] = result.te; - int ierr = MPI_Allreduce(loc,glob,2,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD); - result.mass = glob[0]; - result.te = glob[1]; - - return result; + std::array loc{result.mass, result.te}; + std::array glob{0.0, 0.0}; + int ierr = MPI_Allreduce(loc.data(), glob.data(), 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + return reduction_result{ + .mass = glob[0], + .te = glob[1] + }; } From 351e529e126ac4b2c5a546c049fef33c66ca902d Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 26 Mar 2025 01:36:25 +0200 Subject: [PATCH 64/83] Optimize extents and allocation --- cpp-mdspan/miniWeather_mdspan.cpp | 126 ++++++++++++++++-------------- 1 file changed, 69 insertions(+), 57 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index 7816287..2e2fe2f 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -73,28 +73,27 @@ double constexpr dz = zlen / nz_glob; // grid spacing in the x-direct // END USER-CONFIGURABLE PARAMETERS /////////////////////////////////////////////////////////////////////////////////////// -namespace md { - using MDSPAN_IMPL_STANDARD_NAMESPACE :: MDSPAN_IMPL_PROPOSED_NAMESPACE :: dims; -} // namespace md - -// FIXME use dims (make_unique_mdarray constructor -// that takes extents doesn't like that) -using alloc_3d = md::unique_mdarray, md::layout_right>; -using view_3d = md::mdspan, md::layout_right>; -using view_3d_const = md::mdspan, md::layout_right>; +// NUM_VARS is a compile-time constant, so we bake it into the extents type. +using extents_3d = md::extents; +using view_3d = md::mdspan; +using view_3d_const = md::mdspan; +using extents_1d = md::extents; // a.k.a. dims<1, int>; +using view_1d = md::mdspan; +using view_1d_const = md::mdspan; + +// All dynamic array allocation happens here. +// Deallocation other than through `delete [] ptr` would happen +// through a custom Deleter (second template argument of `unique_ptr`). + +using alloc_3d = std::unique_ptr; +alloc_3d make_unique_array_3d(int X, int Y, int Z) { + return std::make_unique(X * Y * Z); +} using alloc_1d = std::unique_ptr; -using view_1d = md::mdspan, md::layout_right>; -using view_1d_const = md::mdspan, md::layout_right>; - -//Runtime variable arrays -// -// C indexing seems to prefer the extents in reverse order. -// Respecting that also avoids divergence from the Python version. -// This means that the mdspan must be layout_right; the intent appears -// to be for C code to use row-major storage, but with Fortran ordering. -// -// state extents: NUM_VARS, (nz+2*hs), (nx+2*hs) +alloc_1d make_unique_array_1d(int X) { + return std::make_unique(X); +} struct global_scalars { // Model time step (seconds). The last time step might shorten this. @@ -117,20 +116,21 @@ struct global_scalars { bool mainproc() const { return myrank == 0; } //Am I the main process (rank == 0)? }; -// Arrays that are allocated in init and never changed after that. +// Arrays that are allocated and filled in init and never changed after that. class global_const_arrays { public: global_const_arrays(int nx, int nz, int hs) : nx_(nx), nz_(nz), hs_(hs), - hy_dens_cell_ (std::make_unique(nz+2*hs)), - hy_dens_theta_cell_(std::make_unique(nz+2*hs)), - hy_dens_int_ (std::make_unique(nz+1)), - hy_dens_theta_int_ (std::make_unique(nz+1)), - hy_pressure_int_ (std::make_unique(nz+1)) + hy_dens_cell_ (make_unique_array_1d(nz+2*hs)), + hy_dens_theta_cell_(make_unique_array_1d(nz+2*hs)), + hy_dens_int_ (make_unique_array_1d(nz+1)), + hy_dens_theta_int_ (make_unique_array_1d(nz+1)), + hy_pressure_int_ (make_unique_array_1d(nz+1)) {} + // Const views exist for all use after init. view_1d_const hy_dens_cell() const { return view_1d_const{hy_dens_cell_.get(), nz_ + 2 * hs_}; } @@ -147,6 +147,7 @@ class global_const_arrays { return view_1d_const{hy_pressure_int_.get(), nz_ + 1}; } + // Nonconst views exist for init. view_1d hy_dens_cell() { return view_1d{hy_dens_cell_.get(), nz_ + 2 * hs_}; } @@ -165,53 +166,67 @@ class global_const_arrays { private: int nx_, nz_, hs_; - std::unique_ptr hy_dens_cell_; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) - std::unique_ptr hy_dens_theta_cell_; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) - std::unique_ptr hy_dens_int_; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) - std::unique_ptr hy_dens_theta_int_; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) - std::unique_ptr hy_pressure_int_; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) + alloc_1d hy_dens_cell_; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) + alloc_1d hy_dens_theta_cell_; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) + alloc_1d hy_dens_int_; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) + alloc_1d hy_dens_theta_int_; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) + alloc_1d hy_pressure_int_; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) }; // Arrays that are allocated in init and updated throughout the simulation. +// +// C indexing seems to prefer the extents in reverse order. +// Respecting that also avoids divergence from the Python version. +// This means that the mdspan must be layout_right; the intent appears +// to be for C code to use row-major storage, but with Fortran ordering. class global_arrays { public: - global_arrays(int nx, int nz) : - state_ (md::make_unique_mdarray(NUM_VARS, nz+2*hs, nx+2*hs)), - state_tmp_(md::make_unique_mdarray(NUM_VARS, nz+2*hs, nx+2*hs)), - flux_ (md::make_unique_mdarray(NUM_VARS, nz+1, nx+1)), - tend_ (md::make_unique_mdarray(NUM_VARS, nz, nx)) + global_arrays(int nx, int nz, int hs) : + nx_(nx), + nz_(nz), + hs_(hs), + state_ (make_unique_array_3d(NUM_VARS, nz+2*hs, nx+2*hs)), + state_tmp_(make_unique_array_3d(NUM_VARS, nz+2*hs, nx+2*hs)), + flux_ (make_unique_array_3d(NUM_VARS, nz+1, nx+1)), + tend_ (make_unique_array_3d(NUM_VARS, nz, nx)) {} - // The view member functions are const, but currently return nonconst views. + // The view member functions are nonconst and return mdspan-of-nonconst. // We might consider a different model where users declare access intent // (read-only, write-only, or read-write) at the point of use. - view_3d state() const { - // The various allocations have dimensions that depend on + view_3d state() { + // The various allocations have related dimensions that depend on // just a few metadata (NUM_VARS, nz, nx, and hs). - // Storing extents for each allocation duplicates storage of these metadata. - // Instead, we might consider flat allocations (e.g., make_unique) - // and constructing layout mappings on the fly in these member functions. - return view_3d{state_}; + // Storing extents for each allocation would duplicate metadata storage. + // Instead, we use flat allocations and construct layout mappings on the fly + // in the member functions that return (mdspan) views. + return view_3d{state_.get(), NUM_VARS, nz_ + 2 * hs_, nx_ + 2 * hs_}; } - view_3d state_tmp() const { - return view_3d{state_tmp_}; + view_3d state_tmp() { + return view_3d{state_tmp_.get(), NUM_VARS, nz_ + 2 * hs_, nx_ + 2 * hs_}; } - view_3d flux() const { - return view_3d{flux_}; + view_3d flux() { + return view_3d{flux_.get(), NUM_VARS, nz_ + 1, nx_ + 1}; } - view_3d tend() const { - return view_3d{tend_}; + view_3d tend() { + return view_3d{tend_.get(), NUM_VARS, nz_, nx_}; } private: + int nx_, nz_, hs_; alloc_3d state_; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) alloc_3d state_tmp_; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) alloc_3d flux_; // Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) alloc_3d tend_; // Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) }; -std::tuple -init(int *argc , char ***argv ); +struct init_result { + global_scalars scalars; + global_const_arrays const_arrays; + global_arrays arrays; +}; + +init_result init(int *argc , char ***argv); void finalize(); @@ -658,10 +673,7 @@ void set_halo_values_z(view_3d state, } } - -std::tuple -init( int *argc , char ***argv ) -{ +init_result init( int *argc , char ***argv ) { (void) MPI_Init(argc,argv); ///////////////////////////////////////////////////////////// @@ -694,7 +706,7 @@ init( int *argc , char ***argv ) int right_rank = 0; bool mainproc = (myrank == 0); - global_arrays gl_arrs(nx, nz); + global_arrays gl_arrs(nx, nz, hs); auto state = gl_arrs.state(); auto state_tmp = gl_arrs.state_tmp(); auto flux = gl_arrs.flux(); @@ -773,7 +785,7 @@ init( int *argc , char ***argv ) hy_pressure_int [k] = C0 * pow(hr * ht, gamm); } - return std::tuple{ + return init_result{ global_scalars{ #if defined(__cpp_designated_initializers) .dt = dt, From d7883a8c2d3bb48d8ae155300dd3d20368914dae Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 26 Mar 2025 18:21:55 +0200 Subject: [PATCH 65/83] Separate const and nonconst scalars This helps manage mutable state like the output counter and dt. --- cpp-mdspan/miniWeather_mdspan.cpp | 214 +++++++++++++++++------------- 1 file changed, 121 insertions(+), 93 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index 2e2fe2f..6792798 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -1,4 +1,3 @@ - ////////////////////////////////////////////////////////////////////////////////////////// // miniWeather // Author: Matt Norman , Oak Ridge National Laboratory @@ -95,14 +94,8 @@ alloc_1d make_unique_array_1d(int X) { return std::make_unique(X); } -struct global_scalars { - // Model time step (seconds). The last time step might shorten this. - double dt; - double etime = 0.0; //Elapsed model time - double output_counter = 0.0; //Helps determine when it's time to do output - - // Variables and arrays that are set once in init and remain read-only throughout the simulation. - +// Variables that are set once in init and remain read-only throughout the simulation. +struct global_const_scalars { int nx = nx_glob; int nz = nz_glob; //Number of local grid cells in the x- and z- dimensions for this MPI task int i_beg = 0; @@ -116,6 +109,14 @@ struct global_scalars { bool mainproc() const { return myrank == 0; } //Am I the main process (rank == 0)? }; +struct global_scalars { + // Model time step (seconds). The last time step might shorten this. + double dt; + double etime = 0.0; //Elapsed model time + double output_counter = 0.0; //Helps determine when it's time to do output + int num_out = 0; //Number of outputs performed +}; + // Arrays that are allocated and filled in init and never changed after that. class global_const_arrays { public: @@ -221,6 +222,7 @@ class global_arrays { }; struct init_result { + global_const_scalars const_scalars; global_scalars scalars; global_const_arrays const_arrays; global_arrays arrays; @@ -265,62 +267,66 @@ r_t_pair hydro_const_bvfreq(double z, double bv_freq0); double sample_ellipse_cosine(double x, double z, double amp, double x0, double z0, double xrad, double zrad); -int output(view_3d_const state, - const global_scalars& scalars, - const global_const_arrays& arrays, - double etime, int num_out); +void output(view_3d_const state, + const global_const_scalars& const_scalars, + const global_const_arrays& const_arrays, + global_scalars& scalars); void ncwrap(int ierr, int line); int perform_timestep(view_3d state, view_3d state_tmp, view_3d flux, view_3d tend, + const global_const_scalars& c_scalars, + const global_const_arrays& c_arrays, const global_scalars& scalars, - const global_const_arrays& arrays, - int direction_switch); + int direction_switch); // TODO Put in (mutable) scalars void semi_discrete_step(view_3d_const state_init, view_3d state_forcing, view_3d state_out, double dt /* not scalars.dt */, direction dir, view_3d flux, view_3d tend, - const global_scalars& scalars, + const global_const_scalars& scalars, const global_const_arrays& arrays); -void compute_tendencies_x(view_3d_const state, view_3d flux, view_3d tend, - double dt, int nx, int nz, const global_const_arrays& arrays); -void compute_tendencies_z(view_3d_const state, view_3d flux, view_3d tend, - double dt, int nx, int nz, const global_const_arrays& arrays); +void compute_tendencies_x(view_3d_const state, + view_3d flux, view_3d tend, double dt, + const global_const_scalars& scalars, + const global_const_arrays& arrays); +void compute_tendencies_z(view_3d_const state, + view_3d flux, view_3d tend, double dt, + const global_const_scalars& scalars, + const global_const_arrays& arrays); void set_halo_values_x(view_3d state, - const global_scalars& scalars, const global_const_arrays& arrays); + const global_const_scalars& scalars, const global_const_arrays& arrays); void set_halo_values_z(view_3d state, - const global_scalars& scalars, const global_const_arrays& arrays); + const global_const_scalars& scalars, const global_const_arrays& arrays); struct reduction_result { double mass; double te; }; reduction_result reductions(view_3d_const state, - const global_scalars& scalars, - const global_const_arrays& arrays); + const global_const_scalars& const_scalars, + const global_const_arrays& const_arrays); /////////////////////////////////////////////////////////////////////////////////////// // THE MAIN PROGRAM STARTS HERE /////////////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { - auto [scalars, const_arrays, arrays] = init( &argc , &argv ); + auto [const_scalars, scalars, const_arrays, arrays] = init( &argc , &argv ); int direction_switch = 1; - int num_out = 0; //Initial reductions for mass, kinetic energy, and total energy. // // mass0: initial domain total for mass // te0: initial domain total for total energy - auto [mass0, te0] = reductions(arrays.state(), scalars, const_arrays); + auto [mass0, te0] = reductions(arrays.state(), const_scalars, const_arrays); #if ! defined(NO_INFORM) - if (scalars.mainproc()) { + if (const_scalars.mainproc()) { fprintf(stderr, "mass0: %le\n" , mass0); fprintf(stderr, "te0: %le\n" , te0 ); } #endif //Output the initial state - num_out = output(arrays.state(), scalars, const_arrays, scalars.etime, num_out); + output(arrays.state(), const_scalars, const_arrays, scalars); //////////////////////////////////////////////////// // MAIN TIME STEP LOOP @@ -333,10 +339,11 @@ int main(int argc, char **argv) { } //Perform a single time step direction_switch = perform_timestep(arrays.state(), arrays.state_tmp(), - arrays.flux(), arrays.tend(), scalars, const_arrays, direction_switch); + arrays.flux(), arrays.tend(), const_scalars, const_arrays, scalars, + direction_switch); //Inform the user #if ! defined(NO_INFORM) - if (scalars.mainproc()) { + if (const_scalars.mainproc()) { fprintf(stderr, "Elapsed Time: %lf / %lf\n", scalars.etime, sim_time); } #endif @@ -346,20 +353,20 @@ int main(int argc, char **argv) { //If it's time for output, reset the counter, and do output if (scalars.output_counter >= output_freq) { scalars.output_counter = scalars.output_counter - output_freq; - num_out = output(arrays.state(), scalars, const_arrays, scalars.etime, num_out); + output(arrays.state(), const_scalars, const_arrays, scalars); } } [[maybe_unused]] auto t2 = std::chrono::steady_clock::now(); #if ! defined(NO_INFORM) - if (scalars.mainproc()) { + if (const_scalars.mainproc()) { printf("CPU Time: %e sec\n", std::chrono::duration(t2-t1).count()); } #endif #if ! defined(NO_INFORM) //Final reductions for mass, kinetic energy, and total energy - auto [mass, te] = reductions(arrays.state(), scalars, const_arrays); - if (scalars.mainproc()) { + auto [mass, te] = reductions(arrays.state(), const_scalars, const_arrays); + if (const_scalars.mainproc()) { fprintf(stderr, "d_mass: %le\n" , (mass - mass0)/mass0 ); fprintf(stderr, "d_te: %le\n" , (te - te0 )/te0 ); } @@ -380,29 +387,30 @@ int main(int argc, char **argv) { // Return: updated direction_switch int perform_timestep(view_3d state, view_3d state_tmp, view_3d flux, view_3d tend, + const global_const_scalars& c_scalars, + const global_const_arrays& c_arrays, const global_scalars& scalars, - const global_const_arrays& arrays, int direction_switch) { const double dt = scalars.dt; if (direction_switch) { //x-direction first - semi_discrete_step(state, state , state_tmp, dt / 3, direction::X, flux, tend, scalars, arrays); - semi_discrete_step(state, state_tmp, state_tmp, dt / 2, direction::X, flux, tend, scalars, arrays); - semi_discrete_step(state, state_tmp, state , dt / 1, direction::X, flux, tend, scalars, arrays); + semi_discrete_step(state, state , state_tmp, dt / 3, direction::X, flux, tend, c_scalars, c_arrays); + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, direction::X, flux, tend, c_scalars, c_arrays); + semi_discrete_step(state, state_tmp, state , dt / 1, direction::X, flux, tend, c_scalars, c_arrays); //z-direction second - semi_discrete_step(state, state , state_tmp, dt / 3, direction::Z, flux, tend, scalars, arrays); - semi_discrete_step(state, state_tmp, state_tmp, dt / 2, direction::Z, flux, tend, scalars, arrays); - semi_discrete_step(state, state_tmp, state , dt / 1, direction::Z, flux, tend, scalars, arrays); + semi_discrete_step(state, state , state_tmp, dt / 3, direction::Z, flux, tend, c_scalars, c_arrays); + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, direction::Z, flux, tend, c_scalars, c_arrays); + semi_discrete_step(state, state_tmp, state , dt / 1, direction::Z, flux, tend, c_scalars, c_arrays); } else { //z-direction second - semi_discrete_step(state, state , state_tmp, dt / 3, direction::Z, flux, tend, scalars, arrays); - semi_discrete_step(state, state_tmp, state_tmp, dt / 2, direction::Z, flux, tend, scalars, arrays); - semi_discrete_step(state, state_tmp, state , dt / 1, direction::Z, flux, tend, scalars, arrays); + semi_discrete_step(state, state , state_tmp, dt / 3, direction::Z, flux, tend, c_scalars, c_arrays); + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, direction::Z, flux, tend, c_scalars, c_arrays); + semi_discrete_step(state, state_tmp, state , dt / 1, direction::Z, flux, tend, c_scalars, c_arrays); //x-direction first - semi_discrete_step(state, state , state_tmp, dt / 3, direction::X, flux, tend, scalars, arrays); - semi_discrete_step(state, state_tmp, state_tmp, dt / 2, direction::X, flux, tend, scalars, arrays); - semi_discrete_step(state, state_tmp, state , dt / 1, direction::X, flux, tend, scalars, arrays); + semi_discrete_step(state, state , state_tmp, dt / 3, direction::X, flux, tend, c_scalars, c_arrays); + semi_discrete_step(state, state_tmp, state_tmp, dt / 2, direction::X, flux, tend, c_scalars, c_arrays); + semi_discrete_step(state, state_tmp, state , dt / 1, direction::X, flux, tend, c_scalars, c_arrays); } if (direction_switch) { direction_switch = 0; } else { direction_switch = 1; } @@ -419,7 +427,7 @@ void semi_discrete_step(view_3d_const state_init, view_3d state_out, double dt /* not scalars.dt */, direction dir, view_3d flux, view_3d tend, - const global_scalars& scalars, + const global_const_scalars& scalars, const global_const_arrays& arrays) { const int nx = scalars.nx; @@ -429,12 +437,12 @@ void semi_discrete_step(view_3d_const state_init, //Set the halo values for this MPI task's fluid state in the x-direction set_halo_values_x(state_forcing, scalars, arrays); //Compute the time tendencies for the fluid state in the x-direction - compute_tendencies_x(state_forcing, flux, tend, dt, nx, nz, arrays); + compute_tendencies_x(state_forcing, flux, tend, dt, scalars, arrays); } else if (dir == direction::Z) { //Set the halo values for this MPI task's fluid state in the z-direction set_halo_values_z(state_forcing, scalars, arrays); //Compute the time tendencies for the fluid state in the z-direction - compute_tendencies_z(state_forcing, flux, tend, dt, nx, nz, arrays); + compute_tendencies_z(state_forcing, flux, tend, dt, scalars, arrays); } ///////////////////////////////////////////////// @@ -465,9 +473,14 @@ void semi_discrete_step(view_3d_const state_init, //Since the halos are set in a separate routine, this will not require MPI //First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) //Then, compute the tendencies using those fluxes -void compute_tendencies_x(view_3d_const state, view_3d flux, view_3d tend, - double dt, int nx, int nz, const global_const_arrays& arrays) +void compute_tendencies_x(view_3d_const state, + view_3d flux, view_3d tend, double dt, + const global_const_scalars& scalars, + const global_const_arrays& arrays) { + const int nx = scalars.nx; + const int nz = scalars.nz; + double stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef; //Compute the hyperviscosity coefficient hv_coef = -hv_beta * dx / (16*dt); @@ -525,9 +538,14 @@ void compute_tendencies_x(view_3d_const state, view_3d flux, view_3d tend, //Since the halos are set in a separate routine, this will not require MPI //First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity) //Then, compute the tendencies using those fluxes -void compute_tendencies_z(view_3d_const state, view_3d flux, view_3d tend, - double dt, int nx, int nz, const global_const_arrays& arrays) +void compute_tendencies_z(view_3d_const state, + view_3d flux, view_3d tend, double dt, + const global_const_scalars& scalars, + const global_const_arrays& arrays) { + const int nx = scalars.nx; + const int nz = scalars.nz; + double stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS]; //Compute the hyperviscosity coefficient const double hv_coef = -hv_beta * dz / (16*dt); @@ -593,7 +611,7 @@ void compute_tendencies_z(view_3d_const state, view_3d flux, view_3d tend, //Set this MPI task's halo values in the x-direction. This routine will require MPI void set_halo_values_x(view_3d state, - const global_scalars& scalars, const global_const_arrays& arrays) + const global_const_scalars& scalars, const global_const_arrays& arrays) { const int nx = scalars.nx; const int nz = scalars.nz; @@ -642,7 +660,7 @@ void set_halo_values_x(view_3d state, //Set this MPI task's halo values in the z-direction. This does not require MPI because there is no MPI //decomposition in the vertical direction void set_halo_values_z(view_3d state, - const global_scalars& scalars, const global_const_arrays& arrays) + const global_const_scalars& scalars, const global_const_arrays& arrays) { const int nx = scalars.nx; const int nz = scalars.nz; @@ -786,11 +804,8 @@ init_result init( int *argc , char ***argv ) { } return init_result{ - global_scalars{ + global_const_scalars{ #if defined(__cpp_designated_initializers) - .dt = dt, - .etime = 0.0, - .output_counter = 0.0, .nx = nx, .nz = nz, .i_beg = i_beg, @@ -800,9 +815,6 @@ init_result init( int *argc , char ***argv ) { .left_rank = left_rank, .right_rank = right_rank #else - dt, - /* etime = */ 0.0, - /* output_counter = */ 0.0, nx, nz, i_beg, @@ -811,6 +823,19 @@ init_result init( int *argc , char ***argv ) { myrank, left_rank, right_rank +#endif + }, + global_scalars{ +#if defined(__cpp_designated_initializers) + .dt = dt, + .etime = 0.0, + .output_counter = 0.0, + .num_out = 0 +#else + dt, + /* etime = */ 0.0, + /* output_counter = */ 0.0, + /* num_out = */ 0 #endif }, std::move(gl_const_arrs), @@ -945,13 +970,13 @@ double sample_ellipse_cosine( double x , double z , double amp , double x0 , dou //Output the fluid state (state) to a NetCDF file at a given elapsed model time (etime) //The file I/O uses parallel-netcdf, the only external library required for this mini-app. //If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics -// -// Input: number of outputs performed before calling this function. -// Return: number of outputs performed after calling this function. -int output(view_3d_const state, - const global_scalars& scalars, - const global_const_arrays& arrays, - double etime, int num_out) { +void output(view_3d_const state, + const global_const_scalars& const_scalars, + const global_const_arrays& const_arrays, + global_scalars& scalars) +{ + const int nx = const_scalars.nx; + const int nz = const_scalars.nz; int ncid, t_dimid, x_dimid, z_dimid, theta_varid, t_varid, dimids[3]; #if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) @@ -960,15 +985,15 @@ int output(view_3d_const state, MPI_Offset st1[1], ct1[1], st3[3], ct3[3]; //Inform the user - if (scalars.mainproc()) { fprintf(stderr, "*** OUTPUT ***\n"); } + if (const_scalars.mainproc()) { fprintf(stderr, "*** OUTPUT ***\n"); } //Temporary arrays to hold density, u-wind, w-wind, and potential temperature (theta). #if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) - auto dens = md::make_unique_mdarray(scalars.nz, scalars.nx); - auto uwnd = md::make_unique_mdarray(scalars.nz, scalars.nx); - auto wwnd = md::make_unique_mdarray(scalars.nz, scalars.nx); + auto dens = md::make_unique_mdarray(nz, nx); + auto uwnd = md::make_unique_mdarray(nz, nx); + auto wwnd = md::make_unique_mdarray(nz, nx); #endif - auto theta = md::make_unique_mdarray(scalars.nz, scalars.nx); + auto theta = md::make_unique_mdarray(nz, nx); auto etimearr = std::make_unique(1); // PNetCDF needs an MPI_Info object that is not MPI_INFO_NULL. @@ -981,7 +1006,7 @@ int output(view_3d_const state, } //If the elapsed time is zero, create the file. Otherwise, open the file - if (etime == 0) { + if (scalars.etime == 0) { //Create the file ncwrap( ncmpi_create( MPI_COMM_WORLD , "output.nc" , NC_CLOBBER , mpi_info , &ncid ) , __LINE__ ); //Create the dimensions @@ -1015,10 +1040,10 @@ int output(view_3d_const state, //Store perturbed values in the temp arrays for output - auto hy_dens_cell = arrays.hy_dens_cell(); - auto hy_dens_theta_cell = arrays.hy_dens_theta_cell(); - for (int k = 0; k < scalars.nz; ++k) { - for (int i = 0; i < scalars.nx; ++i) { + auto hy_dens_cell = const_arrays.hy_dens_cell(); + auto hy_dens_theta_cell = const_arrays.hy_dens_theta_cell(); + for (int k = 0; k < nz; ++k) { + for (int i = 0; i < nx; ++i) { #if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) dens(k, i) = state(ID_DENS, k+hs, i+hs); uwnd(k, i) = state(ID_UMOM, k+hs, i+hs) / (hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs)); @@ -1031,8 +1056,11 @@ int output(view_3d_const state, } //Write the grid data to file with all the processes writing collectively - st3[0] = num_out; st3[1] = scalars.k_beg; st3[2] = scalars.i_beg; - ct3[0] = 1 ; ct3[1] = scalars.nz ; ct3[2] = scalars.nx ; + const int k_beg = const_scalars.k_beg; + const int i_beg = const_scalars.i_beg; + + st3[0] = scalars.num_out; st3[1] = k_beg; st3[2] = i_beg; + ct3[0] = 1; ct3[1] = nz; ct3[2] = nx; #if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) ncwrap( ncmpi_put_vara_double_all( ncid , dens_varid , st3 , ct3 , dens.get() ) , __LINE__ ); ncwrap( ncmpi_put_vara_double_all( ncid , uwnd_varid , st3 , ct3 , uwnd.get() ) , __LINE__ ); @@ -1044,10 +1072,10 @@ int output(view_3d_const state, //Begin "independent" write mode ncwrap( ncmpi_begin_indep_data(ncid) , __LINE__ ); //write elapsed time to file - if (scalars.mainproc()) { - st1[0] = num_out; + if (const_scalars.mainproc()) { + st1[0] = scalars.num_out; ct1[0] = 1; - etimearr[0] = etime; + etimearr[0] = scalars.etime; ncwrap( ncmpi_put_vara_double( ncid , t_varid , st1 , ct1 , etimearr.get() ) , __LINE__ ); } //End "independent" write mode @@ -1057,7 +1085,7 @@ int output(view_3d_const state, ncwrap( ncmpi_close(ncid) , __LINE__ ); (void) MPI_Info_free(&mpi_info); - return num_out + 1; + scalars.num_out++; } @@ -1078,14 +1106,14 @@ void finalize() { //Compute reduced quantities for error checking without resorting to the "ncdiff" tool reduction_result reductions(view_3d_const state, - const global_scalars& scalars, - const global_const_arrays& arrays) + const global_const_scalars& const_scalars, + const global_const_arrays& const_arrays) { reduction_result result{0.0, 0.0}; - const int nx = scalars.nx; - const int nz = scalars.nz; - auto hy_dens_cell = arrays.hy_dens_cell(); - auto hy_dens_theta_cell = arrays.hy_dens_theta_cell(); + const int nx = const_scalars.nx; + const int nz = const_scalars.nz; + auto hy_dens_cell = const_arrays.hy_dens_cell(); + auto hy_dens_theta_cell = const_arrays.hy_dens_theta_cell(); for (int k = 0; k < nz; ++k) { for (int i = 0; i < nx; ++i) { From 204e8d17138d83f7aa9db9298553f9a15960e608 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 26 Mar 2025 19:00:26 +0200 Subject: [PATCH 66/83] More mutable state encapsulation --- cpp-mdspan/miniWeather_mdspan.cpp | 210 +++++++++++++++++------------- 1 file changed, 116 insertions(+), 94 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index 6792798..a6f9aed 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -115,6 +115,7 @@ struct global_scalars { double etime = 0.0; //Elapsed model time double output_counter = 0.0; //Helps determine when it's time to do output int num_out = 0; //Number of outputs performed + int direction_switch = 1; //Switch to alternate the order of directions }; // Arrays that are allocated and filled in init and never changed after that. @@ -192,9 +193,11 @@ class global_arrays { tend_ (make_unique_array_3d(NUM_VARS, nz, nx)) {} - // The view member functions are nonconst and return mdspan-of-nonconst. - // We might consider a different model where users declare access intent - // (read-only, write-only, or read-write) at the point of use. + // The current model for member functions that get a view of an array + // is for the const-ness of the global_arrays object to determine + // whether the view is a view-of-const or view-of-nonconst. + // We might consider a different model where users explicitly declare + // access intent (read-only, write-only, or read-write) at the point of use. view_3d state() { // The various allocations have related dimensions that depend on // just a few metadata (NUM_VARS, nz, nx, and hs). @@ -213,6 +216,15 @@ class global_arrays { return view_3d{tend_.get(), NUM_VARS, nz_, nx_}; } + view_3d_const state() const { + // The various allocations have related dimensions that depend on + // just a few metadata (NUM_VARS, nz, nx, and hs). + // Storing extents for each allocation would duplicate metadata storage. + // Instead, we use flat allocations and construct layout mappings on the fly + // in the member functions that return (mdspan) views. + return view_3d_const{state_.get(), NUM_VARS, nz_ + 2 * hs_, nx_ + 2 * hs_}; + } + private: int nx_, nz_, hs_; alloc_3d state_; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) @@ -272,12 +284,11 @@ void output(view_3d_const state, const global_const_arrays& const_arrays, global_scalars& scalars); void ncwrap(int ierr, int line); -int perform_timestep(view_3d state, view_3d state_tmp, - view_3d flux, view_3d tend, - const global_const_scalars& c_scalars, - const global_const_arrays& c_arrays, - const global_scalars& scalars, - int direction_switch); // TODO Put in (mutable) scalars +void perform_timestep(view_3d state, view_3d state_tmp, + view_3d flux, view_3d tend, + const global_const_scalars& c_scalars, + const global_const_arrays& c_arrays, + global_scalars& scalars); void semi_discrete_step(view_3d_const state_init, view_3d state_forcing, view_3d state_out, @@ -311,13 +322,12 @@ reduction_result reductions(view_3d_const state, /////////////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { auto [const_scalars, scalars, const_arrays, arrays] = init( &argc , &argv ); - int direction_switch = 1; //Initial reductions for mass, kinetic energy, and total energy. // // mass0: initial domain total for mass // te0: initial domain total for total energy - auto [mass0, te0] = reductions(arrays.state(), const_scalars, const_arrays); + auto [mass0, te0] = reductions(std::as_const(arrays).state(), const_scalars, const_arrays); #if ! defined(NO_INFORM) if (const_scalars.mainproc()) { fprintf(stderr, "mass0: %le\n" , mass0); @@ -326,22 +336,20 @@ int main(int argc, char **argv) { #endif //Output the initial state - output(arrays.state(), const_scalars, const_arrays, scalars); + output(std::as_const(arrays).state(), const_scalars, const_arrays, scalars); //////////////////////////////////////////////////// // MAIN TIME STEP LOOP //////////////////////////////////////////////////// [[maybe_unused]] auto t1 = std::chrono::steady_clock::now(); while (scalars.etime < sim_time) { - //If the time step leads to exceeding the simulation time, shorten it for the last step + // If the time step leads to exceeding the simulation time, + // shorten it for the last step if (scalars.etime + scalars.dt > sim_time) { scalars.dt = sim_time - scalars.etime; } - //Perform a single time step - direction_switch = perform_timestep(arrays.state(), arrays.state_tmp(), - arrays.flux(), arrays.tend(), const_scalars, const_arrays, scalars, - direction_switch); - //Inform the user + perform_timestep(arrays.state(), arrays.state_tmp(), arrays.flux(), + arrays.tend(), const_scalars, const_arrays, scalars); #if ! defined(NO_INFORM) if (const_scalars.mainproc()) { fprintf(stderr, "Elapsed Time: %lf / %lf\n", scalars.etime, sim_time); @@ -359,41 +367,40 @@ int main(int argc, char **argv) { [[maybe_unused]] auto t2 = std::chrono::steady_clock::now(); #if ! defined(NO_INFORM) if (const_scalars.mainproc()) { - printf("CPU Time: %e sec\n", std::chrono::duration(t2-t1).count()); + printf("CPU Time: %e s\n", std::chrono::duration(t2-t1).count()); } #endif -#if ! defined(NO_INFORM) //Final reductions for mass, kinetic energy, and total energy auto [mass, te] = reductions(arrays.state(), const_scalars, const_arrays); if (const_scalars.mainproc()) { - fprintf(stderr, "d_mass: %le\n" , (mass - mass0)/mass0 ); - fprintf(stderr, "d_te: %le\n" , (te - te0 )/te0 ); + printf("d_mass: %le\n" , (mass - mass0)/mass0); + printf("d_te: %le\n" , (te - te0 )/te0 ); } -#endif finalize(); } - -//Performs a single dimensionally split time step using a simple low-storage three-stage Runge-Kutta time integrator -//The dimensional splitting is a second-order-accurate alternating Strang splitting in which the -//order of directions is alternated each time step. -//The Runge-Kutta method used here is defined as follows: +// Perform a single time step. +// Time steps are dimensionally split and +// use a simple low-storage three-stage Runge-Kutta time integrator. +// The dimensional splitting is a second-order-accurate alternating Strang splitting +// that alternates the order of directions each time step. +// +// The Runge-Kutta method used here is defined as follows: +// // q* = q[n] + dt/3 * rhs(q[n]) // q** = q[n] + dt/2 * rhs(q* ) // q[n+1] = q[n] + dt/1 * rhs(q** ) // -// Return: updated direction_switch -int perform_timestep(view_3d state, view_3d state_tmp, - view_3d flux, view_3d tend, - const global_const_scalars& c_scalars, - const global_const_arrays& c_arrays, - const global_scalars& scalars, - int direction_switch) +void perform_timestep(view_3d state, view_3d state_tmp, + view_3d flux, view_3d tend, + const global_const_scalars& c_scalars, + const global_const_arrays& c_arrays, + global_scalars& scalars) { const double dt = scalars.dt; - if (direction_switch) { + if (scalars.direction_switch) { //x-direction first semi_discrete_step(state, state , state_tmp, dt / 3, direction::X, flux, tend, c_scalars, c_arrays); semi_discrete_step(state, state_tmp, state_tmp, dt / 2, direction::X, flux, tend, c_scalars, c_arrays); @@ -412,9 +419,11 @@ int perform_timestep(view_3d state, view_3d state_tmp, semi_discrete_step(state, state_tmp, state_tmp, dt / 2, direction::X, flux, tend, c_scalars, c_arrays); semi_discrete_step(state, state_tmp, state , dt / 1, direction::X, flux, tend, c_scalars, c_arrays); } - if (direction_switch) { direction_switch = 0; } else { direction_switch = 1; } - - return direction_switch; + if (scalars.direction_switch) { + scalars.direction_switch = 0; + } else { + scalars.direction_switch = 1; + } } @@ -450,19 +459,22 @@ void semi_discrete_step(view_3d_const state_init, ///////////////////////////////////////////////// //Apply the tendencies to the fluid state - auto hy_dens_cell = arrays.hy_dens_cell(); - const int i_beg = scalars.i_beg; - const int k_beg = scalars.k_beg; - for (int ll = 0; ll < NUM_VARS; ++ll) { - for (int k = 0; k < nz; ++k) { - for (int i = 0; i < nx; ++i) { - if (data_spec_int == DATA_SPEC_GRAVITY_WAVES) { - const double x = (i_beg + i+0.5)*dx; - const double z = (k_beg + k+0.5)*dz; - const double wpert = sample_ellipse_cosine(x, z, 0.01, xlen/8, 1000.0, 500.0, 500.0); - tend(ID_WMOM, k, i) += wpert * hy_dens_cell[hs+k]; + { + view_3d_const tend_c = tend; + auto hy_dens_cell = arrays.hy_dens_cell(); + const int i_beg = scalars.i_beg; + const int k_beg = scalars.k_beg; + for (int ll = 0; ll < NUM_VARS; ++ll) { + for (int k = 0; k < nz; ++k) { + for (int i = 0; i < nx; ++i) { + if (data_spec_int == DATA_SPEC_GRAVITY_WAVES) { + const double x = (i_beg + i+0.5)*dx; + const double z = (k_beg + k+0.5)*dz; + const double wpert = sample_ellipse_cosine(x, z, 0.01, xlen/8, 1000.0, 500.0, 500.0); + tend(ID_WMOM, k, i) += wpert * hy_dens_cell[hs+k]; + } + state_out(ll, k+hs, i+hs) = state_init(ll, k+hs, i+hs) + dt * tend_c(ll, k, i); } - state_out(ll, k+hs, i+hs) = state_init(ll, k+hs, i+hs) + dt * tend(ll, k, i); } } } @@ -480,32 +492,36 @@ void compute_tendencies_x(view_3d_const state, { const int nx = scalars.nx; const int nz = scalars.nz; + // Hyperviscosity coefficient + const double hv_coef = -hv_beta * dx / (16*dt); + auto hy_dens_cell = arrays.hy_dens_cell(); + auto hy_dens_theta_cell = arrays.hy_dens_theta_cell(); + + std::array stencil; + std::array d3_vals; + std::array vals; - double stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS], hv_coef; - //Compute the hyperviscosity coefficient - hv_coef = -hv_beta * dx / (16*dt); ///////////////////////////////////////////////// // TODO: THREAD ME ///////////////////////////////////////////////// //Compute fluxes in the x-direction for each cell - - auto hy_dens_cell = arrays.hy_dens_cell(); - auto hy_dens_theta_cell = arrays.hy_dens_theta_cell(); - for (int k = 0; k < nz; ++k) { for (int i = 0; i < nx+1; ++i) { - //Use fourth-order interpolation from four cell averages to compute the value at the interface in question + //Use fourth-order interpolation from four cell averages + //to compute the value at the interface in question for (int ll = 0; ll < NUM_VARS; ++ll) { for (int s = 0; s < sten_size; ++s) { stencil[s] = state(ll, k+hs, i+s); } //Fourth-order-accurate interpolation of the state vals[ll] = -stencil[0]/12 + 7*stencil[1]/12 + 7*stencil[2]/12 - stencil[3]/12; - //First-order-accurate interpolation of the third spatial derivative of the state (for artificial viscosity) + //First-order-accurate interpolation of the third spatial derivative + //of the state (for artificial viscosity) d3_vals[ll] = -stencil[0] + 3*stencil[1] - 3*stencil[2] + stencil[3]; } - //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively) + //Compute density, u-wind, w-wind, potential temperature, + //and pressure (r,u,w,t,p respectively) double r = vals[ID_DENS] + hy_dens_cell[k+hs]; double u = vals[ID_UMOM] / r; double w = vals[ID_WMOM] / r; @@ -524,10 +540,13 @@ void compute_tendencies_x(view_3d_const state, // TODO: THREAD ME ///////////////////////////////////////////////// //Use the fluxes to compute tendencies for each cell - for (int ll = 0; ll < NUM_VARS; ++ll) { - for (int k = 0; k < nz; ++k) { - for (int i = 0; i < nx; ++i) { - tend(ll, k, i) = -( flux(ll, k, i+1) - flux(ll, k, i) ) / dx; + { + view_3d_const flux_c = flux; + for (int ll = 0; ll < NUM_VARS; ++ll) { + for (int k = 0; k < nz; ++k) { + for (int i = 0; i < nx; ++i) { + tend(ll, k, i) = -( flux_c(ll, k, i+1) - flux_c(ll, k, i) ) / dx; + } } } } @@ -545,33 +564,37 @@ void compute_tendencies_z(view_3d_const state, { const int nx = scalars.nx; const int nz = scalars.nz; - - double stencil[4], d3_vals[NUM_VARS], vals[NUM_VARS]; - //Compute the hyperviscosity coefficient + // Hyperviscosity coefficient const double hv_coef = -hv_beta * dz / (16*dt); - ///////////////////////////////////////////////// - // TODO: THREAD ME - ///////////////////////////////////////////////// - //Compute fluxes in the x-direction for each cell - auto hy_dens_int = arrays.hy_dens_int(); auto hy_dens_theta_int = arrays.hy_dens_theta_int(); auto hy_pressure_int = arrays.hy_pressure_int(); + std::array stencil; + std::array d3_vals; + std::array vals; + + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Compute fluxes in the x-direction for each cell for (int k = 0; k < nz+1; ++k) { for (int i = 0; i < nx; ++i) { - //Use fourth-order interpolation from four cell averages to compute the value at the interface in question + //Use fourth-order interpolation from four cell averages + //to compute the value at the interface in question for (int ll = 0; ll < NUM_VARS; ++ll) { for (int s = 0; s < sten_size; ++s) { stencil[s] = state(ll, k+s, i+hs); } //Fourth-order-accurate interpolation of the state vals[ll] = -stencil[0]/12 + 7*stencil[1]/12 + 7*stencil[2]/12 - stencil[3]/12; - //First-order-accurate interpolation of the third spatial derivative of the state + //First-order-accurate interpolation of the third spatial derivative + //of the state d3_vals[ll] = -stencil[0] + 3*stencil[1] - 3*stencil[2] + stencil[3]; } - //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively) + //Compute density, u-wind, w-wind, potential temperature, + //and pressure (r,u,w,t,p respectively) double r = vals[ID_DENS] + hy_dens_int[k]; double u = vals[ID_UMOM] / r; double w = vals[ID_WMOM] / r; @@ -595,12 +618,15 @@ void compute_tendencies_z(view_3d_const state, // TODO: THREAD ME ///////////////////////////////////////////////// //Use the fluxes to compute tendencies for each cell - for (int ll = 0; ll < NUM_VARS; ++ll) { - for (int k = 0; k < nz; ++k) { - for (int i = 0; i < nx; ++i) { - tend(ll, k, i) = -( flux(ll, k+1, i) - flux(ll, k, i) ) / dz; - if (ll == ID_WMOM) { - tend(ll, k, i) = tend(ll, k, i) - state(ID_DENS, k+hs, i+hs)*grav; + { + view_3d_const flux_c = flux; + for (int ll = 0; ll < NUM_VARS; ++ll) { + for (int k = 0; k < nz; ++k) { + for (int i = 0; i < nx; ++i) { + tend(ll, k, i) = -( flux_c(ll, k+1, i) - flux_c(ll, k, i) ) / dz; + if (ll == ID_WMOM) { + tend(ll, k, i) = tend(ll, k, i) - state(ID_DENS, k+hs, i+hs)*grav; + } } } } @@ -672,10 +698,10 @@ void set_halo_values_z(view_3d state, for (int ll = 0; ll < NUM_VARS; ++ll) { for (int i = 0; i < nx+2*hs; ++i) { if (ll == ID_WMOM) { - state(ll, 0, i) = 0.; - state(ll, 1, i) = 0.; - state(ll, nz+hs, i) = 0.; - state(ll, nz+hs+1, i) = 0.; + state(ll, 0, i) = 0.0; + state(ll, 1, i) = 0.0; + state(ll, nz+hs, i) = 0.0; + state(ll, nz+hs+1, i) = 0.0; } else if (ll == ID_UMOM) { state(ll, 0, i) = state(ll, hs, i) / hy_dens_cell[hs] * hy_dens_cell[0]; state(ll, 1, i) = state(ll, hs, i) / hy_dens_cell[hs] * hy_dens_cell[1]; @@ -709,12 +735,6 @@ init_result init( int *argc , char ***argv ) { // END MPI DUMMY SECTION ////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////////// - // YOU DON'T NEED TO ALTER ANYTHING BELOW THIS POINT IN THE CODE - //////////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////////// - //Vertical direction isn't MPI-ized, so the rank's local values = the global values int k_beg = 0; int nz = nz_glob; @@ -830,12 +850,14 @@ init_result init( int *argc , char ***argv ) { .dt = dt, .etime = 0.0, .output_counter = 0.0, - .num_out = 0 + .num_out = 0, + .direction_switch = 1 #else dt, /* etime = */ 0.0, /* output_counter = */ 0.0, - /* num_out = */ 0 + /* num_out = */ 0, + /* direction_switch = */ 1 #endif }, std::move(gl_const_arrs), From f526c708522f96f42fe312e7de1e42289285009a Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 26 Mar 2025 19:46:43 +0200 Subject: [PATCH 67/83] Push temp arrays inside loops This will make parallelization easier. --- cpp-mdspan/miniWeather_mdspan.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index a6f9aed..84a556b 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -497,10 +497,6 @@ void compute_tendencies_x(view_3d_const state, auto hy_dens_cell = arrays.hy_dens_cell(); auto hy_dens_theta_cell = arrays.hy_dens_theta_cell(); - std::array stencil; - std::array d3_vals; - std::array vals; - ///////////////////////////////////////////////// // TODO: THREAD ME ///////////////////////////////////////////////// @@ -509,7 +505,10 @@ void compute_tendencies_x(view_3d_const state, for (int i = 0; i < nx+1; ++i) { //Use fourth-order interpolation from four cell averages //to compute the value at the interface in question + std::array d3_vals; + std::array vals; for (int ll = 0; ll < NUM_VARS; ++ll) { + std::array stencil; for (int s = 0; s < sten_size; ++s) { stencil[s] = state(ll, k+hs, i+s); } @@ -570,9 +569,6 @@ void compute_tendencies_z(view_3d_const state, auto hy_dens_theta_int = arrays.hy_dens_theta_int(); auto hy_pressure_int = arrays.hy_pressure_int(); - std::array stencil; - std::array d3_vals; - std::array vals; ///////////////////////////////////////////////// // TODO: THREAD ME @@ -582,7 +578,10 @@ void compute_tendencies_z(view_3d_const state, for (int i = 0; i < nx; ++i) { //Use fourth-order interpolation from four cell averages //to compute the value at the interface in question + std::array d3_vals; + std::array vals; for (int ll = 0; ll < NUM_VARS; ++ll) { + std::array stencil; for (int s = 0; s < sten_size; ++s) { stencil[s] = state(ll, k+s, i+hs); } From f41e369bac8bd78d594a05f7aa5771336b88447b Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 26 Mar 2025 22:14:28 +0200 Subject: [PATCH 68/83] Add CUB_INCLUDE_DIR autodetection --- cpp-mdspan/CMakeLists.txt | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/cpp-mdspan/CMakeLists.txt b/cpp-mdspan/CMakeLists.txt index b4698cd..c95769c 100644 --- a/cpp-mdspan/CMakeLists.txt +++ b/cpp-mdspan/CMakeLists.txt @@ -1,9 +1,36 @@ -cmake_minimum_required(VERSION 3.12) +# CMake 3.17 adds FindCUDAToolkit. +cmake_minimum_required(VERSION 3.17) project(miniWeather-mdspan VERSION 0.0.1 LANGUAGES CXX ) +# Find CUDA Toolkit; it's not required, +# but other things (like CUB) depend on it. +find_package(CUDAToolkit) + +if (DEFINED CUB_INCLUDE_DIR) + if (EXISTS "${CUB_INCLUDE_DIR}/cub/cub.cuh") + message(STATUS "User-defined CUB_INCLUDE_DIR: ${CUB_INCLUDE_DIR}") + else() + message(FATAL_ERROR "CUB_INCLUDE_DIR=\"{CUB_INCLUDE_DIR}\" is \ + defined but \"${CUB_INCLUDE_DIR}/cub/cub.cuh\" does not exist.") + endif() +else() + if (CUDAToolkit_FOUND) + # Check whether CUDA Toolkit's include directory has CUB in it. + find_path(CUB_INCLUDE_DIR + NAMES cub/cub.cuh + PATHS + ${CUDAToolkit_INCLUDE_DIRS} + /usr/include + /usr/local/include + DOC "Path to CUB include directory." + ) + message(STATUS "Discovered CUB_INCLUDE_DIR: ${CUB_INCLUDE_DIR}") + endif() +endif() + # Option to override which version of the C++ Standard to use set(MINIWEATHER_CXX_STANDARD DETECT CACHE STRING "Override the default CXX_STANDARD") From 52352346c25440fa86327ec1b45641a65f40ad57 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 26 Mar 2025 22:39:18 +0200 Subject: [PATCH 69/83] Improve CUB include path autodetection --- cpp-mdspan/CMakeLists.txt | 6 +++++- cpp-mdspan/miniWeather_mdspan.cpp | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/cpp-mdspan/CMakeLists.txt b/cpp-mdspan/CMakeLists.txt index c95769c..fb89098 100644 --- a/cpp-mdspan/CMakeLists.txt +++ b/cpp-mdspan/CMakeLists.txt @@ -28,6 +28,10 @@ else() DOC "Path to CUB include directory." ) message(STATUS "Discovered CUB_INCLUDE_DIR: ${CUB_INCLUDE_DIR}") + if (CUDAToolkit_VERSION VERSION_LESS "12.8.0") + message(STATUS "CUDAToolkit_VERSION 12.6.85 is known to lack ForEachInExtents. \ +Your CUDAToolkit_VERSION is ${CUDAToolkit_VERSION}.") + endif() endif() endif() @@ -88,7 +92,7 @@ if (NOT mdspan_FOUND) endif() # Add mdspan include directory -include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include" "${mdspan_SOURCE_DIR}/include") +include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include" "${mdspan_SOURCE_DIR}/include" "${CUB_INCLUDE_DIR}") add_executable(serial miniWeather_mdspan.cpp) target_link_libraries(serial INTERFACE std::mdspan) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index 84a556b..5b0a36f 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -14,6 +14,9 @@ #include #include "pnetcdf.h" +// nvc++ in 25.1 doesn't like including this header. +//#include "cub/cub.cuh" + #include "mdspan/mdspan.hpp" #include "unique_mdarray.hpp" From add7b15978c19a007cbbe362ecbc614249b37611 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 31 Mar 2025 19:44:13 +0300 Subject: [PATCH 70/83] Increase required CMake version for better NVHPC support --- cpp-mdspan/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp-mdspan/CMakeLists.txt b/cpp-mdspan/CMakeLists.txt index fb89098..4624640 100644 --- a/cpp-mdspan/CMakeLists.txt +++ b/cpp-mdspan/CMakeLists.txt @@ -1,5 +1,8 @@ # CMake 3.17 adds FindCUDAToolkit. -cmake_minimum_required(VERSION 3.17) +# +# Kokkos's build system claims that before CMake 3.20, +# NVHPC was identified as PGI. +cmake_minimum_required(VERSION 3.20) project(miniWeather-mdspan VERSION 0.0.1 LANGUAGES CXX From 91446c9617397ccc529499bcbcaabdfa3b4172a7 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 31 Mar 2025 20:22:21 +0300 Subject: [PATCH 71/83] Start adding Kokkos and OpenACC support I haven't tested either of these yet. --- cpp-mdspan/CMakeLists.txt | 63 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) diff --git a/cpp-mdspan/CMakeLists.txt b/cpp-mdspan/CMakeLists.txt index 4624640..1d7f70f 100644 --- a/cpp-mdspan/CMakeLists.txt +++ b/cpp-mdspan/CMakeLists.txt @@ -7,6 +7,9 @@ project(miniWeather-mdspan VERSION 0.0.1 LANGUAGES CXX ) +include(FetchContent) + +message(STATUS "C++ compiler ID: ${CMAKE_CXX_COMPILER_ID}") # Find CUDA Toolkit; it's not required, # but other things (like CUB) depend on it. @@ -81,7 +84,6 @@ endif() find_package(mdspan QUIET) if (NOT mdspan_FOUND) message(STATUS "No installed mdspan found, fetching from Github") - include(FetchContent) FetchContent_Declare( mdspan GIT_REPOSITORY https://github.com/kokkos/mdspan.git @@ -97,6 +99,41 @@ endif() # Add mdspan include directory include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include" "${mdspan_SOURCE_DIR}/include" "${CUB_INCLUDE_DIR}") +if (CMAKE_CXX_COMPILER_ID STREQUAL "NVHPC") + include(FindOpenACC) + find_package(OpenACC REQUIRED) + if (OpenACC_FOUND) + message(STATUS "Found OpenACC: ${OpenACC_VERSION}") + message(STATUS "OpenACC_CXX_FLAGS: ${OpenACC_CXX_FLAGS}") + message(STATUS "OpenACC_CXX_OPTIONS: ${OpenACC_CXX_OPTIONS}") + #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenACC_CXX_OPTIONS}") + else() + message(FATAL_ERROR "OpenACC not found") + endif() +endif() + +# Please see Kokkos' CMake instructions: +# +# https://kokkos.org/kokkos-core-wiki/get-started/integrating-kokkos-into-your-cmake-project.html +# +# If you want to use a previously installed Kokkos, +# then set the CMake variable Kokkos_ROOT to the path +# of your Kokkos installation. Otherwise, CMake will +# attempt to download Kokkos for you. +find_package(Kokkos CONFIG) +if (Kokkos_FOUND) + message(STATUS "Found Kokkos: ${Kokkos_DIR} (version \"${Kokkos_VERSION}\")") +else() + message(STATUS "No installed Kokkos found, fetching from Github") + include(FetchContent) + FetchContent_Declare( + Kokkos + GIT_REPOSITORY https://github.com/kokkos/kokkos.git + GIT_TAG 4.6.00 + ) + FetchContent_MakeAvailable(Kokkos) +endif() + add_executable(serial miniWeather_mdspan.cpp) target_link_libraries(serial INTERFACE std::mdspan) target_compile_options(serial PRIVATE @@ -106,6 +143,30 @@ target_compile_options(serial PRIVATE /W4> ) +if (Kokkos_FOUND) + add_executable(serial_kokkos miniWeather_mdspan.cpp) + target_link_libraries(serial_kokkos INTERFACE std::mdspan) + target_link_libraries(serial_kokkos PRIVATE Kokkos::kokkos) + target_compile_options(serial_kokkos PRIVATE + $<$,$,$>: + -Wall> + $<$: + /W4> + ) +endif() + +if (OpenACC_FOUND) + add_executable(serial_openacc miniWeather_mdspan.cpp) + target_link_libraries(serial_openacc INTERFACE std::mdspan) + target_link_libraries(serial_openacc PRIVATE OpenACC::OpenACC_CXX) + target_compile_options(serial_openacc PRIVATE + $<$,$,$>: + -Wall> + $<$: + /W4> + ) +endif() + add_executable(test_unique_mdarray test_unique_mdarray.cpp) target_link_libraries(test_unique_mdarray INTERFACE std::mdspan) target_compile_options(test_unique_mdarray PRIVATE From f765513664de590e37237e9b3520209d9835b7a4 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 31 Mar 2025 21:14:27 +0300 Subject: [PATCH 72/83] Simplify FetchContent use Also verify that OpenACC builds define _OPENACC. --- cpp-mdspan/CMakeLists.txt | 69 ++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 40 deletions(-) diff --git a/cpp-mdspan/CMakeLists.txt b/cpp-mdspan/CMakeLists.txt index 1d7f70f..8a623e7 100644 --- a/cpp-mdspan/CMakeLists.txt +++ b/cpp-mdspan/CMakeLists.txt @@ -81,35 +81,30 @@ else() endif() endif() -find_package(mdspan QUIET) -if (NOT mdspan_FOUND) - message(STATUS "No installed mdspan found, fetching from Github") - FetchContent_Declare( - mdspan - GIT_REPOSITORY https://github.com/kokkos/mdspan.git - GIT_TAG stable - ) - FetchContent_GetProperties(mdspan) - if(NOT mdspan_POPULATED) - FetchContent_Populate(mdspan) - add_subdirectory(${mdspan_SOURCE_DIR} ${mdspan_BINARY_DIR} EXCLUDE_FROM_ALL) - endif() -endif() +FetchContent_Declare( + mdspan + GIT_REPOSITORY https://github.com/kokkos/mdspan.git + GIT_TAG stable + FIND_PACKAGE_ARGS NAMES mdspan +) +FetchContent_MakeAvailable(mdspan) # Add mdspan include directory -include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include" "${mdspan_SOURCE_DIR}/include" "${CUB_INCLUDE_DIR}") - -if (CMAKE_CXX_COMPILER_ID STREQUAL "NVHPC") - include(FindOpenACC) - find_package(OpenACC REQUIRED) - if (OpenACC_FOUND) - message(STATUS "Found OpenACC: ${OpenACC_VERSION}") - message(STATUS "OpenACC_CXX_FLAGS: ${OpenACC_CXX_FLAGS}") - message(STATUS "OpenACC_CXX_OPTIONS: ${OpenACC_CXX_OPTIONS}") - #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenACC_CXX_OPTIONS}") - else() - message(FATAL_ERROR "OpenACC not found") - endif() +message(STATUS "mdspan_SOURCE_DIR: ${mdspan_SOURCE_DIR}") +message(STATUS "mdspan_BINARY_DIR: ${mdspan_BINARY_DIR}") +# mdspan_BINARY_DIR does not have the header files in it. +include_directories("${mdspan_SOURCE_DIR}/include") + +#include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include" "${CUB_INCLUDE_DIR}") + +include(FindOpenACC) +find_package(OpenACC) +if (OpenACC_FOUND) + message(STATUS "Found OpenACC: ${OpenACC_VERSION}") + message(STATUS "OpenACC_CXX_FLAGS: ${OpenACC_CXX_FLAGS}") + message(STATUS "OpenACC_CXX_OPTIONS: ${OpenACC_CXX_OPTIONS}") +else() + message(STATUS "OpenACC not found") endif() # Please see Kokkos' CMake instructions: @@ -120,19 +115,13 @@ endif() # then set the CMake variable Kokkos_ROOT to the path # of your Kokkos installation. Otherwise, CMake will # attempt to download Kokkos for you. -find_package(Kokkos CONFIG) -if (Kokkos_FOUND) - message(STATUS "Found Kokkos: ${Kokkos_DIR} (version \"${Kokkos_VERSION}\")") -else() - message(STATUS "No installed Kokkos found, fetching from Github") - include(FetchContent) - FetchContent_Declare( - Kokkos - GIT_REPOSITORY https://github.com/kokkos/kokkos.git - GIT_TAG 4.6.00 - ) - FetchContent_MakeAvailable(Kokkos) -endif() + +FetchContent_Declare( + Kokkos + GIT_REPOSITORY https://github.com/kokkos/kokkos.git + GIT_TAG 4.6.00 +) +FetchContent_MakeAvailable(Kokkos) add_executable(serial miniWeather_mdspan.cpp) target_link_libraries(serial INTERFACE std::mdspan) From 7ef97ce4e70d0c26df8ceaeeed76de830cf02450 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 1 Apr 2025 00:10:04 +0300 Subject: [PATCH 73/83] Abstract memory space (for allocations and deallocations) --- cpp-mdspan/miniWeather_mdspan.cpp | 141 ++++++++++++++++++++---------- 1 file changed, 93 insertions(+), 48 deletions(-) diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index 5b0a36f..7058f1e 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -86,17 +86,36 @@ using view_1d_const = md::mdspan; // All dynamic array allocation happens here. // Deallocation other than through `delete [] ptr` would happen // through a custom Deleter (second template argument of `unique_ptr`). +// +// "auto" return type makes it easier for allocation +// to depend on the build configuration. + +struct host_memory_space {}; -using alloc_3d = std::unique_ptr; -alloc_3d make_unique_array_3d(int X, int Y, int Z) { +std::unique_ptr +make_unique_array_3d(host_memory_space, int X, int Y, int Z) { return std::make_unique(X * Y * Z); } - -using alloc_1d = std::unique_ptr; -alloc_1d make_unique_array_1d(int X) { +std::unique_ptr +make_unique_array_1d(host_memory_space, int X) { return std::make_unique(X); } +template +using alloc_3d = decltype(make_unique_array_3d(MemorySpace{}, 0, 0, 0)); +template +using alloc_1d = decltype(make_unique_array_1d(MemorySpace{}, 0)); + +using default_memory_space = host_memory_space; + +auto make_unique_array_3d(int X, int Y, int Z) { + return make_unique_array_3d(default_memory_space{}, X, Y, Z); +} +auto make_unique_array_1d(int X) { + return make_unique_array_1d(default_memory_space{}, X); +} + + // Variables that are set once in init and remain read-only throughout the simulation. struct global_const_scalars { int nx = nx_glob; @@ -122,17 +141,18 @@ struct global_scalars { }; // Arrays that are allocated and filled in init and never changed after that. +template class global_const_arrays { public: - global_const_arrays(int nx, int nz, int hs) : + global_const_arrays(MemorySpace memory_space, int nx, int nz, int hs) : nx_(nx), nz_(nz), hs_(hs), - hy_dens_cell_ (make_unique_array_1d(nz+2*hs)), - hy_dens_theta_cell_(make_unique_array_1d(nz+2*hs)), - hy_dens_int_ (make_unique_array_1d(nz+1)), - hy_dens_theta_int_ (make_unique_array_1d(nz+1)), - hy_pressure_int_ (make_unique_array_1d(nz+1)) + hy_dens_cell_ (make_unique_array_1d(memory_space, nz+2*hs)), + hy_dens_theta_cell_(make_unique_array_1d(memory_space, nz+2*hs)), + hy_dens_int_ (make_unique_array_1d(memory_space, nz+1)), + hy_dens_theta_int_ (make_unique_array_1d(memory_space, nz+1)), + hy_pressure_int_ (make_unique_array_1d(memory_space, nz+1)) {} // Const views exist for all use after init. @@ -171,11 +191,11 @@ class global_const_arrays { private: int nx_, nz_, hs_; - alloc_1d hy_dens_cell_; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) - alloc_1d hy_dens_theta_cell_; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) - alloc_1d hy_dens_int_; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) - alloc_1d hy_dens_theta_int_; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) - alloc_1d hy_pressure_int_; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) + alloc_1d hy_dens_cell_; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) + alloc_1d hy_dens_theta_cell_; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) + alloc_1d hy_dens_int_; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) + alloc_1d hy_dens_theta_int_; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) + alloc_1d hy_pressure_int_; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) }; // Arrays that are allocated in init and updated throughout the simulation. @@ -184,16 +204,17 @@ class global_const_arrays { // Respecting that also avoids divergence from the Python version. // This means that the mdspan must be layout_right; the intent appears // to be for C code to use row-major storage, but with Fortran ordering. +template class global_arrays { public: - global_arrays(int nx, int nz, int hs) : + global_arrays(MemorySpace memory_space, int nx, int nz, int hs) : nx_(nx), nz_(nz), hs_(hs), - state_ (make_unique_array_3d(NUM_VARS, nz+2*hs, nx+2*hs)), - state_tmp_(make_unique_array_3d(NUM_VARS, nz+2*hs, nx+2*hs)), - flux_ (make_unique_array_3d(NUM_VARS, nz+1, nx+1)), - tend_ (make_unique_array_3d(NUM_VARS, nz, nx)) + state_ (make_unique_array_3d(memory_space, NUM_VARS, nz+2*hs, nx+2*hs)), + state_tmp_(make_unique_array_3d(memory_space, NUM_VARS, nz+2*hs, nx+2*hs)), + flux_ (make_unique_array_3d(memory_space, NUM_VARS, nz+1, nx+1)), + tend_ (make_unique_array_3d(memory_space, NUM_VARS, nz, nx)) {} // The current model for member functions that get a view of an array @@ -230,20 +251,22 @@ class global_arrays { private: int nx_, nz_, hs_; - alloc_3d state_; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) - alloc_3d state_tmp_; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) - alloc_3d flux_; // Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) - alloc_3d tend_; // Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) + alloc_3d state_; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) + alloc_3d state_tmp_; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) + alloc_3d flux_; // Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) + alloc_3d tend_; // Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) }; +template struct init_result { global_const_scalars const_scalars; global_scalars scalars; - global_const_arrays const_arrays; - global_arrays arrays; + global_const_arrays const_arrays; + global_arrays arrays; }; -init_result init(int *argc , char ***argv); +template +init_result init(MemorySpace memory_space, int *argc , char ***argv); void finalize(); @@ -282,49 +305,60 @@ r_t_pair hydro_const_bvfreq(double z, double bv_freq0); double sample_ellipse_cosine(double x, double z, double amp, double x0, double z0, double xrad, double zrad); +template void output(view_3d_const state, const global_const_scalars& const_scalars, - const global_const_arrays& const_arrays, + const global_const_arrays& const_arrays, global_scalars& scalars); void ncwrap(int ierr, int line); +template void perform_timestep(view_3d state, view_3d state_tmp, view_3d flux, view_3d tend, const global_const_scalars& c_scalars, - const global_const_arrays& c_arrays, + const global_const_arrays& c_arrays, global_scalars& scalars); +template void semi_discrete_step(view_3d_const state_init, view_3d state_forcing, view_3d state_out, double dt /* not scalars.dt */, direction dir, view_3d flux, view_3d tend, const global_const_scalars& scalars, - const global_const_arrays& arrays); + const global_const_arrays& arrays); +template void compute_tendencies_x(view_3d_const state, view_3d flux, view_3d tend, double dt, const global_const_scalars& scalars, - const global_const_arrays& arrays); + const global_const_arrays& arrays); +template void compute_tendencies_z(view_3d_const state, view_3d flux, view_3d tend, double dt, const global_const_scalars& scalars, - const global_const_arrays& arrays); + const global_const_arrays& arrays); +template void set_halo_values_x(view_3d state, - const global_const_scalars& scalars, const global_const_arrays& arrays); + const global_const_scalars& scalars, + const global_const_arrays& arrays); +template void set_halo_values_z(view_3d state, - const global_const_scalars& scalars, const global_const_arrays& arrays); + const global_const_scalars& scalars, + const global_const_arrays& arrays); struct reduction_result { double mass; double te; }; +template reduction_result reductions(view_3d_const state, const global_const_scalars& const_scalars, - const global_const_arrays& const_arrays); + const global_const_arrays& const_arrays); /////////////////////////////////////////////////////////////////////////////////////// // THE MAIN PROGRAM STARTS HERE /////////////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { - auto [const_scalars, scalars, const_arrays, arrays] = init( &argc , &argv ); + auto memory_space = default_memory_space{}; + auto [const_scalars, scalars, const_arrays, arrays] = init(memory_space, &argc , &argv ); //Initial reductions for mass, kinetic energy, and total energy. // @@ -396,10 +430,11 @@ int main(int argc, char **argv) { // q** = q[n] + dt/2 * rhs(q* ) // q[n+1] = q[n] + dt/1 * rhs(q** ) // +template void perform_timestep(view_3d state, view_3d state_tmp, view_3d flux, view_3d tend, const global_const_scalars& c_scalars, - const global_const_arrays& c_arrays, + const global_const_arrays& c_arrays, global_scalars& scalars) { const double dt = scalars.dt; @@ -434,13 +469,14 @@ void perform_timestep(view_3d state, view_3d state_tmp, //state_out = state_init + dt * rhs(state_forcing) //Meaning the step starts from state_init, computes the rhs using state_forcing, //and stores the result in state_out +template void semi_discrete_step(view_3d_const state_init, view_3d state_forcing, view_3d state_out, double dt /* not scalars.dt */, direction dir, view_3d flux, view_3d tend, const global_const_scalars& scalars, - const global_const_arrays& arrays) + const global_const_arrays& arrays) { const int nx = scalars.nx; const int nz = scalars.nz; @@ -488,10 +524,11 @@ void semi_discrete_step(view_3d_const state_init, //Since the halos are set in a separate routine, this will not require MPI //First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) //Then, compute the tendencies using those fluxes +template void compute_tendencies_x(view_3d_const state, view_3d flux, view_3d tend, double dt, const global_const_scalars& scalars, - const global_const_arrays& arrays) + const global_const_arrays& arrays) { const int nx = scalars.nx; const int nz = scalars.nz; @@ -559,10 +596,11 @@ void compute_tendencies_x(view_3d_const state, //Since the halos are set in a separate routine, this will not require MPI //First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity) //Then, compute the tendencies using those fluxes +template void compute_tendencies_z(view_3d_const state, view_3d flux, view_3d tend, double dt, const global_const_scalars& scalars, - const global_const_arrays& arrays) + const global_const_arrays& arrays) { const int nx = scalars.nx; const int nz = scalars.nz; @@ -638,8 +676,10 @@ void compute_tendencies_z(view_3d_const state, //Set this MPI task's halo values in the x-direction. This routine will require MPI +template void set_halo_values_x(view_3d state, - const global_const_scalars& scalars, const global_const_arrays& arrays) + const global_const_scalars& scalars, + const global_const_arrays& arrays) { const int nx = scalars.nx; const int nz = scalars.nz; @@ -687,8 +727,10 @@ void set_halo_values_x(view_3d state, //Set this MPI task's halo values in the z-direction. This does not require MPI because there is no MPI //decomposition in the vertical direction +template void set_halo_values_z(view_3d state, - const global_const_scalars& scalars, const global_const_arrays& arrays) + const global_const_scalars& scalars, + const global_const_arrays& arrays) { const int nx = scalars.nx; const int nz = scalars.nz; @@ -719,7 +761,8 @@ void set_halo_values_z(view_3d state, } } -init_result init( int *argc , char ***argv ) { +template +init_result init(MemorySpace memory_space, int *argc , char ***argv ) { (void) MPI_Init(argc,argv); ///////////////////////////////////////////////////////////// @@ -746,7 +789,7 @@ init_result init( int *argc , char ***argv ) { int right_rank = 0; bool mainproc = (myrank == 0); - global_arrays gl_arrs(nx, nz, hs); + global_arrays gl_arrs(memory_space, nx, nz, hs); auto state = gl_arrs.state(); auto state_tmp = gl_arrs.state_tmp(); auto flux = gl_arrs.flux(); @@ -796,7 +839,7 @@ init_result init( int *argc , char ***argv ) { } } - global_const_arrays gl_const_arrs(nx, nz, hs); + global_const_arrays gl_const_arrs(memory_space, nx, nz, hs); // Get nonconst views, so we can fill them in below. auto hy_dens_cell = gl_const_arrs.hy_dens_cell(); auto hy_dens_theta_cell = gl_const_arrs.hy_dens_theta_cell(); @@ -994,9 +1037,10 @@ double sample_ellipse_cosine( double x , double z , double amp , double x0 , dou //Output the fluid state (state) to a NetCDF file at a given elapsed model time (etime) //The file I/O uses parallel-netcdf, the only external library required for this mini-app. //If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics +template void output(view_3d_const state, const global_const_scalars& const_scalars, - const global_const_arrays& const_arrays, + const global_const_arrays& const_arrays, global_scalars& scalars) { const int nx = const_scalars.nx; @@ -1129,9 +1173,10 @@ void finalize() { //Compute reduced quantities for error checking without resorting to the "ncdiff" tool +template reduction_result reductions(view_3d_const state, const global_const_scalars& const_scalars, - const global_const_arrays& const_arrays) + const global_const_arrays& const_arrays) { reduction_result result{0.0, 0.0}; const int nx = const_scalars.nx; From 4a8d84a59dfd298fcd06796aa3c64ba941ada40c Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 1 Apr 2025 02:19:45 +0300 Subject: [PATCH 74/83] Use Kokkos' mdspan if available --- cpp-mdspan/CMakeLists.txt | 117 +++++++++++++++++++------------ cpp-mdspan/build/cmake-kermit.sh | 2 + 2 files changed, 73 insertions(+), 46 deletions(-) diff --git a/cpp-mdspan/CMakeLists.txt b/cpp-mdspan/CMakeLists.txt index 8a623e7..2d205e4 100644 --- a/cpp-mdspan/CMakeLists.txt +++ b/cpp-mdspan/CMakeLists.txt @@ -11,36 +11,6 @@ include(FetchContent) message(STATUS "C++ compiler ID: ${CMAKE_CXX_COMPILER_ID}") -# Find CUDA Toolkit; it's not required, -# but other things (like CUB) depend on it. -find_package(CUDAToolkit) - -if (DEFINED CUB_INCLUDE_DIR) - if (EXISTS "${CUB_INCLUDE_DIR}/cub/cub.cuh") - message(STATUS "User-defined CUB_INCLUDE_DIR: ${CUB_INCLUDE_DIR}") - else() - message(FATAL_ERROR "CUB_INCLUDE_DIR=\"{CUB_INCLUDE_DIR}\" is \ - defined but \"${CUB_INCLUDE_DIR}/cub/cub.cuh\" does not exist.") - endif() -else() - if (CUDAToolkit_FOUND) - # Check whether CUDA Toolkit's include directory has CUB in it. - find_path(CUB_INCLUDE_DIR - NAMES cub/cub.cuh - PATHS - ${CUDAToolkit_INCLUDE_DIRS} - /usr/include - /usr/local/include - DOC "Path to CUB include directory." - ) - message(STATUS "Discovered CUB_INCLUDE_DIR: ${CUB_INCLUDE_DIR}") - if (CUDAToolkit_VERSION VERSION_LESS "12.8.0") - message(STATUS "CUDAToolkit_VERSION 12.6.85 is known to lack ForEachInExtents. \ -Your CUDAToolkit_VERSION is ${CUDAToolkit_VERSION}.") - endif() - endif() -endif() - # Option to override which version of the C++ Standard to use set(MINIWEATHER_CXX_STANDARD DETECT CACHE STRING "Override the default CXX_STANDARD") @@ -81,22 +51,6 @@ else() endif() endif() -FetchContent_Declare( - mdspan - GIT_REPOSITORY https://github.com/kokkos/mdspan.git - GIT_TAG stable - FIND_PACKAGE_ARGS NAMES mdspan -) -FetchContent_MakeAvailable(mdspan) - -# Add mdspan include directory -message(STATUS "mdspan_SOURCE_DIR: ${mdspan_SOURCE_DIR}") -message(STATUS "mdspan_BINARY_DIR: ${mdspan_BINARY_DIR}") -# mdspan_BINARY_DIR does not have the header files in it. -include_directories("${mdspan_SOURCE_DIR}/include") - -#include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include" "${CUB_INCLUDE_DIR}") - include(FindOpenACC) find_package(OpenACC) if (OpenACC_FOUND) @@ -107,6 +61,37 @@ else() message(STATUS "OpenACC not found") endif() + +# Find CUDA Toolkit; it's not required, +# but other things (like CUB) depend on it. +find_package(CUDAToolkit) + +if (DEFINED CUB_INCLUDE_DIR) + if (EXISTS "${CUB_INCLUDE_DIR}/cub/cub.cuh") + message(STATUS "User-defined CUB_INCLUDE_DIR: ${CUB_INCLUDE_DIR}") + else() + message(FATAL_ERROR "CUB_INCLUDE_DIR=\"{CUB_INCLUDE_DIR}\" is \ + defined but \"${CUB_INCLUDE_DIR}/cub/cub.cuh\" does not exist.") + endif() +else() + if (CUDAToolkit_FOUND) + # Check whether CUDA Toolkit's include directory has CUB in it. + find_path(CUB_INCLUDE_DIR + NAMES cub/cub.cuh + PATHS + ${CUDAToolkit_INCLUDE_DIRS} + /usr/include + /usr/local/include + DOC "Path to CUB include directory." + ) + message(STATUS "Discovered CUB_INCLUDE_DIR: ${CUB_INCLUDE_DIR}") + if (CUDAToolkit_VERSION VERSION_LESS "12.8.0") + message(STATUS "CUDAToolkit_VERSION 12.6.85 is known to lack ForEachInExtents. \ +Your CUDAToolkit_VERSION is ${CUDAToolkit_VERSION}.") + endif() + endif() +endif() + # Please see Kokkos' CMake instructions: # # https://kokkos.org/kokkos-core-wiki/get-started/integrating-kokkos-into-your-cmake-project.html @@ -115,6 +100,9 @@ endif() # then set the CMake variable Kokkos_ROOT to the path # of your Kokkos installation. Otherwise, CMake will # attempt to download Kokkos for you. +# +# Users can set FETCHCONTENT_SOURCE_DIR_KOKKOS +# to a local path to Kokkos, to override automatic downloading. FetchContent_Declare( Kokkos @@ -123,6 +111,43 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(Kokkos) +#FetchContent_GetProperties(Kokkos) # this doesn't help +message(STATUS "Kokkos_FOUND: ${Kokkos_FOUND}") # WHY IS THIS NOT DEFINED?!?!?!?!?! +message(STATUS "Kokkos_POPULATED: ${Kokkos_POPULATED}") # WHY IS THIS NOT DEFINED?!?!?!?!?! +message(STATUS "Kokkos_SOURCE_DIR: ${Kokkos_SOURCE_DIR}") +message(STATUS "Kokkos_BINARY_DIR: ${Kokkos_BINARY_DIR}") +message(STATUS "Kokkos_INCLUDE_DIRS: ${Kokkos_INCLUDE_DIRS}") # this is not defined; why? + +# Use the mdspan version that Kokkos installed, if Kokkos was indeed installed. +# Note that this is not a complete mdspan source tree; it just includes the headers. +if ((DEFINED Kokkos_SOURCE_DIR) AND (EXISTS "${Kokkos_SOURCE_DIR}")) # Kokkos_POPULATED would be nice + set(MINIWEATHER_MDSPAN_INCLUDE "${Kokkos_SOURCE_DIR}/tpls/mdspan/include") + message(STATUS "Using Kokkos' mdspan headers: ${MINIWEATHER_MDSPAN_INCLUDE}") +else() + FetchContent_Declare( + mdspan + GIT_REPOSITORY https://github.com/kokkos/mdspan.git + GIT_TAG stable + FIND_PACKAGE_ARGS NAMES mdspan + ) + FetchContent_MakeAvailable(mdspan) + + # Add mdspan include directory + message(STATUS "mdspan_SOURCE_DIR: ${mdspan_SOURCE_DIR}") + message(STATUS "mdspan_BINARY_DIR: ${mdspan_BINARY_DIR}") + message(STATUS "mdspan_INCLUDE_DIRS: ${mdspan_INCLUDE_DIRS}") + + set(MINIWEATHER_MDSPAN_INCLUDE "${mdspan_SOURCE_DIR}/include") + message(STATUS "Using FetchContent's mdspan headers: ${MINIWEATHER_MDSPAN_INCLUDE}") +endif() + +# The above doesn't need to add mdspan's include directory to the build, +# so it appears that we have to do this manually. +# +# mdspan_INCLUDE_DIRS is empty, so we have to set the include directory manually. +# mdspan_BINARY_DIR does not have the header files in it. +include_directories("${MINIWEATHER_MDSPAN_INCLUDE}") + add_executable(serial miniWeather_mdspan.cpp) target_link_libraries(serial INTERFACE std::mdspan) target_compile_options(serial PRIVATE diff --git a/cpp-mdspan/build/cmake-kermit.sh b/cpp-mdspan/build/cmake-kermit.sh index c30d9eb..06d6bc3 100755 --- a/cpp-mdspan/build/cmake-kermit.sh +++ b/cpp-mdspan/build/cmake-kermit.sh @@ -2,6 +2,8 @@ PNETCDF_ROOT=/raid/mhoemmen/pkg/pnetcdf-1.14.0 SRC_ROOT=/raid/mhoemmen/src/miniWeather/cpp-mdspan +mdspan_ROOT=/raid/mhoemmen/src/kokkos/mdspan +# Setting -Dmdspan_ROOT="${mdspan_ROOT}" has no effect on FetchContent. LDFLAGS="-L${PNETCDF_ROOT}/lib -lpnetcdf" CXXFLAGS="-I${PNETCDF_ROOT}/include" cmake \ -DCMAKE_CXX_COMPILER=mpic++ \ From 581cd588e968a4b8971e004b7fb9978846c8354f Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Tue, 1 Apr 2025 23:57:53 +0300 Subject: [PATCH 75/83] Clean up CMake build logic --- cpp-mdspan/CMakeLists.txt | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/cpp-mdspan/CMakeLists.txt b/cpp-mdspan/CMakeLists.txt index 2d205e4..2eb9f6d 100644 --- a/cpp-mdspan/CMakeLists.txt +++ b/cpp-mdspan/CMakeLists.txt @@ -111,18 +111,26 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(Kokkos) -#FetchContent_GetProperties(Kokkos) # this doesn't help -message(STATUS "Kokkos_FOUND: ${Kokkos_FOUND}") # WHY IS THIS NOT DEFINED?!?!?!?!?! -message(STATUS "Kokkos_POPULATED: ${Kokkos_POPULATED}") # WHY IS THIS NOT DEFINED?!?!?!?!?! +# The above defines kokkos_POPULATED (lower-case name), but NOT Kokkos_POPULATED. +if (NOT kokkos_POPULATED) + message(FATAL_ERROR "Kokkos was not found") +endif() message(STATUS "Kokkos_SOURCE_DIR: ${Kokkos_SOURCE_DIR}") message(STATUS "Kokkos_BINARY_DIR: ${Kokkos_BINARY_DIR}") -message(STATUS "Kokkos_INCLUDE_DIRS: ${Kokkos_INCLUDE_DIRS}") # this is not defined; why? +#message(STATUS "Kokkos_INCLUDE_DIRS: ${Kokkos_INCLUDE_DIRS}") # this is not defined; why? +message(STATUS "kokkos_SOURCE_DIR: ${kokkos_SOURCE_DIR}") +message(STATUS "kokkos_BINARY_DIR: ${kokkos_BINARY_DIR}") +message(STATUS "kokkos_INCLUDE_DIRS: ${kokkos_INCLUDE_DIRS}") # Use the mdspan version that Kokkos installed, if Kokkos was indeed installed. # Note that this is not a complete mdspan source tree; it just includes the headers. -if ((DEFINED Kokkos_SOURCE_DIR) AND (EXISTS "${Kokkos_SOURCE_DIR}")) # Kokkos_POPULATED would be nice + +#if ((DEFINED Kokkos_SOURCE_DIR) AND (EXISTS "${Kokkos_SOURCE_DIR}")) +if (kokkos_POPULATED) set(MINIWEATHER_MDSPAN_INCLUDE "${Kokkos_SOURCE_DIR}/tpls/mdspan/include") message(STATUS "Using Kokkos' mdspan headers: ${MINIWEATHER_MDSPAN_INCLUDE}") + # Don't use target_link_libraries in this case. + include_directories("${MINIWEATHER_MDSPAN_INCLUDE}") else() FetchContent_Declare( mdspan @@ -135,10 +143,11 @@ else() # Add mdspan include directory message(STATUS "mdspan_SOURCE_DIR: ${mdspan_SOURCE_DIR}") message(STATUS "mdspan_BINARY_DIR: ${mdspan_BINARY_DIR}") - message(STATUS "mdspan_INCLUDE_DIRS: ${mdspan_INCLUDE_DIRS}") set(MINIWEATHER_MDSPAN_INCLUDE "${mdspan_SOURCE_DIR}/include") message(STATUS "Using FetchContent's mdspan headers: ${MINIWEATHER_MDSPAN_INCLUDE}") + # Don't use include_directories here; target_link_libraries + # (see below) will get mdspan's include directory just fine. endif() # The above doesn't need to add mdspan's include directory to the build, @@ -146,10 +155,12 @@ endif() # # mdspan_INCLUDE_DIRS is empty, so we have to set the include directory manually. # mdspan_BINARY_DIR does not have the header files in it. -include_directories("${MINIWEATHER_MDSPAN_INCLUDE}") add_executable(serial miniWeather_mdspan.cpp) -target_link_libraries(serial INTERFACE std::mdspan) +# We can't do this if mdspan wasn't found via FetchContent or find_package. +if (NOT kokkos_POPULATED) + target_link_libraries(serial PRIVATE std::mdspan) +endif() target_compile_options(serial PRIVATE $<$,$,$>: -Wall> @@ -157,10 +168,10 @@ target_compile_options(serial PRIVATE /W4> ) -if (Kokkos_FOUND) +if (kokkos_POPULATED) add_executable(serial_kokkos miniWeather_mdspan.cpp) - target_link_libraries(serial_kokkos INTERFACE std::mdspan) target_link_libraries(serial_kokkos PRIVATE Kokkos::kokkos) + # We got mdspan from Kokkos. target_compile_options(serial_kokkos PRIVATE $<$,$,$>: -Wall> @@ -171,7 +182,9 @@ endif() if (OpenACC_FOUND) add_executable(serial_openacc miniWeather_mdspan.cpp) - target_link_libraries(serial_openacc INTERFACE std::mdspan) + if (NOT kokkos_POPULATED) + target_link_libraries(serial_openacc PRIVATE std::mdspan) + endif() target_link_libraries(serial_openacc PRIVATE OpenACC::OpenACC_CXX) target_compile_options(serial_openacc PRIVATE $<$,$,$>: @@ -182,7 +195,9 @@ if (OpenACC_FOUND) endif() add_executable(test_unique_mdarray test_unique_mdarray.cpp) -target_link_libraries(test_unique_mdarray INTERFACE std::mdspan) +if (NOT kokkos_POPULATED) + target_link_libraries(test_unique_mdarray PRIVATE std::mdspan) +endif() target_compile_options(test_unique_mdarray PRIVATE $<$,$,$>: -Wall> From 9c2fc141842bee59260f9885b3fb14a1186c58d3 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Sat, 5 Apr 2025 00:19:14 +0300 Subject: [PATCH 76/83] Refactor miniWeather * Separate out all the loops that can be parallelized into their own functions * miniWeather_mdspan.cpp is the "generic executable" --- cpp-mdspan/CMakeLists.txt | 62 +- cpp-mdspan/build/cmake-kermit.sh | 19 +- cpp-mdspan/miniWeather_common.cpp | 111 +++ cpp-mdspan/miniWeather_common.hpp | 517 +++++++++++++ cpp-mdspan/miniWeather_mdspan.cpp | 1171 +---------------------------- cpp-mdspan/miniWeather_output.hpp | 137 ++++ cpp-mdspan/miniWeather_serial.hpp | 383 ++++++++++ 7 files changed, 1229 insertions(+), 1171 deletions(-) create mode 100644 cpp-mdspan/miniWeather_common.cpp create mode 100644 cpp-mdspan/miniWeather_common.hpp create mode 100644 cpp-mdspan/miniWeather_output.hpp create mode 100644 cpp-mdspan/miniWeather_serial.hpp diff --git a/cpp-mdspan/CMakeLists.txt b/cpp-mdspan/CMakeLists.txt index 2eb9f6d..0a89c88 100644 --- a/cpp-mdspan/CMakeLists.txt +++ b/cpp-mdspan/CMakeLists.txt @@ -2,7 +2,9 @@ # # Kokkos's build system claims that before CMake 3.20, # NVHPC was identified as PGI. -cmake_minimum_required(VERSION 3.20) +# +# FetchContent_MakeAvailable gets new features in CMake 3.24. +cmake_minimum_required(VERSION 3.24) project(miniWeather-mdspan VERSION 0.0.1 LANGUAGES CXX @@ -150,18 +152,28 @@ else() # (see below) will get mdspan's include directory just fine. endif() -# The above doesn't need to add mdspan's include directory to the build, -# so it appears that we have to do this manually. -# -# mdspan_INCLUDE_DIRS is empty, so we have to set the include directory manually. -# mdspan_BINARY_DIR does not have the header files in it. +add_executable(test_unique_mdarray test_unique_mdarray.cpp) +# If building with Kokkos, we've already added the mdspan headers +# to the include path (see above). +if (NOT kokkos_POPULATED) + target_link_libraries(test_unique_mdarray PRIVATE std::mdspan) +endif() +target_compile_options(test_unique_mdarray PRIVATE + $<$,$,$>: + -Wall> + $<$: + /W4> +) + +add_executable(miniWeather_serial miniWeather_mdspan.cpp miniWeather_common.cpp) +target_include_directories(miniWeather_serial PRIVATE "${PROJECT_SOURCE_DIR}") -add_executable(serial miniWeather_mdspan.cpp) -# We can't do this if mdspan wasn't found via FetchContent or find_package. +# If building with Kokkos, we've already added the mdspan headers +# to the include path (see above). if (NOT kokkos_POPULATED) - target_link_libraries(serial PRIVATE std::mdspan) + target_link_libraries(miniWeather_serial PRIVATE std::mdspan) endif() -target_compile_options(serial PRIVATE +target_compile_options(miniWeather_serial PRIVATE $<$,$,$>: -Wall> $<$: @@ -169,10 +181,12 @@ target_compile_options(serial PRIVATE ) if (kokkos_POPULATED) - add_executable(serial_kokkos miniWeather_mdspan.cpp) - target_link_libraries(serial_kokkos PRIVATE Kokkos::kokkos) + add_executable(miniWeather_kokkos miniWeather_mdspan.cpp miniWeather_common.cpp) + target_include_directories(miniWeather_kokkos PRIVATE "${PROJECT_SOURCE_DIR}") + + target_link_libraries(miniWeather_kokkos PRIVATE Kokkos::kokkos) # We got mdspan from Kokkos. - target_compile_options(serial_kokkos PRIVATE + target_compile_options(miniWeather_kokkos PRIVATE $<$,$,$>: -Wall> $<$: @@ -181,12 +195,16 @@ if (kokkos_POPULATED) endif() if (OpenACC_FOUND) - add_executable(serial_openacc miniWeather_mdspan.cpp) + add_executable(miniWeather_openacc miniWeather_mdspan.cpp miniWeather_common.cpp) + target_include_directories(miniWeather_openacc PRIVATE "${PROJECT_SOURCE_DIR}") + + # If building with Kokkos, we've already added the mdspan headers + # to the include path (see above). if (NOT kokkos_POPULATED) - target_link_libraries(serial_openacc PRIVATE std::mdspan) + target_link_libraries(miniWeather_openacc PRIVATE std::mdspan) endif() - target_link_libraries(serial_openacc PRIVATE OpenACC::OpenACC_CXX) - target_compile_options(serial_openacc PRIVATE + target_link_libraries(miniWeather_openacc PRIVATE OpenACC::OpenACC_CXX) + target_compile_options(miniWeather_openacc PRIVATE $<$,$,$>: -Wall> $<$: @@ -194,15 +212,5 @@ if (OpenACC_FOUND) ) endif() -add_executable(test_unique_mdarray test_unique_mdarray.cpp) -if (NOT kokkos_POPULATED) - target_link_libraries(test_unique_mdarray PRIVATE std::mdspan) -endif() -target_compile_options(test_unique_mdarray PRIVATE - $<$,$,$>: - -Wall> - $<$: - /W4> -) diff --git a/cpp-mdspan/build/cmake-kermit.sh b/cpp-mdspan/build/cmake-kermit.sh index 06d6bc3..5da9b6e 100755 --- a/cpp-mdspan/build/cmake-kermit.sh +++ b/cpp-mdspan/build/cmake-kermit.sh @@ -1,13 +1,26 @@ #!/bin/bash +# This script only works with nvc++. +# It assumes that mpic++ finds nvc++, mpicc finds nvc (!= nvcc), etc. + PNETCDF_ROOT=/raid/mhoemmen/pkg/pnetcdf-1.14.0 +PNETCDF_LDFLAGS="-L${PNETCDF_ROOT}/lib -lpnetcdf" +PNETCDF_CXXFLAGS="-I${PNETCDF_ROOT}/include" SRC_ROOT=/raid/mhoemmen/src/miniWeather/cpp-mdspan -mdspan_ROOT=/raid/mhoemmen/src/kokkos/mdspan -# Setting -Dmdspan_ROOT="${mdspan_ROOT}" has no effect on FetchContent. -LDFLAGS="-L${PNETCDF_ROOT}/lib -lpnetcdf" CXXFLAGS="-I${PNETCDF_ROOT}/include" cmake \ +# "-stdpar": "Could not find librt library, needed by CUDA::cudart_static" +# Adding "-rt" to LDFLAGS didn't seem to help. + +KOKKOS_ROOT="/raid/mhoemmen/src/kokkos/kokkos" +# -DFETCHCONTENT_SOURCE_DIR_Kokkos="${KOKKOS_ROOT}" +# -DKokkos_ROOT="${KOKKOS_ROOT}" + +LDFLAGS="${PNETCDF_LDFLAGS}" CXXFLAGS="${PNETCDF_CXXFLAGS}" cmake \ -DCMAKE_CXX_COMPILER=mpic++ \ -DCMAKE_C_COMPILER=mpicc \ -DCMAKE_Fortran_COMPILER=mpif90 \ -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DFETCHCONTENT_SOURCE_DIR_KOKKOS="${KOKKOS_ROOT}" \ ${SRC_ROOT} + +# -DCMAKE_CXX_FLAGS="-stdpar" diff --git a/cpp-mdspan/miniWeather_common.cpp b/cpp-mdspan/miniWeather_common.cpp new file mode 100644 index 0000000..ae56336 --- /dev/null +++ b/cpp-mdspan/miniWeather_common.cpp @@ -0,0 +1,111 @@ +#include "miniWeather_common.hpp" + +std::unique_ptr +make_unique_array_3d(host_memory_space, int X, int Y, int Z) { + return std::make_unique(X * Y * Z); +} + +std::unique_ptr +make_unique_array_1d(host_memory_space, int X) { + return std::make_unique(X); +} + +test_case injection(double x , double z) { + auto [hr, ht] = hydro_const_theta(z); + double r = 0.0; + double t = 0.0; + double u = 0.0; + double w = 0.0; + return {r, u, w, t, hr, ht}; +} + +test_case density_current(double x , double z) { + auto [hr, ht] = hydro_const_theta(z); + double r = 0.0; + double t = sample_ellipse_cosine(x, z, -20.0, xlen/2, 5000.0, 4000.0, 2000.0); + double u = 0.0; + double w = 0.0; + return {r, u, w, t, hr, ht}; +} + +test_case gravity_waves(double x, double z) { + auto [hr, ht] = hydro_const_bvfreq(z, 0.02); + double r = 0.0; + double t = 0.0; + double u = 15.0; + double w = 0.0; + return {r, u, w, t, hr, ht}; +} + +test_case thermal(double x, double z) { + auto [hr, ht] = hydro_const_theta(z); + double r = 0.0; + double t = sample_ellipse_cosine(x, z, 3.0, xlen/2,2000.0, 2000.0, 2000.0); + double u = 0.0; + double w = 0.0; + return {r, u, w, t, hr, ht}; +} + +test_case collision(double x , double z) { + auto [hr, ht] = hydro_const_theta(z); + double r = 0.0; + double t = 0.0; + double u = 0.0; + double w = 0.0; + t = t + sample_ellipse_cosine(x, z, 20.0, xlen/2,2000.0, 2000.0, 2000.0); + t = t + sample_ellipse_cosine(x, z, -20.0, xlen/2,8000.0, 2000.0, 2000.0); + return {r, u, w, t, hr, ht}; +} + +test_case get_test_case(int data_spec, double x_, double z_) { + if (data_spec == DATA_SPEC_COLLISION ) { return collision(x_, z_); } + if (data_spec == DATA_SPEC_THERMAL ) { return thermal(x_, z_); } + if (data_spec == DATA_SPEC_GRAVITY_WAVES ) { return gravity_waves(x_, z_); } + if (data_spec == DATA_SPEC_DENSITY_CURRENT) { return density_current(x_, z_); } + if (data_spec == DATA_SPEC_INJECTION ) { return injection(x_, z_); } + assert(false); + return test_case{}; +} + +r_t_pair hydro_const_theta(double z) { + const double theta0 = 300.; //Background potential temperature + const double exner0 = 1.; //Surface-level Exner pressure + double p,exner,rt; + //Establish hydrostatic balance first using Exner pressure + double t = theta0; //Potential Temperature at z + exner = exner0 - grav * z / (cp * theta0); //Exner pressure at z + p = p0 * pow(exner,(cp/rd)); //Pressure at z + rt = pow((p / C0),(1. / gamm)); //rho*theta at z + double r = rt / t; //Density at z + + return {r, t}; +} + +r_t_pair hydro_const_bvfreq(double z, double bv_freq0) { + const double theta0 = 300.; //Background potential temperature + const double exner0 = 1.; //Surface-level Exner pressure + double p, exner, rt; + double t = theta0 * exp( bv_freq0*bv_freq0 / grav * z ); //Pot temp at z + exner = exner0 - grav*grav / (cp * bv_freq0*bv_freq0) * (t - theta0) / (t * theta0); //Exner pressure at z + p = p0 * pow(exner,(cp/rd)); //Pressure at z + rt = pow((p / C0), (1. / gamm)); //rho*theta at z + double r = rt / t; //Density at z + + return {r, t}; +} + +double sample_ellipse_cosine( double x , double z , double amp , double x0 , double z0 , double xrad , double zrad ) { + double dist; + //Compute distance from bubble center + dist = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.0; + //If the distance from bubble center is less than the radius, create a cos**2 profile + if (dist <= pi / 2.0) { + return amp * pow(cos(dist), 2.0); + } else { + return 0.; + } +} + +void finalize() { + (void) MPI_Finalize(); +} diff --git a/cpp-mdspan/miniWeather_common.hpp b/cpp-mdspan/miniWeather_common.hpp new file mode 100644 index 0000000..cde78cd --- /dev/null +++ b/cpp-mdspan/miniWeather_common.hpp @@ -0,0 +1,517 @@ +////////////////////////////////////////////////////////////////////////////////////////// +// miniWeather +// Author: Matt Norman , Oak Ridge National Laboratory +// This code simulates dry, stratified, compressible, non-hydrostatic fluid flows +// For documentation, please see the attached documentation in the "documentation" folder +// +////////////////////////////////////////////////////////////////////////////////////////// + +#pragma once + +#include +#include +#include +#include +#include + +// nvc++ in 25.1 doesn't like including this header. +//#include "cub/cub.cuh" + +#include "mdspan/mdspan.hpp" +#include "unique_mdarray.hpp" + +#define MINIWEATHER_ONLY_OUTPUT_THETA 1 + +constexpr double pi = 3.14159265358979323846264338327; //Pi +constexpr double grav = 9.8; //Gravitational acceleration (m / s^2) +constexpr double cp = 1004.; //Specific heat of dry air at constant pressure +constexpr double cv = 717.; //Specific heat of dry air at constant volume +constexpr double rd = 287.; //Dry air constant for equation of state (P=rho*rd*T) +constexpr double p0 = 1.e5; //Standard pressure at the surface in Pascals +constexpr double C0 = 27.5629410929725921310572974482; //Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma) +constexpr double gamm = 1.40027894002789400278940027894; //gamma=cp/Rd , have to call this gamm because "gamma" is taken (I hate C so much) +//Define domain and stability-related constants +constexpr double xlen = 2.e4; //Length of the domain in the x-direction (meters) +constexpr double zlen = 1.e4; //Length of the domain in the z-direction (meters) +constexpr double hv_beta = 0.05; //How strong to diffuse the solution: hv_beta \in [0:1] +constexpr double cfl = 1.50; //"Courant, Friedrichs, Lewy" number (for numerical stability) +constexpr double max_speed = 450; //Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec) +constexpr int hs = 2; //"Halo" size: number of cells beyond the MPI tasks's domain needed for a full "stencil" of information for reconstruction +constexpr int sten_size = 4; //Size of the stencil used for interpolation + +//Parameters for indexing and flags +constexpr int NUM_VARS = 4; //Number of fluid state variables +constexpr int ID_DENS = 0; //index for density ("rho") +constexpr int ID_UMOM = 1; //index for momentum in the x-direction ("rho * u") +constexpr int ID_WMOM = 2; //index for momentum in the z-direction ("rho * w") +constexpr int ID_RHOT = 3; //index for density * potential temperature ("rho * theta") + +enum class direction { X, Z }; + +constexpr int DATA_SPEC_COLLISION = 1; +constexpr int DATA_SPEC_THERMAL = 2; +constexpr int DATA_SPEC_GRAVITY_WAVES = 3; +constexpr int DATA_SPEC_DENSITY_CURRENT = 5; +constexpr int DATA_SPEC_INJECTION = 6; + +constexpr int nqpoints = 3; +constexpr double qpoints [] = { 0.112701665379258311482073460022E0 , 0.500000000000000000000000000000E0 , 0.887298334620741688517926539980E0 }; +constexpr double qweights[] = { 0.277777777777777777777777777779E0 , 0.444444444444444444444444444444E0 , 0.277777777777777777777777777779E0 }; + +/////////////////////////////////////////////////////////////////////////////////////// +// BEGIN USER-CONFIGURABLE PARAMETERS +/////////////////////////////////////////////////////////////////////////////////////// +//The x-direction length is twice as long as the z-direction length +//So, you'll want to have nx_glob be twice as large as nz_glob + +int constexpr nz_glob = 50; //Number of total cells in the z-direction +int constexpr nx_glob = 2 * nz_glob; //Number of total cells in the x-direction +double constexpr sim_time = 1000.0; //How many seconds to run the simulation +double constexpr output_freq = 10.0; //How frequently to output data to file (in seconds) +int constexpr data_spec_int = DATA_SPEC_THERMAL; //How to initialize the data +double constexpr dx = xlen / nx_glob; // grid spacing in the x-direction +double constexpr dz = zlen / nz_glob; // grid spacing in the x-direction +/////////////////////////////////////////////////////////////////////////////////////// +// END USER-CONFIGURABLE PARAMETERS +/////////////////////////////////////////////////////////////////////////////////////// + +// NUM_VARS is a compile-time constant, so we bake it into the extents type. +using extents_3d = md::extents; +using view_3d = md::mdspan; +using view_3d_const = md::mdspan; +using extents_1d = md::extents; // a.k.a. dims<1, int>; +using view_1d = md::mdspan; +using view_1d_const = md::mdspan; + +// All dynamic array allocation happens here. +// Deallocation other than through `delete [] ptr` would happen +// through a custom Deleter (second template argument of `unique_ptr`). +// +// "auto" return type makes it easier for allocation +// to depend on the build configuration. + +struct host_memory_space {}; +struct host_serial_execution_policy {}; + +std::unique_ptr +make_unique_array_3d(host_memory_space, int X, int Y, int Z); + +std::unique_ptr +make_unique_array_1d(host_memory_space, int X); + +template +using alloc_3d = decltype(make_unique_array_3d(MemorySpace{}, 0, 0, 0)); +template +using alloc_1d = decltype(make_unique_array_1d(MemorySpace{}, 0)); + +// Variables that are set once in init and remain read-only throughout the simulation. +struct global_const_scalars { + int nx = nx_glob; + int nz = nz_glob; //Number of local grid cells in the x- and z- dimensions for this MPI task + int i_beg = 0; + int k_beg = 0; //beginning index in the x- and z-directions for this MPI task + + int nranks = 1; + int myrank = 0; //Number of MPI ranks and my rank id + int left_rank = 0; + int right_rank = 0; //MPI Rank IDs that exist to my left and right in the global domain + + inline bool mainproc() const { return myrank == 0; } //Am I the main process (rank == 0)? +}; + +struct global_scalars { + // Model time step (seconds). The last time step might shorten this. + double dt; + double etime = 0.0; //Elapsed model time + double output_counter = 0.0; //Helps determine when it's time to do output + int num_out = 0; //Number of outputs performed + int direction_switch = 1; //Switch to alternate the order of directions +}; + +// Arrays that are allocated and filled in init and never changed after that. +template +class global_const_arrays { +public: + global_const_arrays(MemorySpace memory_space, int nx, int nz, int hs) : + nx_(nx), + nz_(nz), + hs_(hs), + hy_dens_cell_ (make_unique_array_1d(memory_space, nz+2*hs)), + hy_dens_theta_cell_(make_unique_array_1d(memory_space, nz+2*hs)), + hy_dens_int_ (make_unique_array_1d(memory_space, nz+1)), + hy_dens_theta_int_ (make_unique_array_1d(memory_space, nz+1)), + hy_pressure_int_ (make_unique_array_1d(memory_space, nz+1)) + {} + + // Const views exist for all use after init. + view_1d_const hy_dens_cell() const { + return view_1d_const{hy_dens_cell_.get(), nz_ + 2 * hs_}; + } + view_1d_const hy_dens_theta_cell() const { + return view_1d_const{hy_dens_theta_cell_.get(), nz_ + 2 * hs_}; + } + view_1d_const hy_dens_int() const { + return view_1d_const{hy_dens_int_.get(), nz_ + 1}; + } + view_1d_const hy_dens_theta_int() const { + return view_1d_const{hy_dens_theta_int_.get(), nz_ + 1}; + } + view_1d_const hy_pressure_int() const { + return view_1d_const{hy_pressure_int_.get(), nz_ + 1}; + } + + // Nonconst views exist for init. + view_1d hy_dens_cell() { + return view_1d{hy_dens_cell_.get(), nz_ + 2 * hs_}; + } + view_1d hy_dens_theta_cell() { + return view_1d{hy_dens_theta_cell_.get(), nz_ + 2 * hs_}; + } + view_1d hy_dens_int() { + return view_1d{hy_dens_int_.get(), nz_ + 1}; + } + view_1d hy_dens_theta_int() { + return view_1d{hy_dens_theta_int_.get(), nz_ + 1}; + } + view_1d hy_pressure_int() { + return view_1d{hy_pressure_int_.get(), nz_ + 1}; + } + +private: + int nx_, nz_, hs_; + alloc_1d hy_dens_cell_; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) + alloc_1d hy_dens_theta_cell_; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) + alloc_1d hy_dens_int_; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) + alloc_1d hy_dens_theta_int_; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) + alloc_1d hy_pressure_int_; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) +}; + +// Arrays that are allocated in init and updated throughout the simulation. +// +// C indexing seems to prefer the extents in reverse order. +// Respecting that also avoids divergence from the Python version. +// This means that the mdspan must be layout_right; the intent appears +// to be for C code to use row-major storage, but with Fortran ordering. +template +class global_arrays { +public: + global_arrays(MemorySpace memory_space, int nx, int nz, int hs) : + nx_(nx), + nz_(nz), + hs_(hs), + state_ (make_unique_array_3d(memory_space, NUM_VARS, nz+2*hs, nx+2*hs)), + state_tmp_(make_unique_array_3d(memory_space, NUM_VARS, nz+2*hs, nx+2*hs)), + flux_ (make_unique_array_3d(memory_space, NUM_VARS, nz+1, nx+1)), + tend_ (make_unique_array_3d(memory_space, NUM_VARS, nz, nx)) + {} + + // The current model for member functions that get a view of an array + // is for the const-ness of the global_arrays object to determine + // whether the view is a view-of-const or view-of-nonconst. + // We might consider a different model where users explicitly declare + // access intent (read-only, write-only, or read-write) at the point of use. + view_3d state() { + // The various allocations have related dimensions that depend on + // just a few metadata (NUM_VARS, nz, nx, and hs). + // Storing extents for each allocation would duplicate metadata storage. + // Instead, we use flat allocations and construct layout mappings on the fly + // in the member functions that return (mdspan) views. + return view_3d{state_.get(), NUM_VARS, nz_ + 2 * hs_, nx_ + 2 * hs_}; + } + view_3d state_tmp() { + return view_3d{state_tmp_.get(), NUM_VARS, nz_ + 2 * hs_, nx_ + 2 * hs_}; + } + view_3d flux() { + return view_3d{flux_.get(), NUM_VARS, nz_ + 1, nx_ + 1}; + } + view_3d tend() { + return view_3d{tend_.get(), NUM_VARS, nz_, nx_}; + } + + view_3d_const state() const { + // The various allocations have related dimensions that depend on + // just a few metadata (NUM_VARS, nz, nx, and hs). + // Storing extents for each allocation would duplicate metadata storage. + // Instead, we use flat allocations and construct layout mappings on the fly + // in the member functions that return (mdspan) views. + return view_3d_const{state_.get(), NUM_VARS, nz_ + 2 * hs_, nx_ + 2 * hs_}; + } + +private: + int nx_, nz_, hs_; + alloc_3d state_; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) + alloc_3d state_tmp_; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) + alloc_3d flux_; // Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) + alloc_3d tend_; // Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) +}; + +template +struct init_result { + global_const_scalars const_scalars; + global_scalars scalars; + global_const_arrays const_arrays; + global_arrays arrays; +}; + +struct test_case { + double r; + double u; + double w; + double t; + double hr; + double ht; +}; + +// For the various test case functions, x and z are input coordinates at which to sample; +// r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location; and +// hr and ht are output background hydrostatic density and potential temperature at that location. + +//This test case is initially balanced but injects fast, cold air from the left boundary near the model top +test_case injection(double x, double z); + +//Initialize a density current (falling cold thermal that propagates along the model bottom) +test_case density_current(double x, double z); + +test_case gravity_waves(double x, double z); + +//Rising thermal +test_case thermal(double x, double z); + +//Colliding thermals +test_case collision(double x, double z); + +test_case get_test_case(int data_spec, double x_, double z_); + +struct r_t_pair { + double r; + double t; +}; + +// Establish hydrostatic balance using constant potential temperature +// (thermally neutral atmosphere) +// z is the input coordinate +// r and t are the output background hydrostatic density and potential temperature +r_t_pair hydro_const_theta(double z); + +//Establish hydrostatic balance using constant Brunt-Vaisala frequency +//z is the input coordinate +//bv_freq0 is the constant Brunt-Vaisala frequency +//r and t are the output background hydrostatic density and potential temperature +r_t_pair hydro_const_bvfreq(double z, double bv_freq0); + +//Sample from an ellipse of a specified center, radius, and amplitude at a specified location +//x and z are input coordinates +//amp,x0,z0,xrad,zrad are input amplitude, center, and radius of the ellipse +double sample_ellipse_cosine(double x, double z, double amp, double x0, double z0, + double xrad, double zrad); + +// Perform a single time step. +// Time steps are dimensionally split and +// use a simple low-storage three-stage Runge-Kutta time integrator. +// The dimensional splitting is a second-order-accurate alternating Strang splitting +// that alternates the order of directions each time step. +// +// The Runge-Kutta method used here is defined as follows: +// +// q* = q[n] + dt/3 * rhs(q[n]) +// q** = q[n] + dt/2 * rhs(q* ) +// q[n+1] = q[n] + dt/1 * rhs(q** ) +// +template +void perform_timestep( + ExecutionPolicy exec_policy, + view_3d state, view_3d state_tmp, + view_3d flux, view_3d tend, + const global_const_scalars& c_scalars, + const global_const_arrays& c_arrays, + global_scalars& scalars) +{ + const double dt = scalars.dt; + if (scalars.direction_switch) { + //x-direction first + semi_discrete_step(exec_policy, state, state , state_tmp, dt / 3, direction::X, flux, tend, c_scalars, c_arrays); + semi_discrete_step(exec_policy, state, state_tmp, state_tmp, dt / 2, direction::X, flux, tend, c_scalars, c_arrays); + semi_discrete_step(exec_policy, state, state_tmp, state , dt / 1, direction::X, flux, tend, c_scalars, c_arrays); + //z-direction second + semi_discrete_step(exec_policy, state, state , state_tmp, dt / 3, direction::Z, flux, tend, c_scalars, c_arrays); + semi_discrete_step(exec_policy, state, state_tmp, state_tmp, dt / 2, direction::Z, flux, tend, c_scalars, c_arrays); + semi_discrete_step(exec_policy, state, state_tmp, state , dt / 1, direction::Z, flux, tend, c_scalars, c_arrays); + } else { + //z-direction second + semi_discrete_step(exec_policy, state, state , state_tmp, dt / 3, direction::Z, flux, tend, c_scalars, c_arrays); + semi_discrete_step(exec_policy, state, state_tmp, state_tmp, dt / 2, direction::Z, flux, tend, c_scalars, c_arrays); + semi_discrete_step(exec_policy, state, state_tmp, state , dt / 1, direction::Z, flux, tend, c_scalars, c_arrays); + //x-direction first + semi_discrete_step(exec_policy, state, state , state_tmp, dt / 3, direction::X, flux, tend, c_scalars, c_arrays); + semi_discrete_step(exec_policy, state, state_tmp, state_tmp, dt / 2, direction::X, flux, tend, c_scalars, c_arrays); + semi_discrete_step(exec_policy, state, state_tmp, state , dt / 1, direction::X, flux, tend, c_scalars, c_arrays); + } + if (scalars.direction_switch) { + scalars.direction_switch = 0; + } else { + scalars.direction_switch = 1; + } +} + +//Perform a single semi-discretized step in time with the form: +//state_out = state_init + dt * rhs(state_forcing) +//Meaning the step starts from state_init, computes the rhs using state_forcing, +//and stores the result in state_out +template +void semi_discrete_step( + ExecutionPolicy exec_policy, + view_3d_const state_init, + view_3d state_forcing, + view_3d state_out, + double dt /* not scalars.dt */, + direction dir, view_3d flux, view_3d tend, + const global_const_scalars& scalars, + const global_const_arrays& arrays) +{ + if (dir == direction::X) { + //Set the halo values for this MPI task's fluid state in the x-direction + set_halo_values_x(exec_policy, state_forcing, scalars, arrays); + //Compute the time tendencies for the fluid state in the x-direction + compute_tendencies_x(exec_policy, state_forcing, flux, tend, dt, scalars, arrays); + } else if (dir == direction::Z) { + //Set the halo values for this MPI task's fluid state in the z-direction + set_halo_values_z(exec_policy, state_forcing, scalars, arrays); + //Compute the time tendencies for the fluid state in the z-direction + compute_tendencies_z(exec_policy, state_forcing, flux, tend, dt, scalars, arrays); + } + + apply_tendencies_to_fluid_state(exec_policy, state_init, state_out, dt, tend, scalars, arrays); +} + +template +init_result init( + ExecutionPolicy exec_policy, + MemorySpace memory_space, + int *argc , char ***argv) +{ + (void) MPI_Init(argc,argv); + + ///////////////////////////////////////////////////////////// + // BEGIN MPI DUMMY SECTION + // TODO: (1) GET NUMBER OF MPI RANKS + // (2) GET MY MPI RANK ID (RANKS ARE ZERO-BASED INDEX) + // (3) COMPUTE MY BEGINNING "I" INDEX (1-based index) + // (4) COMPUTE HOW MANY X-DIRECTION CELLS MY RANK HAS + // (5) FIND MY LEFT AND RIGHT NEIGHBORING RANK IDs + ///////////////////////////////////////////////////////////// + int i_beg = 0; + int nx = nx_glob; + + ////////////////////////////////////////////// + // END MPI DUMMY SECTION + ////////////////////////////////////////////// + + //Vertical direction isn't MPI-ized, so the rank's local values = the global values + int k_beg = 0; + int nz = nz_glob; + int nranks = 1; + int myrank = 0; + int left_rank = 0; + int right_rank = 0; + bool mainproc = (myrank == 0); + + global_arrays gl_arrs(memory_space, nx, nz, hs); + auto state = gl_arrs.state(); + auto state_tmp = gl_arrs.state_tmp(); + auto flux = gl_arrs.flux(); + auto tend = gl_arrs.tend(); + + //Define the maximum stable time step based on an assumed maximum wind speed + double dt = fmin(dx,dz) / max_speed * cfl; + + //If I'm the main process in MPI, display some grid information + if (mainproc) { + fprintf(stderr, "nx_glob, nz_glob: %d %d\n", nx_glob, nz_glob); + fprintf(stderr, "dx,dz: %lf %lf\n",dx,dz); + fprintf(stderr, "dt: %lf\n",dt); + } + //Want to make sure this info is displayed before further output + (void) MPI_Barrier(MPI_COMM_WORLD); + + initialize_cell_averaged_fluid_state(exec_policy, + state, state_tmp, nx, nz, i_beg, k_beg); + + global_const_arrays gl_const_arrs(memory_space, nx, nz, hs); + // Get nonconst views, so we can fill them in below. + auto hy_dens_cell = gl_const_arrs.hy_dens_cell(); + auto hy_dens_theta_cell = gl_const_arrs.hy_dens_theta_cell(); + auto hy_dens_int = gl_const_arrs.hy_dens_int(); + auto hy_dens_theta_int = gl_const_arrs.hy_dens_theta_int(); + auto hy_pressure_int = gl_const_arrs.hy_pressure_int(); + + compute_hydrostatic_background_state(exec_policy, + hy_dens_cell, hy_dens_theta_cell, + hy_dens_int, hy_dens_theta_int, hy_pressure_int, nz, k_beg); + + return init_result{ + global_const_scalars{ +#if defined(__cpp_designated_initializers) + .nx = nx, + .nz = nz, + .i_beg = i_beg, + .k_beg = k_beg, + .nranks = nranks, + .myrank = myrank, + .left_rank = left_rank, + .right_rank = right_rank +#else + nx, + nz, + i_beg, + k_beg, + nranks, + myrank, + left_rank, + right_rank +#endif + }, + global_scalars{ +#if defined(__cpp_designated_initializers) + .dt = dt, + .etime = 0.0, + .output_counter = 0.0, + .num_out = 0, + .direction_switch = 1 +#else + dt, + /* etime = */ 0.0, + /* output_counter = */ 0.0, + /* num_out = */ 0, + /* direction_switch = */ 1 +#endif + }, + std::move(gl_const_arrs), + std::move(gl_arrs) + }; +} + +struct reduction_result { + double mass; + double te; +}; + +//Compute reduced quantities for error checking without resorting to the "ncdiff" tool +template +reduction_result reductions( + ExecutionPolicy exec_policy, + view_3d_const state, + const global_const_scalars& const_scalars, + const global_const_arrays& const_arrays) +{ + reduction_result result = local_reductions(exec_policy, + state, const_scalars, const_arrays); + std::array loc{result.mass, result.te}; + std::array glob{0.0, 0.0}; + int ierr = MPI_Allreduce(loc.data(), glob.data(), 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + return reduction_result{ + .mass = glob[0], + .te = glob[1] + }; +} + +void finalize(); diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index 7058f1e..c06114b 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -6,365 +6,41 @@ // ////////////////////////////////////////////////////////////////////////////////////////// -#include -#include -#include -#include +#include "miniWeather_common.hpp" +#include "miniWeather_output.hpp" +#include "miniWeather_serial.hpp" -#include -#include "pnetcdf.h" - -// nvc++ in 25.1 doesn't like including this header. -//#include "cub/cub.cuh" - -#include "mdspan/mdspan.hpp" -#include "unique_mdarray.hpp" - -#define MINIWEATHER_ONLY_OUTPUT_THETA 1 - -constexpr double pi = 3.14159265358979323846264338327; //Pi -constexpr double grav = 9.8; //Gravitational acceleration (m / s^2) -constexpr double cp = 1004.; //Specific heat of dry air at constant pressure -constexpr double cv = 717.; //Specific heat of dry air at constant volume -constexpr double rd = 287.; //Dry air constant for equation of state (P=rho*rd*T) -constexpr double p0 = 1.e5; //Standard pressure at the surface in Pascals -constexpr double C0 = 27.5629410929725921310572974482; //Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma) -constexpr double gamm = 1.40027894002789400278940027894; //gamma=cp/Rd , have to call this gamm because "gamma" is taken (I hate C so much) -//Define domain and stability-related constants -constexpr double xlen = 2.e4; //Length of the domain in the x-direction (meters) -constexpr double zlen = 1.e4; //Length of the domain in the z-direction (meters) -constexpr double hv_beta = 0.05; //How strong to diffuse the solution: hv_beta \in [0:1] -constexpr double cfl = 1.50; //"Courant, Friedrichs, Lewy" number (for numerical stability) -constexpr double max_speed = 450; //Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec) -constexpr int hs = 2; //"Halo" size: number of cells beyond the MPI tasks's domain needed for a full "stencil" of information for reconstruction -constexpr int sten_size = 4; //Size of the stencil used for interpolation - -//Parameters for indexing and flags -constexpr int NUM_VARS = 4; //Number of fluid state variables -constexpr int ID_DENS = 0; //index for density ("rho") -constexpr int ID_UMOM = 1; //index for momentum in the x-direction ("rho * u") -constexpr int ID_WMOM = 2; //index for momentum in the z-direction ("rho * w") -constexpr int ID_RHOT = 3; //index for density * potential temperature ("rho * theta") - -enum class direction { X, Z }; - -constexpr int DATA_SPEC_COLLISION = 1; -constexpr int DATA_SPEC_THERMAL = 2; -constexpr int DATA_SPEC_GRAVITY_WAVES = 3; -constexpr int DATA_SPEC_DENSITY_CURRENT = 5; -constexpr int DATA_SPEC_INJECTION = 6; - -constexpr int nqpoints = 3; -constexpr double qpoints [] = { 0.112701665379258311482073460022E0 , 0.500000000000000000000000000000E0 , 0.887298334620741688517926539980E0 }; -constexpr double qweights[] = { 0.277777777777777777777777777779E0 , 0.444444444444444444444444444444E0 , 0.277777777777777777777777777779E0 }; - -/////////////////////////////////////////////////////////////////////////////////////// -// BEGIN USER-CONFIGURABLE PARAMETERS -/////////////////////////////////////////////////////////////////////////////////////// -//The x-direction length is twice as long as the z-direction length -//So, you'll want to have nx_glob be twice as large as nz_glob - -int constexpr nz_glob = 50; //Number of total cells in the z-direction -int constexpr nx_glob = 2 * nz_glob; //Number of total cells in the x-direction -double constexpr sim_time = 1000.0; //How many seconds to run the simulation -double constexpr output_freq = 10.0; //How frequently to output data to file (in seconds) -int constexpr data_spec_int = DATA_SPEC_THERMAL; //How to initialize the data -double constexpr dx = xlen / nx_glob; // grid spacing in the x-direction -double constexpr dz = zlen / nz_glob; // grid spacing in the x-direction -/////////////////////////////////////////////////////////////////////////////////////// -// END USER-CONFIGURABLE PARAMETERS -/////////////////////////////////////////////////////////////////////////////////////// - -// NUM_VARS is a compile-time constant, so we bake it into the extents type. -using extents_3d = md::extents; -using view_3d = md::mdspan; -using view_3d_const = md::mdspan; -using extents_1d = md::extents; // a.k.a. dims<1, int>; -using view_1d = md::mdspan; -using view_1d_const = md::mdspan; - -// All dynamic array allocation happens here. -// Deallocation other than through `delete [] ptr` would happen -// through a custom Deleter (second template argument of `unique_ptr`). -// -// "auto" return type makes it easier for allocation -// to depend on the build configuration. - -struct host_memory_space {}; - -std::unique_ptr -make_unique_array_3d(host_memory_space, int X, int Y, int Z) { - return std::make_unique(X * Y * Z); +auto default_memory_space() { + return host_memory_space{}; } -std::unique_ptr -make_unique_array_1d(host_memory_space, int X) { - return std::make_unique(X); -} - -template -using alloc_3d = decltype(make_unique_array_3d(MemorySpace{}, 0, 0, 0)); -template -using alloc_1d = decltype(make_unique_array_1d(MemorySpace{}, 0)); -using default_memory_space = host_memory_space; - -auto make_unique_array_3d(int X, int Y, int Z) { - return make_unique_array_3d(default_memory_space{}, X, Y, Z); -} -auto make_unique_array_1d(int X) { - return make_unique_array_1d(default_memory_space{}, X); +auto default_execution_policy() { + return host_serial_execution_policy{}; } - -// Variables that are set once in init and remain read-only throughout the simulation. -struct global_const_scalars { - int nx = nx_glob; - int nz = nz_glob; //Number of local grid cells in the x- and z- dimensions for this MPI task - int i_beg = 0; - int k_beg = 0; //beginning index in the x- and z-directions for this MPI task - - int nranks = 1; - int myrank = 0; //Number of MPI ranks and my rank id - int left_rank = 0; - int right_rank = 0; //MPI Rank IDs that exist to my left and right in the global domain - - bool mainproc() const { return myrank == 0; } //Am I the main process (rank == 0)? -}; - -struct global_scalars { - // Model time step (seconds). The last time step might shorten this. - double dt; - double etime = 0.0; //Elapsed model time - double output_counter = 0.0; //Helps determine when it's time to do output - int num_out = 0; //Number of outputs performed - int direction_switch = 1; //Switch to alternate the order of directions -}; - -// Arrays that are allocated and filled in init and never changed after that. -template -class global_const_arrays { -public: - global_const_arrays(MemorySpace memory_space, int nx, int nz, int hs) : - nx_(nx), - nz_(nz), - hs_(hs), - hy_dens_cell_ (make_unique_array_1d(memory_space, nz+2*hs)), - hy_dens_theta_cell_(make_unique_array_1d(memory_space, nz+2*hs)), - hy_dens_int_ (make_unique_array_1d(memory_space, nz+1)), - hy_dens_theta_int_ (make_unique_array_1d(memory_space, nz+1)), - hy_pressure_int_ (make_unique_array_1d(memory_space, nz+1)) - {} - - // Const views exist for all use after init. - view_1d_const hy_dens_cell() const { - return view_1d_const{hy_dens_cell_.get(), nz_ + 2 * hs_}; - } - view_1d_const hy_dens_theta_cell() const { - return view_1d_const{hy_dens_theta_cell_.get(), nz_ + 2 * hs_}; - } - view_1d_const hy_dens_int() const { - return view_1d_const{hy_dens_int_.get(), nz_ + 1}; - } - view_1d_const hy_dens_theta_int() const { - return view_1d_const{hy_dens_theta_int_.get(), nz_ + 1}; - } - view_1d_const hy_pressure_int() const { - return view_1d_const{hy_pressure_int_.get(), nz_ + 1}; - } - - // Nonconst views exist for init. - view_1d hy_dens_cell() { - return view_1d{hy_dens_cell_.get(), nz_ + 2 * hs_}; - } - view_1d hy_dens_theta_cell() { - return view_1d{hy_dens_theta_cell_.get(), nz_ + 2 * hs_}; - } - view_1d hy_dens_int() { - return view_1d{hy_dens_int_.get(), nz_ + 1}; - } - view_1d hy_dens_theta_int() { - return view_1d{hy_dens_theta_int_.get(), nz_ + 1}; - } - view_1d hy_pressure_int() { - return view_1d{hy_pressure_int_.get(), nz_ + 1}; - } - -private: - int nx_, nz_, hs_; - alloc_1d hy_dens_cell_; //hydrostatic density (vert cell avgs). Dimensions: (1-hs:nz+hs) - alloc_1d hy_dens_theta_cell_; //hydrostatic rho*t (vert cell avgs). Dimensions: (1-hs:nz+hs) - alloc_1d hy_dens_int_; //hydrostatic density (vert cell interf). Dimensions: (1:nz+1) - alloc_1d hy_dens_theta_int_; //hydrostatic rho*t (vert cell interf). Dimensions: (1:nz+1) - alloc_1d hy_pressure_int_; //hydrostatic press (vert cell interf). Dimensions: (1:nz+1) -}; - -// Arrays that are allocated in init and updated throughout the simulation. +// Intra-(MPI-process) parallelization needs to happen in the following functions. // -// C indexing seems to prefer the extents in reverse order. -// Respecting that also avoids divergence from the Python version. -// This means that the mdspan must be layout_right; the intent appears -// to be for C code to use row-major storage, but with Fortran ordering. -template -class global_arrays { -public: - global_arrays(MemorySpace memory_space, int nx, int nz, int hs) : - nx_(nx), - nz_(nz), - hs_(hs), - state_ (make_unique_array_3d(memory_space, NUM_VARS, nz+2*hs, nx+2*hs)), - state_tmp_(make_unique_array_3d(memory_space, NUM_VARS, nz+2*hs, nx+2*hs)), - flux_ (make_unique_array_3d(memory_space, NUM_VARS, nz+1, nx+1)), - tend_ (make_unique_array_3d(memory_space, NUM_VARS, nz, nx)) - {} - - // The current model for member functions that get a view of an array - // is for the const-ness of the global_arrays object to determine - // whether the view is a view-of-const or view-of-nonconst. - // We might consider a different model where users explicitly declare - // access intent (read-only, write-only, or read-write) at the point of use. - view_3d state() { - // The various allocations have related dimensions that depend on - // just a few metadata (NUM_VARS, nz, nx, and hs). - // Storing extents for each allocation would duplicate metadata storage. - // Instead, we use flat allocations and construct layout mappings on the fly - // in the member functions that return (mdspan) views. - return view_3d{state_.get(), NUM_VARS, nz_ + 2 * hs_, nx_ + 2 * hs_}; - } - view_3d state_tmp() { - return view_3d{state_tmp_.get(), NUM_VARS, nz_ + 2 * hs_, nx_ + 2 * hs_}; - } - view_3d flux() { - return view_3d{flux_.get(), NUM_VARS, nz_ + 1, nx_ + 1}; - } - view_3d tend() { - return view_3d{tend_.get(), NUM_VARS, nz_, nx_}; - } - - view_3d_const state() const { - // The various allocations have related dimensions that depend on - // just a few metadata (NUM_VARS, nz, nx, and hs). - // Storing extents for each allocation would duplicate metadata storage. - // Instead, we use flat allocations and construct layout mappings on the fly - // in the member functions that return (mdspan) views. - return view_3d_const{state_.get(), NUM_VARS, nz_ + 2 * hs_, nx_ + 2 * hs_}; - } - -private: - int nx_, nz_, hs_; - alloc_3d state_; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) - alloc_3d state_tmp_; // Fluid state. Dimensions: (1-hs:nx+hs,1-hs:nz+hs,NUM_VARS) - alloc_3d flux_; // Cell interface fluxes. Dimensions: (nx+1,nz+1,NUM_VARS) - alloc_3d tend_; // Fluid state tendencies. Dimensions: (nx,nz,NUM_VARS) -}; - -template -struct init_result { - global_const_scalars const_scalars; - global_scalars scalars; - global_const_arrays const_arrays; - global_arrays arrays; -}; - -template -init_result init(MemorySpace memory_space, int *argc , char ***argv); - -void finalize(); - -struct test_case { - double r; - double u; - double w; - double t; - double hr; - double ht; -}; - -test_case injection(double x, double z); -test_case density_current(double x, double z); -test_case gravity_waves(double x, double z); -test_case thermal(double x, double z); -test_case collision(double x, double z); +// apply_tendencies_to_fluid_state +// set_halo_values_x +// set_halo_values_z +// compute_tendencies_x +// compute_tendencies_z +// initialize_cell_averaged_fluid_state +// compute_hydrostatic_background_state +// local_reductions -test_case get_test_case(int data_spec, double x_, double z_) { - if (data_spec == DATA_SPEC_COLLISION ) { return collision(x_, z_); } - if (data_spec == DATA_SPEC_THERMAL ) { return thermal(x_, z_); } - if (data_spec == DATA_SPEC_GRAVITY_WAVES ) { return gravity_waves(x_, z_); } - if (data_spec == DATA_SPEC_DENSITY_CURRENT) { return density_current(x_, z_); } - if (data_spec == DATA_SPEC_INJECTION ) { return injection(x_, z_); } - assert(false); - return test_case{}; -} - -struct r_t_pair { - double r; - double t; -}; - -r_t_pair hydro_const_theta(double z); -r_t_pair hydro_const_bvfreq(double z, double bv_freq0); -double sample_ellipse_cosine(double x, double z, double amp, double x0, double z0, - double xrad, double zrad); - -template -void output(view_3d_const state, - const global_const_scalars& const_scalars, - const global_const_arrays& const_arrays, - global_scalars& scalars); -void ncwrap(int ierr, int line); -template -void perform_timestep(view_3d state, view_3d state_tmp, - view_3d flux, view_3d tend, - const global_const_scalars& c_scalars, - const global_const_arrays& c_arrays, - global_scalars& scalars); -template -void semi_discrete_step(view_3d_const state_init, - view_3d state_forcing, - view_3d state_out, - double dt /* not scalars.dt */, - direction dir, view_3d flux, view_3d tend, - const global_const_scalars& scalars, - const global_const_arrays& arrays); -template -void compute_tendencies_x(view_3d_const state, - view_3d flux, view_3d tend, double dt, - const global_const_scalars& scalars, - const global_const_arrays& arrays); -template -void compute_tendencies_z(view_3d_const state, - view_3d flux, view_3d tend, double dt, - const global_const_scalars& scalars, - const global_const_arrays& arrays); -template -void set_halo_values_x(view_3d state, - const global_const_scalars& scalars, - const global_const_arrays& arrays); -template -void set_halo_values_z(view_3d state, - const global_const_scalars& scalars, - const global_const_arrays& arrays); - -struct reduction_result { - double mass; - double te; -}; -template -reduction_result reductions(view_3d_const state, - const global_const_scalars& const_scalars, - const global_const_arrays& const_arrays); - -/////////////////////////////////////////////////////////////////////////////////////// -// THE MAIN PROGRAM STARTS HERE -/////////////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { - auto memory_space = default_memory_space{}; - auto [const_scalars, scalars, const_arrays, arrays] = init(memory_space, &argc , &argv ); + auto exec_policy = default_execution_policy(); + auto memory_space = default_memory_space(); + auto [const_scalars, scalars, const_arrays, arrays] = + init(exec_policy, memory_space, &argc , &argv); //Initial reductions for mass, kinetic energy, and total energy. // // mass0: initial domain total for mass // te0: initial domain total for total energy - auto [mass0, te0] = reductions(std::as_const(arrays).state(), const_scalars, const_arrays); + auto [mass0, te0] = reductions(exec_policy, + std::as_const(arrays).state(), const_scalars, const_arrays); #if ! defined(NO_INFORM) if (const_scalars.mainproc()) { fprintf(stderr, "mass0: %le\n" , mass0); @@ -373,7 +49,8 @@ int main(int argc, char **argv) { #endif //Output the initial state - output(std::as_const(arrays).state(), const_scalars, const_arrays, scalars); + output(exec_policy, std::as_const(arrays).state(), + const_scalars, const_arrays, scalars); //////////////////////////////////////////////////// // MAIN TIME STEP LOOP @@ -385,7 +62,8 @@ int main(int argc, char **argv) { if (scalars.etime + scalars.dt > sim_time) { scalars.dt = sim_time - scalars.etime; } - perform_timestep(arrays.state(), arrays.state_tmp(), arrays.flux(), + perform_timestep(exec_policy, + arrays.state(), arrays.state_tmp(), arrays.flux(), arrays.tend(), const_scalars, const_arrays, scalars); #if ! defined(NO_INFORM) if (const_scalars.mainproc()) { @@ -398,7 +76,8 @@ int main(int argc, char **argv) { //If it's time for output, reset the counter, and do output if (scalars.output_counter >= output_freq) { scalars.output_counter = scalars.output_counter - output_freq; - output(arrays.state(), const_scalars, const_arrays, scalars); + output(exec_policy, + arrays.state(), const_scalars, const_arrays, scalars); } } [[maybe_unused]] auto t2 = std::chrono::steady_clock::now(); @@ -409,7 +88,8 @@ int main(int argc, char **argv) { #endif //Final reductions for mass, kinetic energy, and total energy - auto [mass, te] = reductions(arrays.state(), const_scalars, const_arrays); + auto [mass, te] = reductions(exec_policy, + arrays.state(), const_scalars, const_arrays); if (const_scalars.mainproc()) { printf("d_mass: %le\n" , (mass - mass0)/mass0); printf("d_te: %le\n" , (te - te0 )/te0 ); @@ -417,794 +97,3 @@ int main(int argc, char **argv) { finalize(); } - -// Perform a single time step. -// Time steps are dimensionally split and -// use a simple low-storage three-stage Runge-Kutta time integrator. -// The dimensional splitting is a second-order-accurate alternating Strang splitting -// that alternates the order of directions each time step. -// -// The Runge-Kutta method used here is defined as follows: -// -// q* = q[n] + dt/3 * rhs(q[n]) -// q** = q[n] + dt/2 * rhs(q* ) -// q[n+1] = q[n] + dt/1 * rhs(q** ) -// -template -void perform_timestep(view_3d state, view_3d state_tmp, - view_3d flux, view_3d tend, - const global_const_scalars& c_scalars, - const global_const_arrays& c_arrays, - global_scalars& scalars) -{ - const double dt = scalars.dt; - if (scalars.direction_switch) { - //x-direction first - semi_discrete_step(state, state , state_tmp, dt / 3, direction::X, flux, tend, c_scalars, c_arrays); - semi_discrete_step(state, state_tmp, state_tmp, dt / 2, direction::X, flux, tend, c_scalars, c_arrays); - semi_discrete_step(state, state_tmp, state , dt / 1, direction::X, flux, tend, c_scalars, c_arrays); - //z-direction second - semi_discrete_step(state, state , state_tmp, dt / 3, direction::Z, flux, tend, c_scalars, c_arrays); - semi_discrete_step(state, state_tmp, state_tmp, dt / 2, direction::Z, flux, tend, c_scalars, c_arrays); - semi_discrete_step(state, state_tmp, state , dt / 1, direction::Z, flux, tend, c_scalars, c_arrays); - } else { - //z-direction second - semi_discrete_step(state, state , state_tmp, dt / 3, direction::Z, flux, tend, c_scalars, c_arrays); - semi_discrete_step(state, state_tmp, state_tmp, dt / 2, direction::Z, flux, tend, c_scalars, c_arrays); - semi_discrete_step(state, state_tmp, state , dt / 1, direction::Z, flux, tend, c_scalars, c_arrays); - //x-direction first - semi_discrete_step(state, state , state_tmp, dt / 3, direction::X, flux, tend, c_scalars, c_arrays); - semi_discrete_step(state, state_tmp, state_tmp, dt / 2, direction::X, flux, tend, c_scalars, c_arrays); - semi_discrete_step(state, state_tmp, state , dt / 1, direction::X, flux, tend, c_scalars, c_arrays); - } - if (scalars.direction_switch) { - scalars.direction_switch = 0; - } else { - scalars.direction_switch = 1; - } -} - - -//Perform a single semi-discretized step in time with the form: -//state_out = state_init + dt * rhs(state_forcing) -//Meaning the step starts from state_init, computes the rhs using state_forcing, -//and stores the result in state_out -template -void semi_discrete_step(view_3d_const state_init, - view_3d state_forcing, - view_3d state_out, - double dt /* not scalars.dt */, - direction dir, view_3d flux, view_3d tend, - const global_const_scalars& scalars, - const global_const_arrays& arrays) -{ - const int nx = scalars.nx; - const int nz = scalars.nz; - - if (dir == direction::X) { - //Set the halo values for this MPI task's fluid state in the x-direction - set_halo_values_x(state_forcing, scalars, arrays); - //Compute the time tendencies for the fluid state in the x-direction - compute_tendencies_x(state_forcing, flux, tend, dt, scalars, arrays); - } else if (dir == direction::Z) { - //Set the halo values for this MPI task's fluid state in the z-direction - set_halo_values_z(state_forcing, scalars, arrays); - //Compute the time tendencies for the fluid state in the z-direction - compute_tendencies_z(state_forcing, flux, tend, dt, scalars, arrays); - } - - ///////////////////////////////////////////////// - // TODO: THREAD ME - ///////////////////////////////////////////////// - //Apply the tendencies to the fluid state - - { - view_3d_const tend_c = tend; - auto hy_dens_cell = arrays.hy_dens_cell(); - const int i_beg = scalars.i_beg; - const int k_beg = scalars.k_beg; - for (int ll = 0; ll < NUM_VARS; ++ll) { - for (int k = 0; k < nz; ++k) { - for (int i = 0; i < nx; ++i) { - if (data_spec_int == DATA_SPEC_GRAVITY_WAVES) { - const double x = (i_beg + i+0.5)*dx; - const double z = (k_beg + k+0.5)*dz; - const double wpert = sample_ellipse_cosine(x, z, 0.01, xlen/8, 1000.0, 500.0, 500.0); - tend(ID_WMOM, k, i) += wpert * hy_dens_cell[hs+k]; - } - state_out(ll, k+hs, i+hs) = state_init(ll, k+hs, i+hs) + dt * tend_c(ll, k, i); - } - } - } - } -} - - -//Compute the time tendencies of the fluid state using forcing in the x-direction -//Since the halos are set in a separate routine, this will not require MPI -//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) -//Then, compute the tendencies using those fluxes -template -void compute_tendencies_x(view_3d_const state, - view_3d flux, view_3d tend, double dt, - const global_const_scalars& scalars, - const global_const_arrays& arrays) -{ - const int nx = scalars.nx; - const int nz = scalars.nz; - // Hyperviscosity coefficient - const double hv_coef = -hv_beta * dx / (16*dt); - auto hy_dens_cell = arrays.hy_dens_cell(); - auto hy_dens_theta_cell = arrays.hy_dens_theta_cell(); - - ///////////////////////////////////////////////// - // TODO: THREAD ME - ///////////////////////////////////////////////// - //Compute fluxes in the x-direction for each cell - for (int k = 0; k < nz; ++k) { - for (int i = 0; i < nx+1; ++i) { - //Use fourth-order interpolation from four cell averages - //to compute the value at the interface in question - std::array d3_vals; - std::array vals; - for (int ll = 0; ll < NUM_VARS; ++ll) { - std::array stencil; - for (int s = 0; s < sten_size; ++s) { - stencil[s] = state(ll, k+hs, i+s); - } - //Fourth-order-accurate interpolation of the state - vals[ll] = -stencil[0]/12 + 7*stencil[1]/12 + 7*stencil[2]/12 - stencil[3]/12; - //First-order-accurate interpolation of the third spatial derivative - //of the state (for artificial viscosity) - d3_vals[ll] = -stencil[0] + 3*stencil[1] - 3*stencil[2] + stencil[3]; - } - - //Compute density, u-wind, w-wind, potential temperature, - //and pressure (r,u,w,t,p respectively) - double r = vals[ID_DENS] + hy_dens_cell[k+hs]; - double u = vals[ID_UMOM] / r; - double w = vals[ID_WMOM] / r; - double t = ( vals[ID_RHOT] + hy_dens_theta_cell[k+hs] ) / r; - double p = C0 * pow(r*t, gamm); - - //Compute the flux vector - flux(ID_DENS, k, i) = r*u - hv_coef*d3_vals[ID_DENS]; - flux(ID_UMOM, k, i) = r*u*u+p - hv_coef*d3_vals[ID_UMOM]; - flux(ID_WMOM, k, i) = r*u*w - hv_coef*d3_vals[ID_WMOM]; - flux(ID_RHOT, k, i) = r*u*t - hv_coef*d3_vals[ID_RHOT]; - } - } - - ///////////////////////////////////////////////// - // TODO: THREAD ME - ///////////////////////////////////////////////// - //Use the fluxes to compute tendencies for each cell - { - view_3d_const flux_c = flux; - for (int ll = 0; ll < NUM_VARS; ++ll) { - for (int k = 0; k < nz; ++k) { - for (int i = 0; i < nx; ++i) { - tend(ll, k, i) = -( flux_c(ll, k, i+1) - flux_c(ll, k, i) ) / dx; - } - } - } - } -} - - -//Compute the time tendencies of the fluid state using forcing in the z-direction -//Since the halos are set in a separate routine, this will not require MPI -//First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity) -//Then, compute the tendencies using those fluxes -template -void compute_tendencies_z(view_3d_const state, - view_3d flux, view_3d tend, double dt, - const global_const_scalars& scalars, - const global_const_arrays& arrays) -{ - const int nx = scalars.nx; - const int nz = scalars.nz; - // Hyperviscosity coefficient - const double hv_coef = -hv_beta * dz / (16*dt); - auto hy_dens_int = arrays.hy_dens_int(); - auto hy_dens_theta_int = arrays.hy_dens_theta_int(); - auto hy_pressure_int = arrays.hy_pressure_int(); - - - ///////////////////////////////////////////////// - // TODO: THREAD ME - ///////////////////////////////////////////////// - //Compute fluxes in the x-direction for each cell - for (int k = 0; k < nz+1; ++k) { - for (int i = 0; i < nx; ++i) { - //Use fourth-order interpolation from four cell averages - //to compute the value at the interface in question - std::array d3_vals; - std::array vals; - for (int ll = 0; ll < NUM_VARS; ++ll) { - std::array stencil; - for (int s = 0; s < sten_size; ++s) { - stencil[s] = state(ll, k+s, i+hs); - } - //Fourth-order-accurate interpolation of the state - vals[ll] = -stencil[0]/12 + 7*stencil[1]/12 + 7*stencil[2]/12 - stencil[3]/12; - //First-order-accurate interpolation of the third spatial derivative - //of the state - d3_vals[ll] = -stencil[0] + 3*stencil[1] - 3*stencil[2] + stencil[3]; - } - - //Compute density, u-wind, w-wind, potential temperature, - //and pressure (r,u,w,t,p respectively) - double r = vals[ID_DENS] + hy_dens_int[k]; - double u = vals[ID_UMOM] / r; - double w = vals[ID_WMOM] / r; - double t = (vals[ID_RHOT] + hy_dens_theta_int[k]) / r; - double p = C0 * pow(r * t, gamm) - hy_pressure_int[k]; - //Enforce vertical boundary condition and exact mass conservation - if (k == 0 || k == nz) { - w = 0; - d3_vals[ID_DENS] = 0; - } - - //Compute the flux vector with hyperviscosity - flux(ID_DENS, k, i) = r*w - hv_coef*d3_vals[ID_DENS]; - flux(ID_UMOM, k, i) = r*w*u - hv_coef*d3_vals[ID_UMOM]; - flux(ID_WMOM, k, i) = r*w*w+p - hv_coef*d3_vals[ID_WMOM]; - flux(ID_RHOT, k, i) = r*w*t - hv_coef*d3_vals[ID_RHOT]; - } - } - - ///////////////////////////////////////////////// - // TODO: THREAD ME - ///////////////////////////////////////////////// - //Use the fluxes to compute tendencies for each cell - { - view_3d_const flux_c = flux; - for (int ll = 0; ll < NUM_VARS; ++ll) { - for (int k = 0; k < nz; ++k) { - for (int i = 0; i < nx; ++i) { - tend(ll, k, i) = -( flux_c(ll, k+1, i) - flux_c(ll, k, i) ) / dz; - if (ll == ID_WMOM) { - tend(ll, k, i) = tend(ll, k, i) - state(ID_DENS, k+hs, i+hs)*grav; - } - } - } - } - } -} - - - -//Set this MPI task's halo values in the x-direction. This routine will require MPI -template -void set_halo_values_x(view_3d state, - const global_const_scalars& scalars, - const global_const_arrays& arrays) -{ - const int nx = scalars.nx; - const int nz = scalars.nz; - - //////////////////////////////////////////////////////////////////////// - // TODO: EXCHANGE HALO VALUES WITH NEIGHBORING MPI TASKS - // (1) give state(1:hs,1:nz,1:NUM_VARS) to my left neighbor - // (2) receive state(1-hs:0,1:nz,1:NUM_VARS) from my left neighbor - // (3) give state(nx-hs+1:nx,1:nz,1:NUM_VARS) to my right neighbor - // (4) receive state(nx+1:nx+hs,1:nz,1:NUM_VARS) from my right neighbor - //////////////////////////////////////////////////////////////////////// - - ////////////////////////////////////////////////////// - // DELETE THE SERIAL CODE BELOW AND REPLACE WITH MPI - ////////////////////////////////////////////////////// - for (int ll = 0; ll < NUM_VARS; ++ll) { - for (int k = 0; k < nz; ++k) { - state(ll, k+hs, 0) = state(ll, k+hs, nx+hs-2); - state(ll, k+hs, 1) = state(ll, k+hs, nx+hs-1); - state(ll, k+hs, nx+hs) = state(ll, k+hs, hs); - state(ll, k+hs, nx+hs+1) = state(ll, k+hs, hs+1); - } - } - //////////////////////////////////////////////////// - - if (data_spec_int == DATA_SPEC_INJECTION) { - if (scalars.myrank == 0) { - auto hy_dens_cell = arrays.hy_dens_cell(); - auto hy_dens_theta_cell = arrays.hy_dens_theta_cell(); - const int k_beg = scalars.k_beg; - for (int k = 0; k < nz; ++k) { - for (int i = 0; i < hs; ++i) { - const double z = (k_beg + k+0.5)*dz; - if (fabs(z-3*zlen/4) <= zlen/16) { - state(ID_UMOM, k+hs, i) = (state(ID_DENS, k+hs, i) + hy_dens_cell[k+hs]) * 50.0; - state(ID_RHOT, k+hs, i) = (state(ID_DENS, k+hs, i) + hy_dens_cell[k+hs]) * 298.0 - - hy_dens_theta_cell[k+hs]; - } - } - } - } - } -} - - -//Set this MPI task's halo values in the z-direction. This does not require MPI because there is no MPI -//decomposition in the vertical direction -template -void set_halo_values_z(view_3d state, - const global_const_scalars& scalars, - const global_const_arrays& arrays) -{ - const int nx = scalars.nx; - const int nz = scalars.nz; - auto hy_dens_cell = arrays.hy_dens_cell(); - - ///////////////////////////////////////////////// - // TODO: THREAD ME - ///////////////////////////////////////////////// - for (int ll = 0; ll < NUM_VARS; ++ll) { - for (int i = 0; i < nx+2*hs; ++i) { - if (ll == ID_WMOM) { - state(ll, 0, i) = 0.0; - state(ll, 1, i) = 0.0; - state(ll, nz+hs, i) = 0.0; - state(ll, nz+hs+1, i) = 0.0; - } else if (ll == ID_UMOM) { - state(ll, 0, i) = state(ll, hs, i) / hy_dens_cell[hs] * hy_dens_cell[0]; - state(ll, 1, i) = state(ll, hs, i) / hy_dens_cell[hs] * hy_dens_cell[1]; - state(ll, nz+hs, i) = state(ll, nz+hs-1, i) / hy_dens_cell[nz+hs-1] * hy_dens_cell[nz+hs]; - state(ll, nz+hs+1, i) = state(ll, nz+hs-1, i) / hy_dens_cell[nz+hs-1] * hy_dens_cell[nz+hs+1]; - } else { - state(ll, 0, i) = state(ll, hs, i); - state(ll, 1, i) = state(ll, hs, i); - state(ll, nz+hs, i) = state(ll, nz+hs-1, i); - state(ll, nz+hs+1, i) = state(ll, nz+hs-1, i); - } - } - } -} - -template -init_result init(MemorySpace memory_space, int *argc , char ***argv ) { - (void) MPI_Init(argc,argv); - - ///////////////////////////////////////////////////////////// - // BEGIN MPI DUMMY SECTION - // TODO: (1) GET NUMBER OF MPI RANKS - // (2) GET MY MPI RANK ID (RANKS ARE ZERO-BASED INDEX) - // (3) COMPUTE MY BEGINNING "I" INDEX (1-based index) - // (4) COMPUTE HOW MANY X-DIRECTION CELLS MY RANK HAS - // (5) FIND MY LEFT AND RIGHT NEIGHBORING RANK IDs - ///////////////////////////////////////////////////////////// - int i_beg = 0; - int nx = nx_glob; - - ////////////////////////////////////////////// - // END MPI DUMMY SECTION - ////////////////////////////////////////////// - - //Vertical direction isn't MPI-ized, so the rank's local values = the global values - int k_beg = 0; - int nz = nz_glob; - int nranks = 1; - int myrank = 0; - int left_rank = 0; - int right_rank = 0; - bool mainproc = (myrank == 0); - - global_arrays gl_arrs(memory_space, nx, nz, hs); - auto state = gl_arrs.state(); - auto state_tmp = gl_arrs.state_tmp(); - auto flux = gl_arrs.flux(); - auto tend = gl_arrs.tend(); - - //Define the maximum stable time step based on an assumed maximum wind speed - double dt = fmin(dx,dz) / max_speed * cfl; - - //If I'm the main process in MPI, display some grid information - if (mainproc) { - fprintf(stderr, "nx_glob, nz_glob: %d %d\n", nx_glob, nz_glob); - fprintf(stderr, "dx,dz: %lf %lf\n",dx,dz); - fprintf(stderr, "dt: %lf\n",dt); - } - //Want to make sure this info is displayed before further output - (void) MPI_Barrier(MPI_COMM_WORLD); - - ////////////////////////////////////////////////////////////////////////// - // Initialize the cell-averaged fluid state via Gauss-Legendre quadrature - ////////////////////////////////////////////////////////////////////////// - for (int k = 0; k < nz+2*hs; ++k) { - for (int i = 0; i < nx+2*hs; ++i) { - //Initialize the state to zero - for (int ll = 0; ll < NUM_VARS; ++ll) { - state(ll, k, i) = 0.0; - } - //Use Gauss-Legendre quadrature to initialize a hydrostatic balance + temperature perturbation - for (int kk = 0; kk < nqpoints; ++kk) { - for (int ii = 0; ii < nqpoints; ++ii) { - //Compute the x,z location within the global domain based on cell and quadrature index - const double x = (i_beg + i-hs+0.5)*dx + (qpoints[ii]-0.5)*dx; - const double z = (k_beg + k-hs+0.5)*dz + (qpoints[kk]-0.5)*dz; - - //Set the fluid state based on the user's specification - auto [r, u, w, t, hr, ht] = get_test_case(data_spec_int, x, z); - - //Store into the fluid state array - state(ID_DENS, k, i) = state(ID_DENS, k, i) + r * qweights[ii]*qweights[kk]; - state(ID_UMOM, k, i) = state(ID_UMOM, k, i) + (r+hr)*u * qweights[ii]*qweights[kk]; - state(ID_WMOM, k, i) = state(ID_WMOM, k, i) + (r+hr)*w * qweights[ii]*qweights[kk]; - state(ID_RHOT, k, i) = state(ID_RHOT, k, i) + ( (r+hr)*(t+ht) - hr*ht ) * qweights[ii]*qweights[kk]; - } - } - for (int ll = 0; ll < NUM_VARS; ++ll) { - state_tmp(ll, k, i) = state(ll, k, i); - } - } - } - - global_const_arrays gl_const_arrs(memory_space, nx, nz, hs); - // Get nonconst views, so we can fill them in below. - auto hy_dens_cell = gl_const_arrs.hy_dens_cell(); - auto hy_dens_theta_cell = gl_const_arrs.hy_dens_theta_cell(); - auto hy_dens_int = gl_const_arrs.hy_dens_int(); - auto hy_dens_theta_int = gl_const_arrs.hy_dens_theta_int(); - auto hy_pressure_int = gl_const_arrs.hy_pressure_int(); - - //Compute the hydrostatic background state over vertical cell averages - for (int k = 0; k < nz+2*hs; ++k) { - hy_dens_cell[k] = 0.; - hy_dens_theta_cell[k] = 0.; - for (int kk = 0; kk < nqpoints; ++kk) { - const double z = (k_beg + k-hs+0.5)*dz; - //Set the fluid state based on the user's specification - auto [r, u, w, t, hr, ht] = get_test_case(data_spec_int, 0.0, z); - hy_dens_cell[k] = hy_dens_cell[k] + hr * qweights[kk]; - hy_dens_theta_cell[k] = hy_dens_theta_cell[k] + hr*ht * qweights[kk]; - } - } - //Compute the hydrostatic background state at vertical cell interfaces - for (int k = 0; k < nz+1; ++k) { - const double z = (k_beg + k)*dz; - auto [r, u, w, t, hr, ht] = get_test_case(data_spec_int, 0.0, z); - hy_dens_int [k] = hr; - hy_dens_theta_int[k] = hr * ht; - hy_pressure_int [k] = C0 * pow(hr * ht, gamm); - } - - return init_result{ - global_const_scalars{ -#if defined(__cpp_designated_initializers) - .nx = nx, - .nz = nz, - .i_beg = i_beg, - .k_beg = k_beg, - .nranks = nranks, - .myrank = myrank, - .left_rank = left_rank, - .right_rank = right_rank -#else - nx, - nz, - i_beg, - k_beg, - nranks, - myrank, - left_rank, - right_rank -#endif - }, - global_scalars{ -#if defined(__cpp_designated_initializers) - .dt = dt, - .etime = 0.0, - .output_counter = 0.0, - .num_out = 0, - .direction_switch = 1 -#else - dt, - /* etime = */ 0.0, - /* output_counter = */ 0.0, - /* num_out = */ 0, - /* direction_switch = */ 1 -#endif - }, - std::move(gl_const_arrs), - std::move(gl_arrs) - }; -} - - -//This test case is initially balanced but injects fast, cold air from the left boundary near the model top -//x and z are input coordinates at which to sample -//r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location -//hr and ht are output background hydrostatic density and potential temperature at that location -test_case injection(double x , double z) { - auto [hr, ht] = hydro_const_theta(z); - double r = 0.0; - double t = 0.0; - double u = 0.0; - double w = 0.0; - return {r, u, w, t, hr, ht}; -} - - -//Initialize a density current (falling cold thermal that propagates along the model bottom) -//x and z are input coordinates at which to sample -//r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location -//hr and ht are output background hydrostatic density and potential temperature at that location -test_case density_current(double x , double z) { - auto [hr, ht] = hydro_const_theta(z); - double r = 0.0; - double t = sample_ellipse_cosine(x, z, -20.0, xlen/2, 5000.0, 4000.0, 2000.0); - double u = 0.0; - double w = 0.0; - return {r, u, w, t, hr, ht}; -} - - -//x and z are input coordinates at which to sample -//r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location -//hr and ht are output background hydrostatic density and potential temperature at that location -test_case gravity_waves(double x, double z) { - auto [hr, ht] = hydro_const_bvfreq(z, 0.02); - double r = 0.0; - double t = 0.0; - double u = 15.0; - double w = 0.0; - return {r, u, w, t, hr, ht}; -} - - -//Rising thermal -//x and z are input coordinates at which to sample -//r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location -//hr and ht are output background hydrostatic density and potential temperature at that location -test_case thermal(double x, double z) { - auto [hr, ht] = hydro_const_theta(z); - double r = 0.0; - double t = sample_ellipse_cosine(x, z, 3.0, xlen/2,2000.0, 2000.0, 2000.0); - double u = 0.0; - double w = 0.0; - return {r, u, w, t, hr, ht}; -} - - -//Colliding thermals -//x and z are input coordinates at which to sample -//r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location -//hr and ht are output background hydrostatic density and potential temperature at that location -test_case collision(double x , double z) { - auto [hr, ht] = hydro_const_theta(z); - double r = 0.0; - double t = 0.0; - double u = 0.0; - double w = 0.0; - t = t + sample_ellipse_cosine(x, z, 20.0, xlen/2,2000.0, 2000.0, 2000.0); - t = t + sample_ellipse_cosine(x, z, -20.0, xlen/2,8000.0, 2000.0, 2000.0); - return {r, u, w, t, hr, ht}; -} - - -//Establish hydrostatic balance using constant potential temperature (thermally neutral atmosphere) -//z is the input coordinate -//r and t are the output background hydrostatic density and potential temperature -r_t_pair hydro_const_theta(double z) { - const double theta0 = 300.; //Background potential temperature - const double exner0 = 1.; //Surface-level Exner pressure - double p,exner,rt; - //Establish hydrostatic balance first using Exner pressure - double t = theta0; //Potential Temperature at z - exner = exner0 - grav * z / (cp * theta0); //Exner pressure at z - p = p0 * pow(exner,(cp/rd)); //Pressure at z - rt = pow((p / C0),(1. / gamm)); //rho*theta at z - double r = rt / t; //Density at z - - return {r, t}; -} - - -//Establish hydrostatic balance using constant Brunt-Vaisala frequency -//z is the input coordinate -//bv_freq0 is the constant Brunt-Vaisala frequency -//r and t are the output background hydrostatic density and potential temperature -r_t_pair hydro_const_bvfreq(double z, double bv_freq0) { - const double theta0 = 300.; //Background potential temperature - const double exner0 = 1.; //Surface-level Exner pressure - double p, exner, rt; - double t = theta0 * exp( bv_freq0*bv_freq0 / grav * z ); //Pot temp at z - exner = exner0 - grav*grav / (cp * bv_freq0*bv_freq0) * (t - theta0) / (t * theta0); //Exner pressure at z - p = p0 * pow(exner,(cp/rd)); //Pressure at z - rt = pow((p / C0), (1. / gamm)); //rho*theta at z - double r = rt / t; //Density at z - - return {r, t}; -} - - -//Sample from an ellipse of a specified center, radius, and amplitude at a specified location -//x and z are input coordinates -//amp,x0,z0,xrad,zrad are input amplitude, center, and radius of the ellipse -double sample_ellipse_cosine( double x , double z , double amp , double x0 , double z0 , double xrad , double zrad ) { - double dist; - //Compute distance from bubble center - dist = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.0; - //If the distance from bubble center is less than the radius, create a cos**2 profile - if (dist <= pi / 2.0) { - return amp * pow(cos(dist), 2.0); - } else { - return 0.; - } -} - - -//Output the fluid state (state) to a NetCDF file at a given elapsed model time (etime) -//The file I/O uses parallel-netcdf, the only external library required for this mini-app. -//If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics -template -void output(view_3d_const state, - const global_const_scalars& const_scalars, - const global_const_arrays& const_arrays, - global_scalars& scalars) -{ - const int nx = const_scalars.nx; - const int nz = const_scalars.nz; - - int ncid, t_dimid, x_dimid, z_dimid, theta_varid, t_varid, dimids[3]; -#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) - int dens_varid, uwnd_varid, wwnd_varid; -#endif - MPI_Offset st1[1], ct1[1], st3[3], ct3[3]; - - //Inform the user - if (const_scalars.mainproc()) { fprintf(stderr, "*** OUTPUT ***\n"); } - - //Temporary arrays to hold density, u-wind, w-wind, and potential temperature (theta). -#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) - auto dens = md::make_unique_mdarray(nz, nx); - auto uwnd = md::make_unique_mdarray(nz, nx); - auto wwnd = md::make_unique_mdarray(nz, nx); -#endif - auto theta = md::make_unique_mdarray(nz, nx); - auto etimearr = std::make_unique(1); - - // PNetCDF needs an MPI_Info object that is not MPI_INFO_NULL. - // It's possible that earlier PNetCDF versions tolerated MPI_INFO_NULL. - MPI_Info mpi_info; - auto info_err = MPI_Info_create(&mpi_info); - if (info_err != MPI_SUCCESS) { - fprintf(stderr, "Error creating MPI Info object\n"); - MPI_Abort(MPI_COMM_WORLD, -1); - } - - //If the elapsed time is zero, create the file. Otherwise, open the file - if (scalars.etime == 0) { - //Create the file - ncwrap( ncmpi_create( MPI_COMM_WORLD , "output.nc" , NC_CLOBBER , mpi_info , &ncid ) , __LINE__ ); - //Create the dimensions - ncwrap( ncmpi_def_dim( ncid , "t" , (MPI_Offset) NC_UNLIMITED , &t_dimid ) , __LINE__ ); - ncwrap( ncmpi_def_dim( ncid , "x" , (MPI_Offset) nx_glob , &x_dimid ) , __LINE__ ); - ncwrap( ncmpi_def_dim( ncid , "z" , (MPI_Offset) nz_glob , &z_dimid ) , __LINE__ ); - //Create the variables - dimids[0] = t_dimid; - ncwrap( ncmpi_def_var( ncid , "t_var" , NC_DOUBLE , 1 , dimids , &t_varid ) , __LINE__ ); - dimids[0] = t_dimid; dimids[1] = z_dimid; dimids[2] = x_dimid; -#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) - ncwrap( ncmpi_def_var( ncid , "dens" , NC_DOUBLE , 3 , dimids , &dens_varid ) , __LINE__ ); - ncwrap( ncmpi_def_var( ncid , "uwnd" , NC_DOUBLE , 3 , dimids , &uwnd_varid ) , __LINE__ ); - ncwrap( ncmpi_def_var( ncid , "wwnd" , NC_DOUBLE , 3 , dimids , &wwnd_varid ) , __LINE__ ); -#endif - ncwrap( ncmpi_def_var( ncid , "theta" , NC_DOUBLE , 3 , dimids , &theta_varid ) , __LINE__ ); - //End "define" mode - ncwrap( ncmpi_enddef( ncid ) , __LINE__ ); - } else { - //Open the file - ncwrap( ncmpi_open( MPI_COMM_WORLD , "output.nc" , NC_WRITE , mpi_info , &ncid ) , __LINE__ ); - //Get the variable IDs -#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) - ncwrap( ncmpi_inq_varid( ncid , "dens" , &dens_varid ) , __LINE__ ); - ncwrap( ncmpi_inq_varid( ncid , "uwnd" , &uwnd_varid ) , __LINE__ ); - ncwrap( ncmpi_inq_varid( ncid , "wwnd" , &wwnd_varid ) , __LINE__ ); -#endif - ncwrap( ncmpi_inq_varid( ncid , "theta" , &theta_varid ) , __LINE__ ); - ncwrap( ncmpi_inq_varid( ncid , "t_var" , &t_varid ) , __LINE__ ); - } - - //Store perturbed values in the temp arrays for output - - auto hy_dens_cell = const_arrays.hy_dens_cell(); - auto hy_dens_theta_cell = const_arrays.hy_dens_theta_cell(); - for (int k = 0; k < nz; ++k) { - for (int i = 0; i < nx; ++i) { -#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) - dens(k, i) = state(ID_DENS, k+hs, i+hs); - uwnd(k, i) = state(ID_UMOM, k+hs, i+hs) / (hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs)); - wwnd(k, i) = state(ID_WMOM, k+hs, i+hs) / (hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs)); -#endif - theta(k, i) = (state(ID_RHOT, k+hs, i+hs) + hy_dens_theta_cell[k+hs]) / - (hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs)) - - hy_dens_theta_cell[k+hs] / hy_dens_cell[k+hs]; - } - } - - //Write the grid data to file with all the processes writing collectively - const int k_beg = const_scalars.k_beg; - const int i_beg = const_scalars.i_beg; - - st3[0] = scalars.num_out; st3[1] = k_beg; st3[2] = i_beg; - ct3[0] = 1; ct3[1] = nz; ct3[2] = nx; -#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) - ncwrap( ncmpi_put_vara_double_all( ncid , dens_varid , st3 , ct3 , dens.get() ) , __LINE__ ); - ncwrap( ncmpi_put_vara_double_all( ncid , uwnd_varid , st3 , ct3 , uwnd.get() ) , __LINE__ ); - ncwrap( ncmpi_put_vara_double_all( ncid , wwnd_varid , st3 , ct3 , wwnd.get() ) , __LINE__ ); -#endif - ncwrap( ncmpi_put_vara_double_all( ncid , theta_varid , st3 , ct3 , theta.get() ) , __LINE__ ); - - //Only the main process needs to write the elapsed time - //Begin "independent" write mode - ncwrap( ncmpi_begin_indep_data(ncid) , __LINE__ ); - //write elapsed time to file - if (const_scalars.mainproc()) { - st1[0] = scalars.num_out; - ct1[0] = 1; - etimearr[0] = scalars.etime; - ncwrap( ncmpi_put_vara_double( ncid , t_varid , st1 , ct1 , etimearr.get() ) , __LINE__ ); - } - //End "independent" write mode - ncwrap( ncmpi_end_indep_data(ncid) , __LINE__ ); - - //Close the file - ncwrap( ncmpi_close(ncid) , __LINE__ ); - - (void) MPI_Info_free(&mpi_info); - scalars.num_out++; -} - - -//Error reporting routine for the PNetCDF I/O -void ncwrap( int ierr , int line ) { - if (ierr != NC_NOERR) { - fprintf(stderr, "NetCDF Error at line: %d\n", line); - fprintf(stderr, "%s\n", ncmpi_strerror(ierr)); - MPI_Abort(MPI_COMM_WORLD, -1); - } -} - - -void finalize() { - (void) MPI_Finalize(); -} - - -//Compute reduced quantities for error checking without resorting to the "ncdiff" tool -template -reduction_result reductions(view_3d_const state, - const global_const_scalars& const_scalars, - const global_const_arrays& const_arrays) -{ - reduction_result result{0.0, 0.0}; - const int nx = const_scalars.nx; - const int nz = const_scalars.nz; - auto hy_dens_cell = const_arrays.hy_dens_cell(); - auto hy_dens_theta_cell = const_arrays.hy_dens_theta_cell(); - - for (int k = 0; k < nz; ++k) { - for (int i = 0; i < nx; ++i) { - double r = state(ID_DENS, k+hs, i+hs) + hy_dens_cell[hs+k]; // Density - double u = state(ID_UMOM, k+hs, i+hs) / r; // U-wind - double w = state(ID_WMOM, k+hs, i+hs) / r; // W-wind - double th = (state(ID_RHOT, k+hs, i+hs) + hy_dens_theta_cell[hs+k]) / r; // Potential Temperature (theta) - double p = C0 * pow(r * th, gamm); // Pressure - double t = th / pow(p0 / p, rd / cp); // Temperature - double ke = r*(u*u+w*w); // Kinetic Energy - double ie = r*cv*t; // Internal Energy - result.mass += r *dx*dz; // Accumulate domain mass - result.te += (ke + ie)*dx*dz; // Accumulate domain total energy - } - } - std::array loc{result.mass, result.te}; - std::array glob{0.0, 0.0}; - int ierr = MPI_Allreduce(loc.data(), glob.data(), 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - return reduction_result{ - .mass = glob[0], - .te = glob[1] - }; -} - - diff --git a/cpp-mdspan/miniWeather_output.hpp b/cpp-mdspan/miniWeather_output.hpp new file mode 100644 index 0000000..0fcb136 --- /dev/null +++ b/cpp-mdspan/miniWeather_output.hpp @@ -0,0 +1,137 @@ +#pragma once + +#include "miniWeather_common.hpp" +#include "pnetcdf.h" + +//Error reporting routine for the PNetCDF I/O +inline void ncwrap( int ierr , int line ) { + if (ierr != NC_NOERR) { + fprintf(stderr, "NetCDF Error at line: %d\n", line); + fprintf(stderr, "%s\n", ncmpi_strerror(ierr)); + MPI_Abort(MPI_COMM_WORLD, -1); + } +} + +//Output the fluid state (state) to a NetCDF file at a given elapsed model time (etime) +//The file I/O uses parallel-netcdf, the only external library required for this mini-app. +//If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics +template +void output( + host_serial_execution_policy exec_policy, + view_3d_const state, + const global_const_scalars& const_scalars, + const global_const_arrays& const_arrays, + global_scalars& scalars) +{ + const int nx = const_scalars.nx; + const int nz = const_scalars.nz; + + int ncid, t_dimid, x_dimid, z_dimid, theta_varid, t_varid, dimids[3]; +#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) + int dens_varid, uwnd_varid, wwnd_varid; +#endif + MPI_Offset st1[1], ct1[1], st3[3], ct3[3]; + + //Inform the user + if (const_scalars.mainproc()) { fprintf(stderr, "*** OUTPUT ***\n"); } + + //Temporary arrays to hold density, u-wind, w-wind, and potential temperature (theta). +#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) + auto dens = md::make_unique_mdarray(nz, nx); + auto uwnd = md::make_unique_mdarray(nz, nx); + auto wwnd = md::make_unique_mdarray(nz, nx); +#endif + auto theta = md::make_unique_mdarray(nz, nx); + auto etimearr = std::make_unique(1); + + // PNetCDF needs an MPI_Info object that is not MPI_INFO_NULL. + // It's possible that earlier PNetCDF versions tolerated MPI_INFO_NULL. + MPI_Info mpi_info; + auto info_err = MPI_Info_create(&mpi_info); + if (info_err != MPI_SUCCESS) { + fprintf(stderr, "Error creating MPI Info object\n"); + MPI_Abort(MPI_COMM_WORLD, -1); + } + + //If the elapsed time is zero, create the file. Otherwise, open the file + if (scalars.etime == 0) { + //Create the file + ncwrap( ncmpi_create( MPI_COMM_WORLD , "output.nc" , NC_CLOBBER , mpi_info , &ncid ) , __LINE__ ); + //Create the dimensions + ncwrap( ncmpi_def_dim( ncid , "t" , (MPI_Offset) NC_UNLIMITED , &t_dimid ) , __LINE__ ); + ncwrap( ncmpi_def_dim( ncid , "x" , (MPI_Offset) nx_glob , &x_dimid ) , __LINE__ ); + ncwrap( ncmpi_def_dim( ncid , "z" , (MPI_Offset) nz_glob , &z_dimid ) , __LINE__ ); + //Create the variables + dimids[0] = t_dimid; + ncwrap( ncmpi_def_var( ncid , "t_var" , NC_DOUBLE , 1 , dimids , &t_varid ) , __LINE__ ); + dimids[0] = t_dimid; dimids[1] = z_dimid; dimids[2] = x_dimid; +#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) + ncwrap( ncmpi_def_var( ncid , "dens" , NC_DOUBLE , 3 , dimids , &dens_varid ) , __LINE__ ); + ncwrap( ncmpi_def_var( ncid , "uwnd" , NC_DOUBLE , 3 , dimids , &uwnd_varid ) , __LINE__ ); + ncwrap( ncmpi_def_var( ncid , "wwnd" , NC_DOUBLE , 3 , dimids , &wwnd_varid ) , __LINE__ ); +#endif + ncwrap( ncmpi_def_var( ncid , "theta" , NC_DOUBLE , 3 , dimids , &theta_varid ) , __LINE__ ); + //End "define" mode + ncwrap( ncmpi_enddef( ncid ) , __LINE__ ); + } else { + //Open the file + ncwrap( ncmpi_open( MPI_COMM_WORLD , "output.nc" , NC_WRITE , mpi_info , &ncid ) , __LINE__ ); + //Get the variable IDs +#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) + ncwrap( ncmpi_inq_varid( ncid , "dens" , &dens_varid ) , __LINE__ ); + ncwrap( ncmpi_inq_varid( ncid , "uwnd" , &uwnd_varid ) , __LINE__ ); + ncwrap( ncmpi_inq_varid( ncid , "wwnd" , &wwnd_varid ) , __LINE__ ); +#endif + ncwrap( ncmpi_inq_varid( ncid , "theta" , &theta_varid ) , __LINE__ ); + ncwrap( ncmpi_inq_varid( ncid , "t_var" , &t_varid ) , __LINE__ ); + } + + //Store perturbed values in the temp arrays for output + + auto hy_dens_cell = const_arrays.hy_dens_cell(); + auto hy_dens_theta_cell = const_arrays.hy_dens_theta_cell(); + for (int k = 0; k < nz; ++k) { + for (int i = 0; i < nx; ++i) { +#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) + dens(k, i) = state(ID_DENS, k+hs, i+hs); + uwnd(k, i) = state(ID_UMOM, k+hs, i+hs) / (hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs)); + wwnd(k, i) = state(ID_WMOM, k+hs, i+hs) / (hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs)); +#endif + theta(k, i) = (state(ID_RHOT, k+hs, i+hs) + hy_dens_theta_cell[k+hs]) / + (hy_dens_cell[k+hs] + state(ID_DENS, k+hs, i+hs)) - + hy_dens_theta_cell[k+hs] / hy_dens_cell[k+hs]; + } + } + + //Write the grid data to file with all the processes writing collectively + const int k_beg = const_scalars.k_beg; + const int i_beg = const_scalars.i_beg; + + st3[0] = scalars.num_out; st3[1] = k_beg; st3[2] = i_beg; + ct3[0] = 1; ct3[1] = nz; ct3[2] = nx; +#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA) + ncwrap( ncmpi_put_vara_double_all( ncid , dens_varid , st3 , ct3 , dens.get() ) , __LINE__ ); + ncwrap( ncmpi_put_vara_double_all( ncid , uwnd_varid , st3 , ct3 , uwnd.get() ) , __LINE__ ); + ncwrap( ncmpi_put_vara_double_all( ncid , wwnd_varid , st3 , ct3 , wwnd.get() ) , __LINE__ ); +#endif + ncwrap( ncmpi_put_vara_double_all( ncid , theta_varid , st3 , ct3 , theta.get() ) , __LINE__ ); + + //Only the main process needs to write the elapsed time + //Begin "independent" write mode + ncwrap( ncmpi_begin_indep_data(ncid) , __LINE__ ); + //write elapsed time to file + if (const_scalars.mainproc()) { + st1[0] = scalars.num_out; + ct1[0] = 1; + etimearr[0] = scalars.etime; + ncwrap( ncmpi_put_vara_double( ncid , t_varid , st1 , ct1 , etimearr.get() ) , __LINE__ ); + } + //End "independent" write mode + ncwrap( ncmpi_end_indep_data(ncid) , __LINE__ ); + + //Close the file + ncwrap( ncmpi_close(ncid) , __LINE__ ); + + (void) MPI_Info_free(&mpi_info); + scalars.num_out++; +} diff --git a/cpp-mdspan/miniWeather_serial.hpp b/cpp-mdspan/miniWeather_serial.hpp new file mode 100644 index 0000000..b94d835 --- /dev/null +++ b/cpp-mdspan/miniWeather_serial.hpp @@ -0,0 +1,383 @@ +#pragma once + +#include "miniWeather_common.hpp" + +//Set this MPI task's halo values in the x-direction. +template +void set_halo_values_x( + host_serial_execution_policy /* exec_policy */, + view_3d state, + const global_const_scalars& scalars, + const global_const_arrays& arrays) +{ + const int nx = scalars.nx; + const int nz = scalars.nz; + + //////////////////////////////////////////////////////////////////////// + // TODO: EXCHANGE HALO VALUES WITH NEIGHBORING MPI TASKS + // (1) give state(1:hs,1:nz,1:NUM_VARS) to my left neighbor + // (2) receive state(1-hs:0,1:nz,1:NUM_VARS) from my left neighbor + // (3) give state(nx-hs+1:nx,1:nz,1:NUM_VARS) to my right neighbor + // (4) receive state(nx+1:nx+hs,1:nz,1:NUM_VARS) from my right neighbor + //////////////////////////////////////////////////////////////////////// + + ////////////////////////////////////////////////////// + // DELETE THE SERIAL CODE BELOW AND REPLACE WITH MPI + ////////////////////////////////////////////////////// + for (int ll = 0; ll < NUM_VARS; ++ll) { + for (int k = 0; k < nz; ++k) { + state(ll, k+hs, 0) = state(ll, k+hs, nx+hs-2); + state(ll, k+hs, 1) = state(ll, k+hs, nx+hs-1); + state(ll, k+hs, nx+hs) = state(ll, k+hs, hs); + state(ll, k+hs, nx+hs+1) = state(ll, k+hs, hs+1); + } + } + //////////////////////////////////////////////////// + + if (data_spec_int == DATA_SPEC_INJECTION) { + if (scalars.myrank == 0) { + auto hy_dens_cell = arrays.hy_dens_cell(); + auto hy_dens_theta_cell = arrays.hy_dens_theta_cell(); + const int k_beg = scalars.k_beg; + for (int k = 0; k < nz; ++k) { + for (int i = 0; i < hs; ++i) { + const double z = (k_beg + k+0.5)*dz; + if (fabs(z-3*zlen/4) <= zlen/16) { + state(ID_UMOM, k+hs, i) = (state(ID_DENS, k+hs, i) + hy_dens_cell[k+hs]) * 50.0; + state(ID_RHOT, k+hs, i) = (state(ID_DENS, k+hs, i) + hy_dens_cell[k+hs]) * 298.0 - + hy_dens_theta_cell[k+hs]; + } + } + } + } + } +} + +//Set this MPI task's halo values in the z-direction. This does not require MPI because there is no MPI +//decomposition in the vertical direction +template +void set_halo_values_z( + host_serial_execution_policy /* exec_policy */, + view_3d state, + const global_const_scalars& scalars, + const global_const_arrays& arrays) +{ + const int nx = scalars.nx; + const int nz = scalars.nz; + auto hy_dens_cell = arrays.hy_dens_cell(); + + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + for (int ll = 0; ll < NUM_VARS; ++ll) { + for (int i = 0; i < nx+2*hs; ++i) { + if (ll == ID_WMOM) { + state(ll, 0, i) = 0.0; + state(ll, 1, i) = 0.0; + state(ll, nz+hs, i) = 0.0; + state(ll, nz+hs+1, i) = 0.0; + } else if (ll == ID_UMOM) { + state(ll, 0, i) = state(ll, hs, i) / hy_dens_cell[hs] * hy_dens_cell[0]; + state(ll, 1, i) = state(ll, hs, i) / hy_dens_cell[hs] * hy_dens_cell[1]; + state(ll, nz+hs, i) = state(ll, nz+hs-1, i) / hy_dens_cell[nz+hs-1] * hy_dens_cell[nz+hs]; + state(ll, nz+hs+1, i) = state(ll, nz+hs-1, i) / hy_dens_cell[nz+hs-1] * hy_dens_cell[nz+hs+1]; + } else { + state(ll, 0, i) = state(ll, hs, i); + state(ll, 1, i) = state(ll, hs, i); + state(ll, nz+hs, i) = state(ll, nz+hs-1, i); + state(ll, nz+hs+1, i) = state(ll, nz+hs-1, i); + } + } + } +} + +//Compute the time tendencies of the fluid state using forcing in the x-direction +//Since the halos are set in a separate routine, this will not require MPI +//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) +//Then, compute the tendencies using those fluxes +template +void compute_tendencies_x( + host_serial_execution_policy /* exec_policy */, + view_3d_const state, + view_3d flux, view_3d tend, double dt, + const global_const_scalars& scalars, + const global_const_arrays& arrays) +{ + const int nx = scalars.nx; + const int nz = scalars.nz; + // Hyperviscosity coefficient + const double hv_coef = -hv_beta * dx / (16*dt); + auto hy_dens_cell = arrays.hy_dens_cell(); + auto hy_dens_theta_cell = arrays.hy_dens_theta_cell(); + + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Compute fluxes in the x-direction for each cell + for (int k = 0; k < nz; ++k) { + for (int i = 0; i < nx+1; ++i) { + //Use fourth-order interpolation from four cell averages + //to compute the value at the interface in question + std::array d3_vals; + std::array vals; + for (int ll = 0; ll < NUM_VARS; ++ll) { + std::array stencil; + for (int s = 0; s < sten_size; ++s) { + stencil[s] = state(ll, k+hs, i+s); + } + //Fourth-order-accurate interpolation of the state + vals[ll] = -stencil[0]/12 + 7*stencil[1]/12 + 7*stencil[2]/12 - stencil[3]/12; + //First-order-accurate interpolation of the third spatial derivative + //of the state (for artificial viscosity) + d3_vals[ll] = -stencil[0] + 3*stencil[1] - 3*stencil[2] + stencil[3]; + } + + //Compute density, u-wind, w-wind, potential temperature, + //and pressure (r,u,w,t,p respectively) + double r = vals[ID_DENS] + hy_dens_cell[k+hs]; + double u = vals[ID_UMOM] / r; + double w = vals[ID_WMOM] / r; + double t = ( vals[ID_RHOT] + hy_dens_theta_cell[k+hs] ) / r; + double p = C0 * pow(r*t, gamm); + + //Compute the flux vector + flux(ID_DENS, k, i) = r*u - hv_coef*d3_vals[ID_DENS]; + flux(ID_UMOM, k, i) = r*u*u+p - hv_coef*d3_vals[ID_UMOM]; + flux(ID_WMOM, k, i) = r*u*w - hv_coef*d3_vals[ID_WMOM]; + flux(ID_RHOT, k, i) = r*u*t - hv_coef*d3_vals[ID_RHOT]; + } + } + + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Use the fluxes to compute tendencies for each cell + { + view_3d_const flux_c = flux; + for (int ll = 0; ll < NUM_VARS; ++ll) { + for (int k = 0; k < nz; ++k) { + for (int i = 0; i < nx; ++i) { + tend(ll, k, i) = -( flux_c(ll, k, i+1) - flux_c(ll, k, i) ) / dx; + } + } + } + } +} + +//Compute the time tendencies of the fluid state using forcing in the z-direction +//Since the halos are set in a separate routine, this will not require MPI +//First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity) +//Then, compute the tendencies using those fluxes +template +void compute_tendencies_z( + host_serial_execution_policy /* exec_policy */, + view_3d_const state, + view_3d flux, view_3d tend, double dt, + const global_const_scalars& scalars, + const global_const_arrays& arrays) +{ + const int nx = scalars.nx; + const int nz = scalars.nz; + // Hyperviscosity coefficient + const double hv_coef = -hv_beta * dz / (16*dt); + auto hy_dens_int = arrays.hy_dens_int(); + auto hy_dens_theta_int = arrays.hy_dens_theta_int(); + auto hy_pressure_int = arrays.hy_pressure_int(); + + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Compute fluxes in the x-direction for each cell + for (int k = 0; k < nz+1; ++k) { + for (int i = 0; i < nx; ++i) { + //Use fourth-order interpolation from four cell averages + //to compute the value at the interface in question + std::array d3_vals; + std::array vals; + for (int ll = 0; ll < NUM_VARS; ++ll) { + std::array stencil; + for (int s = 0; s < sten_size; ++s) { + stencil[s] = state(ll, k+s, i+hs); + } + //Fourth-order-accurate interpolation of the state + vals[ll] = -stencil[0]/12 + 7*stencil[1]/12 + 7*stencil[2]/12 - stencil[3]/12; + //First-order-accurate interpolation of the third spatial derivative + //of the state + d3_vals[ll] = -stencil[0] + 3*stencil[1] - 3*stencil[2] + stencil[3]; + } + + //Compute density, u-wind, w-wind, potential temperature, + //and pressure (r,u,w,t,p respectively) + double r = vals[ID_DENS] + hy_dens_int[k]; + double u = vals[ID_UMOM] / r; + double w = vals[ID_WMOM] / r; + double t = (vals[ID_RHOT] + hy_dens_theta_int[k]) / r; + double p = C0 * pow(r * t, gamm) - hy_pressure_int[k]; + //Enforce vertical boundary condition and exact mass conservation + if (k == 0 || k == nz) { + w = 0; + d3_vals[ID_DENS] = 0; + } + + //Compute the flux vector with hyperviscosity + flux(ID_DENS, k, i) = r*w - hv_coef*d3_vals[ID_DENS]; + flux(ID_UMOM, k, i) = r*w*u - hv_coef*d3_vals[ID_UMOM]; + flux(ID_WMOM, k, i) = r*w*w+p - hv_coef*d3_vals[ID_WMOM]; + flux(ID_RHOT, k, i) = r*w*t - hv_coef*d3_vals[ID_RHOT]; + } + } + + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + //Use the fluxes to compute tendencies for each cell + { + view_3d_const flux_c = flux; + for (int ll = 0; ll < NUM_VARS; ++ll) { + for (int k = 0; k < nz; ++k) { + for (int i = 0; i < nx; ++i) { + tend(ll, k, i) = -( flux_c(ll, k+1, i) - flux_c(ll, k, i) ) / dz; + if (ll == ID_WMOM) { + tend(ll, k, i) = tend(ll, k, i) - state(ID_DENS, k+hs, i+hs)*grav; + } + } + } + } + } +} + +template +void apply_tendencies_to_fluid_state( + host_serial_execution_policy /* exec_policy */, + view_3d_const state_init, + view_3d state_out, + double dt /* not scalars.dt */, + view_3d tend, + const global_const_scalars& scalars, + const global_const_arrays& arrays) +{ + const int nx = scalars.nx; + const int nz = scalars.nz; + + ///////////////////////////////////////////////// + // TODO: THREAD ME + ///////////////////////////////////////////////// + + auto hy_dens_cell = arrays.hy_dens_cell(); + view_3d_const tend_c = tend; + for (int ll = 0; ll < NUM_VARS; ++ll) { + for (int k = 0; k < nz; ++k) { + for (int i = 0; i < nx; ++i) { + if (data_spec_int == DATA_SPEC_GRAVITY_WAVES) { + const int i_beg = scalars.i_beg; + const int k_beg = scalars.k_beg; + const double x = (i_beg + i+0.5)*dx; + const double z = (k_beg + k+0.5)*dz; + const double wpert = sample_ellipse_cosine(x, z, 0.01, xlen/8, 1000.0, 500.0, 500.0); + tend(ID_WMOM, k, i) += wpert * hy_dens_cell[hs+k]; + } + state_out(ll, k+hs, i+hs) = state_init(ll, k+hs, i+hs) + dt * tend_c(ll, k, i); + } + } + } +} + +// Initialize the cell-averaged fluid state via Gauss-Legendre quadrature +void initialize_cell_averaged_fluid_state( + host_serial_execution_policy /* exec_policy */, + view_3d state, view_3d state_tmp, + int nx, int nz, + int i_beg, int k_beg) +{ + for (int k = 0; k < nz+2*hs; ++k) { + for (int i = 0; i < nx+2*hs; ++i) { + //Initialize the state to zero + for (int ll = 0; ll < NUM_VARS; ++ll) { + state(ll, k, i) = 0.0; + } + //Use Gauss-Legendre quadrature to initialize a hydrostatic balance + temperature perturbation + for (int kk = 0; kk < nqpoints; ++kk) { + for (int ii = 0; ii < nqpoints; ++ii) { + //Compute the x,z location within the global domain based on cell and quadrature index + const double x = (i_beg + i-hs+0.5)*dx + (qpoints[ii]-0.5)*dx; + const double z = (k_beg + k-hs+0.5)*dz + (qpoints[kk]-0.5)*dz; + + //Set the fluid state based on the user's specification + auto [r, u, w, t, hr, ht] = get_test_case(data_spec_int, x, z); + + //Store into the fluid state array + state(ID_DENS, k, i) = state(ID_DENS, k, i) + r * qweights[ii]*qweights[kk]; + state(ID_UMOM, k, i) = state(ID_UMOM, k, i) + (r+hr)*u * qweights[ii]*qweights[kk]; + state(ID_WMOM, k, i) = state(ID_WMOM, k, i) + (r+hr)*w * qweights[ii]*qweights[kk]; + state(ID_RHOT, k, i) = state(ID_RHOT, k, i) + ( (r+hr)*(t+ht) - hr*ht ) * qweights[ii]*qweights[kk]; + } + } + for (int ll = 0; ll < NUM_VARS; ++ll) { + state_tmp(ll, k, i) = state(ll, k, i); + } + } + } +} + +void compute_hydrostatic_background_state( + host_serial_execution_policy /* exec_policy */, + view_1d hy_dens_cell, + view_1d hy_dens_theta_cell, + view_1d hy_dens_int, + view_1d hy_dens_theta_int, + view_1d hy_pressure_int, + int nz, + int k_beg) +{ + //Compute the hydrostatic background state over vertical cell averages + for (int k = 0; k < nz+2*hs; ++k) { + hy_dens_cell[k] = 0.0; + hy_dens_theta_cell[k] = 0.0; + for (int kk = 0; kk < nqpoints; ++kk) { + const double z = (k_beg + k-hs+0.5)*dz; + //Set the fluid state based on the user's specification + auto [r, u, w, t, hr, ht] = get_test_case(data_spec_int, 0.0, z); + hy_dens_cell[k] = hy_dens_cell[k] + hr * qweights[kk]; + hy_dens_theta_cell[k] = hy_dens_theta_cell[k] + hr*ht * qweights[kk]; + } + } + //Compute the hydrostatic background state at vertical cell interfaces + for (int k = 0; k < nz+1; ++k) { + const double z = (k_beg + k)*dz; + auto [r, u, w, t, hr, ht] = get_test_case(data_spec_int, 0.0, z); + hy_dens_int [k] = hr; + hy_dens_theta_int[k] = hr * ht; + hy_pressure_int [k] = C0 * pow(hr * ht, gamm); + } +} + +template +reduction_result local_reductions( + host_serial_execution_policy exec_policy, + view_3d_const state, + const global_const_scalars& const_scalars, + const global_const_arrays& const_arrays) +{ + reduction_result result{0.0, 0.0}; + const int nx = const_scalars.nx; + const int nz = const_scalars.nz; + auto hy_dens_cell = const_arrays.hy_dens_cell(); + auto hy_dens_theta_cell = const_arrays.hy_dens_theta_cell(); + + for (int k = 0; k < nz; ++k) { + for (int i = 0; i < nx; ++i) { + double r = state(ID_DENS, k+hs, i+hs) + hy_dens_cell[hs+k]; // Density + double u = state(ID_UMOM, k+hs, i+hs) / r; // U-wind + double w = state(ID_WMOM, k+hs, i+hs) / r; // W-wind + double th = (state(ID_RHOT, k+hs, i+hs) + hy_dens_theta_cell[hs+k]) / r; // Potential Temperature (theta) + double p = C0 * pow(r * th, gamm); // Pressure + double t = th / pow(p0 / p, rd / cp); // Temperature + double ke = r*(u*u+w*w); // Kinetic Energy + double ie = r*cv*t; // Internal Energy + result.mass += r *dx*dz; // Accumulate domain mass + result.te += (ke + ie)*dx*dz; // Accumulate domain total energy + } + } + return result; +} + From f02431e8fac36962c96b5a063f278766f3d297e9 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Sat, 5 Apr 2025 02:08:33 +0300 Subject: [PATCH 77/83] Kokkos and serial builds now work That is, a single run of `make` can build both of them. --- cpp-mdspan/CMakeLists.txt | 1 + cpp-mdspan/miniWeather_common.hpp | 206 ------------- cpp-mdspan/miniWeather_generic_algs.hpp | 210 +++++++++++++ cpp-mdspan/miniWeather_kokkos.hpp | 390 ++++++++++++++++++++++++ cpp-mdspan/miniWeather_mdspan.cpp | 14 +- cpp-mdspan/miniWeather_output.hpp | 6 +- 6 files changed, 618 insertions(+), 209 deletions(-) create mode 100644 cpp-mdspan/miniWeather_generic_algs.hpp create mode 100644 cpp-mdspan/miniWeather_kokkos.hpp diff --git a/cpp-mdspan/CMakeLists.txt b/cpp-mdspan/CMakeLists.txt index 0a89c88..028fbe6 100644 --- a/cpp-mdspan/CMakeLists.txt +++ b/cpp-mdspan/CMakeLists.txt @@ -185,6 +185,7 @@ if (kokkos_POPULATED) target_include_directories(miniWeather_kokkos PRIVATE "${PROJECT_SOURCE_DIR}") target_link_libraries(miniWeather_kokkos PRIVATE Kokkos::kokkos) + target_compile_definitions(miniWeather_kokkos PRIVATE MINIWEATHER_KOKKOS) # We got mdspan from Kokkos. target_compile_options(miniWeather_kokkos PRIVATE $<$,$,$>: diff --git a/cpp-mdspan/miniWeather_common.hpp b/cpp-mdspan/miniWeather_common.hpp index cde78cd..401016b 100644 --- a/cpp-mdspan/miniWeather_common.hpp +++ b/cpp-mdspan/miniWeather_common.hpp @@ -20,8 +20,6 @@ #include "mdspan/mdspan.hpp" #include "unique_mdarray.hpp" -#define MINIWEATHER_ONLY_OUTPUT_THETA 1 - constexpr double pi = 3.14159265358979323846264338327; //Pi constexpr double grav = 9.8; //Gravitational acceleration (m / s^2) constexpr double cp = 1004.; //Specific heat of dry air at constant pressure @@ -305,213 +303,9 @@ r_t_pair hydro_const_bvfreq(double z, double bv_freq0); double sample_ellipse_cosine(double x, double z, double amp, double x0, double z0, double xrad, double zrad); -// Perform a single time step. -// Time steps are dimensionally split and -// use a simple low-storage three-stage Runge-Kutta time integrator. -// The dimensional splitting is a second-order-accurate alternating Strang splitting -// that alternates the order of directions each time step. -// -// The Runge-Kutta method used here is defined as follows: -// -// q* = q[n] + dt/3 * rhs(q[n]) -// q** = q[n] + dt/2 * rhs(q* ) -// q[n+1] = q[n] + dt/1 * rhs(q** ) -// -template -void perform_timestep( - ExecutionPolicy exec_policy, - view_3d state, view_3d state_tmp, - view_3d flux, view_3d tend, - const global_const_scalars& c_scalars, - const global_const_arrays& c_arrays, - global_scalars& scalars) -{ - const double dt = scalars.dt; - if (scalars.direction_switch) { - //x-direction first - semi_discrete_step(exec_policy, state, state , state_tmp, dt / 3, direction::X, flux, tend, c_scalars, c_arrays); - semi_discrete_step(exec_policy, state, state_tmp, state_tmp, dt / 2, direction::X, flux, tend, c_scalars, c_arrays); - semi_discrete_step(exec_policy, state, state_tmp, state , dt / 1, direction::X, flux, tend, c_scalars, c_arrays); - //z-direction second - semi_discrete_step(exec_policy, state, state , state_tmp, dt / 3, direction::Z, flux, tend, c_scalars, c_arrays); - semi_discrete_step(exec_policy, state, state_tmp, state_tmp, dt / 2, direction::Z, flux, tend, c_scalars, c_arrays); - semi_discrete_step(exec_policy, state, state_tmp, state , dt / 1, direction::Z, flux, tend, c_scalars, c_arrays); - } else { - //z-direction second - semi_discrete_step(exec_policy, state, state , state_tmp, dt / 3, direction::Z, flux, tend, c_scalars, c_arrays); - semi_discrete_step(exec_policy, state, state_tmp, state_tmp, dt / 2, direction::Z, flux, tend, c_scalars, c_arrays); - semi_discrete_step(exec_policy, state, state_tmp, state , dt / 1, direction::Z, flux, tend, c_scalars, c_arrays); - //x-direction first - semi_discrete_step(exec_policy, state, state , state_tmp, dt / 3, direction::X, flux, tend, c_scalars, c_arrays); - semi_discrete_step(exec_policy, state, state_tmp, state_tmp, dt / 2, direction::X, flux, tend, c_scalars, c_arrays); - semi_discrete_step(exec_policy, state, state_tmp, state , dt / 1, direction::X, flux, tend, c_scalars, c_arrays); - } - if (scalars.direction_switch) { - scalars.direction_switch = 0; - } else { - scalars.direction_switch = 1; - } -} - -//Perform a single semi-discretized step in time with the form: -//state_out = state_init + dt * rhs(state_forcing) -//Meaning the step starts from state_init, computes the rhs using state_forcing, -//and stores the result in state_out -template -void semi_discrete_step( - ExecutionPolicy exec_policy, - view_3d_const state_init, - view_3d state_forcing, - view_3d state_out, - double dt /* not scalars.dt */, - direction dir, view_3d flux, view_3d tend, - const global_const_scalars& scalars, - const global_const_arrays& arrays) -{ - if (dir == direction::X) { - //Set the halo values for this MPI task's fluid state in the x-direction - set_halo_values_x(exec_policy, state_forcing, scalars, arrays); - //Compute the time tendencies for the fluid state in the x-direction - compute_tendencies_x(exec_policy, state_forcing, flux, tend, dt, scalars, arrays); - } else if (dir == direction::Z) { - //Set the halo values for this MPI task's fluid state in the z-direction - set_halo_values_z(exec_policy, state_forcing, scalars, arrays); - //Compute the time tendencies for the fluid state in the z-direction - compute_tendencies_z(exec_policy, state_forcing, flux, tend, dt, scalars, arrays); - } - - apply_tendencies_to_fluid_state(exec_policy, state_init, state_out, dt, tend, scalars, arrays); -} - -template -init_result init( - ExecutionPolicy exec_policy, - MemorySpace memory_space, - int *argc , char ***argv) -{ - (void) MPI_Init(argc,argv); - - ///////////////////////////////////////////////////////////// - // BEGIN MPI DUMMY SECTION - // TODO: (1) GET NUMBER OF MPI RANKS - // (2) GET MY MPI RANK ID (RANKS ARE ZERO-BASED INDEX) - // (3) COMPUTE MY BEGINNING "I" INDEX (1-based index) - // (4) COMPUTE HOW MANY X-DIRECTION CELLS MY RANK HAS - // (5) FIND MY LEFT AND RIGHT NEIGHBORING RANK IDs - ///////////////////////////////////////////////////////////// - int i_beg = 0; - int nx = nx_glob; - - ////////////////////////////////////////////// - // END MPI DUMMY SECTION - ////////////////////////////////////////////// - - //Vertical direction isn't MPI-ized, so the rank's local values = the global values - int k_beg = 0; - int nz = nz_glob; - int nranks = 1; - int myrank = 0; - int left_rank = 0; - int right_rank = 0; - bool mainproc = (myrank == 0); - - global_arrays gl_arrs(memory_space, nx, nz, hs); - auto state = gl_arrs.state(); - auto state_tmp = gl_arrs.state_tmp(); - auto flux = gl_arrs.flux(); - auto tend = gl_arrs.tend(); - - //Define the maximum stable time step based on an assumed maximum wind speed - double dt = fmin(dx,dz) / max_speed * cfl; - - //If I'm the main process in MPI, display some grid information - if (mainproc) { - fprintf(stderr, "nx_glob, nz_glob: %d %d\n", nx_glob, nz_glob); - fprintf(stderr, "dx,dz: %lf %lf\n",dx,dz); - fprintf(stderr, "dt: %lf\n",dt); - } - //Want to make sure this info is displayed before further output - (void) MPI_Barrier(MPI_COMM_WORLD); - - initialize_cell_averaged_fluid_state(exec_policy, - state, state_tmp, nx, nz, i_beg, k_beg); - - global_const_arrays gl_const_arrs(memory_space, nx, nz, hs); - // Get nonconst views, so we can fill them in below. - auto hy_dens_cell = gl_const_arrs.hy_dens_cell(); - auto hy_dens_theta_cell = gl_const_arrs.hy_dens_theta_cell(); - auto hy_dens_int = gl_const_arrs.hy_dens_int(); - auto hy_dens_theta_int = gl_const_arrs.hy_dens_theta_int(); - auto hy_pressure_int = gl_const_arrs.hy_pressure_int(); - - compute_hydrostatic_background_state(exec_policy, - hy_dens_cell, hy_dens_theta_cell, - hy_dens_int, hy_dens_theta_int, hy_pressure_int, nz, k_beg); - - return init_result{ - global_const_scalars{ -#if defined(__cpp_designated_initializers) - .nx = nx, - .nz = nz, - .i_beg = i_beg, - .k_beg = k_beg, - .nranks = nranks, - .myrank = myrank, - .left_rank = left_rank, - .right_rank = right_rank -#else - nx, - nz, - i_beg, - k_beg, - nranks, - myrank, - left_rank, - right_rank -#endif - }, - global_scalars{ -#if defined(__cpp_designated_initializers) - .dt = dt, - .etime = 0.0, - .output_counter = 0.0, - .num_out = 0, - .direction_switch = 1 -#else - dt, - /* etime = */ 0.0, - /* output_counter = */ 0.0, - /* num_out = */ 0, - /* direction_switch = */ 1 -#endif - }, - std::move(gl_const_arrs), - std::move(gl_arrs) - }; -} - struct reduction_result { double mass; double te; }; -//Compute reduced quantities for error checking without resorting to the "ncdiff" tool -template -reduction_result reductions( - ExecutionPolicy exec_policy, - view_3d_const state, - const global_const_scalars& const_scalars, - const global_const_arrays& const_arrays) -{ - reduction_result result = local_reductions(exec_policy, - state, const_scalars, const_arrays); - std::array loc{result.mass, result.te}; - std::array glob{0.0, 0.0}; - int ierr = MPI_Allreduce(loc.data(), glob.data(), 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); - return reduction_result{ - .mass = glob[0], - .te = glob[1] - }; -} - void finalize(); diff --git a/cpp-mdspan/miniWeather_generic_algs.hpp b/cpp-mdspan/miniWeather_generic_algs.hpp new file mode 100644 index 0000000..b6bfbe1 --- /dev/null +++ b/cpp-mdspan/miniWeather_generic_algs.hpp @@ -0,0 +1,210 @@ +#pragma once + +#include "miniWeather_common.hpp" + +// Perform a single time step. +// Time steps are dimensionally split and +// use a simple low-storage three-stage Runge-Kutta time integrator. +// The dimensional splitting is a second-order-accurate alternating Strang splitting +// that alternates the order of directions each time step. +// +// The Runge-Kutta method used here is defined as follows: +// +// q* = q[n] + dt/3 * rhs(q[n]) +// q** = q[n] + dt/2 * rhs(q* ) +// q[n+1] = q[n] + dt/1 * rhs(q** ) +// +template +void perform_timestep( + ExecutionPolicy exec_policy, + view_3d state, view_3d state_tmp, + view_3d flux, view_3d tend, + const global_const_scalars& c_scalars, + const global_const_arrays& c_arrays, + global_scalars& scalars) +{ + const double dt = scalars.dt; + if (scalars.direction_switch) { + //x-direction first + semi_discrete_step(exec_policy, state, state , state_tmp, dt / 3, direction::X, flux, tend, c_scalars, c_arrays); + semi_discrete_step(exec_policy, state, state_tmp, state_tmp, dt / 2, direction::X, flux, tend, c_scalars, c_arrays); + semi_discrete_step(exec_policy, state, state_tmp, state , dt / 1, direction::X, flux, tend, c_scalars, c_arrays); + //z-direction second + semi_discrete_step(exec_policy, state, state , state_tmp, dt / 3, direction::Z, flux, tend, c_scalars, c_arrays); + semi_discrete_step(exec_policy, state, state_tmp, state_tmp, dt / 2, direction::Z, flux, tend, c_scalars, c_arrays); + semi_discrete_step(exec_policy, state, state_tmp, state , dt / 1, direction::Z, flux, tend, c_scalars, c_arrays); + } else { + //z-direction second + semi_discrete_step(exec_policy, state, state , state_tmp, dt / 3, direction::Z, flux, tend, c_scalars, c_arrays); + semi_discrete_step(exec_policy, state, state_tmp, state_tmp, dt / 2, direction::Z, flux, tend, c_scalars, c_arrays); + semi_discrete_step(exec_policy, state, state_tmp, state , dt / 1, direction::Z, flux, tend, c_scalars, c_arrays); + //x-direction first + semi_discrete_step(exec_policy, state, state , state_tmp, dt / 3, direction::X, flux, tend, c_scalars, c_arrays); + semi_discrete_step(exec_policy, state, state_tmp, state_tmp, dt / 2, direction::X, flux, tend, c_scalars, c_arrays); + semi_discrete_step(exec_policy, state, state_tmp, state , dt / 1, direction::X, flux, tend, c_scalars, c_arrays); + } + if (scalars.direction_switch) { + scalars.direction_switch = 0; + } else { + scalars.direction_switch = 1; + } +} + +//Perform a single semi-discretized step in time with the form: +//state_out = state_init + dt * rhs(state_forcing) +//Meaning the step starts from state_init, computes the rhs using state_forcing, +//and stores the result in state_out +template +void semi_discrete_step( + ExecutionPolicy exec_policy, + view_3d_const state_init, + view_3d state_forcing, + view_3d state_out, + double dt /* not scalars.dt */, + direction dir, view_3d flux, view_3d tend, + const global_const_scalars& scalars, + const global_const_arrays& arrays) +{ + if (dir == direction::X) { + //Set the halo values for this MPI task's fluid state in the x-direction + set_halo_values_x(exec_policy, state_forcing, scalars, arrays); + //Compute the time tendencies for the fluid state in the x-direction + compute_tendencies_x(exec_policy, state_forcing, flux, tend, dt, scalars, arrays); + } else if (dir == direction::Z) { + //Set the halo values for this MPI task's fluid state in the z-direction + set_halo_values_z(exec_policy, state_forcing, scalars, arrays); + //Compute the time tendencies for the fluid state in the z-direction + compute_tendencies_z(exec_policy, state_forcing, flux, tend, dt, scalars, arrays); + } + + apply_tendencies_to_fluid_state(exec_policy, state_init, state_out, dt, tend, scalars, arrays); +} + +template +init_result init( + ExecutionPolicy exec_policy, + MemorySpace memory_space, + int *argc , char ***argv) +{ + (void) MPI_Init(argc,argv); + + ///////////////////////////////////////////////////////////// + // BEGIN MPI DUMMY SECTION + // TODO: (1) GET NUMBER OF MPI RANKS + // (2) GET MY MPI RANK ID (RANKS ARE ZERO-BASED INDEX) + // (3) COMPUTE MY BEGINNING "I" INDEX (1-based index) + // (4) COMPUTE HOW MANY X-DIRECTION CELLS MY RANK HAS + // (5) FIND MY LEFT AND RIGHT NEIGHBORING RANK IDs + ///////////////////////////////////////////////////////////// + int i_beg = 0; + int nx = nx_glob; + + ////////////////////////////////////////////// + // END MPI DUMMY SECTION + ////////////////////////////////////////////// + + //Vertical direction isn't MPI-ized, so the rank's local values = the global values + int k_beg = 0; + int nz = nz_glob; + int nranks = 1; + int myrank = 0; + int left_rank = 0; + int right_rank = 0; + bool mainproc = (myrank == 0); + + global_arrays gl_arrs(memory_space, nx, nz, hs); + auto state = gl_arrs.state(); + auto state_tmp = gl_arrs.state_tmp(); + auto flux = gl_arrs.flux(); + auto tend = gl_arrs.tend(); + + //Define the maximum stable time step based on an assumed maximum wind speed + double dt = fmin(dx,dz) / max_speed * cfl; + + //If I'm the main process in MPI, display some grid information + if (mainproc) { + fprintf(stderr, "nx_glob, nz_glob: %d %d\n", nx_glob, nz_glob); + fprintf(stderr, "dx,dz: %lf %lf\n",dx,dz); + fprintf(stderr, "dt: %lf\n",dt); + } + //Want to make sure this info is displayed before further output + (void) MPI_Barrier(MPI_COMM_WORLD); + + initialize_cell_averaged_fluid_state(exec_policy, + state, state_tmp, nx, nz, i_beg, k_beg); + + global_const_arrays gl_const_arrs(memory_space, nx, nz, hs); + // Get nonconst views, so we can fill them in below. + auto hy_dens_cell = gl_const_arrs.hy_dens_cell(); + auto hy_dens_theta_cell = gl_const_arrs.hy_dens_theta_cell(); + auto hy_dens_int = gl_const_arrs.hy_dens_int(); + auto hy_dens_theta_int = gl_const_arrs.hy_dens_theta_int(); + auto hy_pressure_int = gl_const_arrs.hy_pressure_int(); + + compute_hydrostatic_background_state(exec_policy, + hy_dens_cell, hy_dens_theta_cell, + hy_dens_int, hy_dens_theta_int, hy_pressure_int, nz, k_beg); + + return init_result{ + global_const_scalars{ +#if defined(__cpp_designated_initializers) + .nx = nx, + .nz = nz, + .i_beg = i_beg, + .k_beg = k_beg, + .nranks = nranks, + .myrank = myrank, + .left_rank = left_rank, + .right_rank = right_rank +#else + nx, + nz, + i_beg, + k_beg, + nranks, + myrank, + left_rank, + right_rank +#endif + }, + global_scalars{ +#if defined(__cpp_designated_initializers) + .dt = dt, + .etime = 0.0, + .output_counter = 0.0, + .num_out = 0, + .direction_switch = 1 +#else + dt, + /* etime = */ 0.0, + /* output_counter = */ 0.0, + /* num_out = */ 0, + /* direction_switch = */ 1 +#endif + }, + std::move(gl_const_arrs), + std::move(gl_arrs) + }; +} + +//Compute reduced quantities for error checking without resorting to the "ncdiff" tool +template +reduction_result reductions( + ExecutionPolicy exec_policy, + view_3d_const state, + const global_const_scalars& const_scalars, + const global_const_arrays& const_arrays) +{ + reduction_result result = local_reductions(exec_policy, + state, const_scalars, const_arrays); + std::array loc{result.mass, result.te}; + std::array glob{0.0, 0.0}; + int ierr = MPI_Allreduce(loc.data(), glob.data(), 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + return reduction_result{ + .mass = glob[0], + .te = glob[1] + }; +} + + + diff --git a/cpp-mdspan/miniWeather_kokkos.hpp b/cpp-mdspan/miniWeather_kokkos.hpp new file mode 100644 index 0000000..ae29b3b --- /dev/null +++ b/cpp-mdspan/miniWeather_kokkos.hpp @@ -0,0 +1,390 @@ +#pragma once + +#include "miniWeather_common.hpp" +#include + +using kokkos_execution_policy = Kokkos::DefaultExecutionSpace; + +template +using md_range_policy = Kokkos::MDRangePolicy< + kokkos_execution_policy, + Kokkos::Rank, + Kokkos::IndexType>; + +//Set this MPI task's halo values in the x-direction. +template +void set_halo_values_x( + kokkos_execution_policy exec_policy, + view_3d state, + const global_const_scalars& scalars, + const global_const_arrays& arrays) +{ + const int nx = scalars.nx; + const int nz = scalars.nz; + + //////////////////////////////////////////////////////////////////////// + // TODO: EXCHANGE HALO VALUES WITH NEIGHBORING MPI TASKS + // (1) give state(1:hs,1:nz,1:NUM_VARS) to my left neighbor + // (2) receive state(1-hs:0,1:nz,1:NUM_VARS) from my left neighbor + // (3) give state(nx-hs+1:nx,1:nz,1:NUM_VARS) to my right neighbor + // (4) receive state(nx+1:nx+hs,1:nz,1:NUM_VARS) from my right neighbor + //////////////////////////////////////////////////////////////////////// + + ////////////////////////////////////////////////////// + // DELETE THE SERIAL CODE BELOW AND REPLACE WITH MPI + ////////////////////////////////////////////////////// + + Kokkos::parallel_for( + "set_halo_values_x", + md_range_policy<2>(exec_policy, {0, 0}, {NUM_VARS, nz}), + KOKKOS_LAMBDA(int ll, int k) { + state(ll, k+hs, 0) = state(ll, k+hs, nx+hs-2); + state(ll, k+hs, 1) = state(ll, k+hs, nx+hs-1); + state(ll, k+hs, nx+hs) = state(ll, k+hs, hs); + state(ll, k+hs, nx+hs+1) = state(ll, k+hs, hs+1); + }); + //////////////////////////////////////////////////// + + if (data_spec_int == DATA_SPEC_INJECTION) { + if (scalars.myrank == 0) { + auto hy_dens_cell = arrays.hy_dens_cell(); + auto hy_dens_theta_cell = arrays.hy_dens_theta_cell(); + const int k_beg = scalars.k_beg; + + Kokkos::parallel_for( + "set_halo_values_x(INJECTION)", + md_range_policy<2>(exec_policy, {0, 0}, {nz, hs}), + KOKKOS_LAMBDA(int k, int i) { + const double z = (k_beg + k+0.5)*dz; + if (fabs(z-3*zlen/4) <= zlen/16) { + state(ID_UMOM, k+hs, i) = (state(ID_DENS, k+hs, i) + hy_dens_cell[k+hs]) * 50.0; + state(ID_RHOT, k+hs, i) = (state(ID_DENS, k+hs, i) + hy_dens_cell[k+hs]) * 298.0 - + hy_dens_theta_cell[k+hs]; + } + }); + } + } +} + +//Set this MPI task's halo values in the z-direction. This does not require MPI because there is no MPI +//decomposition in the vertical direction +template +void set_halo_values_z( + kokkos_execution_policy exec_policy, + view_3d state, + const global_const_scalars& scalars, + const global_const_arrays& arrays) +{ + const int nx = scalars.nx; + const int nz = scalars.nz; + auto hy_dens_cell = arrays.hy_dens_cell(); + + Kokkos::parallel_for( + "set_halo_values_z", + md_range_policy<2>(exec_policy, {0, 0}, {NUM_VARS, nx + 2*hs}), + KOKKOS_LAMBDA(int ll, int i) { + if (ll == ID_WMOM) { + state(ll, 0, i) = 0.0; + state(ll, 1, i) = 0.0; + state(ll, nz+hs, i) = 0.0; + state(ll, nz+hs+1, i) = 0.0; + } else if (ll == ID_UMOM) { + state(ll, 0, i) = state(ll, hs, i) / hy_dens_cell[hs] * hy_dens_cell[0]; + state(ll, 1, i) = state(ll, hs, i) / hy_dens_cell[hs] * hy_dens_cell[1]; + state(ll, nz+hs, i) = state(ll, nz+hs-1, i) / hy_dens_cell[nz+hs-1] * hy_dens_cell[nz+hs]; + state(ll, nz+hs+1, i) = state(ll, nz+hs-1, i) / hy_dens_cell[nz+hs-1] * hy_dens_cell[nz+hs+1]; + } else { + state(ll, 0, i) = state(ll, hs, i); + state(ll, 1, i) = state(ll, hs, i); + state(ll, nz+hs, i) = state(ll, nz+hs-1, i); + state(ll, nz+hs+1, i) = state(ll, nz+hs-1, i); + } + }); +} + +//Compute the time tendencies of the fluid state using forcing in the x-direction +//Since the halos are set in a separate routine, this will not require MPI +//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) +//Then, compute the tendencies using those fluxes +template +void compute_tendencies_x( + kokkos_execution_policy exec_policy, + view_3d_const state, + view_3d flux, view_3d tend, double dt, + const global_const_scalars& scalars, + const global_const_arrays& arrays) +{ + const int nx = scalars.nx; + const int nz = scalars.nz; + // Hyperviscosity coefficient + const double hv_coef = -hv_beta * dx / (16*dt); + auto hy_dens_cell = arrays.hy_dens_cell(); + auto hy_dens_theta_cell = arrays.hy_dens_theta_cell(); + + //Compute fluxes in the x-direction for each cell + Kokkos::parallel_for( + "compute_tendencies_x(1)", + md_range_policy<2>(exec_policy, {0, 0}, {nz, nx+1}), + KOKKOS_LAMBDA(int k, int i) { + //Use fourth-order interpolation from four cell averages + //to compute the value at the interface in question + std::array d3_vals; + std::array vals; + for (int ll = 0; ll < NUM_VARS; ++ll) { + std::array stencil; + for (int s = 0; s < sten_size; ++s) { + stencil[s] = state(ll, k+hs, i+s); + } + //Fourth-order-accurate interpolation of the state + vals[ll] = -stencil[0]/12 + 7*stencil[1]/12 + 7*stencil[2]/12 - stencil[3]/12; + //First-order-accurate interpolation of the third spatial derivative + //of the state (for artificial viscosity) + d3_vals[ll] = -stencil[0] + 3*stencil[1] - 3*stencil[2] + stencil[3]; + } + + //Compute density, u-wind, w-wind, potential temperature, + //and pressure (r,u,w,t,p respectively) + double r = vals[ID_DENS] + hy_dens_cell[k+hs]; + double u = vals[ID_UMOM] / r; + double w = vals[ID_WMOM] / r; + double t = ( vals[ID_RHOT] + hy_dens_theta_cell[k+hs] ) / r; + double p = C0 * pow(r*t, gamm); + + //Compute the flux vector + flux(ID_DENS, k, i) = r*u - hv_coef*d3_vals[ID_DENS]; + flux(ID_UMOM, k, i) = r*u*u+p - hv_coef*d3_vals[ID_UMOM]; + flux(ID_WMOM, k, i) = r*u*w - hv_coef*d3_vals[ID_WMOM]; + flux(ID_RHOT, k, i) = r*u*t - hv_coef*d3_vals[ID_RHOT]; + }); + + //Use the fluxes to compute tendencies for each cell + { + view_3d_const flux_c = flux; + Kokkos::parallel_for( + "compute_tendencies_x(2)", + md_range_policy<3>(exec_policy, {0, 0, 0}, {NUM_VARS, nz, nx}), + KOKKOS_LAMBDA(int ll, int k, int i) { + tend(ll, k, i) = -( flux_c(ll, k, i+1) - flux_c(ll, k, i) ) / dx; + }); + } +} + +//Compute the time tendencies of the fluid state using forcing in the z-direction +//Since the halos are set in a separate routine, this will not require MPI +//First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity) +//Then, compute the tendencies using those fluxes +template +void compute_tendencies_z( + kokkos_execution_policy exec_policy, + view_3d_const state, + view_3d flux, view_3d tend, double dt, + const global_const_scalars& scalars, + const global_const_arrays& arrays) +{ + const int nx = scalars.nx; + const int nz = scalars.nz; + // Hyperviscosity coefficient + const double hv_coef = -hv_beta * dz / (16*dt); + auto hy_dens_int = arrays.hy_dens_int(); + auto hy_dens_theta_int = arrays.hy_dens_theta_int(); + auto hy_pressure_int = arrays.hy_pressure_int(); + + //Compute fluxes in the x-direction for each cell + Kokkos::parallel_for( + "compute_tendencies_z(1)", + md_range_policy<2>(exec_policy, {0, 0}, {nz + 1, nx}), + KOKKOS_LAMBDA(int k, int i) { + //Use fourth-order interpolation from four cell averages + //to compute the value at the interface in question + std::array d3_vals; + std::array vals; + for (int ll = 0; ll < NUM_VARS; ++ll) { + std::array stencil; + for (int s = 0; s < sten_size; ++s) { + stencil[s] = state(ll, k+s, i+hs); + } + //Fourth-order-accurate interpolation of the state + vals[ll] = -stencil[0]/12 + 7*stencil[1]/12 + 7*stencil[2]/12 - stencil[3]/12; + //First-order-accurate interpolation of the third spatial derivative + //of the state + d3_vals[ll] = -stencil[0] + 3*stencil[1] - 3*stencil[2] + stencil[3]; + } + + //Compute density, u-wind, w-wind, potential temperature, + //and pressure (r,u,w,t,p respectively) + double r = vals[ID_DENS] + hy_dens_int[k]; + double u = vals[ID_UMOM] / r; + double w = vals[ID_WMOM] / r; + double t = (vals[ID_RHOT] + hy_dens_theta_int[k]) / r; + double p = C0 * pow(r * t, gamm) - hy_pressure_int[k]; + //Enforce vertical boundary condition and exact mass conservation + if (k == 0 || k == nz) { + w = 0; + d3_vals[ID_DENS] = 0; + } + + //Compute the flux vector with hyperviscosity + flux(ID_DENS, k, i) = r*w - hv_coef*d3_vals[ID_DENS]; + flux(ID_UMOM, k, i) = r*w*u - hv_coef*d3_vals[ID_UMOM]; + flux(ID_WMOM, k, i) = r*w*w+p - hv_coef*d3_vals[ID_WMOM]; + flux(ID_RHOT, k, i) = r*w*t - hv_coef*d3_vals[ID_RHOT]; + }); + + //Use the fluxes to compute tendencies for each cell + { + view_3d_const flux_c = flux; + Kokkos::parallel_for( + "compute_tendencies_z(2)", + md_range_policy<3>(exec_policy, {0, 0, 0}, {NUM_VARS, nz, nx}), + KOKKOS_LAMBDA(int ll, int k, int i) { + tend(ll, k, i) = -( flux_c(ll, k+1, i) - flux_c(ll, k, i) ) / dz; + if (ll == ID_WMOM) { + tend(ll, k, i) = tend(ll, k, i) - state(ID_DENS, k+hs, i+hs)*grav; + } + }); + } +} + +template +void apply_tendencies_to_fluid_state( + kokkos_execution_policy exec_policy, + view_3d_const state_init, + view_3d state_out, + double dt /* not scalars.dt */, + view_3d tend, + const global_const_scalars& scalars, + const global_const_arrays& arrays) +{ + const int nx = scalars.nx; + const int nz = scalars.nz; + auto hy_dens_cell = arrays.hy_dens_cell(); + view_3d_const tend_c = tend; + + Kokkos::parallel_for( + "apply_tendencies_to_fluid_state", + md_range_policy<3>(exec_policy, {0, 0, 0}, {NUM_VARS, nz, nx}), + KOKKOS_LAMBDA(int ll, int k, int i) { + if (data_spec_int == DATA_SPEC_GRAVITY_WAVES) { + const int i_beg = scalars.i_beg; + const int k_beg = scalars.k_beg; + const double x = (i_beg + i+0.5)*dx; + const double z = (k_beg + k+0.5)*dz; + const double wpert = sample_ellipse_cosine(x, z, 0.01, xlen/8, 1000.0, 500.0, 500.0); + tend(ID_WMOM, k, i) += wpert * hy_dens_cell[hs+k]; + } + state_out(ll, k+hs, i+hs) = state_init(ll, k+hs, i+hs) + dt * tend_c(ll, k, i); + }); +} + +// Initialize the cell-averaged fluid state via Gauss-Legendre quadrature +void initialize_cell_averaged_fluid_state( + kokkos_execution_policy exec_policy, + view_3d state, view_3d state_tmp, + int nx, int nz, + int i_beg, int k_beg) +{ + Kokkos::parallel_for( + "initialize_cell_averaged_fluid_state", + md_range_policy<2>(exec_policy, {0, 0}, {nz + 2*hs, nx + 2*hs}), + KOKKOS_LAMBDA(int k, int i) { + //Initialize the state to zero + for (int ll = 0; ll < NUM_VARS; ++ll) { + state(ll, k, i) = 0.0; + } + //Use Gauss-Legendre quadrature to initialize a hydrostatic balance + temperature perturbation + for (int kk = 0; kk < nqpoints; ++kk) { + for (int ii = 0; ii < nqpoints; ++ii) { + //Compute the x,z location within the global domain based on cell and quadrature index + const double x = (i_beg + i-hs+0.5)*dx + (qpoints[ii]-0.5)*dx; + const double z = (k_beg + k-hs+0.5)*dz + (qpoints[kk]-0.5)*dz; + + //Set the fluid state based on the user's specification + auto [r, u, w, t, hr, ht] = get_test_case(data_spec_int, x, z); + + //Store into the fluid state array + state(ID_DENS, k, i) = state(ID_DENS, k, i) + r * qweights[ii]*qweights[kk]; + state(ID_UMOM, k, i) = state(ID_UMOM, k, i) + (r+hr)*u * qweights[ii]*qweights[kk]; + state(ID_WMOM, k, i) = state(ID_WMOM, k, i) + (r+hr)*w * qweights[ii]*qweights[kk]; + state(ID_RHOT, k, i) = state(ID_RHOT, k, i) + ( (r+hr)*(t+ht) - hr*ht ) * qweights[ii]*qweights[kk]; + } + } + for (int ll = 0; ll < NUM_VARS; ++ll) { + state_tmp(ll, k, i) = state(ll, k, i); + } + }); +} + +void compute_hydrostatic_background_state( + kokkos_execution_policy exec_policy, + view_1d hy_dens_cell, + view_1d hy_dens_theta_cell, + view_1d hy_dens_int, + view_1d hy_dens_theta_int, + view_1d hy_pressure_int, + int nz, + int k_beg) +{ + using range_1d_type = + Kokkos::RangePolicy>; + + //Compute the hydrostatic background state over vertical cell averages + Kokkos::parallel_for( + "compute_hydrostatic_background_state(1)", + range_1d_type(exec_policy, 0, nz + 2*hs), + KOKKOS_LAMBDA(int k) { + hy_dens_cell[k] = 0.0; + hy_dens_theta_cell[k] = 0.0; + for (int kk = 0; kk < nqpoints; ++kk) { + const double z = (k_beg + k-hs+0.5)*dz; + //Set the fluid state based on the user's specification + auto [r, u, w, t, hr, ht] = get_test_case(data_spec_int, 0.0, z); + hy_dens_cell[k] = hy_dens_cell[k] + hr * qweights[kk]; + hy_dens_theta_cell[k] = hy_dens_theta_cell[k] + hr*ht * qweights[kk]; + } + }); + + //Compute the hydrostatic background state at vertical cell interfaces + Kokkos::parallel_for( + "compute_hydrostatic_background_state(2)", + range_1d_type(exec_policy, 0, nz + 1), + KOKKOS_LAMBDA(int k) { + const double z = (k_beg + k)*dz; + auto [r, u, w, t, hr, ht] = get_test_case(data_spec_int, 0.0, z); + hy_dens_int[k] = hr; + hy_dens_theta_int[k] = hr * ht; + hy_pressure_int[k] = C0 * pow(hr * ht, gamm); + }); +} + +template +reduction_result local_reductions( + kokkos_execution_policy exec_policy, + view_3d_const state, + const global_const_scalars& const_scalars, + const global_const_arrays& const_arrays) +{ + reduction_result result{0.0, 0.0}; + const int nx = const_scalars.nx; + const int nz = const_scalars.nz; + auto hy_dens_cell = const_arrays.hy_dens_cell(); + auto hy_dens_theta_cell = const_arrays.hy_dens_theta_cell(); + + Kokkos::parallel_reduce( + "local_reductions", + md_range_policy<2>(exec_policy, {0, 0}, {nz, nx}), + KOKKOS_LAMBDA(int k, int i, reduction_result& thread_local_result) { + double r = state(ID_DENS, k+hs, i+hs) + hy_dens_cell[hs+k]; // Density + double u = state(ID_UMOM, k+hs, i+hs) / r; // U-wind + double w = state(ID_WMOM, k+hs, i+hs) / r; // W-wind + double th = (state(ID_RHOT, k+hs, i+hs) + hy_dens_theta_cell[hs+k]) / r; // Potential Temperature (theta) + double p = C0 * pow(r * th, gamm); // Pressure + double t = th / pow(p0 / p, rd / cp); // Temperature + double ke = r*(u*u+w*w); // Kinetic Energy + double ie = r*cv*t; // Internal Energy + thread_local_result.mass += r *dx*dz; // Accumulate domain mass + thread_local_result.te += (ke + ie)*dx*dz; // Accumulate domain total energy + }, result); + + return result; +} + diff --git a/cpp-mdspan/miniWeather_mdspan.cpp b/cpp-mdspan/miniWeather_mdspan.cpp index c06114b..d105e0f 100644 --- a/cpp-mdspan/miniWeather_mdspan.cpp +++ b/cpp-mdspan/miniWeather_mdspan.cpp @@ -8,14 +8,26 @@ #include "miniWeather_common.hpp" #include "miniWeather_output.hpp" -#include "miniWeather_serial.hpp" + +#if defined(MINIWEATHER_KOKKOS) +# include "miniWeather_kokkos.hpp" +#else +# include "miniWeather_serial.hpp" +#endif + +// This needs to go after the above (execution policy - specific) headers. +#include "miniWeather_generic_algs.hpp" auto default_memory_space() { return host_memory_space{}; } auto default_execution_policy() { +#if defined(MINIWEATHER_KOKKOS) + return kokkos_execution_policy{}; +#else return host_serial_execution_policy{}; +#endif } // Intra-(MPI-process) parallelization needs to happen in the following functions. diff --git a/cpp-mdspan/miniWeather_output.hpp b/cpp-mdspan/miniWeather_output.hpp index 0fcb136..7af72b7 100644 --- a/cpp-mdspan/miniWeather_output.hpp +++ b/cpp-mdspan/miniWeather_output.hpp @@ -3,6 +3,8 @@ #include "miniWeather_common.hpp" #include "pnetcdf.h" +#define MINIWEATHER_ONLY_OUTPUT_THETA 1 + //Error reporting routine for the PNetCDF I/O inline void ncwrap( int ierr , int line ) { if (ierr != NC_NOERR) { @@ -15,9 +17,9 @@ inline void ncwrap( int ierr , int line ) { //Output the fluid state (state) to a NetCDF file at a given elapsed model time (etime) //The file I/O uses parallel-netcdf, the only external library required for this mini-app. //If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics -template +template void output( - host_serial_execution_policy exec_policy, + ExecutionPolicy exec_policy, // make this generic for now view_3d_const state, const global_const_scalars& const_scalars, const global_const_arrays& const_arrays, From 4825faef97808f4b8b0c6dd2f65fe44affa5b3f3 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Mon, 7 Apr 2025 20:10:25 +0300 Subject: [PATCH 78/83] Enable Kokkos OpenACC build --- cpp-mdspan/CMakeLists.txt | 36 +++++- cpp-mdspan/miniWeather_common.cpp | 99 +-------------- cpp-mdspan/miniWeather_common.hpp | 155 +++++++++++++++++++----- cpp-mdspan/miniWeather_generic_algs.hpp | 10 +- cpp-mdspan/miniWeather_kokkos.hpp | 68 ++++++++++- 5 files changed, 228 insertions(+), 140 deletions(-) diff --git a/cpp-mdspan/CMakeLists.txt b/cpp-mdspan/CMakeLists.txt index 028fbe6..0314c94 100644 --- a/cpp-mdspan/CMakeLists.txt +++ b/cpp-mdspan/CMakeLists.txt @@ -106,6 +106,9 @@ endif() # Users can set FETCHCONTENT_SOURCE_DIR_KOKKOS # to a local path to Kokkos, to override automatic downloading. +set(Kokkos_ENABLE_SERIAL ON CACHE INTERNAL "") +set(Kokkos_ENABLE_OPENACC ON CACHE INTERNAL "") + FetchContent_Declare( Kokkos GIT_REPOSITORY https://github.com/kokkos/kokkos.git @@ -181,13 +184,36 @@ target_compile_options(miniWeather_serial PRIVATE ) if (kokkos_POPULATED) - add_executable(miniWeather_kokkos miniWeather_mdspan.cpp miniWeather_common.cpp) - target_include_directories(miniWeather_kokkos PRIVATE "${PROJECT_SOURCE_DIR}") + add_executable(miniWeather_kokkos_serial miniWeather_mdspan.cpp miniWeather_common.cpp) + target_include_directories(miniWeather_kokkos_serial PRIVATE "${PROJECT_SOURCE_DIR}") + + target_link_libraries(miniWeather_kokkos_serial PRIVATE Kokkos::kokkos) + target_compile_definitions(miniWeather_kokkos_serial PRIVATE MINIWEATHER_KOKKOS) + target_compile_definitions(miniWeather_kokkos_serial PRIVATE MINIWEATHER_KOKKOS_SERIAL) + + # We got mdspan from Kokkos. + target_compile_options(miniWeather_kokkos_serial PRIVATE + $<$,$,$>: + -Wall> + $<$: + /W4> + ) +endif() + +# FIXME (mfh 2025/04/04) Figure out how to pass OpenACC flags to Kokkos. +if (kokkos_POPULATED AND OpenACC_FOUND) + set(Kokkos_ENABLE_OPENACC ON) + + add_executable(miniWeather_kokkos_openacc miniWeather_mdspan.cpp miniWeather_common.cpp) + target_include_directories(miniWeather_kokkos_openacc PRIVATE "${PROJECT_SOURCE_DIR}") + + target_link_libraries(miniWeather_kokkos_openacc PRIVATE Kokkos::kokkos) + target_compile_definitions(miniWeather_kokkos_openacc PRIVATE MINIWEATHER_KOKKOS) + target_compile_definitions(miniWeather_kokkos_openacc PRIVATE MINIWEATHER_KOKKOS_OPENACC) - target_link_libraries(miniWeather_kokkos PRIVATE Kokkos::kokkos) - target_compile_definitions(miniWeather_kokkos PRIVATE MINIWEATHER_KOKKOS) # We got mdspan from Kokkos. - target_compile_options(miniWeather_kokkos PRIVATE + target_link_libraries(miniWeather_kokkos_openacc PRIVATE OpenACC::OpenACC_CXX) + target_compile_options(miniWeather_kokkos_openacc PRIVATE $<$,$,$>: -Wall> $<$: diff --git a/cpp-mdspan/miniWeather_common.cpp b/cpp-mdspan/miniWeather_common.cpp index ae56336..935ad5a 100644 --- a/cpp-mdspan/miniWeather_common.cpp +++ b/cpp-mdspan/miniWeather_common.cpp @@ -10,102 +10,9 @@ make_unique_array_1d(host_memory_space, int X) { return std::make_unique(X); } -test_case injection(double x , double z) { - auto [hr, ht] = hydro_const_theta(z); - double r = 0.0; - double t = 0.0; - double u = 0.0; - double w = 0.0; - return {r, u, w, t, hr, ht}; -} - -test_case density_current(double x , double z) { - auto [hr, ht] = hydro_const_theta(z); - double r = 0.0; - double t = sample_ellipse_cosine(x, z, -20.0, xlen/2, 5000.0, 4000.0, 2000.0); - double u = 0.0; - double w = 0.0; - return {r, u, w, t, hr, ht}; -} - -test_case gravity_waves(double x, double z) { - auto [hr, ht] = hydro_const_bvfreq(z, 0.02); - double r = 0.0; - double t = 0.0; - double u = 15.0; - double w = 0.0; - return {r, u, w, t, hr, ht}; -} - -test_case thermal(double x, double z) { - auto [hr, ht] = hydro_const_theta(z); - double r = 0.0; - double t = sample_ellipse_cosine(x, z, 3.0, xlen/2,2000.0, 2000.0, 2000.0); - double u = 0.0; - double w = 0.0; - return {r, u, w, t, hr, ht}; -} - -test_case collision(double x , double z) { - auto [hr, ht] = hydro_const_theta(z); - double r = 0.0; - double t = 0.0; - double u = 0.0; - double w = 0.0; - t = t + sample_ellipse_cosine(x, z, 20.0, xlen/2,2000.0, 2000.0, 2000.0); - t = t + sample_ellipse_cosine(x, z, -20.0, xlen/2,8000.0, 2000.0, 2000.0); - return {r, u, w, t, hr, ht}; -} - -test_case get_test_case(int data_spec, double x_, double z_) { - if (data_spec == DATA_SPEC_COLLISION ) { return collision(x_, z_); } - if (data_spec == DATA_SPEC_THERMAL ) { return thermal(x_, z_); } - if (data_spec == DATA_SPEC_GRAVITY_WAVES ) { return gravity_waves(x_, z_); } - if (data_spec == DATA_SPEC_DENSITY_CURRENT) { return density_current(x_, z_); } - if (data_spec == DATA_SPEC_INJECTION ) { return injection(x_, z_); } - assert(false); - return test_case{}; -} - -r_t_pair hydro_const_theta(double z) { - const double theta0 = 300.; //Background potential temperature - const double exner0 = 1.; //Surface-level Exner pressure - double p,exner,rt; - //Establish hydrostatic balance first using Exner pressure - double t = theta0; //Potential Temperature at z - exner = exner0 - grav * z / (cp * theta0); //Exner pressure at z - p = p0 * pow(exner,(cp/rd)); //Pressure at z - rt = pow((p / C0),(1. / gamm)); //rho*theta at z - double r = rt / t; //Density at z - - return {r, t}; -} - -r_t_pair hydro_const_bvfreq(double z, double bv_freq0) { - const double theta0 = 300.; //Background potential temperature - const double exner0 = 1.; //Surface-level Exner pressure - double p, exner, rt; - double t = theta0 * exp( bv_freq0*bv_freq0 / grav * z ); //Pot temp at z - exner = exner0 - grav*grav / (cp * bv_freq0*bv_freq0) * (t - theta0) / (t * theta0); //Exner pressure at z - p = p0 * pow(exner,(cp/rd)); //Pressure at z - rt = pow((p / C0), (1. / gamm)); //rho*theta at z - double r = rt / t; //Density at z - - return {r, t}; -} - -double sample_ellipse_cosine( double x , double z , double amp , double x0 , double z0 , double xrad , double zrad ) { - double dist; - //Compute distance from bubble center - dist = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.0; - //If the distance from bubble center is less than the radius, create a cos**2 profile - if (dist <= pi / 2.0) { - return amp * pow(cos(dist), 2.0); - } else { - return 0.; - } -} - void finalize() { +#if defined(MINIWEATHER_KOKKOS) + Kokkos::finalize(); +#endif (void) MPI_Finalize(); } diff --git a/cpp-mdspan/miniWeather_common.hpp b/cpp-mdspan/miniWeather_common.hpp index 401016b..f26a8e8 100644 --- a/cpp-mdspan/miniWeather_common.hpp +++ b/cpp-mdspan/miniWeather_common.hpp @@ -20,6 +20,13 @@ #include "mdspan/mdspan.hpp" #include "unique_mdarray.hpp" +#if defined(MINIWEATHER_KOKKOS) +# include "Kokkos_Core.hpp" +# define MINIWEATHER_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION +#else +# define MINIWEATHER_INLINE_FUNCTION inline +#endif + constexpr double pi = 3.14159265358979323846264338327; //Pi constexpr double grav = 9.8; //Gravitational acceleration (m / s^2) constexpr double cp = 1004.; //Specific heat of dry air at constant pressure @@ -251,6 +258,64 @@ struct init_result { global_arrays arrays; }; +struct r_t_pair { + double r; + double t; +}; + +// Establish hydrostatic balance using constant potential temperature +// (thermally neutral atmosphere) +// z is the input coordinate +// r and t are the output background hydrostatic density and potential temperature +MINIWEATHER_INLINE_FUNCTION +r_t_pair hydro_const_theta(double z) { + const double theta0 = 300.; //Background potential temperature + const double exner0 = 1.; //Surface-level Exner pressure + double p,exner,rt; + //Establish hydrostatic balance first using Exner pressure + double t = theta0; //Potential Temperature at z + exner = exner0 - grav * z / (cp * theta0); //Exner pressure at z + p = p0 * pow(exner,(cp/rd)); //Pressure at z + rt = pow((p / C0),(1. / gamm)); //rho*theta at z + double r = rt / t; //Density at z + + return {r, t}; +} + +//Establish hydrostatic balance using constant Brunt-Vaisala frequency +//z is the input coordinate +//bv_freq0 is the constant Brunt-Vaisala frequency +//r and t are the output background hydrostatic density and potential temperature +MINIWEATHER_INLINE_FUNCTION +r_t_pair hydro_const_bvfreq(double z, double bv_freq0) { + const double theta0 = 300.; //Background potential temperature + const double exner0 = 1.; //Surface-level Exner pressure + double p, exner, rt; + double t = theta0 * exp( bv_freq0*bv_freq0 / grav * z ); //Pot temp at z + exner = exner0 - grav*grav / (cp * bv_freq0*bv_freq0) * (t - theta0) / (t * theta0); //Exner pressure at z + p = p0 * pow(exner,(cp/rd)); //Pressure at z + rt = pow((p / C0), (1. / gamm)); //rho*theta at z + double r = rt / t; //Density at z + + return {r, t}; +} + +//Sample from an ellipse of a specified center, radius, and amplitude at a specified location +//x and z are input coordinates +//amp,x0,z0,xrad,zrad are input amplitude, center, and radius of the ellipse +MINIWEATHER_INLINE_FUNCTION +double sample_ellipse_cosine( double x , double z , double amp , double x0 , double z0 , double xrad , double zrad ) { + double dist; + //Compute distance from bubble center + dist = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.0; + //If the distance from bubble center is less than the radius, create a cos**2 profile + if (dist <= pi / 2.0) { + return amp * pow(cos(dist), 2.0); + } else { + return 0.; + } +} + struct test_case { double r; double u; @@ -265,43 +330,71 @@ struct test_case { // hr and ht are output background hydrostatic density and potential temperature at that location. //This test case is initially balanced but injects fast, cold air from the left boundary near the model top -test_case injection(double x, double z); +MINIWEATHER_INLINE_FUNCTION +test_case injection(double x , double z) { + auto [hr, ht] = hydro_const_theta(z); + double r = 0.0; + double t = 0.0; + double u = 0.0; + double w = 0.0; + return {r, u, w, t, hr, ht}; +} //Initialize a density current (falling cold thermal that propagates along the model bottom) -test_case density_current(double x, double z); - -test_case gravity_waves(double x, double z); +MINIWEATHER_INLINE_FUNCTION +test_case density_current(double x , double z) { + auto [hr, ht] = hydro_const_theta(z); + double r = 0.0; + double t = sample_ellipse_cosine(x, z, -20.0, xlen/2, 5000.0, 4000.0, 2000.0); + double u = 0.0; + double w = 0.0; + return {r, u, w, t, hr, ht}; +} + +MINIWEATHER_INLINE_FUNCTION +test_case gravity_waves(double x, double z) { + auto [hr, ht] = hydro_const_bvfreq(z, 0.02); + double r = 0.0; + double t = 0.0; + double u = 15.0; + double w = 0.0; + return {r, u, w, t, hr, ht}; +} //Rising thermal -test_case thermal(double x, double z); +MINIWEATHER_INLINE_FUNCTION +test_case thermal(double x, double z) { + auto [hr, ht] = hydro_const_theta(z); + double r = 0.0; + double t = sample_ellipse_cosine(x, z, 3.0, xlen/2,2000.0, 2000.0, 2000.0); + double u = 0.0; + double w = 0.0; + return {r, u, w, t, hr, ht}; +} //Colliding thermals -test_case collision(double x, double z); - -test_case get_test_case(int data_spec, double x_, double z_); - -struct r_t_pair { - double r; - double t; -}; - -// Establish hydrostatic balance using constant potential temperature -// (thermally neutral atmosphere) -// z is the input coordinate -// r and t are the output background hydrostatic density and potential temperature -r_t_pair hydro_const_theta(double z); - -//Establish hydrostatic balance using constant Brunt-Vaisala frequency -//z is the input coordinate -//bv_freq0 is the constant Brunt-Vaisala frequency -//r and t are the output background hydrostatic density and potential temperature -r_t_pair hydro_const_bvfreq(double z, double bv_freq0); - -//Sample from an ellipse of a specified center, radius, and amplitude at a specified location -//x and z are input coordinates -//amp,x0,z0,xrad,zrad are input amplitude, center, and radius of the ellipse -double sample_ellipse_cosine(double x, double z, double amp, double x0, double z0, - double xrad, double zrad); +MINIWEATHER_INLINE_FUNCTION +test_case collision(double x , double z) { + auto [hr, ht] = hydro_const_theta(z); + double r = 0.0; + double t = 0.0; + double u = 0.0; + double w = 0.0; + t = t + sample_ellipse_cosine(x, z, 20.0, xlen/2,2000.0, 2000.0, 2000.0); + t = t + sample_ellipse_cosine(x, z, -20.0, xlen/2,8000.0, 2000.0, 2000.0); + return {r, u, w, t, hr, ht}; +} + +MINIWEATHER_INLINE_FUNCTION +test_case get_test_case(int data_spec, double x, double z) { + if (data_spec == DATA_SPEC_COLLISION ) { return collision(x, z); } + if (data_spec == DATA_SPEC_THERMAL ) { return thermal(x, z); } + if (data_spec == DATA_SPEC_GRAVITY_WAVES ) { return gravity_waves(x, z); } + if (data_spec == DATA_SPEC_DENSITY_CURRENT) { return density_current(x, z); } + if (data_spec == DATA_SPEC_INJECTION ) { return injection(x, z); } + assert(false); + return test_case{}; +} struct reduction_result { double mass; diff --git a/cpp-mdspan/miniWeather_generic_algs.hpp b/cpp-mdspan/miniWeather_generic_algs.hpp index b6bfbe1..b31b80e 100644 --- a/cpp-mdspan/miniWeather_generic_algs.hpp +++ b/cpp-mdspan/miniWeather_generic_algs.hpp @@ -1,6 +1,9 @@ #pragma once #include "miniWeather_common.hpp" +#if defined(MINIWEATHER_KOKKOS) +# include "Kokkos_Core.hpp" +#endif // Perform a single time step. // Time steps are dimensionally split and @@ -87,7 +90,9 @@ init_result init( int *argc , char ***argv) { (void) MPI_Init(argc,argv); - +#if defined(MINIWEATHER_KOKKOS) + Kokkos::initialize(*argc, *argv); +#endif ///////////////////////////////////////////////////////////// // BEGIN MPI DUMMY SECTION // TODO: (1) GET NUMBER OF MPI RANKS @@ -205,6 +210,3 @@ reduction_result reductions( .te = glob[1] }; } - - - diff --git a/cpp-mdspan/miniWeather_kokkos.hpp b/cpp-mdspan/miniWeather_kokkos.hpp index ae29b3b..dc140eb 100644 --- a/cpp-mdspan/miniWeather_kokkos.hpp +++ b/cpp-mdspan/miniWeather_kokkos.hpp @@ -1,9 +1,30 @@ #pragma once #include "miniWeather_common.hpp" -#include +#include "Kokkos_Core.hpp" +#include "cuda/std/array" -using kokkos_execution_policy = Kokkos::DefaultExecutionSpace; +#if defined(MINIWEATHER_KOKKOS_OPENACC) +# if ! defined(KOKKOS_ENABLE_OPENACC) +# error "Kokkos OpenACC is not enabled" +# endif + +// Users aren't allowed to include OpenACC/Kokkos_OpenACC.hpp directly. +//#include "OpenACC/Kokkos_OpenACC.hpp" +using kokkos_execution_policy = Kokkos::Experimental::OpenACC; + +#elif defined(MINIWEATHER_KOKKOS_SERIAL) +# if ! defined(KOKKOS_ENABLE_SERIAL) +# error "Kokkos Serial is not enabled" +# endif + +// Users aren't allowed to include Serial/Kokkos_Serial.hpp directly. +//#include "Serial/Kokkos_Serial.hpp" +using kokkos_execution_policy = Kokkos::Serial; + +#else +# error "No Kokkos execution policy defined" +#endif template using md_range_policy = Kokkos::MDRangePolicy< @@ -287,6 +308,19 @@ void initialize_cell_averaged_fluid_state( "initialize_cell_averaged_fluid_state", md_range_policy<2>(exec_policy, {0, 0}, {nz + 2*hs, nx + 2*hs}), KOKKOS_LAMBDA(int k, int i) { + // OpenACC doesn't support static arrays, even if they are constexpr. + constexpr int nqpoints = 3; + constexpr cuda::std::array qpoints{ + 0.112701665379258311482073460022E0, + 0.500000000000000000000000000000E0, + 0.887298334620741688517926539980E0 + }; + constexpr cuda::std::array qweights{ + 0.277777777777777777777777777779E0, + 0.444444444444444444444444444444E0, + 0.277777777777777777777777777779E0 + }; + //Initialize the state to zero for (int ll = 0; ll < NUM_VARS; ++ll) { state(ll, k, i) = 0.0; @@ -332,6 +366,14 @@ void compute_hydrostatic_background_state( "compute_hydrostatic_background_state(1)", range_1d_type(exec_policy, 0, nz + 2*hs), KOKKOS_LAMBDA(int k) { + // OpenACC doesn't support static arrays, even if they are constexpr. + constexpr int nqpoints = 3; + constexpr cuda::std::array qweights{ + 0.277777777777777777777777777779E0, + 0.444444444444444444444444444444E0, + 0.277777777777777777777777777779E0 + }; + hy_dens_cell[k] = 0.0; hy_dens_theta_cell[k] = 0.0; for (int kk = 0; kk < nqpoints; ++kk) { @@ -369,9 +411,28 @@ reduction_result local_reductions( auto hy_dens_cell = const_arrays.hy_dens_cell(); auto hy_dens_theta_cell = const_arrays.hy_dens_theta_cell(); + // Kokkos doesn't currently implement parallel_reduce(MDRangePolicy) for OpenACC. +#if defined(MINIWEATHER_KOKKOS_OPENACC) +# if defined(KOKKOS_ENABLE_CUDA) + auto my_exec_policy = Kokkos::Cuda{}; +# elif defined(KOKKOS_ENABLE_SERIAL) + auto my_exec_policy = Kokkos::Serial{}; +# else +# error "No fall-back execution policy defined" +# endif +#else + auto my_exec_policy = exec_policy; +#endif + + Kokkos::MDRangePolicy< + std::remove_cvref_t, + Kokkos::Rank<2>, + Kokkos::IndexType> + range(my_exec_policy, {0, 0}, {nz, nx}); + Kokkos::parallel_reduce( "local_reductions", - md_range_policy<2>(exec_policy, {0, 0}, {nz, nx}), + range, KOKKOS_LAMBDA(int k, int i, reduction_result& thread_local_result) { double r = state(ID_DENS, k+hs, i+hs) + hy_dens_cell[hs+k]; // Density double u = state(ID_UMOM, k+hs, i+hs) / r; // U-wind @@ -384,7 +445,6 @@ reduction_result local_reductions( thread_local_result.mass += r *dx*dz; // Accumulate domain mass thread_local_result.te += (ke + ie)*dx*dz; // Accumulate domain total energy }, result); - return result; } From d94a521f470c0f4bf1b5ffd1a72fcc68f53a2286 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 9 Apr 2025 01:30:05 +0300 Subject: [PATCH 79/83] Add stdpar implementation and builds --- cpp-mdspan/CMakeLists.txt | 66 ++++- cpp-mdspan/build/cmake-kermit.sh | 1 + cpp-mdspan/miniWeather_stdpar.hpp | 449 ++++++++++++++++++++++++++++++ 3 files changed, 508 insertions(+), 8 deletions(-) create mode 100644 cpp-mdspan/miniWeather_stdpar.hpp diff --git a/cpp-mdspan/CMakeLists.txt b/cpp-mdspan/CMakeLists.txt index 0314c94..aa4c957 100644 --- a/cpp-mdspan/CMakeLists.txt +++ b/cpp-mdspan/CMakeLists.txt @@ -120,17 +120,10 @@ FetchContent_MakeAvailable(Kokkos) if (NOT kokkos_POPULATED) message(FATAL_ERROR "Kokkos was not found") endif() -message(STATUS "Kokkos_SOURCE_DIR: ${Kokkos_SOURCE_DIR}") -message(STATUS "Kokkos_BINARY_DIR: ${Kokkos_BINARY_DIR}") -#message(STATUS "Kokkos_INCLUDE_DIRS: ${Kokkos_INCLUDE_DIRS}") # this is not defined; why? -message(STATUS "kokkos_SOURCE_DIR: ${kokkos_SOURCE_DIR}") -message(STATUS "kokkos_BINARY_DIR: ${kokkos_BINARY_DIR}") -message(STATUS "kokkos_INCLUDE_DIRS: ${kokkos_INCLUDE_DIRS}") # Use the mdspan version that Kokkos installed, if Kokkos was indeed installed. # Note that this is not a complete mdspan source tree; it just includes the headers. -#if ((DEFINED Kokkos_SOURCE_DIR) AND (EXISTS "${Kokkos_SOURCE_DIR}")) if (kokkos_POPULATED) set(MINIWEATHER_MDSPAN_INCLUDE "${Kokkos_SOURCE_DIR}/tpls/mdspan/include") message(STATUS "Using Kokkos' mdspan headers: ${MINIWEATHER_MDSPAN_INCLUDE}") @@ -200,7 +193,6 @@ if (kokkos_POPULATED) ) endif() -# FIXME (mfh 2025/04/04) Figure out how to pass OpenACC flags to Kokkos. if (kokkos_POPULATED AND OpenACC_FOUND) set(Kokkos_ENABLE_OPENACC ON) @@ -239,5 +231,63 @@ if (OpenACC_FOUND) ) endif() +# Option to enable stdpar +set(MINIWEATHER_ENABLE_STDPAR DETECT CACHE STRING "Enable stdpar using the -stdpar flag") +if (MINIWEATHER_ENABLE_STDPAR) + add_executable(miniWeather_stdpar_cpu miniWeather_mdspan.cpp miniWeather_common.cpp) + target_include_directories(miniWeather_stdpar_cpu PRIVATE "${PROJECT_SOURCE_DIR}") + # If building with Kokkos, we've already added the mdspan headers + # to the include path (see above). + if (NOT kokkos_POPULATED) + target_link_libraries(miniWeather_stdpar_cpu PRIVATE std::mdspan) + endif() + target_compile_options(miniWeather_stdpar_cpu PRIVATE + $<$,$,$>: + -Wall> + $<$: + /W4> + ) + target_compile_options(miniWeather_stdpar_cpu PRIVATE "-stdpar=multicore") + target_link_options(miniWeather_stdpar_cpu PRIVATE "-stdpar=multicore") +endif() + +if (MINIWEATHER_ENABLE_STDPAR) + add_executable(miniWeather_stdpar_gpu miniWeather_mdspan.cpp miniWeather_common.cpp) + target_include_directories(miniWeather_stdpar_gpu PRIVATE "${PROJECT_SOURCE_DIR}") + + # If building with Kokkos, we've already added the mdspan headers + # to the include path (see above). + if (NOT kokkos_POPULATED) + target_link_libraries(miniWeather_stdpar_gpu PRIVATE std::mdspan) + endif() + target_compile_options(miniWeather_stdpar_gpu PRIVATE + $<$,$,$>: + -Wall> + $<$: + /W4> + ) + target_compile_options(miniWeather_stdpar_gpu PRIVATE "-stdpar=gpu") + target_link_options(miniWeather_stdpar_gpu PRIVATE "-stdpar=gpu") +endif() + +if (MINIWEATHER_ENABLE_STDPAR AND OpenACC_FOUND) + add_executable(miniWeather_stdpar_openacc miniWeather_mdspan.cpp miniWeather_common.cpp) + target_include_directories(miniWeather_stdpar_openacc PRIVATE "${PROJECT_SOURCE_DIR}") + + # If building with Kokkos, we've already added the mdspan headers + # to the include path (see above). + if (NOT kokkos_POPULATED) + target_link_libraries(miniWeather_stdpar_openacc PRIVATE std::mdspan) + endif() + target_link_libraries(miniWeather_stdpar_openacc PRIVATE OpenACC::OpenACC_CXX) + target_compile_options(miniWeather_stdpar_openacc PRIVATE + $<$,$,$>: + -Wall> + $<$: + /W4> + ) + target_compile_options(miniWeather_stdpar_openacc PRIVATE "-stdpar=gpu:acc") + target_link_options(miniWeather_stdpar_openacc PRIVATE "-stdpar=gpu:acc") +endif() diff --git a/cpp-mdspan/build/cmake-kermit.sh b/cpp-mdspan/build/cmake-kermit.sh index 5da9b6e..d03931a 100755 --- a/cpp-mdspan/build/cmake-kermit.sh +++ b/cpp-mdspan/build/cmake-kermit.sh @@ -21,6 +21,7 @@ LDFLAGS="${PNETCDF_LDFLAGS}" CXXFLAGS="${PNETCDF_CXXFLAGS}" cmake \ -DCMAKE_Fortran_COMPILER=mpif90 \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DFETCHCONTENT_SOURCE_DIR_KOKKOS="${KOKKOS_ROOT}" \ + -DMINIWEATHER_ENABLE_STDPAR=ON \ ${SRC_ROOT} # -DCMAKE_CXX_FLAGS="-stdpar" diff --git a/cpp-mdspan/miniWeather_stdpar.hpp b/cpp-mdspan/miniWeather_stdpar.hpp new file mode 100644 index 0000000..b1004b9 --- /dev/null +++ b/cpp-mdspan/miniWeather_stdpar.hpp @@ -0,0 +1,449 @@ +#pragma once + +#include "miniWeather_common.hpp" +#include +#include +#include + +using stdpar_ranges_execution_policy = std::execution::parallel_policy; + +constexpr auto stdpar_md_range(stdpar_ranges_execution_policy, int M, int N) { + return std::views::cartesian_product(std::views::iota(0, M), std::views::iota(0, N)); +} + +constexpr auto stdpar_md_range(stdpar_ranges_execution_policy, int M, int N, int P) { + return std::views::cartesian_product(std::views::iota(0, M), std::views::iota(0, N), std::views::iota(0, P)); +} + +//Set this MPI task's halo values in the x-direction. +template +void set_halo_values_x( + stdpar_ranges_execution_policy exec_policy, + view_3d state, + const global_const_scalars& scalars, + const global_const_arrays& arrays) +{ + using std::begin; + using std::end; + const int nx = scalars.nx; + const int nz = scalars.nz; + + //////////////////////////////////////////////////////////////////////// + // TODO: EXCHANGE HALO VALUES WITH NEIGHBORING MPI TASKS + // (1) give state(1:hs,1:nz,1:NUM_VARS) to my left neighbor + // (2) receive state(1-hs:0,1:nz,1:NUM_VARS) from my left neighbor + // (3) give state(nx-hs+1:nx,1:nz,1:NUM_VARS) to my right neighbor + // (4) receive state(nx+1:nx+hs,1:nz,1:NUM_VARS) from my right neighbor + //////////////////////////////////////////////////////////////////////// + + ////////////////////////////////////////////////////// + // DELETE THE SERIAL CODE BELOW AND REPLACE WITH MPI + ////////////////////////////////////////////////////// + + { + auto range = stdpar_md_range(exec_policy, NUM_VARS, nz); + std::for_each(exec_policy, begin(range), end(range), + [state](int ll, int k) { + state(ll, k+hs, 0) = state(ll, k+hs, nx+hs-2); + state(ll, k+hs, 1) = state(ll, k+hs, nx+hs-1); + state(ll, k+hs, nx+hs) = state(ll, k+hs, hs); + state(ll, k+hs, nx+hs+1) = state(ll, k+hs, hs+1); + }); + } + + //////////////////////////////////////////////////// + + if (data_spec_int == DATA_SPEC_INJECTION) { + if (scalars.myrank == 0) { + auto hy_dens_cell = arrays.hy_dens_cell(); + auto hy_dens_theta_cell = arrays.hy_dens_theta_cell(); + const int k_beg = scalars.k_beg; + + auto range = stdpar_md_range(exec_policy, nz, hs); + std::for_each(exec_policy, begin(range), end(range), + [=] (auto&& k_i_pair) { + auto [k, i] = k_i_pair; + const double z = (k_beg + k+0.5)*dz; + if (fabs(z-3*zlen/4) <= zlen/16) { + state(ID_UMOM, k+hs, i) = (state(ID_DENS, k+hs, i) + hy_dens_cell[k+hs]) * 50.0; + state(ID_RHOT, k+hs, i) = (state(ID_DENS, k+hs, i) + hy_dens_cell[k+hs]) * 298.0 - + hy_dens_theta_cell[k+hs]; + } + }); + } + } +} + +//Set this MPI task's halo values in the z-direction. This does not require MPI because there is no MPI +//decomposition in the vertical direction +template +void set_halo_values_z( + stdpar_ranges_execution_policy exec_policy, + view_3d state, + const global_const_scalars& scalars, + const global_const_arrays& arrays) +{ + const int nx = scalars.nx; + const int nz = scalars.nz; + auto hy_dens_cell = arrays.hy_dens_cell(); + + auto range = stdpar_md_range(exec_policy, NUM_VARS, nx + 2*hs); + std::for_each(exec_policy, begin(range), end(range), + [=] (auto&& ll_i_pair) { + auto [ll, i] = ll_i_pair; + if (ll == ID_WMOM) { + state(ll, 0, i) = 0.0; + state(ll, 1, i) = 0.0; + state(ll, nz+hs, i) = 0.0; + state(ll, nz+hs+1, i) = 0.0; + } else if (ll == ID_UMOM) { + state(ll, 0, i) = state(ll, hs, i) / hy_dens_cell[hs] * hy_dens_cell[0]; + state(ll, 1, i) = state(ll, hs, i) / hy_dens_cell[hs] * hy_dens_cell[1]; + state(ll, nz+hs, i) = state(ll, nz+hs-1, i) / hy_dens_cell[nz+hs-1] * hy_dens_cell[nz+hs]; + state(ll, nz+hs+1, i) = state(ll, nz+hs-1, i) / hy_dens_cell[nz+hs-1] * hy_dens_cell[nz+hs+1]; + } else { + state(ll, 0, i) = state(ll, hs, i); + state(ll, 1, i) = state(ll, hs, i); + state(ll, nz+hs, i) = state(ll, nz+hs-1, i); + state(ll, nz+hs+1, i) = state(ll, nz+hs-1, i); + } + }); +} + +//Compute the time tendencies of the fluid state using forcing in the x-direction +//Since the halos are set in a separate routine, this will not require MPI +//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity) +//Then, compute the tendencies using those fluxes +template +void compute_tendencies_x( + stdpar_ranges_execution_policy exec_policy, + view_3d_const state, + view_3d flux, view_3d tend, double dt, + const global_const_scalars& scalars, + const global_const_arrays& arrays) +{ + const int nx = scalars.nx; + const int nz = scalars.nz; + // Hyperviscosity coefficient + const double hv_coef = -hv_beta * dx / (16*dt); + auto hy_dens_cell = arrays.hy_dens_cell(); + auto hy_dens_theta_cell = arrays.hy_dens_theta_cell(); + + //Compute fluxes in the x-direction for each cell + + auto range = stdpar_md_range(exec_policy, nz, nx+1); + std::for_each(exec_policy, begin(range), end(range), + [=] (auto&& k_i_pair) { + auto [k, i] = k_i_pair; + //Use fourth-order interpolation from four cell averages + //to compute the value at the interface in question + std::array d3_vals; + std::array vals; + for (int ll = 0; ll < NUM_VARS; ++ll) { + std::array stencil; + for (int s = 0; s < sten_size; ++s) { + stencil[s] = state(ll, k+hs, i+s); + } + //Fourth-order-accurate interpolation of the state + vals[ll] = -stencil[0]/12 + 7*stencil[1]/12 + 7*stencil[2]/12 - stencil[3]/12; + //First-order-accurate interpolation of the third spatial derivative + //of the state (for artificial viscosity) + d3_vals[ll] = -stencil[0] + 3*stencil[1] - 3*stencil[2] + stencil[3]; + } + + //Compute density, u-wind, w-wind, potential temperature, + //and pressure (r,u,w,t,p respectively) + double r = vals[ID_DENS] + hy_dens_cell[k+hs]; + double u = vals[ID_UMOM] / r; + double w = vals[ID_WMOM] / r; + double t = ( vals[ID_RHOT] + hy_dens_theta_cell[k+hs] ) / r; + double p = C0 * pow(r*t, gamm); + + //Compute the flux vector + flux(ID_DENS, k, i) = r*u - hv_coef*d3_vals[ID_DENS]; + flux(ID_UMOM, k, i) = r*u*u+p - hv_coef*d3_vals[ID_UMOM]; + flux(ID_WMOM, k, i) = r*u*w - hv_coef*d3_vals[ID_WMOM]; + flux(ID_RHOT, k, i) = r*u*t - hv_coef*d3_vals[ID_RHOT]; + }); + + //Use the fluxes to compute tendencies for each cell + { + view_3d_const flux_c = flux; + + auto range = stdpar_md_range(exec_policy, NUM_VARS, nz, nx); + std::for_each(exec_policy, begin(range), end(range), + [=] (auto&& ll_k_i_triple) { + auto [ll, k, i] = ll_k_i_triple; + tend(ll, k, i) = -( flux_c(ll, k, i+1) - flux_c(ll, k, i) ) / dx; + }); + } +} + +//Compute the time tendencies of the fluid state using forcing in the z-direction +//Since the halos are set in a separate routine, this will not require MPI +//First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity) +//Then, compute the tendencies using those fluxes +template +void compute_tendencies_z( + stdpar_ranges_execution_policy exec_policy, + view_3d_const state, + view_3d flux, view_3d tend, double dt, + const global_const_scalars& scalars, + const global_const_arrays& arrays) +{ + const int nx = scalars.nx; + const int nz = scalars.nz; + // Hyperviscosity coefficient + const double hv_coef = -hv_beta * dz / (16*dt); + auto hy_dens_int = arrays.hy_dens_int(); + auto hy_dens_theta_int = arrays.hy_dens_theta_int(); + auto hy_pressure_int = arrays.hy_pressure_int(); + + //Compute fluxes in the x-direction for each cell + auto range = stdpar_md_range(exec_policy, nz + 1, nx); + std::for_each(exec_policy, begin(range), end(range), + [=] (auto&& k_i_pair) { + auto [k, i] = k_i_pair; + //Use fourth-order interpolation from four cell averages + //to compute the value at the interface in question + std::array d3_vals; + std::array vals; + for (int ll = 0; ll < NUM_VARS; ++ll) { + std::array stencil; + for (int s = 0; s < sten_size; ++s) { + stencil[s] = state(ll, k+s, i+hs); + } + //Fourth-order-accurate interpolation of the state + vals[ll] = -stencil[0]/12 + 7*stencil[1]/12 + 7*stencil[2]/12 - stencil[3]/12; + //First-order-accurate interpolation of the third spatial derivative + //of the state + d3_vals[ll] = -stencil[0] + 3*stencil[1] - 3*stencil[2] + stencil[3]; + } + + //Compute density, u-wind, w-wind, potential temperature, + //and pressure (r,u,w,t,p respectively) + double r = vals[ID_DENS] + hy_dens_int[k]; + double u = vals[ID_UMOM] / r; + double w = vals[ID_WMOM] / r; + double t = (vals[ID_RHOT] + hy_dens_theta_int[k]) / r; + double p = C0 * pow(r * t, gamm) - hy_pressure_int[k]; + //Enforce vertical boundary condition and exact mass conservation + if (k == 0 || k == nz) { + w = 0; + d3_vals[ID_DENS] = 0; + } + + //Compute the flux vector with hyperviscosity + flux(ID_DENS, k, i) = r*w - hv_coef*d3_vals[ID_DENS]; + flux(ID_UMOM, k, i) = r*w*u - hv_coef*d3_vals[ID_UMOM]; + flux(ID_WMOM, k, i) = r*w*w+p - hv_coef*d3_vals[ID_WMOM]; + flux(ID_RHOT, k, i) = r*w*t - hv_coef*d3_vals[ID_RHOT]; + }); + + //Use the fluxes to compute tendencies for each cell + { + view_3d_const flux_c = flux; + + auto range = stdpar_md_range(exec_policy, NUM_VARS, nz, nx); + std::for_each(exec_policy, begin(range), end(range), + [=] (auto&& ll_k_i_triple) { + auto [ll, k, i] = ll_k_i_triple; + tend(ll, k, i) = -( flux_c(ll, k+1, i) - flux_c(ll, k, i) ) / dz; + if (ll == ID_WMOM) { + tend(ll, k, i) = tend(ll, k, i) - state(ID_DENS, k+hs, i+hs)*grav; + } + }); + } +} + +template +void apply_tendencies_to_fluid_state( + stdpar_ranges_execution_policy exec_policy, + view_3d_const state_init, + view_3d state_out, + double dt /* not scalars.dt */, + view_3d tend, + const global_const_scalars& scalars, + const global_const_arrays& arrays) +{ + const int nx = scalars.nx; + const int nz = scalars.nz; + auto hy_dens_cell = arrays.hy_dens_cell(); + view_3d_const tend_c = tend; + + auto range = stdpar_md_range(exec_policy, NUM_VARS, nz, nx); + std::for_each(exec_policy, begin(range), end(range), + [=] (auto&& ll_k_i_triple) { + auto [ll, k, i] = ll_k_i_triple; + if (data_spec_int == DATA_SPEC_GRAVITY_WAVES) { + const int i_beg = scalars.i_beg; + const int k_beg = scalars.k_beg; + const double x = (i_beg + i+0.5)*dx; + const double z = (k_beg + k+0.5)*dz; + const double wpert = sample_ellipse_cosine(x, z, 0.01, xlen/8, 1000.0, 500.0, 500.0); + tend(ID_WMOM, k, i) += wpert * hy_dens_cell[hs+k]; + } + state_out(ll, k+hs, i+hs) = state_init(ll, k+hs, i+hs) + dt * tend_c(ll, k, i); + }); +} + +// Initialize the cell-averaged fluid state via Gauss-Legendre quadrature +void initialize_cell_averaged_fluid_state( + stdpar_ranges_execution_policy exec_policy, + view_3d state, view_3d state_tmp, + int nx, int nz, + int i_beg, int k_beg) +{ + auto range = stdpar_md_range(exec_policy, nz + 2*hs, nx + 2*hs); + std::for_each(exec_policy, begin(range), end(range), + [=] (auto&& k_i_pair) { + auto [k, i] = k_i_pair; + // OpenACC doesn't support static arrays, even if they are constexpr. + constexpr int nqpoints = 3; + constexpr cuda::std::array qpoints{ + 0.112701665379258311482073460022E0, + 0.500000000000000000000000000000E0, + 0.887298334620741688517926539980E0 + }; + constexpr cuda::std::array qweights{ + 0.277777777777777777777777777779E0, + 0.444444444444444444444444444444E0, + 0.277777777777777777777777777779E0 + }; + + //Initialize the state to zero + for (int ll = 0; ll < NUM_VARS; ++ll) { + state(ll, k, i) = 0.0; + } + //Use Gauss-Legendre quadrature to initialize a hydrostatic balance + temperature perturbation + for (int kk = 0; kk < nqpoints; ++kk) { + for (int ii = 0; ii < nqpoints; ++ii) { + //Compute the x,z location within the global domain based on cell and quadrature index + const double x = (i_beg + i-hs+0.5)*dx + (qpoints[ii]-0.5)*dx; + const double z = (k_beg + k-hs+0.5)*dz + (qpoints[kk]-0.5)*dz; + + //Set the fluid state based on the user's specification + auto [r, u, w, t, hr, ht] = get_test_case(data_spec_int, x, z); + + //Store into the fluid state array + state(ID_DENS, k, i) = state(ID_DENS, k, i) + r * qweights[ii]*qweights[kk]; + state(ID_UMOM, k, i) = state(ID_UMOM, k, i) + (r+hr)*u * qweights[ii]*qweights[kk]; + state(ID_WMOM, k, i) = state(ID_WMOM, k, i) + (r+hr)*w * qweights[ii]*qweights[kk]; + state(ID_RHOT, k, i) = state(ID_RHOT, k, i) + ( (r+hr)*(t+ht) - hr*ht ) * qweights[ii]*qweights[kk]; + } + } + for (int ll = 0; ll < NUM_VARS; ++ll) { + state_tmp(ll, k, i) = state(ll, k, i); + } + }); +} + +void compute_hydrostatic_background_state( + stdpar_ranges_execution_policy exec_policy, + view_1d hy_dens_cell, + view_1d hy_dens_theta_cell, + view_1d hy_dens_int, + view_1d hy_dens_theta_int, + view_1d hy_pressure_int, + int nz, + int k_beg) +{ + //Compute the hydrostatic background state over vertical cell averages + auto range = stdpar_md_range(exec_policy, nz + 2*hs); + std::for_each(exec_policy, begin(range), end(range), + [=] (int k) { + // OpenACC doesn't support static arrays, even if they are constexpr. + constexpr int nqpoints = 3; + constexpr cuda::std::array qweights{ + 0.277777777777777777777777777779E0, + 0.444444444444444444444444444444E0, + 0.277777777777777777777777777779E0 + }; + + hy_dens_cell[k] = 0.0; + hy_dens_theta_cell[k] = 0.0; + for (int kk = 0; kk < nqpoints; ++kk) { + const double z = (k_beg + k-hs+0.5)*dz; + //Set the fluid state based on the user's specification + auto [r, u, w, t, hr, ht] = get_test_case(data_spec_int, 0.0, z); + hy_dens_cell[k] = hy_dens_cell[k] + hr * qweights[kk]; + hy_dens_theta_cell[k] = hy_dens_theta_cell[k] + hr*ht * qweights[kk]; + } + }); + + //Compute the hydrostatic background state at vertical cell interfaces + auto range = stdpar_md_range(exec_policy, nz + 1); + std::for_each(exec_policy, begin(range), end(range), + [=] (int k) { + const double z = (k_beg + k)*dz; + auto [r, u, w, t, hr, ht] = get_test_case(data_spec_int, 0.0, z); + hy_dens_int[k] = hr; + hy_dens_theta_int[k] = hr * ht; + hy_pressure_int[k] = C0 * pow(hr * ht, gamm); + }); +} + +struct stdpar_reducer { + view3d_const state; + view1d_const hy_dens_cell; + view1d_const hy_dens_theta_cell; + + reduction_result local_result(std::tuple k_i_pair) const { + auto [k, i] = k_i_pair; + + double r = state(ID_DENS, k+hs, i+hs) + hy_dens_cell[hs+k]; // Density + double u = state(ID_UMOM, k+hs, i+hs) / r; // U-wind + double w = state(ID_WMOM, k+hs, i+hs) / r; // W-wind + double th = (state(ID_RHOT, k+hs, i+hs) + hy_dens_theta_cell[hs+k]) / r; // Potential Temperature (theta) + double p = C0 * pow(r * th, gamm); // Pressure + double t = th / pow(p0 / p, rd / cp); // Temperature + double ke = r*(u*u+w*w); // Kinetic Energy + double ie = r*cv*t; // Internal Energy + return reduction_result{ + .mass = r *dx*dz, // domain mass + .te = (ke + ie)*dx*dz // domain total energy + }; + } + + reduction_result + operator() (const reduction_result& r0, const reduction_result& r1) const { + return reduction_result{ + .mass = r0.mass + r1.mass, // Accumulate domain mass + .te = r0.te + r1.te // Accumulate domain total energy + }; + } + + reduction_result + operator() (std::tuple k_i_pair, const reduction_result& thread_local_result) const { + return (*this)(local_result(k_i_pair), thread_local_result); + } + + reduction_result + operator() (const reduction_result& thread_local_result, std::tuple k_i_pair) const { + return (*this)(thread_local_result, local_result(k_i_pair)); + } + + reduction_result + operator() (std::tuple p0, std::tuple p1) const { + return (*this)(local_result(p0), local_result(p1)); + } +}; + +template +reduction_result local_reductions( + stdpar_ranges_execution_policy exec_policy, + view_3d_const state, + const global_const_scalars& const_scalars, + const global_const_arrays& const_arrays) +{ + reduction_result result{0.0, 0.0}; + const int nx = const_scalars.nx; + const int nz = const_scalars.nz; + auto hy_dens_cell = const_arrays.hy_dens_cell(); + auto hy_dens_theta_cell = const_arrays.hy_dens_theta_cell(); + + auto range = stdpar_md_range(exec_policy, nz, nx); + auto reducer = stdpar_reducer{state, hy_dens_cell, hy_dens_theta_cell}; + result = std::reduce(exec_policy, begin(range), end(range), result, reducer); + return result; +} From c9ac57778af90a1191071b50e438eea221966077 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 9 Apr 2025 01:38:34 +0300 Subject: [PATCH 80/83] Add -Wall to NVHPC build --- cpp-mdspan/CMakeLists.txt | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp-mdspan/CMakeLists.txt b/cpp-mdspan/CMakeLists.txt index aa4c957..3f3db59 100644 --- a/cpp-mdspan/CMakeLists.txt +++ b/cpp-mdspan/CMakeLists.txt @@ -11,7 +11,7 @@ project(miniWeather-mdspan ) include(FetchContent) -message(STATUS "C++ compiler ID: ${CMAKE_CXX_COMPILER_ID}") +message(STATUS "CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}") # Option to override which version of the C++ Standard to use set(MINIWEATHER_CXX_STANDARD DETECT CACHE STRING "Override the default CXX_STANDARD") @@ -155,7 +155,7 @@ if (NOT kokkos_POPULATED) target_link_libraries(test_unique_mdarray PRIVATE std::mdspan) endif() target_compile_options(test_unique_mdarray PRIVATE - $<$,$,$>: + $<$,$,$,$>: -Wall> $<$: /W4> @@ -170,7 +170,7 @@ if (NOT kokkos_POPULATED) target_link_libraries(miniWeather_serial PRIVATE std::mdspan) endif() target_compile_options(miniWeather_serial PRIVATE - $<$,$,$>: + $<$,$,$,$>: -Wall> $<$: /W4> @@ -186,7 +186,7 @@ if (kokkos_POPULATED) # We got mdspan from Kokkos. target_compile_options(miniWeather_kokkos_serial PRIVATE - $<$,$,$>: + $<$,$,$,$>: -Wall> $<$: /W4> @@ -206,7 +206,7 @@ if (kokkos_POPULATED AND OpenACC_FOUND) # We got mdspan from Kokkos. target_link_libraries(miniWeather_kokkos_openacc PRIVATE OpenACC::OpenACC_CXX) target_compile_options(miniWeather_kokkos_openacc PRIVATE - $<$,$,$>: + $<$,$,$,$>: -Wall> $<$: /W4> @@ -224,7 +224,7 @@ if (OpenACC_FOUND) endif() target_link_libraries(miniWeather_openacc PRIVATE OpenACC::OpenACC_CXX) target_compile_options(miniWeather_openacc PRIVATE - $<$,$,$>: + $<$,$,$,$>: -Wall> $<$: /W4> @@ -244,7 +244,7 @@ if (MINIWEATHER_ENABLE_STDPAR) target_link_libraries(miniWeather_stdpar_cpu PRIVATE std::mdspan) endif() target_compile_options(miniWeather_stdpar_cpu PRIVATE - $<$,$,$>: + $<$,$,$,$>: -Wall> $<$: /W4> @@ -263,7 +263,7 @@ if (MINIWEATHER_ENABLE_STDPAR) target_link_libraries(miniWeather_stdpar_gpu PRIVATE std::mdspan) endif() target_compile_options(miniWeather_stdpar_gpu PRIVATE - $<$,$,$>: + $<$,$,$,$>: -Wall> $<$: /W4> @@ -283,7 +283,7 @@ if (MINIWEATHER_ENABLE_STDPAR AND OpenACC_FOUND) endif() target_link_libraries(miniWeather_stdpar_openacc PRIVATE OpenACC::OpenACC_CXX) target_compile_options(miniWeather_stdpar_openacc PRIVATE - $<$,$,$>: + $<$,$,$,$>: -Wall> $<$: /W4> From c4f3d665907bc3b15b593e0bafd963ddcb103e72 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 9 Apr 2025 01:49:10 +0300 Subject: [PATCH 81/83] Fix builds and disable builds that don't currently work We have a CUB (ForEachInExtents) version, but it hasn't been tested yet, because HPC SDK 25.1 doesn't have a new enough CUB version yet. Fix stdpar build by using cartesian_product from Gonzalo Brito's tutorial. --- cpp-mdspan/CMakeLists.txt | 108 +- cpp-mdspan/build/cmake-kermit.sh | 11 +- cpp-mdspan/cartesian_product.hpp | 2027 +++++++++++++++++++++++ cpp-mdspan/miniWeather_common.cpp | 4 +- cpp-mdspan/miniWeather_common.hpp | 170 +- cpp-mdspan/miniWeather_cub.hpp | 439 +++++ cpp-mdspan/miniWeather_generic_algs.hpp | 74 +- cpp-mdspan/miniWeather_kokkos.hpp | 146 +- cpp-mdspan/miniWeather_mdspan.cpp | 59 +- cpp-mdspan/miniWeather_output.hpp | 6 +- cpp-mdspan/miniWeather_serial.hpp | 34 +- cpp-mdspan/miniWeather_stdpar.hpp | 110 +- 12 files changed, 2935 insertions(+), 253 deletions(-) create mode 100644 cpp-mdspan/cartesian_product.hpp create mode 100644 cpp-mdspan/miniWeather_cub.hpp diff --git a/cpp-mdspan/CMakeLists.txt b/cpp-mdspan/CMakeLists.txt index 3f3db59..b285903 100644 --- a/cpp-mdspan/CMakeLists.txt +++ b/cpp-mdspan/CMakeLists.txt @@ -63,35 +63,49 @@ else() message(STATUS "OpenACC not found") endif() - # Find CUDA Toolkit; it's not required, # but other things (like CUB) depend on it. find_package(CUDAToolkit) +set(MINIWEATHER_CUB_FOUND FALSE) + +if (CUDAToolkit_FOUND AND (NOT (DEFINED CUB_INCLUDE_DIR))) + # Check whether CUDA Toolkit's include directory has CUB in it. + find_path(CUB_INCLUDE_DIR + NAMES cub/cub.cuh + PATHS + ${CUDAToolkit_INCLUDE_DIRS} + /usr/include + /usr/local/include + DOC "Path to CUB include directory." + ) + message(STATUS "Discovered CUB_INCLUDE_DIR: ${CUB_INCLUDE_DIR}") + if (CUDAToolkit_VERSION VERSION_LESS "12.8.0") + message(STATUS "CUDAToolkit_VERSION 12.6.85 is known to lack ForEachInExtents. \ +Your CUDAToolkit_VERSION is ${CUDAToolkit_VERSION}.") + endif() +endif() + if (DEFINED CUB_INCLUDE_DIR) - if (EXISTS "${CUB_INCLUDE_DIR}/cub/cub.cuh") - message(STATUS "User-defined CUB_INCLUDE_DIR: ${CUB_INCLUDE_DIR}") + set(CUB_TEST_FILE_0 "${CUB_INCLUDE_DIR}/cub/cub.cuh") + set(CUB_TEST_FILE_0_FOUND FALSE) + # This was added to later versions of CUB. + set(CUB_TEST_FILE_1 "${CUB_INCLUDE_DIR}/cub/device/device_for_each.cuh") + set(CUB_TEST_FILE_1_FOUND FALSE) + + if (EXISTS "${CUB_TEST_FILE_0}") + message(STATUS "User-defined CUB_INCLUDE_DIR ${CUB_INCLUDE_DIR} contains cub/cub.cuh") else() - message(FATAL_ERROR "CUB_INCLUDE_DIR=\"{CUB_INCLUDE_DIR}\" is \ - defined but \"${CUB_INCLUDE_DIR}/cub/cub.cuh\" does not exist.") + message(STATUS "User-defined CUB_INCLUDE_DIR ${CUB_INCLUDE_DIR} does NOT contain cub/cub.cuh") endif() -else() - if (CUDAToolkit_FOUND) - # Check whether CUDA Toolkit's include directory has CUB in it. - find_path(CUB_INCLUDE_DIR - NAMES cub/cub.cuh - PATHS - ${CUDAToolkit_INCLUDE_DIRS} - /usr/include - /usr/local/include - DOC "Path to CUB include directory." - ) - message(STATUS "Discovered CUB_INCLUDE_DIR: ${CUB_INCLUDE_DIR}") - if (CUDAToolkit_VERSION VERSION_LESS "12.8.0") - message(STATUS "CUDAToolkit_VERSION 12.6.85 is known to lack ForEachInExtents. \ -Your CUDAToolkit_VERSION is ${CUDAToolkit_VERSION}.") - endif() + + if (EXISTS "${CUB_TEST_FILE_1}") + message(STATUS "User-defined CUB_INCLUDE_DIR ${CUB_INCLUDE_DIR} contains cub/device/device_for_each.cuh") + else() + message(STATUS "User-defined CUB_INCLUDE_DIR ${CUB_INCLUDE_DIR} does NOT contain cub/device/device_for_each.cuh") endif() + + set(MINIWEATHER_CUB_FOUND (${CUB_TEST_FILE_0_FOUND} AND ${CUB_TEST_FILE_1_FOUND})) endif() # Please see Kokkos' CMake instructions: @@ -161,8 +175,10 @@ target_compile_options(test_unique_mdarray PRIVATE /W4> ) +message(STATUS "Enabled miniWeather_serial") add_executable(miniWeather_serial miniWeather_mdspan.cpp miniWeather_common.cpp) target_include_directories(miniWeather_serial PRIVATE "${PROJECT_SOURCE_DIR}") +target_compile_definitions(miniWeather_serial PRIVATE MINIWEATHER_SERIAL) # If building with Kokkos, we've already added the mdspan headers # to the include path (see above). @@ -177,6 +193,7 @@ target_compile_options(miniWeather_serial PRIVATE ) if (kokkos_POPULATED) + message(STATUS "Enabled miniWeather_kokkos_serial") add_executable(miniWeather_kokkos_serial miniWeather_mdspan.cpp miniWeather_common.cpp) target_include_directories(miniWeather_kokkos_serial PRIVATE "${PROJECT_SOURCE_DIR}") @@ -196,6 +213,7 @@ endif() if (kokkos_POPULATED AND OpenACC_FOUND) set(Kokkos_ENABLE_OPENACC ON) + message(STATUS "Enabled miniWeather_kokkos_openacc") add_executable(miniWeather_kokkos_openacc miniWeather_mdspan.cpp miniWeather_common.cpp) target_include_directories(miniWeather_kokkos_openacc PRIVATE "${PROJECT_SOURCE_DIR}") @@ -213,9 +231,13 @@ if (kokkos_POPULATED AND OpenACC_FOUND) ) endif() +# We don't have a "pure OpenACC" version yet. +if (FALSE) if (OpenACC_FOUND) + message(STATUS "Enabled miniWeather_openacc") add_executable(miniWeather_openacc miniWeather_mdspan.cpp miniWeather_common.cpp) target_include_directories(miniWeather_openacc PRIVATE "${PROJECT_SOURCE_DIR}") + target_compile_definitions(miniWeather_openacc PRIVATE MINIWEATHER_OPENACC) # If building with Kokkos, we've already added the mdspan headers # to the include path (see above). @@ -230,13 +252,18 @@ if (OpenACC_FOUND) /W4> ) endif() +endif() +set(MINIWEATHER_HAVE_STDPAR (${CMAKE_CXX_STANDARD} GREATER_EQUAL 23)) # Option to enable stdpar -set(MINIWEATHER_ENABLE_STDPAR DETECT CACHE STRING "Enable stdpar using the -stdpar flag") +set(MINIWEATHER_ENABLE_STDPAR ${MINIWEATHER_HAVE_STDPAR} CACHE BOOL "Enable stdpar using the -stdpar flag") if (MINIWEATHER_ENABLE_STDPAR) + message(STATUS "Enabled miniWeather_stdpar_cpu") add_executable(miniWeather_stdpar_cpu miniWeather_mdspan.cpp miniWeather_common.cpp) target_include_directories(miniWeather_stdpar_cpu PRIVATE "${PROJECT_SOURCE_DIR}") + target_compile_definitions(miniWeather_stdpar_cpu PRIVATE MINIWEATHER_STDPAR) + target_compile_definitions(miniWeather_stdpar_cpu PRIVATE MINIWEATHER_STDPAR_CPU) # If building with Kokkos, we've already added the mdspan headers # to the include path (see above). @@ -254,8 +281,11 @@ if (MINIWEATHER_ENABLE_STDPAR) endif() if (MINIWEATHER_ENABLE_STDPAR) + message(STATUS "Enabled miniWeather_stdpar_gpu") add_executable(miniWeather_stdpar_gpu miniWeather_mdspan.cpp miniWeather_common.cpp) target_include_directories(miniWeather_stdpar_gpu PRIVATE "${PROJECT_SOURCE_DIR}") + target_compile_definitions(miniWeather_stdpar_gpu PRIVATE MINIWEATHER_STDPAR) + target_compile_definitions(miniWeather_stdpar_gpu PRIVATE MINIWEATHER_STDPAR_GPU) # If building with Kokkos, we've already added the mdspan headers # to the include path (see above). @@ -273,8 +303,11 @@ if (MINIWEATHER_ENABLE_STDPAR) endif() if (MINIWEATHER_ENABLE_STDPAR AND OpenACC_FOUND) + message(STATUS "Enabled miniWeather_stdpar_openacc") add_executable(miniWeather_stdpar_openacc miniWeather_mdspan.cpp miniWeather_common.cpp) target_include_directories(miniWeather_stdpar_openacc PRIVATE "${PROJECT_SOURCE_DIR}") + target_compile_definitions(miniWeather_stdpar_openacc PRIVATE MINIWEATHER_STDPAR) + target_compile_definitions(miniWeather_stdpar_openacc PRIVATE MINIWEATHER_STDPAR_OPENACC) # If building with Kokkos, we've already added the mdspan headers # to the include path (see above). @@ -291,3 +324,34 @@ if (MINIWEATHER_ENABLE_STDPAR AND OpenACC_FOUND) target_compile_options(miniWeather_stdpar_openacc PRIVATE "-stdpar=gpu:acc") target_link_options(miniWeather_stdpar_openacc PRIVATE "-stdpar=gpu:acc") endif() + +# Option to enable CUB +set(MINIWEATHER_ENABLE_CUB ${MINIWEATHER_CUB_FOUND} CACHE BOOL "Enable CUB") + +if (MINIWEATHER_ENABLE_CUB) + message(STATUS "Enabled miniWeather_cub") + add_executable(miniWeather_cub miniWeather_mdspan.cpp miniWeather_common.cpp) + target_include_directories(miniWeather_cub PRIVATE "${PROJECT_SOURCE_DIR}") + target_compile_definitions(miniWeather_cub PRIVATE MINIWEATHER_CUB) + + # If building with Kokkos, we've already added the mdspan headers + # to the include path (see above). + if (NOT kokkos_POPULATED) + target_link_libraries(miniWeather_cub PRIVATE std::mdspan) + endif() + target_compile_options(miniWeather_cub PRIVATE + $<$,$,$,$>: + -Wall> + $<$: + /W4> + ) + # Building CUB with nvc++ requires the "-cuda" flag + target_compile_options(miniWeather_cub PRIVATE + $<$>: + -cuda> + ) + target_link_options(miniWeather_cub PRIVATE + $<$>: + -cuda> + ) +endif() diff --git a/cpp-mdspan/build/cmake-kermit.sh b/cpp-mdspan/build/cmake-kermit.sh index d03931a..66c55ff 100755 --- a/cpp-mdspan/build/cmake-kermit.sh +++ b/cpp-mdspan/build/cmake-kermit.sh @@ -11,6 +11,14 @@ SRC_ROOT=/raid/mhoemmen/src/miniWeather/cpp-mdspan # "-stdpar": "Could not find librt library, needed by CUDA::cudart_static" # Adding "-rt" to LDFLAGS didn't seem to help. +# Current version of stdpar code requires C++23, +# because it uses std::ranges::views::cartesian_product. +# nvc++ might not support that yet. +MINIWEATHER_ENABLE_STDPAR=ON + +# We don't have ForEachInExtents yet with 25.1. +MINIWEATHER_ENABLE_CUB=OFF + KOKKOS_ROOT="/raid/mhoemmen/src/kokkos/kokkos" # -DFETCHCONTENT_SOURCE_DIR_Kokkos="${KOKKOS_ROOT}" # -DKokkos_ROOT="${KOKKOS_ROOT}" @@ -21,7 +29,8 @@ LDFLAGS="${PNETCDF_LDFLAGS}" CXXFLAGS="${PNETCDF_CXXFLAGS}" cmake \ -DCMAKE_Fortran_COMPILER=mpif90 \ -DCMAKE_VERBOSE_MAKEFILE=ON \ -DFETCHCONTENT_SOURCE_DIR_KOKKOS="${KOKKOS_ROOT}" \ - -DMINIWEATHER_ENABLE_STDPAR=ON \ + -DMINIWEATHER_ENABLE_STDPAR=${MINIWEATHER_ENABLE_STDPAR} \ + -DMINIWEATHER_ENABLE_CUB=${MINIWEATHER_ENABLE_CUB} \ ${SRC_ROOT} # -DCMAKE_CXX_FLAGS="-stdpar" diff --git a/cpp-mdspan/cartesian_product.hpp b/cpp-mdspan/cartesian_product.hpp new file mode 100644 index 0000000..60ed85f --- /dev/null +++ b/cpp-mdspan/cartesian_product.hpp @@ -0,0 +1,2027 @@ +#pragma once +//#define DISABLE_CART_PROD_IOTA_SPEC +/* +Adapted from TartanLlama/ranges: https://github.com/TartanLlama/ranges +Original version License CC0 1.0 Universal (see below) + +Modified by Gonzalo Brito Gadeschi, NVIDIA corporation +Modifications under MIT license. + +--- + +SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +SPDX-License-Identifier: MIT + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + +--- + +Creative Commons Legal Code + +CC0 1.0 Universal + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS + PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM + THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED + HEREUNDER. + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator +and subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for +the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without fear +of later claims of infringement build upon, modify, incorporate in other +works, reuse and redistribute as freely as possible in any form whatsoever +and for any purposes, including without limitation commercial purposes. +These owners may contribute to the Commons to promote the ideal of a free +culture and the further production of creative, cultural and scientific +works, or to gain reputation or greater distribution for their Work in +part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or she +is an owner of Copyright and Related Rights in the Work, voluntarily +elects to apply CC0 to the Work and publicly distribute the Work under its +terms, with knowledge of his or her Copyright and Related Rights in the +Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not +limited to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; + ii. moral rights retained by the original author(s) and/or performer(s); +iii. publicity and privacy rights pertaining to a person's image or + likeness depicted in a Work; + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + v. rights protecting the extraction, dissemination, use and reuse of data + in a Work; + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation + thereof, including any amended or successor version of such + directive); and +vii. other similar, equivalent or corresponding rights throughout the + world based on applicable law or treaty, and any national + implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention +of, applicable law, Affirmer hereby overtly, fully, permanently, +irrevocably and unconditionally waives, abandons, and surrenders all of +Affirmer's Copyright and Related Rights and associated claims and causes +of action, whether now known or unknown (including existing as well as +future claims and causes of action), in the Work (i) in all territories +worldwide, (ii) for the maximum duration provided by applicable law or +treaty (including future time extensions), (iii) in any current or future +medium and for any number of copies, and (iv) for any purpose whatsoever, +including without limitation commercial, advertising or promotional +purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each +member of the public at large and to the detriment of Affirmer's heirs and +successors, fully intending that such Waiver shall not be subject to +revocation, rescission, cancellation, termination, or any other legal or +equitable action to disrupt the quiet enjoyment of the Work by the public +as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason +be judged legally invalid or ineffective under applicable law, then the +Waiver shall be preserved to the maximum extent permitted taking into +account Affirmer's express Statement of Purpose. In addition, to the +extent the Waiver is so judged Affirmer hereby grants to each affected +person a royalty-free, non transferable, non sublicensable, non exclusive, +irrevocable and unconditional license to exercise Affirmer's Copyright and +Related Rights in the Work (i) in all territories worldwide, (ii) for the +maximum duration provided by applicable law or treaty (including future +time extensions), (iii) in any current or future medium and for any number +of copies, and (iv) for any purpose whatsoever, including without +limitation commercial, advertising or promotional purposes (the +"License"). The License shall be deemed effective as of the date CC0 was +applied by Affirmer to the Work. Should any part of the License for any +reason be judged legally invalid or ineffective under applicable law, such +partial invalidity or ineffectiveness shall not invalidate the remainder +of the License, and in such case Affirmer hereby affirms that he or she +will not (i) exercise any of his or her remaining Copyright and Related +Rights in the Work or (ii) assert any associated claims and causes of +action with respect to the Work, in either case contrary to Affirmer's +express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + b. Affirmer offers the Work as-is and makes no representations or + warranties of any kind concerning the Work, express, implied, + statutory or otherwise, including without limitation warranties of + title, merchantability, fitness for a particular purpose, non + infringement, or the absence of latent or other defects, accuracy, or + the present or absence of errors, whether or not discoverable, all to + the greatest extent permissible under applicable law. + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without + limitation any person's Copyright and Related Rights in the Work. + Further, Affirmer disclaims responsibility for obtaining any necessary + consents, permissions or other rights required for any use of the + Work. + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to + this CC0 or use of the Work. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace tl { + namespace detail { + template + concept single_pass_iterator = std::input_or_output_iterator && !std::forward_iterator; + + template + constexpr auto common_iterator_category() { + if constexpr ((std::ranges::random_access_range && ...)) + return std::random_access_iterator_tag{}; + else if constexpr ((std::ranges::bidirectional_range && ...)) + return std::bidirectional_iterator_tag{}; + else if constexpr ((std::ranges::forward_range && ...)) + return std::forward_iterator_tag{}; + else if constexpr ((std::ranges::input_range && ...)) + return std::input_iterator_tag{}; + else + return std::output_iterator_tag{}; + } + } + + template + using common_iterator_category = decltype(detail::common_iterator_category()); + + template + concept simple_view = std::ranges::view && std::ranges::range && + std::same_as, std::ranges::iterator_t> && + std::same_as, + std::ranges::sentinel_t>; + + struct as_sentinel_t {}; + constexpr inline as_sentinel_t as_sentinel; + + template + using maybe_const = std::conditional_t; + + template + class basic_mixin : protected T { + public: + constexpr basic_mixin() + noexcept(std::is_nothrow_default_constructible::value) + requires std::default_initializable : + T() {} + constexpr basic_mixin(const T& t) + noexcept(std::is_nothrow_copy_constructible::value) + requires std::copy_constructible : + T(t) {} + constexpr basic_mixin(T&& t) + noexcept(std::is_nothrow_move_constructible::value) + requires std::move_constructible : + T(std::move(t)) {} + + + constexpr T& get() & noexcept { return *static_cast(this); } + constexpr const T& get() const& noexcept { return *static_cast(this); } + constexpr T&& get() && noexcept { return std::move(*static_cast(this)); } + constexpr const T&& get() const&& noexcept { return std::move(*static_cast(this)); } + }; + + namespace cursor { + namespace detail { + template + struct tags { + static constexpr auto single_pass() requires requires { { C::single_pass } -> std::convertible_to; } { + return C::single_pass; + } + static constexpr auto single_pass() { return false; } + + static constexpr auto contiguous() requires requires { { C::contiguous } -> std::convertible_to; } { + return C::contiguous; + } + static constexpr auto contiguous() { return false; } + }; + } + template + constexpr bool single_pass = detail::tags::single_pass(); + + template + constexpr bool tagged_contiguous = detail::tags::contiguous(); + + namespace detail { + template + struct deduced_mixin_t { + template static auto deduce(int)-> typename T::mixin; + template static auto deduce(...)->tl::basic_mixin; + using type = decltype(deduce(0)); + }; + } + + template + using mixin_t = typename detail::deduced_mixin_t::type; + + template + requires + requires(const C& c) { c.read(); } + using reference_t = decltype(std::declval().read()); + + namespace detail { + template + struct deduced_value_t { + template static auto deduce(int)-> typename T::value_type; + template static auto deduce(...)->std::decay_t>; + + using type = decltype(deduce(0)); + }; + } + + template + requires std::same_as::type, std::decay_t::type>> + using value_type_t = typename detail::deduced_value_t::type; + + namespace detail { + template + struct deduced_difference_t { + template static auto deduce(int)-> typename T::difference_type; + template + static auto deduce(long)->decltype(std::declval().distance_to(std::declval())); + template + static auto deduce(...)->std::ptrdiff_t; + + using type = decltype(deduce(0)); + }; + } + + template + using difference_type_t = typename detail::deduced_difference_t::type; + + template + concept cursor = std::semiregular> + && std::semiregular>> + && requires {typename difference_type_t; }; + + template + concept readable = cursor && requires(const C & c) { + c.read(); + typename reference_t; + typename value_type_t; + }; + + template + concept arrow = readable + && requires(const C & c) { c.arrow(); }; + + template + concept writable = cursor + && requires(C & c, T && t) { c.write(std::forward(t)); }; + + template + concept sentinel_for = cursor && std::semiregular + && requires(const C & c, const S & s) { {c.equal(s)} -> std::same_as; }; + + template + concept sized_sentinel_for = sentinel_for && + requires(const C & c, const S & s) { + {c.distance_to(s)} -> std::same_as>; + }; + + template + concept next = cursor && requires(C & c) { c.next(); }; + + template + concept prev = cursor && requires(C & c) { c.prev(); }; + + template + concept advance = cursor + && requires(C & c, difference_type_t n) { c.advance(n); }; + + template + concept indirect_move = readable + && requires(const C & c) { c.indirect_move(); }; + + template + concept indirect_swap = readable && readable + && requires(const C & c, const O & o) { + c.indirect_swap(o); + o.indirect_swap(c); + }; + + template + concept input = readable && next; + template + concept forward = input && sentinel_for && !single_pass; + template + concept bidirectional = forward && prev; + template + concept random_access = bidirectional && advance && sized_sentinel_for; + template + concept contiguous = random_access && tagged_contiguous && std::is_reference_v>; + + template + constexpr auto cpp20_iterator_category() { + if constexpr (contiguous) + return std::contiguous_iterator_tag{}; + else if constexpr (random_access) + return std::random_access_iterator_tag{}; + else if constexpr (bidirectional) + return std::bidirectional_iterator_tag{}; + else if constexpr (forward) + return std::forward_iterator_tag{}; + else + return std::input_iterator_tag{}; + } + template + using cpp20_iterator_category_t = decltype(cpp20_iterator_category()); + + //There were a few changes in requirements on iterators between C++17 and C++20 + //See https://wg21.link/p2259 for discussion + //- C++17 didn't have contiguous iterators + //- C++17 input iterators required *it++ to be valid + //- C++17 forward iterators required the reference type to be exactly value_type&/value_type const& (i.e. not a proxy) + struct not_a_cpp17_iterator {}; + + template + concept reference_is_value_type_ref = + (std::same_as, value_type_t&> || std::same_as, value_type_t const&>); + + template + concept can_create_postincrement_proxy = + (std::move_constructible> && std::constructible_from, reference_t>); + + template + constexpr auto cpp17_iterator_category() { + if constexpr (random_access +#if !defined(__NVCOMPILER) + // YOLO: with nvc++ proxy iterators can be random access . . . + // BUG: Need to update Thrust to C++20 iterator categories + && reference_is_value_type_ref +#endif + ) + return std::random_access_iterator_tag{}; + else if constexpr (bidirectional && reference_is_value_type_ref) + return std::bidirectional_iterator_tag{}; + else if constexpr (forward && reference_is_value_type_ref) + return std::forward_iterator_tag{}; + else if constexpr (can_create_postincrement_proxy) + return std::input_iterator_tag{}; + else + return not_a_cpp17_iterator{}; + } + template + using cpp17_iterator_category_t = decltype(cpp17_iterator_category()); + + //iterator_concept and iterator_category are tricky; this abstracts them out. + //Since the rules for iterator categories changed between C++17 and C++20 + //a C++20 iterator may have a weaker category in C++17, + //or it might not be a valid C++17 iterator at all. + //iterator_concept will be the C++20 iterator category. + //iterator_category will be the C++17 iterator category, or it will not exist + //in the case that the iterator is not a valid C++17 iterator. + template > + struct associated_types_category_base { + using iterator_category = category; + }; + template + struct associated_types_category_base {}; + + template + struct associated_types : associated_types_category_base { + using iterator_concept = cpp20_iterator_category_t; + using value_type = cursor::value_type_t; + using difference_type = cursor::difference_type_t; + using reference = cursor::reference_t; + }; + + namespace detail { + // We assume a cursor is writeable if it's either not readable + // or it is writeable with the same type it reads to + template + struct is_writable_cursor { + template + requires requires (C c) { + c.write(c.read()); + } + static auto deduce()->std::true_type; + + template + static auto deduce()->std::false_type; + + template + static auto deduce()->std::true_type; + + static constexpr bool value = decltype(deduce())::value; + }; + } + } + + namespace detail { + template + struct post_increment_proxy { + private: + T cache_; + + public: + template + constexpr post_increment_proxy(U&& t) + : cache_(std::forward(t)) + {} + constexpr T const& operator*() const noexcept + { + return cache_; + } + }; + } + + + template + class basic_iterator : + public cursor::mixin_t + { + private: + using mixin = cursor::mixin_t; + + constexpr auto& cursor() noexcept { return this->mixin::get(); } + constexpr auto const& cursor() const noexcept { return this->mixin::get(); } + + template + friend class basic_iterator; + + //TODO these need to change to support output iterators + using reference_t = decltype(std::declval().read()); + using const_reference_t = reference_t; + + public: + using mixin::get; + + using value_type = cursor::value_type_t; + using difference_type = cursor::difference_type_t; + using reference = cursor::reference_t; + + basic_iterator() = default; + + using mixin::mixin; + + constexpr explicit basic_iterator(C&& c) + noexcept(std::is_nothrow_constructible_v) : + mixin(std::move(c)) {} + + + constexpr explicit basic_iterator(C const& c) + noexcept(std::is_nothrow_constructible_v) : + mixin(c) {} + + template O> + constexpr basic_iterator(basic_iterator&& that) + noexcept(std::is_nothrow_constructible::value) : + mixin(that.cursor()) {} + + template O> + constexpr basic_iterator(const basic_iterator& that) + noexcept(std::is_nothrow_constructible::value) : + mixin(std::move(that.cursor())) {} + + template O> + constexpr basic_iterator& operator=(basic_iterator&& that) & + noexcept(std::is_nothrow_assignable::value) { + cursor() = std::move(that.cursor()); + return *this; + } + + template O> + constexpr basic_iterator& operator=(const basic_iterator& that) & + noexcept(std::is_nothrow_assignable::value) { + cursor() = that.cursor(); + return *this; + } + + template + requires + (!std::same_as, basic_iterator> && + !cursor::next&& + cursor::writable) + constexpr basic_iterator& operator=(T&& t) & + noexcept(noexcept(std::declval().write(static_cast(t)))) { + cursor() = std::forward(t); + return *this; + } + + friend constexpr decltype(auto) iter_move(const basic_iterator& i) +#if !defined(__NVCOMPILER) + noexcept(noexcept(i.cursor().indirect_move())) +#endif + requires cursor::indirect_move { + return i.cursor().indirect_move(); + } + + template + requires cursor::indirect_swap + friend constexpr void iter_swap( + const basic_iterator& x, const basic_iterator& y) +#if !defined(__NVCOMPILER) + noexcept(noexcept((void)x.indirect_swap(y))) +#endif + { + x.indirect_swap(y); + } + + //Input iterator + constexpr decltype(auto) operator*() const + noexcept(noexcept(std::declval().read())) + requires (cursor::readable && !cursor::detail::is_writable_cursor::value) { + return cursor().read(); + } + + //Output iterator + constexpr decltype(auto) operator*() + noexcept(noexcept(reference_t{ cursor() })) + requires (cursor::next&& cursor::detail::is_writable_cursor::value) { + return reference_t{ cursor() }; + } + + //Output iterator + constexpr decltype(auto) operator*() const + noexcept(noexcept( + const_reference_t{ cursor() })) + requires (cursor::next&& cursor::detail::is_writable_cursor::value) { + return const_reference_t{ cursor() }; + } + + constexpr basic_iterator& operator*() noexcept + requires (!cursor::next) { + return *this; + } + + // operator->: "Manual" deduction override, + constexpr decltype(auto) operator->() const + noexcept(noexcept(cursor().arrow())) + requires cursor::arrow { + return cursor().arrow(); + } + // operator->: Otherwise, if reference_t is an lvalue reference, + constexpr decltype(auto) operator->() const + noexcept(noexcept(*std::declval())) + requires (cursor::readable && !cursor::arrow) + && std::is_lvalue_reference::value{ + return std::addressof(**this); + } + + // modifiers + constexpr basic_iterator& operator++() & noexcept { + return *this; + } + constexpr basic_iterator& operator++() & + noexcept(noexcept(std::declval().cursor().next())) + requires cursor::next { + cursor().next(); + return *this; + } + + //C++17 required that *it++ was valid. + //For input iterators, we can't copy *this, so we need to create a proxy reference. + constexpr auto operator++(int) & + noexcept(noexcept(++std::declval()) && + std::is_nothrow_move_constructible_v&& + std::is_nothrow_constructible_v) + requires (cursor::single_pass&& + std::move_constructible&& + std::constructible_from) { + detail::post_increment_proxy p(**this); + ++* this; + return p; + } + + //If we can't create a proxy reference, it++ is going to return void + constexpr void operator++(int) & + noexcept(noexcept(++std::declval())) + requires (cursor::single_pass && !(std::move_constructible&& + std::constructible_from)) { + (void)(++(*this)); + } + + //If C is a forward cursor then copying it is fine + constexpr basic_iterator operator++(int) & + noexcept(std::is_nothrow_copy_constructible_v&& + std::is_nothrow_move_constructible_v && + noexcept(++std::declval())) + requires (!cursor::single_pass) { + auto temp = *this; + ++* this; + return temp; + } + + constexpr basic_iterator& operator--() & + noexcept(noexcept(cursor().prev())) + requires cursor::bidirectional { + cursor().prev(); + return *this; + } + + //Postfix decrement doesn't have the same issue as postfix increment + //because bidirectional requires the cursor to be a forward cursor anyway + //so copying it is fine. + constexpr basic_iterator operator--(int) & + noexcept(std::is_nothrow_copy_constructible::value&& + std::is_nothrow_move_constructible::value && + noexcept(--std::declval())) + requires cursor::bidirectional { + auto tmp = *this; + --* this; + return tmp; + } + + constexpr basic_iterator& operator+=(difference_type n) & + noexcept(noexcept(cursor().advance(n))) + requires cursor::random_access { + cursor().advance(n); + return *this; + } + + constexpr basic_iterator& operator-=(difference_type n) & + noexcept(noexcept(cursor().advance(-n))) + requires cursor::random_access { + cursor().advance(-n); + return *this; + } + + constexpr decltype(auto) operator[](difference_type n) const + noexcept(noexcept(*(std::declval() + n))) + requires cursor::random_access { + return *(*this + n); + } + + // non-template type-symmetric ops to enable implicit conversions + friend constexpr difference_type operator-( + const basic_iterator& x, const basic_iterator& y) + noexcept(noexcept(y.cursor().distance_to(x.cursor()))) + requires cursor::sized_sentinel_for { + return y.cursor().distance_to(x.cursor()); + } + friend constexpr bool operator==( + const basic_iterator& x, const basic_iterator& y) +#if !defined(__NVCOMPILER) + noexcept(noexcept(x.cursor().equal(y.cursor()))) + requires cursor::sentinel_for +#endif + { + return x.cursor().equal(y.cursor()); + } + friend constexpr bool operator!=( + const basic_iterator& x, const basic_iterator& y) +#if !defined(__NVCOMPILER) + noexcept(noexcept(!(x == y))) + requires cursor::sentinel_for +#endif + { + return !(x == y); + } + friend constexpr bool operator<( + const basic_iterator& x, const basic_iterator& y) +#if !defined(__NVCOMPILER) + noexcept(noexcept(y - x)) +#endif + requires cursor::sized_sentinel_for { + return 0 < (y - x); + } + friend constexpr bool operator>( + const basic_iterator& x, const basic_iterator& y) +#if !defined(__NVCOMPILER) + noexcept(noexcept(y - x)) +#endif + requires cursor::sized_sentinel_for { + return 0 > (y - x); + } + friend constexpr bool operator<=( + const basic_iterator& x, const basic_iterator& y) +#if !defined(__NVCOMPILER) + noexcept(noexcept(y - x)) +#endif + requires cursor::sized_sentinel_for { + return 0 <= (y - x); + } + friend constexpr bool operator>=( + const basic_iterator& x, const basic_iterator& y) +#if !defined(__NVCOMPILER) + noexcept(noexcept(y - x)) +#endif + requires cursor::sized_sentinel_for { + return 0 >= (y - x); + } + }; + + namespace detail { + template + struct is_basic_iterator { + template + static auto deduce(basic_iterator const&)->std::true_type; + template + static auto deduce(...)->std::false_type; + static constexpr inline bool value = decltype(deduce(std::declval()))::value; + }; + } + + // basic_iterator nonmember functions + template + constexpr basic_iterator operator+( + const basic_iterator& i, cursor::difference_type_t n) + noexcept(std::is_nothrow_copy_constructible>::value&& + std::is_nothrow_move_constructible>::value && + noexcept(std::declval&>() += n)) + requires cursor::random_access { + auto tmp = i; + tmp += n; + return tmp; + } + template + constexpr basic_iterator operator+( + cursor::difference_type_t n, const basic_iterator& i) + noexcept(noexcept(i + n)) + requires cursor::random_access { + return i + n; + } + + template + constexpr basic_iterator operator-( + const basic_iterator& i, cursor::difference_type_t n) + noexcept(noexcept(i + (-n))) + requires cursor::random_access { + return i + (-n); + } + template + requires cursor::sized_sentinel_for + constexpr cursor::difference_type_t operator-( + const basic_iterator& lhs, const basic_iterator& rhs) + noexcept(noexcept( + rhs.get().distance_to(lhs.get()))) { + return rhs.get().distance_to(lhs.get()); + } + template + requires cursor::sized_sentinel_for + constexpr cursor::difference_type_t operator-( + const S& lhs, const basic_iterator& rhs) + noexcept(noexcept(rhs.get().distance_to(lhs))) { + return rhs.get().distance_to(lhs); + } + template + requires cursor::sized_sentinel_for + constexpr cursor::difference_type_t operator-( + const basic_iterator& lhs, const S& rhs) + noexcept(noexcept(-(rhs - lhs))) { + return -(rhs - lhs); + } + + template + requires cursor::sentinel_for + constexpr bool operator==( + const basic_iterator& lhs, const basic_iterator& rhs) + noexcept(noexcept(lhs.get().equal(rhs.get()))) { + return lhs.get().equal(rhs.get()); + } + template + requires cursor::sentinel_for + constexpr bool operator==( + const basic_iterator& lhs, const S& rhs) + noexcept(noexcept(lhs.get().equal(rhs))) { + return lhs.get().equal(rhs); + } + template + requires cursor::sentinel_for + constexpr bool operator==( + const S& lhs, const basic_iterator& rhs) + noexcept(noexcept(rhs == lhs)) { + return rhs == lhs; + } + + template + requires cursor::sentinel_for + constexpr bool operator!=( + const basic_iterator& lhs, const basic_iterator& rhs) + noexcept(noexcept(!(lhs == rhs))) { + return !(lhs == rhs); + } + template + requires cursor::sentinel_for + constexpr bool operator!=( + const basic_iterator& lhs, const S& rhs) + noexcept(noexcept(!lhs.get().equal(rhs))) { + return !lhs.get().equal(rhs); + } + template + requires cursor::sentinel_for + constexpr bool operator!=( + const S& lhs, const basic_iterator& rhs) + noexcept(noexcept(!rhs.get().equal(lhs))) { + return !rhs.get().equal(lhs); + } + + template + requires cursor::sized_sentinel_for + constexpr bool operator<( + const basic_iterator& lhs, const basic_iterator& rhs) + noexcept(noexcept(lhs - rhs < 0)) { + return (lhs - rhs) < 0; + } + + template + requires cursor::sized_sentinel_for + constexpr bool operator>( + const basic_iterator& lhs, const basic_iterator& rhs) + noexcept(noexcept((lhs - rhs) > 0)) { + return (lhs - rhs) > 0; + } + + template + requires cursor::sized_sentinel_for + constexpr bool operator<=( + const basic_iterator& lhs, const basic_iterator& rhs) + noexcept(noexcept((lhs - rhs) <= 0)) { + return (lhs - rhs) <= 0; + } + + template + requires cursor::sized_sentinel_for + constexpr bool operator>=( + const basic_iterator& lhs, const basic_iterator& rhs) + noexcept(noexcept((lhs - rhs) >= 0)) { + return (lhs - rhs) >= 0; + } + + template + class basic_sentinel { + using Base = std::conditional_t; + + public: + std::ranges::sentinel_t end_{}; + basic_sentinel() = default; + constexpr explicit basic_sentinel(std::ranges::sentinel_t end) + : end_{ std::move(end) } {} + + constexpr basic_sentinel(basic_sentinel other) requires Const&& std:: + convertible_to, + std::ranges::sentinel_t> + : end_{ std::move(other.end_) } {} + + constexpr auto end() const { + return end_; + } + + friend class basic_sentinel; + }; + + //tl::compose composes f and g such that compose(f,g)(args...) is f(g(args...)), i.e. g is called first + template + struct compose_fn { + [[no_unique_address]] F f; + [[no_unique_address]] G g; + + template + compose_fn(A&& a, B&& b) : f(std::forward(a)), g(std::forward(b)) {} + + template + static constexpr auto call(A&& a, B&& b, Args&&... args) { + if constexpr (std::is_void_v>) { + std::invoke(std::forward(b), std::forward(args)...); + return std::invoke(std::forward(a)); + } + else { + return std::invoke(std::forward(a), std::invoke(std::forward(b), std::forward(args)...)); + } + } + + template + constexpr auto operator()(Args&&... args) & { + return call(f, g, std::forward(args)...); + } + + template + constexpr auto operator()(Args&&... args) const& { + return call(f, g, std::forward(args)...); + } + + template + constexpr auto operator()(Args&&... args)&& { + return call(std::move(f), std::move(g), std::forward(args)...); + } + + template + constexpr auto operator()(Args&&... args) const&& { + return call(std::move(f), std::move(g), std::forward(args)...); + } + }; + + template + constexpr auto compose(F&& f, G&& g) { + return compose_fn, std::remove_cvref_t>(std::forward(f), std::forward(g)); + } + + //tl::pipeable takes some invocable and enables: + //- Piping a single argument to it such that a | pipeable is the same as pipeable(a) + //- Piping it to another pipeable object, such that a | b is the same as tl::compose(b, a) + struct pipeable_base {}; + template + concept is_pipeable = std::is_base_of_v>; + + template + struct pipeable_fn : pipeable_base { + [[no_unique_address]] F f_; + + constexpr pipeable_fn(F f) : f_(std::move(f)) {} + + template + constexpr auto operator()(Args&&... args) const requires std::invocable { + return std::invoke(f_, std::forward(args)...); + } + }; + + template + constexpr auto pipeable(F f) { + return pipeable_fn{ std::move(f) }; + } + + template + constexpr auto operator|(V&& v, Pipe&& fn) + requires (!is_pipeable && is_pipeable && std::invocable) { + return std::invoke(std::forward(fn).f_, std::forward(v)); + } + + template + constexpr auto operator|(Pipe1&& p1, Pipe2&& p2) + requires (is_pipeable&& is_pipeable) { + return pipeable(compose(std::forward(p2).f_, std::forward(p1).f_)); + } + + //tl::bind_back binds the last N arguments of f to the given ones, returning a new closure + template + constexpr auto bind_back(F&& f, Args&&... args) { + return[f_ = std::forward(f), ...args_ = std::forward(args)] + (auto&&... other_args) + requires std::invocable { + return std::invoke(f_, std::forward(other_args)..., args_...); + }; + } +} + +namespace std { + template + struct iterator_traits> : tl::cursor::associated_types {}; +} + +namespace tl { + + template + constexpr inline std::size_t tuple_size = std::tuple_size_v>; + + template + using index_constant = std::integral_constant; + + namespace meta { + //Partially-apply the given template with the given arguments + template