From c2190e6cfbe8ddeda2091c47908415937f774a98 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Mon, 3 Mar 2025 16:20:22 -0700
Subject: [PATCH 01/37] Initial check-in of python/miniWeather.py

This is NOT COMPLETE YET.

It's a port of cpp/miniWeather_serial.cpp.
---
 python/miniWeather.py | 869 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 869 insertions(+)
 create mode 100644 python/miniWeather.py
diff --git a/python/miniWeather.py b/python/miniWeather.py
new file mode 100644
index 0000000..6bb16a3
--- /dev/null
+++ b/python/miniWeather.py
@@ -0,0 +1,869 @@
+
+# //////////////////////////////////////////////////////////////////////////////////////////
+# // miniWeather
+# // Author: Matt Norman <normanmr@ornl.gov>  , Oak Ridge National Laboratory
+# // This code simulates dry, stratified, compressible, non-hydrostatic fluid flows
+# // For documentation, please see the attached documentation in the "documentation" folder
+# //
+# //////////////////////////////////////////////////////////////////////////////////////////
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <mpi.h>
+#include <ctime>
+#include "const.h"
+#include "pnetcdf.h"
+#include <chrono>
+
+# "Halo" size: number of cells beyond the MPI tasks's domain needed for a full "stencil" of information for reconstruction
+hs: int = 2
+
+# real can be either float or double, depending on cpp/const.h.
+
+#typedef yakl::Array<real  ,1,yakl::memHost> real1d;
+def real1d(name: string, nx: int):
+    # FIXME (mfh 2025/03/03) This should NOT return the same thing as doub1d; need to change type
+    return np.zeros(nx) # FIXME (mfh 2025/03/03)
+
+#typedef yakl::Array<real  ,2,yakl::memHost> real2d;
+def real2d(name: string, nx: int, nz: int):
+    return np.zeros((nz, nx)) # FIXME (mfh 2025/03/03) What element type should this return?
+
+#typedef yakl::Array<real  ,3,yakl::memHost> real3d;
+def real3d(name: string, nx: int, nz: int, nvars: int):
+    return np.zeros((nvars, nz, nx)) # FIXME (mfh 2025/03/03) What element type should this return?
+
+#typedef yakl::Array<double,2,yakl::memHost> doub2d;
+def doub2d(name: string, nx: int, nz: int):
+    return np.zeros((nz, nx)) # FIXME (mfh 2025/03/03) What element type should this return?
+
+typedef yakl::Array<real   const,1,yakl::memHost> realConst1d;
+typedef yakl::Array<real   const,2,yakl::memHost> realConst2d;
+typedef yakl::Array<real   const,3,yakl::memHost> realConst3d;
+typedef yakl::Array<double const,1,yakl::memHost> doubConst1d;
+typedef yakl::Array<double const,2,yakl::memHost> doubConst2d;
+typedef yakl::Array<double const,3,yakl::memHost> doubConst3d;
+
+///////////////////////////////////////////////////////////////////////////////////////
+// Variables that are initialized but remain static over the course of the simulation
+///////////////////////////////////////////////////////////////////////////////////////
+struct Fixed_data {
+  int nx, nz;                 //Number of local grid cells in the x- and z- dimensions for this MPI task
+  int i_beg, k_beg;           //beginning index in the x- and z-directions for this MPI task
+  int nranks, myrank;         //Number of MPI ranks and my rank id
+  int left_rank, right_rank;  //MPI Rank IDs that exist to my left and right in the global domain
+  int mainproc;             //Am I the main process (rank == 0)?
+  realConst1d hy_dens_cell;        //hydrostatic density (vert cell avgs).   Dimensions: (1-hs:nz+hs)
+  realConst1d hy_dens_theta_cell;  //hydrostatic rho*t (vert cell avgs).     Dimensions: (1-hs:nz+hs)
+  realConst1d hy_dens_int;         //hydrostatic density (vert cell interf). Dimensions: (1:nz+1)
+  realConst1d hy_dens_theta_int;   //hydrostatic rho*t (vert cell interf).   Dimensions: (1:nz+1)
+  realConst1d hy_pressure_int;     //hydrostatic press (vert cell interf).   Dimensions: (1:nz+1)
+};
+
+///////////////////////////////////////////////////////////////////////////////////////
+// THE MAIN PROGRAM STARTS HERE
+///////////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv) {
+  #MPI_Init(&argc,&argv);
+  #yakl::init();
+  {
+    Fixed_data fixed_data;
+    real3d state;
+    real dt;                    //Model time step (seconds)
+
+    # init allocates state
+    (fixed_data, state, dt) = init() # init( state , dt , fixed_data );
+
+    auto &mainproc = fixed_data.mainproc;
+
+    //Initial reductions for mass, kinetic energy, and total energy
+    double mass0, te0;
+    reductions(state,mass0,te0,fixed_data);
+
+    int  num_out = 0;          //The number of outputs performed so far
+    real output_counter = 0;   //Helps determine when it's time to do output
+    real etime = 0;
+
+    //Output the initial state
+    if (output_freq >= 0) {
+      output(state,etime,num_out,fixed_data);
+    }
+
+    int direction_switch = 1;  // Tells dimensionally split which order to take x,z solves
+
+    ////////////////////////////////////////////////////
+    // MAIN TIME STEP LOOP
+    ////////////////////////////////////////////////////
+    auto t1 = std::chrono::steady_clock::now();
+    while (etime < sim_time) {
+      //If the time step leads to exceeding the simulation time, shorten it for the last step
+      if (etime + dt > sim_time) { dt = sim_time - etime; }
+      # Perform a single time step
+      direction_switch = perform_timestep(state, dt, direction_switch, fixed_data)
+      //Inform the user
+      #ifndef NO_INFORM
+        if (mainproc) { printf( "Elapsed Time: %lf / %lf\n", etime , sim_time ); }
+      #endif
+      //Update the elapsed time and output counter
+      etime = etime + dt;
+      output_counter = output_counter + dt;
+      //If it's time for output, reset the counter, and do output
+      if (output_freq >= 0 && output_counter >= output_freq) {
+        output_counter = output_counter - output_freq;
+        output(state,etime,num_out,fixed_data);
+      }
+    }
+    auto t2 = std::chrono::steady_clock::now();
+    if (mainproc) {
+      std::cout << "CPU Time: " << std::chrono::duration<double>(t2-t1).count() << " sec\n";
+    }
+
+    //Final reductions for mass, kinetic energy, and total energy
+    double mass, te;
+    reductions(state,mass,te,fixed_data);
+
+    if (mainproc) {
+      printf( "d_mass: %le\n" , (mass - mass0)/mass0 );
+      printf( "d_te:   %le\n" , (te   - te0  )/te0   );
+    }
+
+    finalize();
+  }
+  yakl::finalize();
+  MPI_Finalize();
+}
+
+
+# Performs a single dimensionally split time step using a simple low-storage three-stage Runge-Kutta time integrator
+# The dimensional splitting is a second-order-accurate alternating Strang splitting in which the
+# order of directions is alternated each time step.
+# The Runge-Kutta method used here is defined as follows:
+#   q*     = q_n + dt/3 * rhs(q_n)
+#   q**    = q_n + dt/2 * rhs(q* )
+#   q_n+1  = q_n + dt/1 * rhs(q**)
+def perform_timestep(
+    state, # real3d const&, input parameter
+    dt, # real, must be an input parameter
+    direction_switch, # int&, in/out parameter, transformed to input and return value
+    fixed_data # Fixed_data const &, input parameter
+  ) -> int: # was void; now returns direction_switch
+
+  nx = fixed_data.nx
+  nz = fixed_data.nz
+
+  state_tmp = real3d("state_tmp", NUM_VARS, nz+2*hs, nx+2*hs);
+
+  if direction_switch != 0:
+    # x-direction first
+    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_X , fixed_data )
+    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , fixed_data )
+    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_X , fixed_data )
+    # z-direction second
+    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_Z , fixed_data )
+    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , fixed_data )
+    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_Z , fixed_data )
+  else:
+    # z-direction second
+    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_Z , fixed_data )
+    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , fixed_data )
+    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_Z , fixed_data )
+    # x-direction first
+    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_X , fixed_data )
+    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , fixed_data )
+    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_X , fixed_data )
+
+  if direction_switch:
+    direction_switch = 0
+  else:
+    direction_switch = 1
+
+  return direction_switch
+
+
+
+//Perform a single semi-discretized step in time with the form:
+//state_out = state_init + dt * rhs(state_forcing)
+//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out
+void semi_discrete_step( realConst3d state_init , real3d const &state_forcing , real3d const &state_out , real dt , int dir , Fixed_data const &fixed_data ) {
+  auto &nx                 = fixed_data.nx                ;
+  auto &nz                 = fixed_data.nz                ;
+  auto &i_beg              = fixed_data.i_beg             ;
+  auto &k_beg              = fixed_data.k_beg             ;
+  auto &hy_dens_cell       = fixed_data.hy_dens_cell      ;
+
+  tend = real3d("tend", NUM_VARS, nz, nx);
+
+  if        (dir == DIR_X) {
+    //Set the halo values for this MPI task's fluid state in the x-direction
+    yakl::timer_start("halo x");
+    set_halo_values_x(state_forcing,fixed_data);
+    yakl::timer_stop("halo x");
+    //Compute the time tendencies for the fluid state in the x-direction
+    yakl::timer_start("tendencies x");
+    compute_tendencies_x(state_forcing,tend,dt,fixed_data);
+    yakl::timer_stop("tendencies x");
+  } else if (dir == DIR_Z) {
+    //Set the halo values for this MPI task's fluid state in the z-direction
+    yakl::timer_start("halo z");
+    set_halo_values_z(state_forcing,fixed_data);
+    yakl::timer_stop("halo z");
+    //Compute the time tendencies for the fluid state in the z-direction
+    yakl::timer_start("tendencies z");
+    compute_tendencies_z(state_forcing,tend,dt,fixed_data);
+    yakl::timer_stop("tendencies z");
+  }
+
+  /////////////////////////////////////////////////
+  // TODO: MAKE THESE 3 LOOPS A PARALLEL_FOR
+  /////////////////////////////////////////////////
+  //Apply the tendencies to the fluid state
+  yakl::timer_start("apply tendencies");
+  for (int ll=0; ll<NUM_VARS; ll++) {
+    for (int k=0; k<nz; k++) {
+      for (int i=0; i<nx; i++) {
+        if (data_spec_int == DATA_SPEC_GRAVITY_WAVES) {
+          real x = (i_beg + i+0.5)*dx;
+          real z = (k_beg + k+0.5)*dz;
+          real wpert = sample_ellipse_cosine( x,z , 0.01 , xlen/8,1000., 500.,500. );
+          tend(ID_WMOM,k,i) += wpert*hy_dens_cell(hs+k);
+        }
+        state_out(ll,hs+k,hs+i) = state_init(ll,hs+k,hs+i) + dt * tend(ll,k,i);
+      }
+    }
+  }
+  yakl::timer_stop("apply tendencies");
+}
+
+
+//Compute the time tendencies of the fluid state using forcing in the x-direction
+//Since the halos are set in a separate routine, this will not require MPI
+//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity)
+//Then, compute the tendencies using those fluxes
+void compute_tendencies_x( realConst3d state , real3d const &tend , real dt , Fixed_data const &fixed_data ) {
+  auto &nx                 = fixed_data.nx                ;
+  auto &nz                 = fixed_data.nz                ;
+  auto &hy_dens_cell       = fixed_data.hy_dens_cell      ;
+  auto &hy_dens_theta_cell = fixed_data.hy_dens_theta_cell;
+
+  flux = real3d("flux", NUM_VARS, nz, nx+1);
+
+  //Compute the hyperviscosity coefficient
+  real hv_coef = -hv_beta * dx / (16*dt);
+  /////////////////////////////////////////////////
+  // TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR
+  /////////////////////////////////////////////////
+  //Compute fluxes in the x-direction for each cell
+  for (int k=0; k<nz; k++) {
+    for (int i=0; i<nx+1; i++) {
+      SArray<real,1,4> stencil;
+      SArray<real,1,NUM_VARS> d3_vals;
+      SArray<real,1,NUM_VARS> vals;
+      //Use fourth-order interpolation from four cell averages to compute the value at the interface in question
+      for (int ll=0; ll<NUM_VARS; ll++) {
+        for (int s=0; s < sten_size; s++) {
+          stencil(s) = state(ll,hs+k,i+s);
+        }
+        //Fourth-order-accurate interpolation of the state
+        vals(ll) = -stencil(0)/12 + 7*stencil(1)/12 + 7*stencil(2)/12 - stencil(3)/12;
+        //First-order-accurate interpolation of the third spatial derivative of the state (for artificial viscosity)
+        d3_vals(ll) = -stencil(0) + 3*stencil(1) - 3*stencil(2) + stencil(3);
+      }
+
+      //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
+      real r = vals(ID_DENS) + hy_dens_cell(hs+k);
+      real u = vals(ID_UMOM) / r;
+      real w = vals(ID_WMOM) / r;
+      real t = ( vals(ID_RHOT) + hy_dens_theta_cell(hs+k) ) / r;
+      real p = C0*pow((r*t),gamm);
+
+      //Compute the flux vector
+      flux(ID_DENS,k,i) = r*u     - hv_coef*d3_vals(ID_DENS);
+      flux(ID_UMOM,k,i) = r*u*u+p - hv_coef*d3_vals(ID_UMOM);
+      flux(ID_WMOM,k,i) = r*u*w   - hv_coef*d3_vals(ID_WMOM);
+      flux(ID_RHOT,k,i) = r*u*t   - hv_coef*d3_vals(ID_RHOT);
+    }
+  }
+
+  /////////////////////////////////////////////////
+  // TODO: MAKE THESE 3 LOOPS A PARALLEL_FOR
+  /////////////////////////////////////////////////
+  //Use the fluxes to compute tendencies for each cell
+  for (int ll=0; ll<NUM_VARS; ll++) {
+    for (int k=0; k<nz; k++) {
+      for (int i=0; i<nx; i++) {
+        tend(ll,k,i) = -( flux(ll,k,i+1) - flux(ll,k,i) ) / dx;
+      }
+    }
+  }
+}
+
+
+//Compute the time tendencies of the fluid state using forcing in the z-direction
+//Since the halos are set in a separate routine, this will not require MPI
+//First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity)
+//Then, compute the tendencies using those fluxes
+void compute_tendencies_z( realConst3d state , real3d const &tend , real dt , Fixed_data const &fixed_data ) {
+  auto &nx                 = fixed_data.nx                ;
+  auto &nz                 = fixed_data.nz                ;
+  auto &hy_dens_int        = fixed_data.hy_dens_int       ;
+  auto &hy_dens_theta_int  = fixed_data.hy_dens_theta_int ;
+  auto &hy_pressure_int    = fixed_data.hy_pressure_int   ;
+
+  flux = real3d("flux", NUM_VARS, nz+1, nx);
+
+  //Compute the hyperviscosity coefficient
+  real hv_coef = -hv_beta * dz / (16*dt);
+  /////////////////////////////////////////////////
+  // TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR
+  /////////////////////////////////////////////////
+  //Compute fluxes in the x-direction for each cell
+  for (int k=0; k<nz+1; k++) {
+    for (int i=0; i<nx; i++) {
+      SArray<real,1,4> stencil;
+      SArray<real,1,NUM_VARS> d3_vals;
+      SArray<real,1,NUM_VARS> vals;
+      //Use fourth-order interpolation from four cell averages to compute the value at the interface in question
+      for (int ll=0; ll<NUM_VARS; ll++) {
+        for (int s=0; s<sten_size; s++) {
+          stencil(s) = state(ll,k+s,hs+i);
+        }
+        //Fourth-order-accurate interpolation of the state
+        vals(ll) = -stencil(0)/12 + 7*stencil(1)/12 + 7*stencil(2)/12 - stencil(3)/12;
+        //First-order-accurate interpolation of the third spatial derivative of the state
+        d3_vals(ll) = -stencil(0) + 3*stencil(1) - 3*stencil(2) + stencil(3);
+      }
+
+      //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
+      real r = vals(ID_DENS) + hy_dens_int(k);
+      real u = vals(ID_UMOM) / r;
+      real w = vals(ID_WMOM) / r;
+      real t = ( vals(ID_RHOT) + hy_dens_theta_int(k) ) / r;
+      real p = C0*pow((r*t),gamm) - hy_pressure_int(k);
+      if (k == 0 || k == nz) {
+        w                = 0;
+        d3_vals(ID_DENS) = 0;
+      }
+
+      //Compute the flux vector with hyperviscosity
+      flux(ID_DENS,k,i) = r*w     - hv_coef*d3_vals(ID_DENS);
+      flux(ID_UMOM,k,i) = r*w*u   - hv_coef*d3_vals(ID_UMOM);
+      flux(ID_WMOM,k,i) = r*w*w+p - hv_coef*d3_vals(ID_WMOM);
+      flux(ID_RHOT,k,i) = r*w*t   - hv_coef*d3_vals(ID_RHOT);
+    }
+  }
+
+  //Use the fluxes to compute tendencies for each cell
+  /////////////////////////////////////////////////
+  // TODO: MAKE THESE 3 LOOPS A PARALLEL_FOR
+  /////////////////////////////////////////////////
+  for (int ll=0; ll<NUM_VARS; ll++) {
+    for (int k=0; k<nz; k++) {
+      for (int i=0; i<nx; i++) {
+        tend(ll,k,i) = -( flux(ll,k+1,i) - flux(ll,k,i) ) / dz;
+        if (ll == ID_WMOM) {
+          tend(ll,k,i) -= state(ID_DENS,hs+k,hs+i)*grav;
+        }
+      }
+    }
+  }
+}
+
+
+
+//Set this MPI task's halo values in the x-direction. This routine will require MPI
+void set_halo_values_x( real3d const &state , Fixed_data const &fixed_data ) {
+  auto &nx                 = fixed_data.nx                ;
+  auto &nz                 = fixed_data.nz                ;
+  auto &k_beg              = fixed_data.k_beg             ;
+  auto &left_rank          = fixed_data.left_rank         ;
+  auto &right_rank         = fixed_data.right_rank        ;
+  auto &myrank             = fixed_data.myrank            ;
+  auto &hy_dens_cell       = fixed_data.hy_dens_cell      ;
+  auto &hy_dens_theta_cell = fixed_data.hy_dens_theta_cell;
+
+  ////////////////////////////////////////////////////////////////////////
+  // TODO: EXCHANGE HALO VALUES WITH NEIGHBORING MPI TASKS
+  // (1) give    state(1:hs,1:nz,1:NUM_VARS)       to   my left  neighbor
+  // (2) receive state(1-hs:0,1:nz,1:NUM_VARS)     from my left  neighbor
+  // (3) give    state(nx-hs+1:nx,1:nz,1:NUM_VARS) to   my right neighbor
+  // (4) receive state(nx+1:nx+hs,1:nz,1:NUM_VARS) from my right neighbor
+  ////////////////////////////////////////////////////////////////////////
+
+  //////////////////////////////////////////////////////
+  // DELETE THE SERIAL CODE BELOW AND REPLACE WITH MPI
+  //////////////////////////////////////////////////////
+  for (int ll=0; ll<NUM_VARS; ll++) {
+    for (int k=0; k<nz; k++) {
+      state(ll,hs+k,0      ) = state(ll,hs+k,nx+hs-2);
+      state(ll,hs+k,1      ) = state(ll,hs+k,nx+hs-1);
+      state(ll,hs+k,nx+hs  ) = state(ll,hs+k,hs     );
+      state(ll,hs+k,nx+hs+1) = state(ll,hs+k,hs+1   );
+    }
+  }
+  ////////////////////////////////////////////////////
+
+  if (data_spec_int == DATA_SPEC_INJECTION) {
+    if (myrank == 0) {
+      /////////////////////////////////////////////////
+      // TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR
+      /////////////////////////////////////////////////
+      for (int k=0; k<nz; k++) {
+        for (int i=0; i<hs; i++) {
+          real z = (k_beg + k+0.5)*dz;
+          if (abs(z-3*zlen/4) <= zlen/16) {
+            state(ID_UMOM,hs+k,i) = (state(ID_DENS,hs+k,i)+hy_dens_cell(hs+k)) * 50.;
+            state(ID_RHOT,hs+k,i) = (state(ID_DENS,hs+k,i)+hy_dens_cell(hs+k)) * 298. - hy_dens_theta_cell(hs+k);
+          }
+        }
+      }
+    }
+  }
+}
+
+
+//Set this MPI task's halo values in the z-direction. This does not require MPI because there is no MPI
+//decomposition in the vertical direction
+void set_halo_values_z( real3d const &state , Fixed_data const &fixed_data ) {
+  auto &nx                 = fixed_data.nx                ;
+  auto &nz                 = fixed_data.nz                ;
+  auto &hy_dens_cell       = fixed_data.hy_dens_cell      ;
+  
+  /////////////////////////////////////////////////
+  // TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR
+  /////////////////////////////////////////////////
+  for (int ll=0; ll<NUM_VARS; ll++) {
+    for (int i=0; i<nx+2*hs; i++) {
+      if (ll == ID_WMOM) {
+        state(ll,0      ,i) = 0.;
+        state(ll,1      ,i) = 0.;
+        state(ll,nz+hs  ,i) = 0.;
+        state(ll,nz+hs+1,i) = 0.;
+      } else if (ll == ID_UMOM) {
+        state(ll,0      ,i) = state(ll,hs     ,i) / hy_dens_cell(hs     ) * hy_dens_cell(0      );
+        state(ll,1      ,i) = state(ll,hs     ,i) / hy_dens_cell(hs     ) * hy_dens_cell(1      );
+        state(ll,nz+hs  ,i) = state(ll,nz+hs-1,i) / hy_dens_cell(nz+hs-1) * hy_dens_cell(nz+hs  );
+        state(ll,nz+hs+1,i) = state(ll,nz+hs-1,i) / hy_dens_cell(nz+hs-1) * hy_dens_cell(nz+hs+1);
+      } else {
+        state(ll,0      ,i) = state(ll,hs     ,i);
+        state(ll,1      ,i) = state(ll,hs     ,i);
+        state(ll,nz+hs  ,i) = state(ll,nz+hs-1,i);
+        state(ll,nz+hs+1,i) = state(ll,nz+hs-1,i);
+      }
+    }
+  }
+}
+
+# state, dt, and fixed_data used to be output parameters.
+# It would be more Pythonic to return them as a tuple.
+def init(): # tuple[real3d, real, Fixed_data]:
+  #real3d state
+  #Fixed_data fixed_data
+  #real dt
+
+  auto &nx                 = fixed_data.nx                ;
+  auto &nz                 = fixed_data.nz                ;
+  auto &i_beg              = fixed_data.i_beg             ;
+  auto &k_beg              = fixed_data.k_beg             ;
+  auto &left_rank          = fixed_data.left_rank         ;
+  auto &right_rank         = fixed_data.right_rank        ;
+  auto &nranks             = fixed_data.nranks            ;
+  auto &myrank             = fixed_data.myrank            ;
+  auto &mainproc         = fixed_data.mainproc        ;
+  ierr: int = 0
+
+  # /////////////////////////////////////////////////////////////
+  #// BEGIN MPI DUMMY SECTION
+  # // TODO: (1) GET NUMBER OF MPI RANKS
+  # //       (2) GET MY MPI RANK ID (RANKS ARE ZERO-BASED INDEX)
+  # //       (3) COMPUTE MY BEGINNING "I" INDEX (1-based index)
+  # //       (4) COMPUTE HOW MANY X-DIRECTION CELLS MY RANK HAS
+  # //       (5) FIND MY LEFT AND RIGHT NEIGHBORING RANK IDs
+  # /////////////////////////////////////////////////////////////
+  nranks = 1
+  myrank = 0
+  i_beg = 0
+  nx = nx_glob
+  left_rank = 0
+  right_rank = 0
+  # //////////////////////////////////////////////
+  # END MPI DUMMY SECTION
+  # //////////////////////////////////////////////
+
+  # Vertical direction isn't MPI-ized, so the rank's local values = the global values
+  k_beg = 0
+  nz = nz_glob
+  mainproc = (myrank == 0)
+
+  # Allocate the model data
+  state = real3d("state", NUM_VARS, nz+2*hs, nx+2*hs)
+
+  # Define the maximum stable time step based on an assumed maximum wind speed
+  dt: real = min(dx,dz) / max_speed * cfl;
+
+  # If I'm the main process in MPI, display some grid information
+  if mainproc:
+    print( "nx_glob, nz_glob: %d %d\n", nx_glob, nz_glob)
+    print( "dx,dz: %lf %lf\n",dx,dz)
+    print( "dt: %lf\n",dt)
+
+  # Want to make sure this info is displayed before further output
+  # ierr = MPI_Barrier(MPI_COMM_WORLD);
+
+  # Define quadrature weights and points
+  nqpoints: int = 3 
+  # SArray<real,1,nqpoints> qpoints;
+  # SArray<real,1,nqpoints> qweights;
+
+  qpoints = np.array([0.112701665379258311482073460022, 0.500000000000000000000000000000, 0.887298334620741688517926539980], dtype=real)
+  qweights = np.array([0.277777777777777777777777777779, 0.444444444444444444444444444444, 0.277777777777777777777777777779], dtype=real)
+
+  # //////////////////////////////////////////////////////////////////////////
+  # Initialize the cell-averaged fluid state via Gauss-Legendre quadrature
+  #//////////////////////////////////////////////////////////////////////////
+  #///////////////////////////////////////////////
+  # TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR
+  #///////////////////////////////////////////////
+  for k in range(nz+2*hs):
+    for i in range(nx+2*hs):
+      # Initialize the state to zero
+      for ll in range(NUM_VARS):
+        state[ll,k,i] = 0.0
+      
+      # Use Gauss-Legendre quadrature to initialize a hydrostatic balance + temperature perturbation
+      for kk in range(nqpoints):
+        for ii in range(nqpoints):
+          # Compute the x,z location within the global domain based on cell and quadrature index
+          x: real = (i_beg + i-hs+0.5)*dx + (qpoints[ii]-0.5)*dx
+          z: real = (k_beg + k-hs+0.5)*dz + (qpoints[kk]-0.5)*dz
+          # real r, u, w, t, hr, ht;
+
+          # The above real variables are probably output parameters
+          # of collision, thermal, gravity_waves, density_current, and injection.
+          # x and z are probably input parameters of these functions.
+
+          # Set the fluid state based on the user's specification
+          if data_spec_int == DATA_SPEC_COLLISION:
+            (r,u,w,t,hr,ht) = collision(x,z)
+          if data_spec_int == DATA_SPEC_THERMAL:
+            (r,u,w,t,hr,ht) = thermal(x,z)
+          if data_spec_int == DATA_SPEC_GRAVITY_WAVES:
+            (r,u,w,t,hr,ht) = gravity_waves(x,z)
+          if data_spec_int == DATA_SPEC_DENSITY_CURRENT:
+            (r,u,w,t,hr,ht) = density_current(x,z)
+          if data_spec_int == DATA_SPEC_INJECTION:
+            (r,u,w,t,hr,ht) = injection(x,z)
+
+          # Store into the fluid state array
+          state[ID_DENS,k,i] += r                         * qweights[ii]*qweights[kk];
+          state[ID_UMOM,k,i] += (r+hr)*u                  * qweights[ii]*qweights[kk];
+          state[ID_WMOM,k,i] += (r+hr)*w                  * qweights[ii]*qweights[kk];
+          state[ID_RHOT,k,i] += ( (r+hr)*(t+ht) - hr*ht ) * qweights[ii]*qweights[kk];
+
+  hy_dens_cell       = real1d("hy_dens_cell      ", nz+2*hs);
+  hy_dens_theta_cell = real1d("hy_dens_theta_cell", nz+2*hs);
+  hy_dens_int        = real1d("hy_dens_int       ", nz+1);
+  hy_dens_theta_int  = real1d("hy_dens_theta_int ", nz+1);
+  hy_pressure_int    = real1d("hy_pressure_int   ", nz+1);
+
+  //Compute the hydrostatic background state over vertical cell averages
+  /////////////////////////////////////////////////
+  // TODO: MAKE THIS LOOP A PARALLEL_FOR
+  /////////////////////////////////////////////////
+  for (int k=0; k<nz+2*hs; k++) {
+    hy_dens_cell      (k) = 0.;
+    hy_dens_theta_cell(k) = 0.;
+    for (int kk=0; kk<nqpoints; kk++) {
+      real z = (k_beg + k-hs+0.5)*dz;
+      real r, u, w, t, hr, ht;
+      //Set the fluid state based on the user's specification
+      if (data_spec_int == DATA_SPEC_COLLISION      ) { collision      (0.,z,r,u,w,t,hr,ht); }
+      if (data_spec_int == DATA_SPEC_THERMAL        ) { thermal        (0.,z,r,u,w,t,hr,ht); }
+      if (data_spec_int == DATA_SPEC_GRAVITY_WAVES  ) { gravity_waves  (0.,z,r,u,w,t,hr,ht); }
+      if (data_spec_int == DATA_SPEC_DENSITY_CURRENT) { density_current(0.,z,r,u,w,t,hr,ht); }
+      if (data_spec_int == DATA_SPEC_INJECTION      ) { injection      (0.,z,r,u,w,t,hr,ht); }
+      hy_dens_cell      (k) = hy_dens_cell      (k) + hr    * qweights(kk);
+      hy_dens_theta_cell(k) = hy_dens_theta_cell(k) + hr*ht * qweights(kk);
+    }
+  }
+
+  //Compute the hydrostatic background state at vertical cell interfaces
+  /////////////////////////////////////////////////
+  // TODO: MAKE THIS LOOP A PARALLEL_FOR
+  /////////////////////////////////////////////////
+  for (int k=0; k<nz+1; k++) {
+    real z = (k_beg + k)*dz;
+    real r, u, w, t, hr, ht;
+    if (data_spec_int == DATA_SPEC_COLLISION      ) { collision      (0.,z,r,u,w,t,hr,ht); }
+    if (data_spec_int == DATA_SPEC_THERMAL        ) { thermal        (0.,z,r,u,w,t,hr,ht); }
+    if (data_spec_int == DATA_SPEC_GRAVITY_WAVES  ) { gravity_waves  (0.,z,r,u,w,t,hr,ht); }
+    if (data_spec_int == DATA_SPEC_DENSITY_CURRENT) { density_current(0.,z,r,u,w,t,hr,ht); }
+    if (data_spec_int == DATA_SPEC_INJECTION      ) { injection      (0.,z,r,u,w,t,hr,ht); }
+    hy_dens_int      (k) = hr;
+    hy_dens_theta_int(k) = hr*ht;
+    hy_pressure_int  (k) = C0*pow((hr*ht),gamm);
+  }
+
+  fixed_data.hy_dens_cell       = realConst1d(hy_dens_cell      );
+  fixed_data.hy_dens_theta_cell = realConst1d(hy_dens_theta_cell);
+  fixed_data.hy_dens_int        = realConst1d(hy_dens_int       );
+  fixed_data.hy_dens_theta_int  = realConst1d(hy_dens_theta_int );
+  fixed_data.hy_pressure_int    = realConst1d(hy_pressure_int   );
+}
+
+
+//This test case is initially balanced but injects fast, cold air from the left boundary near the model top
+//x and z are input coordinates at which to sample
+//r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location
+//hr and ht are output background hydrostatic density and potential temperature at that location
+void injection( real x , real z , real &r , real &u , real &w , real &t , real &hr , real &ht ) {
+  (hr, ht) = hydro_const_theta(z)
+  r = 0.;
+  t = 0.;
+  u = 0.;
+  w = 0.;
+}
+
+
+# Initialize a density current (falling cold thermal that propagates along the model bottom)
+# x and z are input coordinates at which to sample
+# r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location
+# hr and ht are output background hydrostatic density and potential temperature at that location
+void density_current( real x , real z , real &r , real &u , real &w , real &t , real &hr , real &ht ) {
+  (hr, ht) = hydro_const_theta(z)
+  r = 0.;
+  t = 0.;
+  u = 0.;
+  w = 0.;
+  t = t + sample_ellipse_cosine(x,z,-20. ,xlen/2,5000.,4000.,2000.);
+}
+
+
+//x and z are input coordinates at which to sample
+//r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location
+//hr and ht are output background hydrostatic density and potential temperature at that location
+void gravity_waves ( real x , real z , real &r , real &u , real &w , real &t , real &hr , real &ht ) {
+  hydro_const_bvfreq(z,0.02,hr,ht);
+  r = 0.;
+  t = 0.;
+  u = 15.;
+  w = 0.;
+}
+
+
+//Rising thermal
+//x and z are input coordinates at which to sample
+//r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location
+//hr and ht are output background hydrostatic density and potential temperature at that location
+void thermal( real x , real z , real &r , real &u , real &w , real &t , real &hr , real &ht ) {
+  (hr, ht) = hydro_const_theta(z)
+  r = 0.;
+  t = 0.;
+  u = 0.;
+  w = 0.;
+  t = t + sample_ellipse_cosine(x,z, 3. ,xlen/2,2000.,2000.,2000.);
+}
+
+
+# Colliding thermals
+# x and z are input coordinates at which to sample
+# r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location
+# hr and ht are output background hydrostatic density and potential temperature at that location
+def collision(x: real, z: real): # returns (r, u, w, t, hr, ht)
+  (hr, ht) = hydro_const_theta(z)
+  r = 0.0
+  t = 0.0
+  u = 0.0
+  w = 0.0
+  t = t + sample_ellipse_cosine(x, z,  20.0 , xlen/2, 2000.0, 2000.0, 2000.0)
+  t = t + sample_ellipse_cosine(x, z, -20.0 , xlen/2, 8000.0, 2000.0, 2000.0)
+
+  return (r, u, w, t, hr, ht)
+
+
+# Establish hydrostatic balance using constant potential temperature (thermally neutral atmosphere)
+# z is the input coordinate
+# Returns (r,t): r and t are the output background hydrostatic density and potential temperature
+def hydro_const_theta(z: real): # returns (r, t)
+  theta0: real = 300.0  # Background potential temperature
+  exner0: real =   1.0  # Surface-level Exner pressure
+  # Establish hydrostatic balance first using Exner pressure
+  t = theta0                                  # Potential Temperature at z
+  exner = exner0 - grav * z / (cp * theta0)   # Exner pressure at z
+  p = p0 * pow(exner, (cp/rd))                # Pressure at z
+  rt = pow((p / C0), (1. / gamm))             # rho*theta at z
+  r = rt / t                                  # Density at z
+
+  return (r, t)
+}
+
+
+//Establish hydrostatic balance using constant Brunt-Vaisala frequency
+//z is the input coordinate
+//bv_freq0 is the constant Brunt-Vaisala frequency
+//r and t are the output background hydrostatic density and potential temperature
+void hydro_const_bvfreq( real z , real bv_freq0 , real &r , real &t ) {
+  const real theta0 = 300.;  //Background potential temperature
+  const real exner0 = 1.;    //Surface-level Exner pressure
+  t = theta0 * exp( bv_freq0*bv_freq0 / grav * z );                                    //Pot temp at z
+  real exner = exner0 - grav*grav / (cp * bv_freq0*bv_freq0) * (t - theta0) / (t * theta0); //Exner pressure at z
+  real p = p0 * pow(exner,(cp/rd));                                                         //Pressure at z
+  real rt = pow((p / C0),(1. / gamm));                                                  //rho*theta at z
+  r = rt / t;                                                                          //Density at z
+}
+
+
+//Sample from an ellipse of a specified center, radius, and amplitude at a specified location
+//x and z are input coordinates
+//amp,x0,z0,xrad,zrad are input amplitude, center, and radius of the ellipse
+real sample_ellipse_cosine( real x , real z , real amp , real x0 , real z0 , real xrad , real zrad ) {
+  //Compute distance from bubble center
+  real dist = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.;
+  //If the distance from bubble center is less than the radius, create a cos**2 profile
+  if (dist <= pi / 2.) {
+    return amp * pow(cos(dist),2.);
+  } else {
+    return 0.;
+  }
+}
+
+
+//Output the fluid state (state) to a NetCDF file at a given elapsed model time (etime)
+//The file I/O uses parallel-netcdf, the only external library required for this mini-app.
+//If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics
+void output( realConst3d state , real etime , int &num_out , Fixed_data const &fixed_data ) {
+  auto &nx                 = fixed_data.nx                ;
+  auto &nz                 = fixed_data.nz                ;
+  auto &i_beg              = fixed_data.i_beg             ;
+  auto &k_beg              = fixed_data.k_beg             ;
+  auto &mainproc         = fixed_data.mainproc        ;
+  auto &hy_dens_cell       = fixed_data.hy_dens_cell      ;
+  auto &hy_dens_theta_cell = fixed_data.hy_dens_theta_cell;
+
+  int ncid, t_dimid, x_dimid, z_dimid, dens_varid, uwnd_varid, wwnd_varid, theta_varid, t_varid, dimids[3];
+  MPI_Offset st1[1], ct1[1], st3[3], ct3[3];
+  //Temporary arrays to hold density, u-wind, w-wind, and potential temperature (theta)
+  //Inform the user
+  if (mainproc) { printf("*** OUTPUT ***\n"); }
+  # Allocate some (big) temp arrays
+  dens = doub2d("dens", nz, nx);
+  uwnd = doub2d("uwnd", nz, nx);
+  wwnd = doub2d("wwnd", nz, nx);
+  theta = doub2d("theta", nz, nx);
+
+  //If the elapsed time is zero, create the file. Otherwise, open the file
+  if (etime == 0) {
+    //Create the file
+    ncwrap( ncmpi_create( MPI_COMM_WORLD , "output.nc" , NC_CLOBBER , MPI_INFO_NULL , &ncid ) , __LINE__ );
+    //Create the dimensions
+    ncwrap( ncmpi_def_dim( ncid , "t" , (MPI_Offset) NC_UNLIMITED , &t_dimid ) , __LINE__ );
+    ncwrap( ncmpi_def_dim( ncid , "x" , (MPI_Offset) nx_glob      , &x_dimid ) , __LINE__ );
+    ncwrap( ncmpi_def_dim( ncid , "z" , (MPI_Offset) nz_glob      , &z_dimid ) , __LINE__ );
+    //Create the variables
+    dimids[0] = t_dimid;
+    ncwrap( ncmpi_def_var( ncid , "t"     , NC_DOUBLE , 1 , dimids ,     &t_varid ) , __LINE__ );
+    dimids[0] = t_dimid; dimids[1] = z_dimid; dimids[2] = x_dimid;
+    ncwrap( ncmpi_def_var( ncid , "dens"  , NC_DOUBLE , 3 , dimids ,  &dens_varid ) , __LINE__ );
+    ncwrap( ncmpi_def_var( ncid , "uwnd"  , NC_DOUBLE , 3 , dimids ,  &uwnd_varid ) , __LINE__ );
+    ncwrap( ncmpi_def_var( ncid , "wwnd"  , NC_DOUBLE , 3 , dimids ,  &wwnd_varid ) , __LINE__ );
+    ncwrap( ncmpi_def_var( ncid , "theta" , NC_DOUBLE , 3 , dimids , &theta_varid ) , __LINE__ );
+    //End "define" mode
+    ncwrap( ncmpi_enddef( ncid ) , __LINE__ );
+  } else {
+    //Open the file
+    ncwrap( ncmpi_open( MPI_COMM_WORLD , "output.nc" , NC_WRITE , MPI_INFO_NULL , &ncid ) , __LINE__ );
+    //Get the variable IDs
+    ncwrap( ncmpi_inq_varid( ncid , "dens"  ,  &dens_varid ) , __LINE__ );
+    ncwrap( ncmpi_inq_varid( ncid , "uwnd"  ,  &uwnd_varid ) , __LINE__ );
+    ncwrap( ncmpi_inq_varid( ncid , "wwnd"  ,  &wwnd_varid ) , __LINE__ );
+    ncwrap( ncmpi_inq_varid( ncid , "theta" , &theta_varid ) , __LINE__ );
+    ncwrap( ncmpi_inq_varid( ncid , "t"     ,     &t_varid ) , __LINE__ );
+  }
+
+  //Store perturbed values in the temp arrays for output
+  /////////////////////////////////////////////////
+  // TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR
+  /////////////////////////////////////////////////
+  for (int k=0; k<nz; k++) {
+    for (int i=0; i<nx; i++) {
+      dens (k,i) = state(ID_DENS,hs+k,hs+i);
+      uwnd (k,i) = state(ID_UMOM,hs+k,hs+i) / ( hy_dens_cell(hs+k) + state(ID_DENS,hs+k,hs+i) );
+      wwnd (k,i) = state(ID_WMOM,hs+k,hs+i) / ( hy_dens_cell(hs+k) + state(ID_DENS,hs+k,hs+i) );
+      theta(k,i) = ( state(ID_RHOT,hs+k,hs+i) + hy_dens_theta_cell(hs+k) ) / ( hy_dens_cell(hs+k) + state(ID_DENS,hs+k,hs+i) ) - hy_dens_theta_cell(hs+k) / hy_dens_cell(hs+k);
+    }
+  }
+
+  //Write the grid data to file with all the processes writing collectively
+  st3[0] = num_out; st3[1] = k_beg; st3[2] = i_beg;
+  ct3[0] = 1      ; ct3[1] = nz   ; ct3[2] = nx   ;
+  ncwrap( ncmpi_put_vara_double_all( ncid ,  dens_varid , st3 , ct3 , dens.data()  ) , __LINE__ );
+  ncwrap( ncmpi_put_vara_double_all( ncid ,  uwnd_varid , st3 , ct3 , uwnd.data()  ) , __LINE__ );
+  ncwrap( ncmpi_put_vara_double_all( ncid ,  wwnd_varid , st3 , ct3 , wwnd.data()  ) , __LINE__ );
+  ncwrap( ncmpi_put_vara_double_all( ncid , theta_varid , st3 , ct3 , theta.data() ) , __LINE__ );
+
+  //Only the main process needs to write the elapsed time
+  //Begin "independent" write mode
+  ncwrap( ncmpi_begin_indep_data(ncid) , __LINE__ );
+  //write elapsed time to file
+  if (mainproc) {
+    st1[0] = num_out;
+    ct1[0] = 1;
+    double etimearr[1];
+    etimearr[0] = etime; ncwrap( ncmpi_put_vara_double( ncid , t_varid , st1 , ct1 , etimearr ) , __LINE__ );
+  }
+  //End "independent" write mode
+  ncwrap( ncmpi_end_indep_data(ncid) , __LINE__ );
+
+  //Close the file
+  ncwrap( ncmpi_close(ncid) , __LINE__ );
+
+  //Increment the number of outputs
+  num_out = num_out + 1;
+}
+
+
+//Error reporting routine for the PNetCDF I/O
+void ncwrap( int ierr , int line ) {
+  if (ierr != NC_NOERR) {
+    printf("NetCDF Error at line: %d\n", line);
+    printf("%s\n",ncmpi_strerror(ierr));
+    exit(-1);
+  }
+}
+
+
+void finalize() {
+}
+
+
+//Compute reduced quantities for error checking without resorting to the "ncdiff" tool
+void reductions( realConst3d state , double &mass , double &te , Fixed_data const &fixed_data ) {
+  auto &nx                 = fixed_data.nx                ;
+  auto &nz                 = fixed_data.nz                ;
+  auto &hy_dens_cell       = fixed_data.hy_dens_cell      ;
+  auto &hy_dens_theta_cell = fixed_data.hy_dens_theta_cell;
+
+  mass = 0;
+  te   = 0;
+  for (int k=0; k<nz; k++) {
+    for (int i=0; i<nx; i++) {
+      double r  =   state(ID_DENS,hs+k,hs+i) + hy_dens_cell(hs+k);             // Density
+      double u  =   state(ID_UMOM,hs+k,hs+i) / r;                              // U-wind
+      double w  =   state(ID_WMOM,hs+k,hs+i) / r;                              // W-wind
+      double th = ( state(ID_RHOT,hs+k,hs+i) + hy_dens_theta_cell(hs+k) ) / r; // Potential Temperature (theta)
+      double p  = C0*pow(r*th,gamm);                               // Pressure
+      double t  = th / pow(p0/p,rd/cp);                            // Temperature
+      double ke = r*(u*u+w*w);                                     // Kinetic Energy
+      double ie = r*cv*t;                                          // Internal Energy
+      mass += r        *dx*dz; // Accumulate domain mass
+      te   += (ke + ie)*dx*dz; // Accumulate domain total energy
+    }
+  }
+  double glob[2], loc[2];
+  loc[0] = mass;
+  loc[1] = te;
+  int ierr = MPI_Allreduce(loc,glob,2,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
+  mass = glob[0];
+  te   = glob[1];
+}
+
+

From 1a1120cc90de4b22ccb5795358f32423a671f2d6 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Mon, 3 Mar 2025 17:09:42 -0700
Subject: [PATCH 02/37] Progress on Python port; not done yet

---
 python/miniWeather.py | 481 ++++++++++++++++++++++--------------------
 1 file changed, 252 insertions(+), 229 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index 6bb16a3..ab8e3ba 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -67,9 +67,9 @@ def doub2d(name: string, nx: int, nz: int):
   #MPI_Init(&argc,&argv);
   #yakl::init();
   {
-    Fixed_data fixed_data;
-    real3d state;
-    real dt;                    //Model time step (seconds)
+    #Fixed_data fixed_data;
+    #real3d state;
+    #real dt;                    //Model time step (seconds)
 
     # init allocates state
     (fixed_data, state, dt) = init() # init( state , dt , fixed_data );
@@ -122,15 +122,15 @@ def doub2d(name: string, nx: int, nz: int):
     double mass, te;
     reductions(state,mass,te,fixed_data);
 
-    if (mainproc) {
-      printf( "d_mass: %le\n" , (mass - mass0)/mass0 );
-      printf( "d_te:   %le\n" , (te   - te0  )/te0   );
-    }
+    if mainproc:
+      print( "d_mass: %le\n" % ((mass - mass0)/mass0) )
+      print( "d_te:   %le\n" % ((te   - te0  )/te0  ) )
 
-    finalize();
+    finalize()
   }
-  yakl::finalize();
-  MPI_Finalize();
+
+  # yakl::finalize();
+  # MPI_Finalize();
 }
 
 
@@ -213,26 +213,25 @@ def perform_timestep(
     yakl::timer_stop("tendencies z");
   }
 
-  /////////////////////////////////////////////////
-  // TODO: MAKE THESE 3 LOOPS A PARALLEL_FOR
-  /////////////////////////////////////////////////
-  //Apply the tendencies to the fluid state
-  yakl::timer_start("apply tendencies");
-  for (int ll=0; ll<NUM_VARS; ll++) {
-    for (int k=0; k<nz; k++) {
-      for (int i=0; i<nx; i++) {
-        if (data_spec_int == DATA_SPEC_GRAVITY_WAVES) {
-          real x = (i_beg + i+0.5)*dx;
-          real z = (k_beg + k+0.5)*dz;
-          real wpert = sample_ellipse_cosine( x,z , 0.01 , xlen/8,1000., 500.,500. );
-          tend(ID_WMOM,k,i) += wpert*hy_dens_cell(hs+k);
-        }
+  # /////////////////////////////////////////////////
+  # // TODO: MAKE THESE 3 LOOPS A PARALLEL_FOR
+  # /////////////////////////////////////////////////
+  # Apply the tendencies to the fluid state
+  # yakl::timer_start("apply tendencies");
+  for ll in range(NUM_VARS):
+    for k in range(nz):
+      for i in range(nx):
+        if data_spec_int == DATA_SPEC_GRAVITY_WAVES:
+          x: real = (i_beg + i+0.5)*dx;
+          z: real = (k_beg + k+0.5)*dz;
+          wpert: real = sample_ellipse_cosine(x, z, 0.01, xlen/8, 1000.0, 500.0, 500.0)
+          tend[ID_WMOM,k,i] += wpert*hy_dens_cell[hs+k]
+
         state_out(ll,hs+k,hs+i) = state_init(ll,hs+k,hs+i) + dt * tend(ll,k,i);
-      }
-    }
-  }
-  yakl::timer_stop("apply tendencies");
-}
+
+  # yakl::timer_stop("apply tendencies");
+
+  # FIXME (mfh 2025/03/03) This should return something; not sure what yet
 
 
 //Compute the time tendencies of the fluid state using forcing in the x-direction
@@ -381,44 +380,38 @@ def perform_timestep(
   auto &hy_dens_cell       = fixed_data.hy_dens_cell      ;
   auto &hy_dens_theta_cell = fixed_data.hy_dens_theta_cell;
 
-  ////////////////////////////////////////////////////////////////////////
-  // TODO: EXCHANGE HALO VALUES WITH NEIGHBORING MPI TASKS
-  // (1) give    state(1:hs,1:nz,1:NUM_VARS)       to   my left  neighbor
-  // (2) receive state(1-hs:0,1:nz,1:NUM_VARS)     from my left  neighbor
-  // (3) give    state(nx-hs+1:nx,1:nz,1:NUM_VARS) to   my right neighbor
-  // (4) receive state(nx+1:nx+hs,1:nz,1:NUM_VARS) from my right neighbor
-  ////////////////////////////////////////////////////////////////////////
-
-  //////////////////////////////////////////////////////
-  // DELETE THE SERIAL CODE BELOW AND REPLACE WITH MPI
-  //////////////////////////////////////////////////////
-  for (int ll=0; ll<NUM_VARS; ll++) {
-    for (int k=0; k<nz; k++) {
-      state(ll,hs+k,0      ) = state(ll,hs+k,nx+hs-2);
-      state(ll,hs+k,1      ) = state(ll,hs+k,nx+hs-1);
-      state(ll,hs+k,nx+hs  ) = state(ll,hs+k,hs     );
-      state(ll,hs+k,nx+hs+1) = state(ll,hs+k,hs+1   );
-    }
-  }
-  ////////////////////////////////////////////////////
-
-  if (data_spec_int == DATA_SPEC_INJECTION) {
-    if (myrank == 0) {
-      /////////////////////////////////////////////////
-      // TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR
-      /////////////////////////////////////////////////
-      for (int k=0; k<nz; k++) {
-        for (int i=0; i<hs; i++) {
-          real z = (k_beg + k+0.5)*dz;
-          if (abs(z-3*zlen/4) <= zlen/16) {
-            state(ID_UMOM,hs+k,i) = (state(ID_DENS,hs+k,i)+hy_dens_cell(hs+k)) * 50.;
-            state(ID_RHOT,hs+k,i) = (state(ID_DENS,hs+k,i)+hy_dens_cell(hs+k)) * 298. - hy_dens_theta_cell(hs+k);
-          }
-        }
-      }
-    }
-  }
-}
+  # //////////////////////////////////////////////////////////////////////
+  # TODO: EXCHANGE HALO VALUES WITH NEIGHBORING MPI TASKS
+  # (1) give    state(1:hs,1:nz,1:NUM_VARS)       to   my left  neighbor
+  # (2) receive state(1-hs:0,1:nz,1:NUM_VARS)     from my left  neighbor
+  # (3) give    state(nx-hs+1:nx,1:nz,1:NUM_VARS) to   my right neighbor
+  # (4) receive state(nx+1:nx+hs,1:nz,1:NUM_VARS) from my right neighbor
+  # //////////////////////////////////////////////////////////////////////
+
+  # //////////////////////////////////////////////////////
+  # // DELETE THE SERIAL CODE BELOW AND REPLACE WITH MPI
+  # //////////////////////////////////////////////////////
+  for ll in range(NUM_VARS):
+    for k in range(nz):
+      state[ll,hs+k,0      ] = state[ll,hs+k,nx+hs-2];
+      state[ll,hs+k,1      ] = state[ll,hs+k,nx+hs-1];
+      state[ll,hs+k,nx+hs  ] = state[ll,hs+k,hs     ];
+      state[ll,hs+k,nx+hs+1] = state[ll,hs+k,hs+1   ];
+
+  if data_spec_int == DATA_SPEC_INJECTION:
+    if myrank == 0:
+      #/////////////////////////////////////////////////
+      # TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR
+      #/////////////////////////////////////////////////
+      for k in range(nz):
+        for i in range(hs):
+          z: real = (k_beg + k+0.5)*dz
+          if abs(z-3*zlen/4) <= zlen/16:
+            state[ID_UMOM,hs+k,i] = (state[ID_DENS,hs+k,i] + hy_dens_cell[hs+k]) * 50.0
+            state[ID_RHOT,hs+k,i] = (state[ID_DENS,hs+k,i] + hy_dens_cell[hs+k]) * 298.0 - hy_dens_theta_cell[hs+k]
+
+  # TODO (mfh 2025/03/03) Return something, not sure what yet
+
 
 
 //Set this MPI task's halo values in the z-direction. This does not require MPI because there is no MPI
@@ -428,34 +421,33 @@ def perform_timestep(
   auto &nz                 = fixed_data.nz                ;
   auto &hy_dens_cell       = fixed_data.hy_dens_cell      ;
   
-  /////////////////////////////////////////////////
-  // TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR
-  /////////////////////////////////////////////////
-  for (int ll=0; ll<NUM_VARS; ll++) {
-    for (int i=0; i<nx+2*hs; i++) {
-      if (ll == ID_WMOM) {
-        state(ll,0      ,i) = 0.;
-        state(ll,1      ,i) = 0.;
-        state(ll,nz+hs  ,i) = 0.;
-        state(ll,nz+hs+1,i) = 0.;
-      } else if (ll == ID_UMOM) {
-        state(ll,0      ,i) = state(ll,hs     ,i) / hy_dens_cell(hs     ) * hy_dens_cell(0      );
-        state(ll,1      ,i) = state(ll,hs     ,i) / hy_dens_cell(hs     ) * hy_dens_cell(1      );
-        state(ll,nz+hs  ,i) = state(ll,nz+hs-1,i) / hy_dens_cell(nz+hs-1) * hy_dens_cell(nz+hs  );
-        state(ll,nz+hs+1,i) = state(ll,nz+hs-1,i) / hy_dens_cell(nz+hs-1) * hy_dens_cell(nz+hs+1);
-      } else {
-        state(ll,0      ,i) = state(ll,hs     ,i);
-        state(ll,1      ,i) = state(ll,hs     ,i);
-        state(ll,nz+hs  ,i) = state(ll,nz+hs-1,i);
-        state(ll,nz+hs+1,i) = state(ll,nz+hs-1,i);
-      }
-    }
-  }
-}
+  # /////////////////////////////////////////////////
+  # // TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR
+  # /////////////////////////////////////////////////
+  for ll in range(NUM_VARS):
+    for i in range(nx+2*hs):
+      if ll == ID_WMOM:
+        state[ll,0      ,i] = 0.0
+        state[ll,1      ,i] = 0.0
+        state[ll,nz+hs  ,i] = 0.0
+        state[ll,nz+hs+1,i] = 0.0
+      elif ll == ID_UMOM:
+        state[ll,0      ,i] = state[ll,hs     ,i] / hy_dens_cell[hs     ] * hy_dens_cell[0      ]
+        state[ll,1      ,i] = state[ll,hs     ,i] / hy_dens_cell[hs     ] * hy_dens_cell[1      ]
+        state[ll,nz+hs  ,i] = state[ll,nz+hs-1,i] / hy_dens_cell[nz+hs-1] * hy_dens_cell[nz+hs  ]
+        state[ll,nz+hs+1,i] = state[ll,nz+hs-1,i] / hy_dens_cell[nz+hs-1] * hy_dens_cell[nz+hs+1]
+      else:
+        state[ll,0      ,i] = state[ll,hs     ,i]
+        state[ll,1      ,i] = state[ll,hs     ,i]
+        state[ll,nz+hs  ,i] = state[ll,nz+hs-1,i]
+        state[ll,nz+hs+1,i] = state[ll,nz+hs-1,i]
+
+  # TODO (mfh 2025/03/03) Return something, not sure what yet
+
 
 # state, dt, and fixed_data used to be output parameters.
 # It would be more Pythonic to return them as a tuple.
-def init(): # tuple[real3d, real, Fixed_data]:
+def init(): # -> tuple[real3d, real, Fixed_data]:
   #real3d state
   #Fixed_data fixed_data
   #real dt
@@ -514,8 +506,16 @@ def init(): # tuple[real3d, real, Fixed_data]:
   # SArray<real,1,nqpoints> qpoints;
   # SArray<real,1,nqpoints> qweights;
 
-  qpoints = np.array([0.112701665379258311482073460022, 0.500000000000000000000000000000, 0.887298334620741688517926539980], dtype=real)
-  qweights = np.array([0.277777777777777777777777777779, 0.444444444444444444444444444444, 0.277777777777777777777777777779], dtype=real)
+  qpoints = np.array([
+      0.112701665379258311482073460022,
+      0.500000000000000000000000000000,
+      0.887298334620741688517926539980
+    ], dtype=real)
+  qweights = np.array([
+      0.277777777777777777777777777779,
+      0.444444444444444444444444444444,
+      0.277777777777777777777777777779
+    ], dtype=real)
 
   # //////////////////////////////////////////////////////////////////////////
   # Initialize the cell-averaged fluid state via Gauss-Legendre quadrature
@@ -565,103 +565,120 @@ def init(): # tuple[real3d, real, Fixed_data]:
   hy_dens_theta_int  = real1d("hy_dens_theta_int ", nz+1);
   hy_pressure_int    = real1d("hy_pressure_int   ", nz+1);
 
-  //Compute the hydrostatic background state over vertical cell averages
-  /////////////////////////////////////////////////
-  // TODO: MAKE THIS LOOP A PARALLEL_FOR
-  /////////////////////////////////////////////////
-  for (int k=0; k<nz+2*hs; k++) {
-    hy_dens_cell      (k) = 0.;
-    hy_dens_theta_cell(k) = 0.;
-    for (int kk=0; kk<nqpoints; kk++) {
-      real z = (k_beg + k-hs+0.5)*dz;
-      real r, u, w, t, hr, ht;
-      //Set the fluid state based on the user's specification
-      if (data_spec_int == DATA_SPEC_COLLISION      ) { collision      (0.,z,r,u,w,t,hr,ht); }
-      if (data_spec_int == DATA_SPEC_THERMAL        ) { thermal        (0.,z,r,u,w,t,hr,ht); }
-      if (data_spec_int == DATA_SPEC_GRAVITY_WAVES  ) { gravity_waves  (0.,z,r,u,w,t,hr,ht); }
-      if (data_spec_int == DATA_SPEC_DENSITY_CURRENT) { density_current(0.,z,r,u,w,t,hr,ht); }
-      if (data_spec_int == DATA_SPEC_INJECTION      ) { injection      (0.,z,r,u,w,t,hr,ht); }
-      hy_dens_cell      (k) = hy_dens_cell      (k) + hr    * qweights(kk);
-      hy_dens_theta_cell(k) = hy_dens_theta_cell(k) + hr*ht * qweights(kk);
-    }
-  }
-
-  //Compute the hydrostatic background state at vertical cell interfaces
-  /////////////////////////////////////////////////
-  // TODO: MAKE THIS LOOP A PARALLEL_FOR
-  /////////////////////////////////////////////////
-  for (int k=0; k<nz+1; k++) {
-    real z = (k_beg + k)*dz;
-    real r, u, w, t, hr, ht;
-    if (data_spec_int == DATA_SPEC_COLLISION      ) { collision      (0.,z,r,u,w,t,hr,ht); }
-    if (data_spec_int == DATA_SPEC_THERMAL        ) { thermal        (0.,z,r,u,w,t,hr,ht); }
-    if (data_spec_int == DATA_SPEC_GRAVITY_WAVES  ) { gravity_waves  (0.,z,r,u,w,t,hr,ht); }
-    if (data_spec_int == DATA_SPEC_DENSITY_CURRENT) { density_current(0.,z,r,u,w,t,hr,ht); }
-    if (data_spec_int == DATA_SPEC_INJECTION      ) { injection      (0.,z,r,u,w,t,hr,ht); }
-    hy_dens_int      (k) = hr;
-    hy_dens_theta_int(k) = hr*ht;
-    hy_pressure_int  (k) = C0*pow((hr*ht),gamm);
-  }
-
-  fixed_data.hy_dens_cell       = realConst1d(hy_dens_cell      );
-  fixed_data.hy_dens_theta_cell = realConst1d(hy_dens_theta_cell);
-  fixed_data.hy_dens_int        = realConst1d(hy_dens_int       );
-  fixed_data.hy_dens_theta_int  = realConst1d(hy_dens_theta_int );
-  fixed_data.hy_pressure_int    = realConst1d(hy_pressure_int   );
-}
-
-
-//This test case is initially balanced but injects fast, cold air from the left boundary near the model top
-//x and z are input coordinates at which to sample
-//r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location
-//hr and ht are output background hydrostatic density and potential temperature at that location
-void injection( real x , real z , real &r , real &u , real &w , real &t , real &hr , real &ht ) {
+  # Compute the hydrostatic background state over vertical cell averages
+  # /////////////////////////////////////////////////
+  # // TODO: MAKE THIS LOOP A PARALLEL_FOR
+  # /////////////////////////////////////////////////
+  for k in range(nz+2*hs):
+    hy_dens_cell[k]       = 0.0
+    hy_dens_theta_cell[k] = 0.0
+    for kk in range(nqpoints):
+      z = (k_beg + k-hs+0.5)*dz
+      # real r, u, w, t, hr, ht; # formerly output arguments
+      # Set the fluid state based on the user's specification
+      if data_spec_int == DATA_SPEC_COLLISION:
+        (r, u, w, t, hr, ht) = collision(0.0, z)
+      if data_spec_int == DATA_SPEC_THERMAL:
+        (r, u, w, t, hr, ht) = thermal(0.0, z)
+      if data_spec_int == DATA_SPEC_GRAVITY_WAVES:
+        (r, u, w, t, hr, ht) = gravity_waves(0.0, z)
+      if data_spec_int == DATA_SPEC_DENSITY_CURRENT:
+        (r, u, w, t, hr, ht) = density_current(0.0, z)
+      if data_spec_int == DATA_SPEC_INJECTION:
+        (r, u, w, t, hr, ht) = injection(0.0, z)
+      
+      hy_dens_cell[k]       = hy_dens_cell[k]       + hr    * qweights[kk];
+      hy_dens_theta_cell[k] = hy_dens_theta_cell[k] + hr*ht * qweights[kk];
+
+  # Compute the hydrostatic background state at vertical cell interfaces
+  # /////////////////////////////////////////////////
+  # // TODO: MAKE THIS LOOP A PARALLEL_FOR
+  # /////////////////////////////////////////////////
+  for k in range(nz+1):
+    z = (k_beg + k)*dz
+    # real r, u, w, t, hr, ht; # formerly output arguments
+    if data_spec_int == DATA_SPEC_COLLISION:
+      (r, u, w, t, hr, ht) = collision(0.0, z)
+    if data_spec_int == DATA_SPEC_THERMAL:
+      (r, u, w, t, hr, ht) = thermal(0.0, z)
+    if data_spec_int == DATA_SPEC_GRAVITY_WAVES:
+      (r, u, w, t, hr, ht) = gravity_waves(0.0, z)
+    if data_spec_int == DATA_SPEC_DENSITY_CURRENT:
+      (r, u, w, t, hr, ht) = density_current(0.0, z)
+    if data_spec_int == DATA_SPEC_INJECTION:
+      (r, u, w, t, hr, ht) = injection(0.0, z)
+    
+    hy_dens_int[k]       = hr
+    hy_dens_theta_int[k] = hr*ht
+    hy_pressure_int[k]   = C0*pow((hr*ht),gamm)
+
+  fixed_data.hy_dens_cell       = realConst1d(hy_dens_cell      )
+  fixed_data.hy_dens_theta_cell = realConst1d(hy_dens_theta_cell)
+  fixed_data.hy_dens_int        = realConst1d(hy_dens_int       )
+  fixed_data.hy_dens_theta_int  = realConst1d(hy_dens_theta_int )
+  fixed_data.hy_pressure_int    = realConst1d(hy_pressure_int   )
+
+  # FIXME (mfh 2025/03/03) This should return something; not sure what yet
+
+# This test case is initially balanced but injects fast, cold air from the left boundary near the model top
+# x and z are input coordinates at which to sample
+# r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location
+# hr and ht are output background hydrostatic density and potential temperature at that location
+def injection(x: real, z: real): # returns (r, u, w, t, hr, ht)
   (hr, ht) = hydro_const_theta(z)
-  r = 0.;
-  t = 0.;
-  u = 0.;
-  w = 0.;
-}
+  r = 0.0
+  t = 0.0
+  u = 0.0
+  w = 0.0
+
+  return (r, u, w, t, hr, ht)
 
 
 # Initialize a density current (falling cold thermal that propagates along the model bottom)
 # x and z are input coordinates at which to sample
 # r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location
 # hr and ht are output background hydrostatic density and potential temperature at that location
-void density_current( real x , real z , real &r , real &u , real &w , real &t , real &hr , real &ht ) {
+def density_current(x: real, z: real): # returns (r, u, w, t, hr, ht)
   (hr, ht) = hydro_const_theta(z)
-  r = 0.;
-  t = 0.;
-  u = 0.;
-  w = 0.;
-  t = t + sample_ellipse_cosine(x,z,-20. ,xlen/2,5000.,4000.,2000.);
-}
+  r = 0.0
+  t = 0.0
+  u = 0.0
+  w = 0.0
+  # FIXME (mfh 2025/03/03) This was originally t = t + ... so perhaps t should also be an input parameter.
+  # On the other hand, t was uninitialized before (which means this was probably incorrect in the original C++).
+  t = sample_ellipse_cosine(x, z, -20.0, xlen/2, 5000.0, 4000.0, 2000.0)
 
+  return (r, u, w, t, hr, ht)
 
-//x and z are input coordinates at which to sample
-//r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location
-//hr and ht are output background hydrostatic density and potential temperature at that location
-void gravity_waves ( real x , real z , real &r , real &u , real &w , real &t , real &hr , real &ht ) {
-  hydro_const_bvfreq(z,0.02,hr,ht);
-  r = 0.;
-  t = 0.;
-  u = 15.;
-  w = 0.;
-}
 
+# x and z are input coordinates at which to sample
+# r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location
+# hr and ht are output background hydrostatic density and potential temperature at that location
+def gravity_waves(x: real, z: real): # returns (r, u, w, t, hr, ht)
+  (hr, ht) = hydro_const_bvfreq(z, 0.02)
+  r = 0.0
+  t = 0.0
+  u = 15.0
+  w = 0.0
 
-//Rising thermal
-//x and z are input coordinates at which to sample
-//r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location
-//hr and ht are output background hydrostatic density and potential temperature at that location
-void thermal( real x , real z , real &r , real &u , real &w , real &t , real &hr , real &ht ) {
+  return (r, u, w, t, hr, ht)
+
+
+# Rising thermal
+# x and z are input coordinates at which to sample
+# r,u,w,t are output density, u-wind, w-wind, and potential temperature at that location
+# hr and ht are output background hydrostatic density and potential temperature at that location
+def thermal(x: real, z: real): # returns (r, u, w, t, hr, ht)
   (hr, ht) = hydro_const_theta(z)
-  r = 0.;
-  t = 0.;
-  u = 0.;
-  w = 0.;
-  t = t + sample_ellipse_cosine(x,z, 3. ,xlen/2,2000.,2000.,2000.);
-}
+  r = 0.0
+  t = 0.0
+  u = 0.0
+  w = 0.0
+  # FIXME (mfh 2025/03/03) This was originally t = t + ... so perhaps t should also be an input parameter.
+  # On the other hand, t was uninitialized before (which means this was probably incorrect in the original C++).
+  t = sample_ellipse_cosine(x, z, 3.0, xlen/2, 2000.0, 2000.0, 2000.0)
+
+  return (r, u, w, t, hr, ht)
 
 
 # Colliding thermals
@@ -674,8 +691,10 @@ def collision(x: real, z: real): # returns (r, u, w, t, hr, ht)
   t = 0.0
   u = 0.0
   w = 0.0
-  t = t + sample_ellipse_cosine(x, z,  20.0 , xlen/2, 2000.0, 2000.0, 2000.0)
-  t = t + sample_ellipse_cosine(x, z, -20.0 , xlen/2, 8000.0, 2000.0, 2000.0)
+  # FIXME (mfh 2025/03/03) This was originally t = t + ... so perhaps t should also be an input parameter.
+  # On the other hand, t was uninitialized before (which means this was probably incorrect in the original C++).
+  t = sample_ellipse_cosine(x, z,  20.0, xlen/2, 2000.0, 2000.0, 2000.0) + \
+      sample_ellipse_cosine(x, z, -20.0, xlen/2, 8000.0, 2000.0, 2000.0)
 
   return (r, u, w, t, hr, ht)
 
@@ -697,19 +716,20 @@ def hydro_const_theta(z: real): # returns (r, t)
 }
 
 
-//Establish hydrostatic balance using constant Brunt-Vaisala frequency
-//z is the input coordinate
-//bv_freq0 is the constant Brunt-Vaisala frequency
-//r and t are the output background hydrostatic density and potential temperature
-void hydro_const_bvfreq( real z , real bv_freq0 , real &r , real &t ) {
-  const real theta0 = 300.;  //Background potential temperature
-  const real exner0 = 1.;    //Surface-level Exner pressure
-  t = theta0 * exp( bv_freq0*bv_freq0 / grav * z );                                    //Pot temp at z
-  real exner = exner0 - grav*grav / (cp * bv_freq0*bv_freq0) * (t - theta0) / (t * theta0); //Exner pressure at z
-  real p = p0 * pow(exner,(cp/rd));                                                         //Pressure at z
-  real rt = pow((p / C0),(1. / gamm));                                                  //rho*theta at z
-  r = rt / t;                                                                          //Density at z
-}
+# Establish hydrostatic balance using constant Brunt-Vaisala frequency
+# z is the input coordinate
+# bv_freq0 is the constant Brunt-Vaisala frequency (also an input)
+# r and t are the output background hydrostatic density and potential temperature
+def hydro_const_bvfreq(z: real, bv_freq0: real): # returns (r, t)
+  theta0: real = 300.0  # Background potential temperature
+  exner0: real =   1.0  # Surface-level Exner pressure
+  t = theta0 * exp( bv_freq0*bv_freq0 / grav * z )                                     # Pot temp at z
+  exner = exner0 - grav*grav / (cp * bv_freq0*bv_freq0) * (t - theta0) / (t * theta0)  # Exner pressure at z
+  p = p0 * pow(exner,(cp/rd))                                                          # Pressure at z
+  rt = pow((p / C0),(1. / gamm))                                                       # rho*theta at z
+  r = rt / t                                                                           # Density at z
+
+  return (r, t)
 
 
 //Sample from an ellipse of a specified center, radius, and amplitude at a specified location
@@ -831,39 +851,42 @@ def hydro_const_theta(z: real): # returns (r, t)
 }
 
 
-void finalize() {
-}
-
-
-//Compute reduced quantities for error checking without resorting to the "ncdiff" tool
-void reductions( realConst3d state , double &mass , double &te , Fixed_data const &fixed_data ) {
-  auto &nx                 = fixed_data.nx                ;
-  auto &nz                 = fixed_data.nz                ;
-  auto &hy_dens_cell       = fixed_data.hy_dens_cell      ;
-  auto &hy_dens_theta_cell = fixed_data.hy_dens_theta_cell;
-
-  mass = 0;
-  te   = 0;
-  for (int k=0; k<nz; k++) {
-    for (int i=0; i<nx; i++) {
-      double r  =   state(ID_DENS,hs+k,hs+i) + hy_dens_cell(hs+k);             // Density
-      double u  =   state(ID_UMOM,hs+k,hs+i) / r;                              // U-wind
-      double w  =   state(ID_WMOM,hs+k,hs+i) / r;                              // W-wind
-      double th = ( state(ID_RHOT,hs+k,hs+i) + hy_dens_theta_cell(hs+k) ) / r; // Potential Temperature (theta)
-      double p  = C0*pow(r*th,gamm);                               // Pressure
-      double t  = th / pow(p0/p,rd/cp);                            // Temperature
-      double ke = r*(u*u+w*w);                                     // Kinetic Energy
-      double ie = r*cv*t;                                          // Internal Energy
-      mass += r        *dx*dz; // Accumulate domain mass
-      te   += (ke + ie)*dx*dz; // Accumulate domain total energy
-    }
-  }
-  double glob[2], loc[2];
-  loc[0] = mass;
-  loc[1] = te;
-  int ierr = MPI_Allreduce(loc,glob,2,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
-  mass = glob[0];
-  te   = glob[1];
-}
-
-
+def finalize() -> None:
+  pass
+
+
+# Compute reduced quantities for error checking without resorting to the "ncdiff" tool
+#void reductions( realConst3d state , double &mass , double &te , Fixed_data const &fixed_data ) {
+def reductions(
+    state # realConst3d, an input parameter
+    fixed_data # Fixed_data const&, an input parameter
+  ) -> tuple[double, double]: # mass, te
+
+  nx                 = fixed_data.nx                
+  nz                 = fixed_data.nz                
+  hy_dens_cell       = fixed_data.hy_dens_cell      
+  hy_dens_theta_cell = fixed_data.hy_dens_theta_cell
+
+  mass = 0
+  te   = 0
+  for k in range(nz):
+    for i in range(nx):
+      r = state[ID_DENS,hs+k,hs+i] + hy_dens_cell[hs+k]                # Density
+      u = state[ID_UMOM,hs+k,hs+i] / r                                 # U-wind
+      w = state[ID_WMOM,hs+k,hs+i] / r                                 # W-wind
+      th = ( state[ID_RHOT,hs+k,hs+i] + hy_dens_theta_cell[hs+k] ) / r # Potential Temperature (theta)
+      p  = C0*pow(r*th,gamm)                               # Pressure
+      t  = th / pow(p0/p,rd/cp)                            # Temperature
+      ke = r*(u*u+w*w)                                     # Kinetic Energy
+      ie = r*cv*t                                          # Internal Energy
+      mass += r        *dx*dz # Accumulate domain mass
+      te   += (ke + ie)*dx*dz # Accumulate domain total energy
+
+  #double glob[2], loc[2];
+  #loc[0] = mass;
+  #loc[1] = te;
+  #int ierr = MPI_Allreduce(loc,glob,2,MPI_DOUBLE,MPI_SUM,MPI_COMM_WORLD);
+  #mass = glob[0];
+  #te   = glob[1];
+
+  return (mass, te)

From 609c239aad0245752b26185616390d8f740bbde1 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Mon, 3 Mar 2025 22:46:58 -0700
Subject: [PATCH 03/37] Progress on porting to Python

---
 python/miniWeather.py | 702 ++++++++++++++++++++++--------------------
 1 file changed, 368 insertions(+), 334 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index ab8e3ba..d601022 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -60,78 +60,73 @@ def doub2d(name: string, nx: int, nz: int):
   realConst1d hy_pressure_int;     //hydrostatic press (vert cell interf).   Dimensions: (1:nz+1)
 };
 
-///////////////////////////////////////////////////////////////////////////////////////
-// THE MAIN PROGRAM STARTS HERE
-///////////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+# ///////////////////////////////////////////////////////////////////////////////////////
+# // THE MAIN PROGRAM STARTS HERE
+# ///////////////////////////////////////////////////////////////////////////////////////
+def main() -> None:
   #MPI_Init(&argc,&argv);
   #yakl::init();
-  {
-    #Fixed_data fixed_data;
-    #real3d state;
-    #real dt;                    //Model time step (seconds)
 
-    # init allocates state
-    (fixed_data, state, dt) = init() # init( state , dt , fixed_data );
+  #Fixed_data fixed_data;
+  #real3d state;
+  #real dt;                    //Model time step (seconds)
 
-    auto &mainproc = fixed_data.mainproc;
+  # init allocates state
+  (fixed_data, state, dt) = init() # init( state , dt , fixed_data );
 
-    //Initial reductions for mass, kinetic energy, and total energy
-    double mass0, te0;
-    reductions(state,mass0,te0,fixed_data);
+  mainproc = fixed_data.mainproc;
 
-    int  num_out = 0;          //The number of outputs performed so far
-    real output_counter = 0;   //Helps determine when it's time to do output
-    real etime = 0;
+  # Initial reductions for mass, kinetic energy, and total energy
+  (mass0, te0) = reductions(state, fixed_data)
 
-    //Output the initial state
-    if (output_freq >= 0) {
-      output(state,etime,num_out,fixed_data);
-    }
+  num_out: int         = 0  # The number of outputs performed so far
+  output_counter: real = 0  # Helps determine when it's time to do output
+  etime: real          = 0
 
-    int direction_switch = 1;  // Tells dimensionally split which order to take x,z solves
-
-    ////////////////////////////////////////////////////
-    // MAIN TIME STEP LOOP
-    ////////////////////////////////////////////////////
-    auto t1 = std::chrono::steady_clock::now();
-    while (etime < sim_time) {
-      //If the time step leads to exceeding the simulation time, shorten it for the last step
-      if (etime + dt > sim_time) { dt = sim_time - etime; }
-      # Perform a single time step
-      direction_switch = perform_timestep(state, dt, direction_switch, fixed_data)
-      //Inform the user
-      #ifndef NO_INFORM
-        if (mainproc) { printf( "Elapsed Time: %lf / %lf\n", etime , sim_time ); }
-      #endif
-      //Update the elapsed time and output counter
-      etime = etime + dt;
-      output_counter = output_counter + dt;
-      //If it's time for output, reset the counter, and do output
-      if (output_freq >= 0 && output_counter >= output_freq) {
-        output_counter = output_counter - output_freq;
-        output(state,etime,num_out,fixed_data);
-      }
-    }
-    auto t2 = std::chrono::steady_clock::now();
-    if (mainproc) {
-      std::cout << "CPU Time: " << std::chrono::duration<double>(t2-t1).count() << " sec\n";
-    }
+  # Output the initial state
+  if output_freq >= 0:
+    num_out = output(state, etime, num_out, fixed_data)
 
-    //Final reductions for mass, kinetic energy, and total energy
-    double mass, te;
-    reductions(state,mass,te,fixed_data);
+  direction_switch: int = 1 # Tells dimensionally split which order to take x,z solves
 
+  # ////////////////////////////////////////////////////
+  # MAIN TIME STEP LOOP
+  # ////////////////////////////////////////////////////
+  t1 = std::chrono::steady_clock::now()
+  
+  while etime < sim_time:
+    # If the time step leads to exceeding the simulation time, shorten it for the last step
+    if etime + dt > sim_time:
+      dt = sim_time - etime
+
+    # Perform a single time step
+    direction_switch = perform_timestep(state, dt, direction_switch, fixed_data)
+    # Inform the user
     if mainproc:
-      print( "d_mass: %le\n" % ((mass - mass0)/mass0) )
-      print( "d_te:   %le\n" % ((te   - te0  )/te0  ) )
+      print( "Elapsed Time: %lf / %lf\n", etime , sim_time )
+    # Update the elapsed time and output counter
+    etime = etime + dt
+    output_counter = output_counter + dt
+    # If it's time for output, reset the counter, and do output
+    if output_freq >= 0 and output_counter >= output_freq:
+      output_counter = output_counter - output_freq
+      num_out = output(state, etime, num_out, fixed_data)
+
+  auto t2 = std::chrono::steady_clock::now();
+  if mainproc:
+    print( "CPU Time: " << std::chrono::duration<double>(t2-t1).count() << " sec\n" )
 
-    finalize()
-  }
+  # Final reductions for mass, kinetic energy, and total energy
+  (mass, te) = reductions(state, fixed_data)
+
+  if mainproc:
+    print( "d_mass: %le\n" % ((mass - mass0)/mass0) )
+    print( "d_te:   %le\n" % ((te   - te0  )/te0  ) )
+
+  finalize()
 
   # yakl::finalize();
   # MPI_Finalize();
-}
 
 
 # Performs a single dimensionally split time step using a simple low-storage three-stage Runge-Kutta time integrator
@@ -151,26 +146,26 @@ def perform_timestep(
   nx = fixed_data.nx
   nz = fixed_data.nz
 
-  state_tmp = real3d("state_tmp", NUM_VARS, nz+2*hs, nx+2*hs);
+  state_tmp = real3d("state_tmp", NUM_VARS, nz+2*hs, nx+2*hs)
 
   if direction_switch != 0:
     # x-direction first
-    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_X , fixed_data )
-    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , fixed_data )
-    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_X , fixed_data )
+    semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_X, fixed_data)
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, fixed_data)
+    semi_discrete_step(state, state_tmp, state    , dt / 1, DIR_X, fixed_data)
     # z-direction second
-    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_Z , fixed_data )
-    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , fixed_data )
-    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_Z , fixed_data )
+    semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_Z, fixed_data)
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, fixed_data)
+    semi_discrete_step(state, state_tmp, state    , dt / 1, DIR_Z, fixed_data)
   else:
     # z-direction second
-    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_Z , fixed_data )
-    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_Z , fixed_data )
-    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_Z , fixed_data )
+    semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_Z, fixed_data)
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, fixed_data)
+    semi_discrete_step(state, state_tmp, state    , dt / 1, DIR_Z, fixed_data)
     # x-direction first
-    semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_X , fixed_data )
-    semi_discrete_step( state , state_tmp , state_tmp , dt / 2 , DIR_X , fixed_data )
-    semi_discrete_step( state , state_tmp , state     , dt / 1 , DIR_X , fixed_data )
+    semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_X, fixed_data)
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, fixed_data)
+    semi_discrete_step(state, state_tmp, state    , dt / 1, DIR_X, fixed_data)
 
   if direction_switch:
     direction_switch = 0
@@ -181,37 +176,44 @@ def perform_timestep(
 
 
 
-//Perform a single semi-discretized step in time with the form:
-//state_out = state_init + dt * rhs(state_forcing)
-//Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out
-void semi_discrete_step( realConst3d state_init , real3d const &state_forcing , real3d const &state_out , real dt , int dir , Fixed_data const &fixed_data ) {
-  auto &nx                 = fixed_data.nx                ;
-  auto &nz                 = fixed_data.nz                ;
-  auto &i_beg              = fixed_data.i_beg             ;
-  auto &k_beg              = fixed_data.k_beg             ;
-  auto &hy_dens_cell       = fixed_data.hy_dens_cell      ;
-
-  tend = real3d("tend", NUM_VARS, nz, nx);
-
-  if        (dir == DIR_X) {
-    //Set the halo values for this MPI task's fluid state in the x-direction
-    yakl::timer_start("halo x");
-    set_halo_values_x(state_forcing,fixed_data);
-    yakl::timer_stop("halo x");
-    //Compute the time tendencies for the fluid state in the x-direction
-    yakl::timer_start("tendencies x");
-    compute_tendencies_x(state_forcing,tend,dt,fixed_data);
-    yakl::timer_stop("tendencies x");
-  } else if (dir == DIR_Z) {
-    //Set the halo values for this MPI task's fluid state in the z-direction
-    yakl::timer_start("halo z");
-    set_halo_values_z(state_forcing,fixed_data);
-    yakl::timer_stop("halo z");
-    //Compute the time tendencies for the fluid state in the z-direction
-    yakl::timer_start("tendencies z");
-    compute_tendencies_z(state_forcing,tend,dt,fixed_data);
-    yakl::timer_stop("tendencies z");
-  }
+# Perform a single semi-discretized step in time with the form:
+# state_out = state_init + dt * rhs(state_forcing)
+# Meaning the step starts from state_init, computes the rhs using state_forcing, and stores the result in state_out
+def semi_discrete_step(
+    state_init, # realConst3d
+    state_forcing, # real3d const&
+    state_out, # real3d const&,
+    dt: real,
+    dir: int,
+    fixed_data # Fixed_data const&
+  ) -> None:
+
+  nx                 = fixed_data.nx
+  nz                 = fixed_data.nz
+  i_beg              = fixed_data.i_beg
+  k_beg              = fixed_data.k_beg
+  hy_dens_cell       = fixed_data.hy_dens_cell
+
+  tend = real3d("tend", NUM_VARS, nz, nx)
+
+  if dir == DIR_X:
+    # Set the halo values for this MPI task's fluid state in the x-direction
+    #yakl::timer_start("halo x");
+    set_halo_values_x(state_forcing, fixed_data)
+    #yakl::timer_stop("halo x");
+    # Compute the time tendencies for the fluid state in the x-direction
+    #yakl::timer_start("tendencies x");
+    compute_tendencies_x(state_forcing, tend, dt, fixed_data)
+    #yakl::timer_stop("tendencies x");
+  elif dir == DIR_Z:
+    # Set the halo values for this MPI task's fluid state in the z-direction
+    #yakl::timer_start("halo z");
+    set_halo_values_z(state_forcing, fixed_data)
+    #yakl::timer_stop("halo z");
+    # Compute the time tendencies for the fluid state in the z-direction
+    #yakl::timer_start("tendencies z");
+    compute_tendencies_z(state_forcing, tend, dt, fixed_data)
+    #yakl::timer_stop("tendencies z");
 
   # /////////////////////////////////////////////////
   # // TODO: MAKE THESE 3 LOOPS A PARALLEL_FOR
@@ -231,154 +233,174 @@ def perform_timestep(
 
   # yakl::timer_stop("apply tendencies");
 
-  # FIXME (mfh 2025/03/03) This should return something; not sure what yet
+  # NOTE It's OK for this not to return anything,
+  # as long as we can treat state_out as an output parameter.
 
 
-//Compute the time tendencies of the fluid state using forcing in the x-direction
-//Since the halos are set in a separate routine, this will not require MPI
-//First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity)
-//Then, compute the tendencies using those fluxes
-void compute_tendencies_x( realConst3d state , real3d const &tend , real dt , Fixed_data const &fixed_data ) {
-  auto &nx                 = fixed_data.nx                ;
-  auto &nz                 = fixed_data.nz                ;
-  auto &hy_dens_cell       = fixed_data.hy_dens_cell      ;
-  auto &hy_dens_theta_cell = fixed_data.hy_dens_theta_cell;
-
-  flux = real3d("flux", NUM_VARS, nz, nx+1);
-
-  //Compute the hyperviscosity coefficient
-  real hv_coef = -hv_beta * dx / (16*dt);
-  /////////////////////////////////////////////////
-  // TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR
-  /////////////////////////////////////////////////
-  //Compute fluxes in the x-direction for each cell
-  for (int k=0; k<nz; k++) {
-    for (int i=0; i<nx+1; i++) {
-      SArray<real,1,4> stencil;
-      SArray<real,1,NUM_VARS> d3_vals;
-      SArray<real,1,NUM_VARS> vals;
-      //Use fourth-order interpolation from four cell averages to compute the value at the interface in question
-      for (int ll=0; ll<NUM_VARS; ll++) {
-        for (int s=0; s < sten_size; s++) {
-          stencil(s) = state(ll,hs+k,i+s);
-        }
-        //Fourth-order-accurate interpolation of the state
-        vals(ll) = -stencil(0)/12 + 7*stencil(1)/12 + 7*stencil(2)/12 - stencil(3)/12;
-        //First-order-accurate interpolation of the third spatial derivative of the state (for artificial viscosity)
-        d3_vals(ll) = -stencil(0) + 3*stencil(1) - 3*stencil(2) + stencil(3);
-      }
-
-      //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
-      real r = vals(ID_DENS) + hy_dens_cell(hs+k);
-      real u = vals(ID_UMOM) / r;
-      real w = vals(ID_WMOM) / r;
-      real t = ( vals(ID_RHOT) + hy_dens_theta_cell(hs+k) ) / r;
-      real p = C0*pow((r*t),gamm);
-
-      //Compute the flux vector
-      flux(ID_DENS,k,i) = r*u     - hv_coef*d3_vals(ID_DENS);
-      flux(ID_UMOM,k,i) = r*u*u+p - hv_coef*d3_vals(ID_UMOM);
-      flux(ID_WMOM,k,i) = r*u*w   - hv_coef*d3_vals(ID_WMOM);
-      flux(ID_RHOT,k,i) = r*u*t   - hv_coef*d3_vals(ID_RHOT);
-    }
-  }
+# Compute the time tendencies of the fluid state using forcing in the x-direction
+# Since the halos are set in a separate routine, this will not require MPI
+# First, compute the flux vector at each cell interface in the x-direction (including hyperviscosity)
+# Then, compute the tendencies using those fluxes
+def compute_tendencies_x(
+    state, # realConst3d
+    tend, # real3d const&
+    dt, # real
+    fixed_data # Fixed_data const&
+  ) -> None:
 
-  /////////////////////////////////////////////////
-  // TODO: MAKE THESE 3 LOOPS A PARALLEL_FOR
-  /////////////////////////////////////////////////
-  //Use the fluxes to compute tendencies for each cell
-  for (int ll=0; ll<NUM_VARS; ll++) {
-    for (int k=0; k<nz; k++) {
-      for (int i=0; i<nx; i++) {
-        tend(ll,k,i) = -( flux(ll,k,i+1) - flux(ll,k,i) ) / dx;
-      }
-    }
-  }
-}
+  nx                 = fixed_data.nx
+  nz                 = fixed_data.nz
+  hy_dens_cell       = fixed_data.hy_dens_cell
+  hy_dens_theta_cell = fixed_data.hy_dens_theta_cell
 
+  flux = real3d("flux", NUM_VARS, nz, nx+1)
 
-//Compute the time tendencies of the fluid state using forcing in the z-direction
-//Since the halos are set in a separate routine, this will not require MPI
-//First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity)
-//Then, compute the tendencies using those fluxes
-void compute_tendencies_z( realConst3d state , real3d const &tend , real dt , Fixed_data const &fixed_data ) {
-  auto &nx                 = fixed_data.nx                ;
-  auto &nz                 = fixed_data.nz                ;
-  auto &hy_dens_int        = fixed_data.hy_dens_int       ;
-  auto &hy_dens_theta_int  = fixed_data.hy_dens_theta_int ;
-  auto &hy_pressure_int    = fixed_data.hy_pressure_int   ;
-
-  flux = real3d("flux", NUM_VARS, nz+1, nx);
-
-  //Compute the hyperviscosity coefficient
-  real hv_coef = -hv_beta * dz / (16*dt);
-  /////////////////////////////////////////////////
-  // TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR
-  /////////////////////////////////////////////////
-  //Compute fluxes in the x-direction for each cell
-  for (int k=0; k<nz+1; k++) {
-    for (int i=0; i<nx; i++) {
-      SArray<real,1,4> stencil;
-      SArray<real,1,NUM_VARS> d3_vals;
-      SArray<real,1,NUM_VARS> vals;
-      //Use fourth-order interpolation from four cell averages to compute the value at the interface in question
-      for (int ll=0; ll<NUM_VARS; ll++) {
-        for (int s=0; s<sten_size; s++) {
-          stencil(s) = state(ll,k+s,hs+i);
-        }
-        //Fourth-order-accurate interpolation of the state
-        vals(ll) = -stencil(0)/12 + 7*stencil(1)/12 + 7*stencil(2)/12 - stencil(3)/12;
-        //First-order-accurate interpolation of the third spatial derivative of the state
-        d3_vals(ll) = -stencil(0) + 3*stencil(1) - 3*stencil(2) + stencil(3);
-      }
-
-      //Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
-      real r = vals(ID_DENS) + hy_dens_int(k);
-      real u = vals(ID_UMOM) / r;
-      real w = vals(ID_WMOM) / r;
-      real t = ( vals(ID_RHOT) + hy_dens_theta_int(k) ) / r;
-      real p = C0*pow((r*t),gamm) - hy_pressure_int(k);
-      if (k == 0 || k == nz) {
+  # Compute the hyperviscosity coefficient
+  hv_coef: real = -hv_beta * dx / (16*dt)
+  # /////////////////////////////////////////////////
+  # TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR
+  # /////////////////////////////////////////////////
+  # Compute fluxes in the x-direction for each cell
+  for k in range(nz):
+    for i in range(nx+1):
+
+      # "Stack Array" -- local multidimensional array type with compile-time extents
+      #SArray<real,1,4> stencil;
+      #SArray<real,1,NUM_VARS> d3_vals;
+      #SArray<real,1,NUM_VARS> vals;
+
+      stencil = np.zeros((1, 4), dtype=real)
+      d3_vals = np.zeros((1, NUM_VARS), dtype=real)
+      vals = np.zeros((1, NUM_VARS), dtype=real)
+      
+      # Use fourth-order interpolation from four cell averages to compute the value at the interface in question
+      for ll in range(NUM_VARS):
+        for s in range(sten_size):
+          stencil[s] = state[ll,hs+k,i+s]
+
+        # Fourth-order-accurate interpolation of the state
+        vals[ll] = -stencil[0]/12 + 7*stencil[1]/12 + 7*stencil[2]/12 - stencil[3]/12
+        # First-order-accurate interpolation of the third spatial derivative of the state (for artificial viscosity)
+        d3_vals[ll] = -stencil[0] + 3*stencil[1] - 3*stencil[2] + stencil[3]
+
+      # Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
+      r = vals[ID_DENS] + hy_dens_cell[hs+k]
+      u = vals[ID_UMOM] / r
+      w = vals[ID_WMOM] / r
+      t = (vals[ID_RHOT] + hy_dens_theta_cell[hs+k]) / r
+      p = C0*pow((r*t), gamm)
+
+      # Compute the flux vector
+      flux[ID_DENS,k,i] = r*u     - hv_coef*d3_vals[ID_DENS]
+      flux[ID_UMOM,k,i] = r*u*u+p - hv_coef*d3_vals[ID_UMOM]
+      flux[ID_WMOM,k,i] = r*u*w   - hv_coef*d3_vals[ID_WMOM]
+      flux[ID_RHOT,k,i] = r*u*t   - hv_coef*d3_vals[ID_RHOT]
+
+  # /////////////////////////////////////////////////
+  # // TODO: MAKE THESE 3 LOOPS A PARALLEL_FOR
+  # /////////////////////////////////////////////////
+  # Use the fluxes to compute tendencies for each cell
+  for ll in range(NUM_VARS):
+    for k in range(nz):
+      for i in range(nx):
+        tend[ll,k,i] = -( flux[ll,k,i+1] - flux[ll,k,i] ) / dx
+
+  # NOTE It's OK for this not to return anything,
+  # as long as we can treat tend as an output parameter.
+
+
+# Compute the time tendencies of the fluid state using forcing in the z-direction
+# Since the halos are set in a separate routine, this will not require MPI
+# First, compute the flux vector at each cell interface in the z-direction (including hyperviscosity)
+# Then, compute the tendencies using those fluxes
+def compute_tendencies_z(
+    state, # realConst3d
+    tend, # real3d const&
+    dt, # real
+    fixed_data # Fixed_data const&
+  ) -> None:
+
+  nx                 = fixed_data.nx
+  nz                 = fixed_data.nz
+  hy_dens_int        = fixed_data.hy_dens_int
+  hy_dens_theta_int  = fixed_data.hy_dens_theta_int
+  hy_pressure_int    = fixed_data.hy_pressure_int
+
+  flux = real3d("flux", NUM_VARS, nz+1, nx)
+
+  # Compute the hyperviscosity coefficient
+  hv_coef: real = -hv_beta * dz / (16*dt);
+  # /////////////////////////////////////////////////
+  # TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR
+  # /////////////////////////////////////////////////
+  # Compute fluxes in the x-direction for each cell
+  for k in range(nz+1):
+    for i in range(nx):
+      # "Stack Array" -- local multidimensional array type with compile-time extents
+      #SArray<real,1,4> stencil;
+      #SArray<real,1,NUM_VARS> d3_vals;
+      #SArray<real,1,NUM_VARS> vals;
+
+      stencil = np.zeros((1, 4), dtype=real)
+      d3_vals = np.zeros((1, NUM_VARS), dtype=real)
+      vals = np.zeros((1, NUM_VARS), dtype=real)
+
+      # Use fourth-order interpolation from four cell averages to compute the value at the interface in question
+      for ll in range(NUM_VARS):
+        for s in range(sten_size):
+          stencil[s] = state[ll,k+s,hs+i];
+
+        # Fourth-order-accurate interpolation of the state
+        vals[ll] = -stencil[0]/12 + 7*stencil[1]/12 + 7*stencil[2]/12 - stencil[3]/12
+        # First-order-accurate interpolation of the third spatial derivative of the state
+        d3_vals[ll] = -stencil[0] + 3*stencil[1] - 3*stencil[2] + stencil[3]
+
+      # Compute density, u-wind, w-wind, potential temperature, and pressure (r,u,w,t,p respectively)
+      r: real = vals[ID_DENS] + hy_dens_int[k];
+      u: real = vals[ID_UMOM] / r;
+      w: real = vals[ID_WMOM] / r;
+      t: real = ( vals[ID_RHOT] + hy_dens_theta_int[k] ) / r;
+      p: real = C0*pow((r*t),gamm) - hy_pressure_int[k];
+      if k == 0 or k == nz:
         w                = 0;
-        d3_vals(ID_DENS) = 0;
-      }
-
-      //Compute the flux vector with hyperviscosity
-      flux(ID_DENS,k,i) = r*w     - hv_coef*d3_vals(ID_DENS);
-      flux(ID_UMOM,k,i) = r*w*u   - hv_coef*d3_vals(ID_UMOM);
-      flux(ID_WMOM,k,i) = r*w*w+p - hv_coef*d3_vals(ID_WMOM);
-      flux(ID_RHOT,k,i) = r*w*t   - hv_coef*d3_vals(ID_RHOT);
-    }
-  }
+        d3_vals[ID_DENS] = 0;
 
-  //Use the fluxes to compute tendencies for each cell
-  /////////////////////////////////////////////////
-  // TODO: MAKE THESE 3 LOOPS A PARALLEL_FOR
-  /////////////////////////////////////////////////
-  for (int ll=0; ll<NUM_VARS; ll++) {
-    for (int k=0; k<nz; k++) {
-      for (int i=0; i<nx; i++) {
-        tend(ll,k,i) = -( flux(ll,k+1,i) - flux(ll,k,i) ) / dz;
-        if (ll == ID_WMOM) {
-          tend(ll,k,i) -= state(ID_DENS,hs+k,hs+i)*grav;
-        }
-      }
+      # Compute the flux vector with hyperviscosity
+      flux[ID_DENS,k,i] = r*w     - hv_coef*d3_vals[ID_DENS];
+      flux[ID_UMOM,k,i] = r*w*u   - hv_coef*d3_vals[ID_UMOM];
+      flux[ID_WMOM,k,i] = r*w*w+p - hv_coef*d3_vals[ID_WMOM];
+      flux[ID_RHOT,k,i] = r*w*t   - hv_coef*d3_vals[ID_RHOT];
     }
   }
-}
-
-
 
-//Set this MPI task's halo values in the x-direction. This routine will require MPI
-void set_halo_values_x( real3d const &state , Fixed_data const &fixed_data ) {
-  auto &nx                 = fixed_data.nx                ;
-  auto &nz                 = fixed_data.nz                ;
-  auto &k_beg              = fixed_data.k_beg             ;
-  auto &left_rank          = fixed_data.left_rank         ;
-  auto &right_rank         = fixed_data.right_rank        ;
-  auto &myrank             = fixed_data.myrank            ;
-  auto &hy_dens_cell       = fixed_data.hy_dens_cell      ;
-  auto &hy_dens_theta_cell = fixed_data.hy_dens_theta_cell;
+  # Use the fluxes to compute tendencies for each cell
+  #/////////////////////////////////////////////////
+  # TODO: MAKE THESE 3 LOOPS A PARALLEL_FOR
+  #/////////////////////////////////////////////////
+  for ll in range(NUM_VARS):
+    for k in range(nz):
+      for i in range(nx):
+        tend[ll,k,i] = -( flux[ll,k+1,i] - flux[ll,k,i] ) / dz
+        if ll == ID_WMOM:
+          tend[ll,k,i] -= state[ID_DENS,hs+k,hs+i]*grav
+
+  # NOTE (mfh 2025/03/03) Don't need to return anything, as long as tend can be an output parameter
+
+
+# Set this MPI task's halo values in the x-direction. This routine will require MPI
+def set_halo_values_x(
+    state, # real3d const&
+    fixed_data # Fixed_data const&
+  ) -> None:
+
+  nx                 = fixed_data.nx
+  nz                 = fixed_data.nz
+  k_beg              = fixed_data.k_beg
+  left_rank          = fixed_data.left_rank
+  right_rank         = fixed_data.right_rank
+  myrank             = fixed_data.myrank
+  hy_dens_cell       = fixed_data.hy_dens_cell
+  hy_dens_theta_cell = fixed_data.hy_dens_theta_cell
 
   # //////////////////////////////////////////////////////////////////////
   # TODO: EXCHANGE HALO VALUES WITH NEIGHBORING MPI TASKS
@@ -410,17 +432,20 @@ def perform_timestep(
             state[ID_UMOM,hs+k,i] = (state[ID_DENS,hs+k,i] + hy_dens_cell[hs+k]) * 50.0
             state[ID_RHOT,hs+k,i] = (state[ID_DENS,hs+k,i] + hy_dens_cell[hs+k]) * 298.0 - hy_dens_theta_cell[hs+k]
 
-  # TODO (mfh 2025/03/03) Return something, not sure what yet
+  # NOTE (mfh 2025/03/03) Don't need to return anything, as long as state can be an output parameter
 
 
+# Set this MPI task's halo values in the z-direction. This does not require MPI because there is no MPI
+# decomposition in the vertical direction
+def set_halo_values_z(
+    state, # real3d const&
+    fixed_data # Fixed_data const&
+  ) -> None:
 
-//Set this MPI task's halo values in the z-direction. This does not require MPI because there is no MPI
-//decomposition in the vertical direction
-void set_halo_values_z( real3d const &state , Fixed_data const &fixed_data ) {
-  auto &nx                 = fixed_data.nx                ;
-  auto &nz                 = fixed_data.nz                ;
-  auto &hy_dens_cell       = fixed_data.hy_dens_cell      ;
-  
+  nx                 = fixed_data.nx
+  nz                 = fixed_data.nz
+  hy_dens_cell       = fixed_data.hy_dens_cell
+ 
   # /////////////////////////////////////////////////
   # // TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR
   # /////////////////////////////////////////////////
@@ -442,7 +467,7 @@ def perform_timestep(
         state[ll,nz+hs  ,i] = state[ll,nz+hs-1,i]
         state[ll,nz+hs+1,i] = state[ll,nz+hs-1,i]
 
-  # TODO (mfh 2025/03/03) Return something, not sure what yet
+  # NOTE (mfh 2025/03/03) Don't need to return anything, as long as state can be an output parameter
 
 
 # state, dt, and fixed_data used to be output parameters.
@@ -732,89 +757,97 @@ def hydro_const_bvfreq(z: real, bv_freq0: real): # returns (r, t)
   return (r, t)
 
 
-//Sample from an ellipse of a specified center, radius, and amplitude at a specified location
-//x and z are input coordinates
-//amp,x0,z0,xrad,zrad are input amplitude, center, and radius of the ellipse
-real sample_ellipse_cosine( real x , real z , real amp , real x0 , real z0 , real xrad , real zrad ) {
-  //Compute distance from bubble center
-  real dist = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.;
-  //If the distance from bubble center is less than the radius, create a cos**2 profile
-  if (dist <= pi / 2.) {
-    return amp * pow(cos(dist),2.);
-  } else {
-    return 0.;
-  }
-}
-
-
-//Output the fluid state (state) to a NetCDF file at a given elapsed model time (etime)
-//The file I/O uses parallel-netcdf, the only external library required for this mini-app.
-//If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics
-void output( realConst3d state , real etime , int &num_out , Fixed_data const &fixed_data ) {
-  auto &nx                 = fixed_data.nx                ;
-  auto &nz                 = fixed_data.nz                ;
-  auto &i_beg              = fixed_data.i_beg             ;
-  auto &k_beg              = fixed_data.k_beg             ;
-  auto &mainproc         = fixed_data.mainproc        ;
-  auto &hy_dens_cell       = fixed_data.hy_dens_cell      ;
-  auto &hy_dens_theta_cell = fixed_data.hy_dens_theta_cell;
+# Sample from an ellipse of a specified center, radius, and amplitude at a specified location
+# x and z are input coordinates
+# amp,x0,z0,xrad,zrad are input amplitude, center, and radius of the ellipse
+def sample_ellipse_cosine(x: real, z: real, amp: real, x0: real, z0: real, xrad: real, zrad: real) -> real:
+  # Compute distance from bubble center
+  dist: real = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.0
+  # If the distance from bubble center is less than the radius, create a cos**2 profile
+  if dist <= pi / 2.0:
+    return amp * pow(cos(dist),2.)
+  else:
+    return 0.0
+
+
+# Output the fluid state (state) to a NetCDF file at a given elapsed model time (etime)
+# The file I/O uses parallel-netcdf, the only external library required for this mini-app.
+# If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics
+def output(
+    state, # realConst3d
+    etime, # real
+    num_out, # int
+    fixed_data # Fixed_data const&
+  ) -> int: # num_out (updated)
+
+  nx                 = fixed_data.nx
+  nz                 = fixed_data.nz
+  i_beg              = fixed_data.i_beg
+  k_beg              = fixed_data.k_beg
+  mainproc           = fixed_data.mainproc
+  hy_dens_cell       = fixed_data.hy_dens_cell
+  hy_dens_theta_cell = fixed_data.hy_dens_theta_cell
 
   int ncid, t_dimid, x_dimid, z_dimid, dens_varid, uwnd_varid, wwnd_varid, theta_varid, t_varid, dimids[3];
   MPI_Offset st1[1], ct1[1], st3[3], ct3[3];
-  //Temporary arrays to hold density, u-wind, w-wind, and potential temperature (theta)
-  //Inform the user
-  if (mainproc) { printf("*** OUTPUT ***\n"); }
+  # Temporary arrays to hold density, u-wind, w-wind, and potential temperature (theta)
+  # Inform the user
+  if mainproc:
+    print("*** OUTPUT ***\n")
   # Allocate some (big) temp arrays
-  dens = doub2d("dens", nz, nx);
-  uwnd = doub2d("uwnd", nz, nx);
-  wwnd = doub2d("wwnd", nz, nx);
-  theta = doub2d("theta", nz, nx);
-
-  //If the elapsed time is zero, create the file. Otherwise, open the file
-  if (etime == 0) {
-    //Create the file
+  dens = doub2d("dens", nz, nx)
+  uwnd = doub2d("uwnd", nz, nx)
+  wwnd = doub2d("wwnd", nz, nx)
+  theta = doub2d("theta", nz, nx)
+
+  # If the elapsed time is zero, create the file. Otherwise, open the file
+  if etime == 0:
+    # Create the file
     ncwrap( ncmpi_create( MPI_COMM_WORLD , "output.nc" , NC_CLOBBER , MPI_INFO_NULL , &ncid ) , __LINE__ );
-    //Create the dimensions
-    ncwrap( ncmpi_def_dim( ncid , "t" , (MPI_Offset) NC_UNLIMITED , &t_dimid ) , __LINE__ );
-    ncwrap( ncmpi_def_dim( ncid , "x" , (MPI_Offset) nx_glob      , &x_dimid ) , __LINE__ );
-    ncwrap( ncmpi_def_dim( ncid , "z" , (MPI_Offset) nz_glob      , &z_dimid ) , __LINE__ );
-    //Create the variables
-    dimids[0] = t_dimid;
-    ncwrap( ncmpi_def_var( ncid , "t"     , NC_DOUBLE , 1 , dimids ,     &t_varid ) , __LINE__ );
-    dimids[0] = t_dimid; dimids[1] = z_dimid; dimids[2] = x_dimid;
-    ncwrap( ncmpi_def_var( ncid , "dens"  , NC_DOUBLE , 3 , dimids ,  &dens_varid ) , __LINE__ );
-    ncwrap( ncmpi_def_var( ncid , "uwnd"  , NC_DOUBLE , 3 , dimids ,  &uwnd_varid ) , __LINE__ );
-    ncwrap( ncmpi_def_var( ncid , "wwnd"  , NC_DOUBLE , 3 , dimids ,  &wwnd_varid ) , __LINE__ );
-    ncwrap( ncmpi_def_var( ncid , "theta" , NC_DOUBLE , 3 , dimids , &theta_varid ) , __LINE__ );
-    //End "define" mode
+    # Create the dimensions
+    ncwrap( ncmpi_def_dim( ncid , "t" , (MPI_Offset) NC_UNLIMITED , &t_dimid ) , __LINE__ )
+    ncwrap( ncmpi_def_dim( ncid , "x" , (MPI_Offset) nx_glob      , &x_dimid ) , __LINE__ )
+    ncwrap( ncmpi_def_dim( ncid , "z" , (MPI_Offset) nz_glob      , &z_dimid ) , __LINE__ )
+    # Create the variables
+    dimids[0] = t_dimid
+    ncwrap( ncmpi_def_var( ncid , "t"     , NC_DOUBLE , 1 , dimids ,     &t_varid ) , __LINE__ )
+    dimids[0] = t_dimid
+    dimids[1] = z_dimid
+    dimids[2] = x_dimid
+    ncwrap( ncmpi_def_var( ncid , "dens"  , NC_DOUBLE , 3 , dimids ,  &dens_varid ) , __LINE__ )
+    ncwrap( ncmpi_def_var( ncid , "uwnd"  , NC_DOUBLE , 3 , dimids ,  &uwnd_varid ) , __LINE__ )
+    ncwrap( ncmpi_def_var( ncid , "wwnd"  , NC_DOUBLE , 3 , dimids ,  &wwnd_varid ) , __LINE__ )
+    ncwrap( ncmpi_def_var( ncid , "theta" , NC_DOUBLE , 3 , dimids , &theta_varid ) , __LINE__ )
+    # End "define" mode
     ncwrap( ncmpi_enddef( ncid ) , __LINE__ );
-  } else {
-    //Open the file
-    ncwrap( ncmpi_open( MPI_COMM_WORLD , "output.nc" , NC_WRITE , MPI_INFO_NULL , &ncid ) , __LINE__ );
-    //Get the variable IDs
-    ncwrap( ncmpi_inq_varid( ncid , "dens"  ,  &dens_varid ) , __LINE__ );
-    ncwrap( ncmpi_inq_varid( ncid , "uwnd"  ,  &uwnd_varid ) , __LINE__ );
-    ncwrap( ncmpi_inq_varid( ncid , "wwnd"  ,  &wwnd_varid ) , __LINE__ );
-    ncwrap( ncmpi_inq_varid( ncid , "theta" , &theta_varid ) , __LINE__ );
-    ncwrap( ncmpi_inq_varid( ncid , "t"     ,     &t_varid ) , __LINE__ );
-  }
-
-  //Store perturbed values in the temp arrays for output
-  /////////////////////////////////////////////////
-  // TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR
-  /////////////////////////////////////////////////
-  for (int k=0; k<nz; k++) {
-    for (int i=0; i<nx; i++) {
-      dens (k,i) = state(ID_DENS,hs+k,hs+i);
-      uwnd (k,i) = state(ID_UMOM,hs+k,hs+i) / ( hy_dens_cell(hs+k) + state(ID_DENS,hs+k,hs+i) );
-      wwnd (k,i) = state(ID_WMOM,hs+k,hs+i) / ( hy_dens_cell(hs+k) + state(ID_DENS,hs+k,hs+i) );
-      theta(k,i) = ( state(ID_RHOT,hs+k,hs+i) + hy_dens_theta_cell(hs+k) ) / ( hy_dens_cell(hs+k) + state(ID_DENS,hs+k,hs+i) ) - hy_dens_theta_cell(hs+k) / hy_dens_cell(hs+k);
-    }
-  }
-
-  //Write the grid data to file with all the processes writing collectively
-  st3[0] = num_out; st3[1] = k_beg; st3[2] = i_beg;
-  ct3[0] = 1      ; ct3[1] = nz   ; ct3[2] = nx   ;
+  else:
+    # Open the file
+    ncwrap( ncmpi_open( MPI_COMM_WORLD , "output.nc" , NC_WRITE , MPI_INFO_NULL , &ncid ) , __LINE__ )
+    # Get the variable IDs
+    ncwrap( ncmpi_inq_varid( ncid , "dens"  ,  &dens_varid ) , __LINE__ )
+    ncwrap( ncmpi_inq_varid( ncid , "uwnd"  ,  &uwnd_varid ) , __LINE__ )
+    ncwrap( ncmpi_inq_varid( ncid , "wwnd"  ,  &wwnd_varid ) , __LINE__ )
+    ncwrap( ncmpi_inq_varid( ncid , "theta" , &theta_varid ) , __LINE__ )
+    ncwrap( ncmpi_inq_varid( ncid , "t"     ,     &t_varid ) , __LINE__ )
+
+  # Store perturbed values in the temp arrays for output
+  # /////////////////////////////////////////////////
+  # TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR
+  # /////////////////////////////////////////////////
+  for k in range(nz):
+    for i in range(nx):
+      dens[k,i]  = state[ID_DENS,hs+k,hs+i]
+      uwnd[k,i]  = state[ID_UMOM,hs+k,hs+i] / ( hy_dens_cell[hs+k] + state[ID_DENS,hs+k,hs+i] )
+      wwnd[k,i]  = state[ID_WMOM,hs+k,hs+i] / ( hy_dens_cell[hs+k] + state[ID_DENS,hs+k,hs+i] )
+      theta[k,i] = ( state[ID_RHOT,hs+k,hs+i] + hy_dens_theta_cell[hs+k] ) / ( hy_dens_cell[hs+k] + state[ID_DENS,hs+k,hs+i] ) - hy_dens_theta_cell[hs+k] / hy_dens_cell[hs+k]
+
+  # Write the grid data to file with all the processes writing collectively
+  st3[0] = num_out
+  st3[1] = k_beg
+  st3[2] = i_beg
+  ct3[0] = 1
+  ct3[1] = nz
+  ct3[2] = nx
   ncwrap( ncmpi_put_vara_double_all( ncid ,  dens_varid , st3 , ct3 , dens.data()  ) , __LINE__ );
   ncwrap( ncmpi_put_vara_double_all( ncid ,  uwnd_varid , st3 , ct3 , uwnd.data()  ) , __LINE__ );
   ncwrap( ncmpi_put_vara_double_all( ncid ,  wwnd_varid , st3 , ct3 , wwnd.data()  ) , __LINE__ );
@@ -823,22 +856,23 @@ def hydro_const_bvfreq(z: real, bv_freq0: real): # returns (r, t)
   //Only the main process needs to write the elapsed time
   //Begin "independent" write mode
   ncwrap( ncmpi_begin_indep_data(ncid) , __LINE__ );
-  //write elapsed time to file
-  if (mainproc) {
-    st1[0] = num_out;
-    ct1[0] = 1;
+  # write elapsed time to file
+  if mainproc:
+    st1[0] = num_out
+    ct1[0] = 1
     double etimearr[1];
     etimearr[0] = etime; ncwrap( ncmpi_put_vara_double( ncid , t_varid , st1 , ct1 , etimearr ) , __LINE__ );
-  }
+
   //End "independent" write mode
   ncwrap( ncmpi_end_indep_data(ncid) , __LINE__ );
 
   //Close the file
   ncwrap( ncmpi_close(ncid) , __LINE__ );
 
-  //Increment the number of outputs
+  # Increment the number of outputs
   num_out = num_out + 1;
-}
+
+  return num_out
 
 
 //Error reporting routine for the PNetCDF I/O
@@ -867,8 +901,8 @@ def reductions(
   hy_dens_cell       = fixed_data.hy_dens_cell      
   hy_dens_theta_cell = fixed_data.hy_dens_theta_cell
 
-  mass = 0
-  te   = 0
+  mass: double = 0.0
+  te: double   = 0.0
   for k in range(nz):
     for i in range(nx):
       r = state[ID_DENS,hs+k,hs+i] + hy_dens_cell[hs+k]                # Density

From bd6a3de19bd45d28ce5cb3c75f249d1cbc30d882 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Tue, 4 Mar 2025 11:18:44 -0700
Subject: [PATCH 04/37] Progress on porting to Python

---
 python/miniWeather.py | 72 ++++++++++++++++++++-----------------------
 1 file changed, 33 insertions(+), 39 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index d601022..fcbf75f 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -7,15 +7,12 @@
 # //
 # //////////////////////////////////////////////////////////////////////////////////////////
 
-#include <stdlib.h>
-#include <stdio.h>
-#include <mpi.h>
-#include <ctime>
+import timeit
 #include "const.h"
 #include "pnetcdf.h"
-#include <chrono>
 
-# "Halo" size: number of cells beyond the MPI tasks's domain needed for a full "stencil" of information for reconstruction
+# "Halo" size: number of cells beyond the MPI tasks's domain
+# needed for a full "stencil" of information for reconstruction
 hs: int = 2
 
 # real can be either float or double, depending on cpp/const.h.
@@ -67,61 +64,58 @@ def main() -> None:
   #MPI_Init(&argc,&argv);
   #yakl::init();
 
-  #Fixed_data fixed_data;
-  #real3d state;
-  #real dt;                    //Model time step (seconds)
-
-  # init allocates state
+  # fixed_data: Fixed_data
+  # state: real3d
+  # dt: real: Model time step (seconds)
   (fixed_data, state, dt) = init() # init( state , dt , fixed_data );
 
-  mainproc = fixed_data.mainproc;
+  mainproc = fixed_data.mainproc
 
   # Initial reductions for mass, kinetic energy, and total energy
   (mass0, te0) = reductions(state, fixed_data)
 
-  num_out: int         = 0  # The number of outputs performed so far
-  output_counter: real = 0  # Helps determine when it's time to do output
-  etime: real          = 0
+  num_out: int = 0    # The number of outputs performed so far
+  etime: real  = 0.0  # Elapsed time
 
   # Output the initial state
   if output_freq >= 0:
     num_out = output(state, etime, num_out, fixed_data)
 
-  direction_switch: int = 1 # Tells dimensionally split which order to take x,z solves
-
   # ////////////////////////////////////////////////////
   # MAIN TIME STEP LOOP
   # ////////////////////////////////////////////////////
-  t1 = std::chrono::steady_clock::now()
-  
-  while etime < sim_time:
-    # If the time step leads to exceeding the simulation time, shorten it for the last step
-    if etime + dt > sim_time:
-      dt = sim_time - etime
-
-    # Perform a single time step
-    direction_switch = perform_timestep(state, dt, direction_switch, fixed_data)
-    # Inform the user
-    if mainproc:
-      print( "Elapsed Time: %lf / %lf\n", etime , sim_time )
-    # Update the elapsed time and output counter
-    etime = etime + dt
-    output_counter = output_counter + dt
-    # If it's time for output, reset the counter, and do output
-    if output_freq >= 0 and output_counter >= output_freq:
-      output_counter = output_counter - output_freq
+  def run_simulation():
+    direction_switch: int = 1  # Order in which dimensional splitting takes x,z solves
+    output_counter: real = 0.0 # Helps determine when it's time to do output
+
+    while etime < sim_time:
+      # If the time step leads to exceeding the simulation time, shorten it for the last step
+      if etime + dt > sim_time:
+        dt = sim_time - etime
+
+      # Perform a single time step
+      direction_switch = perform_timestep(state, dt, direction_switch, fixed_data)
+      # Inform the user
+      if mainproc:
+        print(f"Elapsed Time: {etime}, Simulation Time: {sim_time}\n")
+      # Update the elapsed time and output counter
+      etime = etime + dt
+      output_counter = output_counter + dt
+      # If it's time for output, reset the counter, and do output
+      if output_freq >= 0 and output_counter >= output_freq:
+        output_counter = output_counter - output_freq
       num_out = output(state, etime, num_out, fixed_data)
 
-  auto t2 = std::chrono::steady_clock::now();
+  time_in_s = timeit.timeit(run_simulation(), number=1)
   if mainproc:
-    print( "CPU Time: " << std::chrono::duration<double>(t2-t1).count() << " sec\n" )
+    print(f"CPU Time: {time_in_s} s\n")
 
   # Final reductions for mass, kinetic energy, and total energy
   (mass, te) = reductions(state, fixed_data)
 
   if mainproc:
-    print( "d_mass: %le\n" % ((mass - mass0)/mass0) )
-    print( "d_te:   %le\n" % ((te   - te0  )/te0  ) )
+    print( f"d_mass: {((mass - mass0)/mass0)}" )
+    print( f"d_te:   {((te   - te0  )/te0  )}" )
 
   finalize()
 

From d5634d23143e0b143212206efb13c623437d455e Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Tue, 4 Mar 2025 11:24:35 -0700
Subject: [PATCH 05/37] Progress on porting to Python (print statements)

---
 python/miniWeather.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index fcbf75f..ba3cc30 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -7,6 +7,7 @@
 # //
 # //////////////////////////////////////////////////////////////////////////////////////////
 
+import sys
 import timeit
 #include "const.h"
 #include "pnetcdf.h"
@@ -114,8 +115,8 @@ def run_simulation():
   (mass, te) = reductions(state, fixed_data)
 
   if mainproc:
-    print( f"d_mass: {((mass - mass0)/mass0)}" )
-    print( f"d_te:   {((te   - te0  )/te0  )}" )
+    print(f"d_mass: {((mass - mass0)/mass0)}")
+    print(f"d_te:   {((te   - te0  )/te0  )}")
 
   finalize()
 
@@ -513,9 +514,9 @@ def init(): # -> tuple[real3d, real, Fixed_data]:
 
   # If I'm the main process in MPI, display some grid information
   if mainproc:
-    print( "nx_glob, nz_glob: %d %d\n", nx_glob, nz_glob)
-    print( "dx,dz: %lf %lf\n",dx,dz)
-    print( "dt: %lf\n",dt)
+    print(f"nx_glob, nz_glob: {nx_glob} {nz_glob}\n")
+    print(f"dx,dz: {dx} {dz}\n")
+    print(f"dt: {dt}\n")
 
   # Want to make sure this info is displayed before further output
   # ierr = MPI_Barrier(MPI_COMM_WORLD);
@@ -872,9 +873,9 @@ def output(
 //Error reporting routine for the PNetCDF I/O
 void ncwrap( int ierr , int line ) {
   if (ierr != NC_NOERR) {
-    printf("NetCDF Error at line: %d\n", line);
-    printf("%s\n",ncmpi_strerror(ierr));
-    exit(-1);
+    print(f"NetCDF Error at line: {line}\n")
+    print(f"{ncmpi_strerror(ierr)}\n")
+    sys.exit(-1)
   }
 }
 

From 3fe55840f3f7f4b40a56c27320d9acba744ea0cc Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Tue, 4 Mar 2025 14:51:55 -0700
Subject: [PATCH 06/37] All C++ code is now Python code

---
 python/miniWeather.py | 297 +++++++++++++++++++-----------------------
 1 file changed, 132 insertions(+), 165 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index ba3cc30..21ee2c3 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -9,54 +9,129 @@
 
 import sys
 import timeit
-#include "const.h"
+import numpy as np
 #include "pnetcdf.h"
 
+# "real" in the original C++ code could be either float or double.
+real = np.float64 # or np.float32
+
+#
+# Constants (ported from cpp/const.h)
+#
+
+hs: int = 2
+
+# We don't like code that redefines pi,
+# but we keep it just for now, to test that
+# the Python gives the same results as the C++.
+pi: real   = 3.14159265358979323846264338327
+grav: real = 9.8     # Gravitational acceleration (m / s^2)
+cp: real   = 1004.0  # Specific heat of dry air at constant pressure
+cv: real   = 717.0   # Specific heat of dry air at constant volume
+rd: real   = 287.0   # Dry air constant for equation of state (P=rho*rd*T)
+p0: real   = 1.e5    # Standard pressure at the surface in Pascals
+C0: real   = 27.5629410929725921310572974482 # Constant to translate potential temperature into pressure (P=C0*(rho*theta)**gamma)
+# gamma=cp/Rd, have to call this gamm because "gamma" is taken (I hate C so much)
+gamm: real = 1.40027894002789400278940027894
+
+#
+# Domain and stability-related constants
+#
+
+xlen: real = 2.e4    # Length of the domain in the x-direction (meters)
+zlen: real = 1.e4    # Length of the domain in the z-direction (meters)
+hv_beta: real = 0.05 # How strong to diffuse the solution: hv_beta \in [0:1]
+cfl: real = 1.50     # "Courant, Friedrichs, Lewy" number (for numerical stability)
+max_speed: real = 450 # Assumed maximum wave speed during the simulation (speed of sound + speed of wind) (meter / sec)
 # "Halo" size: number of cells beyond the MPI tasks's domain
 # needed for a full "stencil" of information for reconstruction
 hs: int = 2
+# Size of the stencil used for interpolation
+sten_size: int = 4
 
-# real can be either float or double, depending on cpp/const.h.
+# ///////////////////////////////////////////////////////////////////////////////////////
+# // BEGIN USER-CONFIGURABLE PARAMETERS
+# ///////////////////////////////////////////////////////////////////////////////////////
+# The x-direction length is twice as long as the z-direction length
+# So, you'll want to have nx_glob be twice as large as nz_glob
+nx_glob: int = _NX              # Number of total cells in the x-direction
+nz_glob: int = _NZ              # Number of total cells in the z-direction
+sim_time: real = _SIM_TIME      # How many seconds to run the simulation
+output_freq: real = _OUT_FREQ   # How frequently to output data to file (in seconds)
+data_spec_int: int = _DATA_SPEC # How to initialize the data
+# ///////////////////////////////////////////////////////////////////////////////////////
+# // END USER-CONFIGURABLE PARAMETERS
+# ///////////////////////////////////////////////////////////////////////////////////////
+dx: real = xlen / nx_glob
+dz: real = zlen / nz_glob
+
+
+#
+# Parameters for indexing and flags
+#
+
+NUM_VARS: int = 4           # Number of fluid state variables
+ID_DENS: int = 0           #index for density ("rho")
+ID_UMOM: int = 1           #index for momentum in the x-direction ("rho * u")
+ID_WMOM: int = 2           #index for momentum in the z-direction ("rho * w")
+ID_RHOT: int = 3           #index for density * potential temperature ("rho * theta")
+DIR_X: int = 1              #Integer constant to express that this operation is in the x-direction
+DIR_Z: int = 2              #Integer constant to express that this operation is in the z-direction
+DATA_SPEC_COLLISION: int = 1
+DATA_SPEC_THERMAL: int = 2
+DATA_SPEC_GRAVITY_WAVES: int = 3
+DATA_SPEC_DENSITY_CURRENT: int = 5
+DATA_SPEC_INJECTION: int = 6
+
+#
+# These functions aid in porting from the original C++.
+#
 
 #typedef yakl::Array<real  ,1,yakl::memHost> real1d;
 def real1d(name: string, nx: int):
-    # FIXME (mfh 2025/03/03) This should NOT return the same thing as doub1d; need to change type
-    return np.zeros(nx) # FIXME (mfh 2025/03/03)
+  return np.zeros(nx, dtype=real)
 
 #typedef yakl::Array<real  ,2,yakl::memHost> real2d;
 def real2d(name: string, nx: int, nz: int):
-    return np.zeros((nz, nx)) # FIXME (mfh 2025/03/03) What element type should this return?
+  return np.zeros((nz, nx), dtype=real)
 
 #typedef yakl::Array<real  ,3,yakl::memHost> real3d;
 def real3d(name: string, nx: int, nz: int, nvars: int):
-    return np.zeros((nvars, nz, nx)) # FIXME (mfh 2025/03/03) What element type should this return?
+  return np.zeros((nvars, nz, nx), dtype=real)
 
 #typedef yakl::Array<double,2,yakl::memHost> doub2d;
 def doub2d(name: string, nx: int, nz: int):
-    return np.zeros((nz, nx)) # FIXME (mfh 2025/03/03) What element type should this return?
-
-typedef yakl::Array<real   const,1,yakl::memHost> realConst1d;
-typedef yakl::Array<real   const,2,yakl::memHost> realConst2d;
-typedef yakl::Array<real   const,3,yakl::memHost> realConst3d;
-typedef yakl::Array<double const,1,yakl::memHost> doubConst1d;
-typedef yakl::Array<double const,2,yakl::memHost> doubConst2d;
-typedef yakl::Array<double const,3,yakl::memHost> doubConst3d;
-
-///////////////////////////////////////////////////////////////////////////////////////
-// Variables that are initialized but remain static over the course of the simulation
-///////////////////////////////////////////////////////////////////////////////////////
-struct Fixed_data {
-  int nx, nz;                 //Number of local grid cells in the x- and z- dimensions for this MPI task
-  int i_beg, k_beg;           //beginning index in the x- and z-directions for this MPI task
-  int nranks, myrank;         //Number of MPI ranks and my rank id
-  int left_rank, right_rank;  //MPI Rank IDs that exist to my left and right in the global domain
-  int mainproc;             //Am I the main process (rank == 0)?
-  realConst1d hy_dens_cell;        //hydrostatic density (vert cell avgs).   Dimensions: (1-hs:nz+hs)
-  realConst1d hy_dens_theta_cell;  //hydrostatic rho*t (vert cell avgs).     Dimensions: (1-hs:nz+hs)
-  realConst1d hy_dens_int;         //hydrostatic density (vert cell interf). Dimensions: (1:nz+1)
-  realConst1d hy_dens_theta_int;   //hydrostatic rho*t (vert cell interf).   Dimensions: (1:nz+1)
-  realConst1d hy_pressure_int;     //hydrostatic press (vert cell interf).   Dimensions: (1:nz+1)
-};
+  return np.zeros((nz, nx), dtype=real)
+
+#typedef yakl::Array<real   const,1,yakl::memHost> realConst1d;
+def realConst1d(name: string, nx: int):
+  return np.zeros(nx, dtype=real)
+
+#typedef yakl::Array<real   const,2,yakl::memHost> realConst2d;
+def realConst2d(name: string, nx: int, nz: int):
+  return np.zeros((nz, nx), dtype=real)
+
+# ///////////////////////////////////////////////////////////////////////////////////////
+# // Variables that are initialized but remain static over the course of the simulation
+# ///////////////////////////////////////////////////////////////////////////////////////
+
+class Fixed_data:
+  def __init__(self):
+    self.nx = 0          # Number of local grid cells in the x dimension for this MPI task
+    self.nz = 0          # Number of local grid cells in the z dimension for this MPI task
+    self.i_beg = 0       # beginning index in the x direction for this MPI task
+    self.k_beg = 0       # beginning index in the z direction for this MPI task
+    self.nranks = 0      # Number of MPI ranks
+    self.myrank = 0      # My rank id
+    self.left_rank = 0   # MPI Rank ID that exists to my left in the global domain
+    self.right_rank = 0  # MPI Rank ID that exists to my right in the global domain
+    self.mainproc = True # Am I the main process (rank == 0)?
+    self.hy_dens_cell = None       # realConst1d: hydrostatic density (vert cell avgs).   Dimensions: (1-hs:nz+hs)
+    self.hy_dens_theta_cell = None # realConst1d: hydrostatic rho*t (vert cell avgs).     Dimensions: (1-hs:nz+hs)
+    self.hy_dens_int = None        # realConst1d: hydrostatic density (vert cell interf). Dimensions: (1:nz+1)
+    self.hy_dens_theta_int = None  # realConst1d: hydrostatic rho*t (vert cell interf).   Dimensions: (1:nz+1)
+    self.hy_pressure_int = None    # realConst1d: hydrostatic press (vert cell interf).   Dimensions: (1:nz+1)
+
 
 # ///////////////////////////////////////////////////////////////////////////////////////
 # // THE MAIN PROGRAM STARTS HERE
@@ -68,7 +143,7 @@ def main() -> None:
   # fixed_data: Fixed_data
   # state: real3d
   # dt: real: Model time step (seconds)
-  (fixed_data, state, dt) = init() # init( state , dt , fixed_data );
+  (fixed_data, state, dt) = init()
 
   mainproc = fixed_data.mainproc
 
@@ -468,19 +543,7 @@ def set_halo_values_z(
 # state, dt, and fixed_data used to be output parameters.
 # It would be more Pythonic to return them as a tuple.
 def init(): # -> tuple[real3d, real, Fixed_data]:
-  #real3d state
-  #Fixed_data fixed_data
-  #real dt
-
-  auto &nx                 = fixed_data.nx                ;
-  auto &nz                 = fixed_data.nz                ;
-  auto &i_beg              = fixed_data.i_beg             ;
-  auto &k_beg              = fixed_data.k_beg             ;
-  auto &left_rank          = fixed_data.left_rank         ;
-  auto &right_rank         = fixed_data.right_rank        ;
-  auto &nranks             = fixed_data.nranks            ;
-  auto &myrank             = fixed_data.myrank            ;
-  auto &mainproc         = fixed_data.mainproc        ;
+
   ierr: int = 0
 
   # /////////////////////////////////////////////////////////////
@@ -501,7 +564,8 @@ def init(): # -> tuple[real3d, real, Fixed_data]:
   # END MPI DUMMY SECTION
   # //////////////////////////////////////////////
 
-  # Vertical direction isn't MPI-ized, so the rank's local values = the global values
+  # Vertical direction isn't MPI-ized,
+  # so the rank's local values = the global values
   k_beg = 0
   nz = nz_glob
   mainproc = (myrank == 0)
@@ -522,10 +586,7 @@ def init(): # -> tuple[real3d, real, Fixed_data]:
   # ierr = MPI_Barrier(MPI_COMM_WORLD);
 
   # Define quadrature weights and points
-  nqpoints: int = 3 
-  # SArray<real,1,nqpoints> qpoints;
-  # SArray<real,1,nqpoints> qweights;
-
+  nqpoints: int = 3
   qpoints = np.array([
       0.112701665379258311482073460022,
       0.500000000000000000000000000000,
@@ -555,11 +616,6 @@ def init(): # -> tuple[real3d, real, Fixed_data]:
           # Compute the x,z location within the global domain based on cell and quadrature index
           x: real = (i_beg + i-hs+0.5)*dx + (qpoints[ii]-0.5)*dx
           z: real = (k_beg + k-hs+0.5)*dz + (qpoints[kk]-0.5)*dz
-          # real r, u, w, t, hr, ht;
-
-          # The above real variables are probably output parameters
-          # of collision, thermal, gravity_waves, density_current, and injection.
-          # x and z are probably input parameters of these functions.
 
           # Set the fluid state based on the user's specification
           if data_spec_int == DATA_SPEC_COLLISION:
@@ -574,16 +630,16 @@ def init(): # -> tuple[real3d, real, Fixed_data]:
             (r,u,w,t,hr,ht) = injection(x,z)
 
           # Store into the fluid state array
-          state[ID_DENS,k,i] += r                         * qweights[ii]*qweights[kk];
-          state[ID_UMOM,k,i] += (r+hr)*u                  * qweights[ii]*qweights[kk];
-          state[ID_WMOM,k,i] += (r+hr)*w                  * qweights[ii]*qweights[kk];
-          state[ID_RHOT,k,i] += ( (r+hr)*(t+ht) - hr*ht ) * qweights[ii]*qweights[kk];
+          state[ID_DENS,k,i] += r                         * qweights[ii]*qweights[kk]
+          state[ID_UMOM,k,i] += (r+hr)*u                  * qweights[ii]*qweights[kk]
+          state[ID_WMOM,k,i] += (r+hr)*w                  * qweights[ii]*qweights[kk]
+          state[ID_RHOT,k,i] += ( (r+hr)*(t+ht) - hr*ht ) * qweights[ii]*qweights[kk]
 
-  hy_dens_cell       = real1d("hy_dens_cell      ", nz+2*hs);
-  hy_dens_theta_cell = real1d("hy_dens_theta_cell", nz+2*hs);
-  hy_dens_int        = real1d("hy_dens_int       ", nz+1);
-  hy_dens_theta_int  = real1d("hy_dens_theta_int ", nz+1);
-  hy_pressure_int    = real1d("hy_pressure_int   ", nz+1);
+  hy_dens_cell       = real1d("hy_dens_cell      ", nz+2*hs)
+  hy_dens_theta_cell = real1d("hy_dens_theta_cell", nz+2*hs)
+  hy_dens_int        = real1d("hy_dens_int       ", nz+1)
+  hy_dens_theta_int  = real1d("hy_dens_theta_int ", nz+1)
+  hy_pressure_int    = real1d("hy_pressure_int   ", nz+1)
 
   # Compute the hydrostatic background state over vertical cell averages
   # /////////////////////////////////////////////////
@@ -630,15 +686,20 @@ def init(): # -> tuple[real3d, real, Fixed_data]:
     
     hy_dens_int[k]       = hr
     hy_dens_theta_int[k] = hr*ht
-    hy_pressure_int[k]   = C0*pow((hr*ht),gamm)
+    hy_pressure_int[k]   = C0*pow((hr*ht), gamm)
+
+  hy_dens_cell       = realConst1d(hy_dens_cell      )
+  hy_dens_theta_cell = realConst1d(hy_dens_theta_cell)
+  hy_dens_int        = realConst1d(hy_dens_int       )
+  hy_dens_theta_int  = realConst1d(hy_dens_theta_int )
+  hy_pressure_int    = realConst1d(hy_pressure_int   )
 
-  fixed_data.hy_dens_cell       = realConst1d(hy_dens_cell      )
-  fixed_data.hy_dens_theta_cell = realConst1d(hy_dens_theta_cell)
-  fixed_data.hy_dens_int        = realConst1d(hy_dens_int       )
-  fixed_data.hy_dens_theta_int  = realConst1d(hy_dens_theta_int )
-  fixed_data.hy_pressure_int    = realConst1d(hy_pressure_int   )
+  fixed_data = Fixed_data(nx, nz, i_beg, k_beg,
+    nranks, myrank, left_rank, right_rank, mainproc,
+    hy_dens_cell, hy_dens_theta_cell, hy_dens_int,
+    hy_dens_theta_int, hy_pressure_int)
 
-  # FIXME (mfh 2025/03/03) This should return something; not sure what yet
+  return (state, fixed_data, dt)
 
 # This test case is initially balanced but injects fast, cold air from the left boundary near the model top
 # x and z are input coordinates at which to sample
@@ -775,94 +836,10 @@ def output(
     fixed_data # Fixed_data const&
   ) -> int: # num_out (updated)
 
-  nx                 = fixed_data.nx
-  nz                 = fixed_data.nz
-  i_beg              = fixed_data.i_beg
-  k_beg              = fixed_data.k_beg
-  mainproc           = fixed_data.mainproc
-  hy_dens_cell       = fixed_data.hy_dens_cell
-  hy_dens_theta_cell = fixed_data.hy_dens_theta_cell
-
-  int ncid, t_dimid, x_dimid, z_dimid, dens_varid, uwnd_varid, wwnd_varid, theta_varid, t_varid, dimids[3];
-  MPI_Offset st1[1], ct1[1], st3[3], ct3[3];
-  # Temporary arrays to hold density, u-wind, w-wind, and potential temperature (theta)
-  # Inform the user
   if mainproc:
     print("*** OUTPUT ***\n")
-  # Allocate some (big) temp arrays
-  dens = doub2d("dens", nz, nx)
-  uwnd = doub2d("uwnd", nz, nx)
-  wwnd = doub2d("wwnd", nz, nx)
-  theta = doub2d("theta", nz, nx)
-
-  # If the elapsed time is zero, create the file. Otherwise, open the file
-  if etime == 0:
-    # Create the file
-    ncwrap( ncmpi_create( MPI_COMM_WORLD , "output.nc" , NC_CLOBBER , MPI_INFO_NULL , &ncid ) , __LINE__ );
-    # Create the dimensions
-    ncwrap( ncmpi_def_dim( ncid , "t" , (MPI_Offset) NC_UNLIMITED , &t_dimid ) , __LINE__ )
-    ncwrap( ncmpi_def_dim( ncid , "x" , (MPI_Offset) nx_glob      , &x_dimid ) , __LINE__ )
-    ncwrap( ncmpi_def_dim( ncid , "z" , (MPI_Offset) nz_glob      , &z_dimid ) , __LINE__ )
-    # Create the variables
-    dimids[0] = t_dimid
-    ncwrap( ncmpi_def_var( ncid , "t"     , NC_DOUBLE , 1 , dimids ,     &t_varid ) , __LINE__ )
-    dimids[0] = t_dimid
-    dimids[1] = z_dimid
-    dimids[2] = x_dimid
-    ncwrap( ncmpi_def_var( ncid , "dens"  , NC_DOUBLE , 3 , dimids ,  &dens_varid ) , __LINE__ )
-    ncwrap( ncmpi_def_var( ncid , "uwnd"  , NC_DOUBLE , 3 , dimids ,  &uwnd_varid ) , __LINE__ )
-    ncwrap( ncmpi_def_var( ncid , "wwnd"  , NC_DOUBLE , 3 , dimids ,  &wwnd_varid ) , __LINE__ )
-    ncwrap( ncmpi_def_var( ncid , "theta" , NC_DOUBLE , 3 , dimids , &theta_varid ) , __LINE__ )
-    # End "define" mode
-    ncwrap( ncmpi_enddef( ncid ) , __LINE__ );
-  else:
-    # Open the file
-    ncwrap( ncmpi_open( MPI_COMM_WORLD , "output.nc" , NC_WRITE , MPI_INFO_NULL , &ncid ) , __LINE__ )
-    # Get the variable IDs
-    ncwrap( ncmpi_inq_varid( ncid , "dens"  ,  &dens_varid ) , __LINE__ )
-    ncwrap( ncmpi_inq_varid( ncid , "uwnd"  ,  &uwnd_varid ) , __LINE__ )
-    ncwrap( ncmpi_inq_varid( ncid , "wwnd"  ,  &wwnd_varid ) , __LINE__ )
-    ncwrap( ncmpi_inq_varid( ncid , "theta" , &theta_varid ) , __LINE__ )
-    ncwrap( ncmpi_inq_varid( ncid , "t"     ,     &t_varid ) , __LINE__ )
-
-  # Store perturbed values in the temp arrays for output
-  # /////////////////////////////////////////////////
-  # TODO: MAKE THESE 2 LOOPS A PARALLEL_FOR
-  # /////////////////////////////////////////////////
-  for k in range(nz):
-    for i in range(nx):
-      dens[k,i]  = state[ID_DENS,hs+k,hs+i]
-      uwnd[k,i]  = state[ID_UMOM,hs+k,hs+i] / ( hy_dens_cell[hs+k] + state[ID_DENS,hs+k,hs+i] )
-      wwnd[k,i]  = state[ID_WMOM,hs+k,hs+i] / ( hy_dens_cell[hs+k] + state[ID_DENS,hs+k,hs+i] )
-      theta[k,i] = ( state[ID_RHOT,hs+k,hs+i] + hy_dens_theta_cell[hs+k] ) / ( hy_dens_cell[hs+k] + state[ID_DENS,hs+k,hs+i] ) - hy_dens_theta_cell[hs+k] / hy_dens_cell[hs+k]
-
-  # Write the grid data to file with all the processes writing collectively
-  st3[0] = num_out
-  st3[1] = k_beg
-  st3[2] = i_beg
-  ct3[0] = 1
-  ct3[1] = nz
-  ct3[2] = nx
-  ncwrap( ncmpi_put_vara_double_all( ncid ,  dens_varid , st3 , ct3 , dens.data()  ) , __LINE__ );
-  ncwrap( ncmpi_put_vara_double_all( ncid ,  uwnd_varid , st3 , ct3 , uwnd.data()  ) , __LINE__ );
-  ncwrap( ncmpi_put_vara_double_all( ncid ,  wwnd_varid , st3 , ct3 , wwnd.data()  ) , __LINE__ );
-  ncwrap( ncmpi_put_vara_double_all( ncid , theta_varid , st3 , ct3 , theta.data() ) , __LINE__ );
-
-  //Only the main process needs to write the elapsed time
-  //Begin "independent" write mode
-  ncwrap( ncmpi_begin_indep_data(ncid) , __LINE__ );
-  # write elapsed time to file
-  if mainproc:
-    st1[0] = num_out
-    ct1[0] = 1
-    double etimearr[1];
-    etimearr[0] = etime; ncwrap( ncmpi_put_vara_double( ncid , t_varid , st1 , ct1 , etimearr ) , __LINE__ );
-
-  //End "independent" write mode
-  ncwrap( ncmpi_end_indep_data(ncid) , __LINE__ );
 
-  //Close the file
-  ncwrap( ncmpi_close(ncid) , __LINE__ );
+  # TODO (mfh 2025/03/04) Actually write to the output file.
 
   # Increment the number of outputs
   num_out = num_out + 1;
@@ -870,16 +847,6 @@ def output(
   return num_out
 
 
-//Error reporting routine for the PNetCDF I/O
-void ncwrap( int ierr , int line ) {
-  if (ierr != NC_NOERR) {
-    print(f"NetCDF Error at line: {line}\n")
-    print(f"{ncmpi_strerror(ierr)}\n")
-    sys.exit(-1)
-  }
-}
-
-
 def finalize() -> None:
   pass
 

From 2967153482b9210df835e1ddeb3870b1c9886ef7 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Tue, 4 Mar 2025 16:22:26 -0700
Subject: [PATCH 07/37] Run-time debugging of port

---
 python/miniWeather.py | 131 +++++++++++++++++++++---------------------
 1 file changed, 66 insertions(+), 65 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index 21ee2c3..14710c9 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -7,23 +7,41 @@
 # //
 # //////////////////////////////////////////////////////////////////////////////////////////
 
+import math
+import numpy as np
 import sys
 import timeit
-import numpy as np
 #include "pnetcdf.h"
 
 # "real" in the original C++ code could be either float or double.
 real = np.float64 # or np.float32
+double = np.float64
 
 #
-# Constants (ported from cpp/const.h)
+# Parameters for indexing and flags
+# (effectively enums, but we leave them as constants
+# so it's easier to see the relationship to the original C++)
 #
 
-hs: int = 2
+NUM_VARS: int = 4           # Number of fluid state variables
+ID_DENS: int = 0           #index for density ("rho")
+ID_UMOM: int = 1           #index for momentum in the x-direction ("rho * u")
+ID_WMOM: int = 2           #index for momentum in the z-direction ("rho * w")
+ID_RHOT: int = 3           #index for density * potential temperature ("rho * theta")
+DIR_X: int = 1              #Integer constant to express that this operation is in the x-direction
+DIR_Z: int = 2              #Integer constant to express that this operation is in the z-direction
+DATA_SPEC_COLLISION: int = 1
+DATA_SPEC_THERMAL: int = 2
+DATA_SPEC_GRAVITY_WAVES: int = 3
+DATA_SPEC_DENSITY_CURRENT: int = 5
+DATA_SPEC_INJECTION: int = 6
+
+#
+# Constants (ported from cpp/const.h)
+#
 
-# We don't like code that redefines pi,
-# but we keep it just for now, to test that
-# the Python gives the same results as the C++.
+# We don't like code that redefines pi, but we keep it for now,
+# to test that the Python gives the same results as the C++.
 pi: real   = 3.14159265358979323846264338327
 grav: real = 9.8     # Gravitational acceleration (m / s^2)
 cp: real   = 1004.0  # Specific heat of dry air at constant pressure
@@ -54,61 +72,41 @@
 # ///////////////////////////////////////////////////////////////////////////////////////
 # The x-direction length is twice as long as the z-direction length
 # So, you'll want to have nx_glob be twice as large as nz_glob
-nx_glob: int = _NX              # Number of total cells in the x-direction
-nz_glob: int = _NZ              # Number of total cells in the z-direction
-sim_time: real = _SIM_TIME      # How many seconds to run the simulation
-output_freq: real = _OUT_FREQ   # How frequently to output data to file (in seconds)
-data_spec_int: int = _DATA_SPEC # How to initialize the data
+nz_glob: int = 50               # Number of total cells in the z-direction
+nx_glob: int = 2 * nz_glob      # Number of total cells in the x-direction
+sim_time: real = 1000.0         # How many seconds to run the simulation
+output_freq: real = 10.0        # How frequently to output data to file (in seconds)
+data_spec_int: int = DATA_SPEC_THERMAL # How to initialize the data
 # ///////////////////////////////////////////////////////////////////////////////////////
 # // END USER-CONFIGURABLE PARAMETERS
 # ///////////////////////////////////////////////////////////////////////////////////////
 dx: real = xlen / nx_glob
 dz: real = zlen / nz_glob
 
-
-#
-# Parameters for indexing and flags
-#
-
-NUM_VARS: int = 4           # Number of fluid state variables
-ID_DENS: int = 0           #index for density ("rho")
-ID_UMOM: int = 1           #index for momentum in the x-direction ("rho * u")
-ID_WMOM: int = 2           #index for momentum in the z-direction ("rho * w")
-ID_RHOT: int = 3           #index for density * potential temperature ("rho * theta")
-DIR_X: int = 1              #Integer constant to express that this operation is in the x-direction
-DIR_Z: int = 2              #Integer constant to express that this operation is in the z-direction
-DATA_SPEC_COLLISION: int = 1
-DATA_SPEC_THERMAL: int = 2
-DATA_SPEC_GRAVITY_WAVES: int = 3
-DATA_SPEC_DENSITY_CURRENT: int = 5
-DATA_SPEC_INJECTION: int = 6
-
 #
 # These functions aid in porting from the original C++.
+# Not sure why the degrees of freedom are in reverse order;
+# perhaps the original author wanted to preserve Fortran order.
 #
 
 #typedef yakl::Array<real  ,1,yakl::memHost> real1d;
-def real1d(name: string, nx: int):
+def real1d(name: str, nx: int):
   return np.zeros(nx, dtype=real)
 
-#typedef yakl::Array<real  ,2,yakl::memHost> real2d;
-def real2d(name: string, nx: int, nz: int):
-  return np.zeros((nz, nx), dtype=real)
-
 #typedef yakl::Array<real  ,3,yakl::memHost> real3d;
-def real3d(name: string, nx: int, nz: int, nvars: int):
+def real3d(name: str, nx: int, nz: int, nvars: int):
   return np.zeros((nvars, nz, nx), dtype=real)
 
 #typedef yakl::Array<double,2,yakl::memHost> doub2d;
-def doub2d(name: string, nx: int, nz: int):
+def doub2d(name: str, nx: int, nz: int):
   return np.zeros((nz, nx), dtype=real)
 
 #typedef yakl::Array<real   const,1,yakl::memHost> realConst1d;
-def realConst1d(name: string, nx: int):
+def realConst1d(name: str, nx: int):
   return np.zeros(nx, dtype=real)
 
 #typedef yakl::Array<real   const,2,yakl::memHost> realConst2d;
-def realConst2d(name: string, nx: int, nz: int):
+def realConst2d(name: str, nx: int, nz: int):
   return np.zeros((nz, nx), dtype=real)
 
 # ///////////////////////////////////////////////////////////////////////////////////////
@@ -216,7 +214,7 @@ def perform_timestep(
   nx = fixed_data.nx
   nz = fixed_data.nz
 
-  state_tmp = real3d("state_tmp", NUM_VARS, nz+2*hs, nx+2*hs)
+  state_tmp = real3d("state_tmp", nx=nx+2*hs, nz=nz+2*hs, nvars=NUM_VARS)
 
   if direction_switch != 0:
     # x-direction first
@@ -264,7 +262,7 @@ def semi_discrete_step(
   k_beg              = fixed_data.k_beg
   hy_dens_cell       = fixed_data.hy_dens_cell
 
-  tend = real3d("tend", NUM_VARS, nz, nx)
+  tend = real3d("tend", nx=nx, nz=nz, nvars=NUM_VARS)
 
   if dir == DIR_X:
     # Set the halo values for this MPI task's fluid state in the x-direction
@@ -299,7 +297,7 @@ def semi_discrete_step(
           wpert: real = sample_ellipse_cosine(x, z, 0.01, xlen/8, 1000.0, 500.0, 500.0)
           tend[ID_WMOM,k,i] += wpert*hy_dens_cell[hs+k]
 
-        state_out(ll,hs+k,hs+i) = state_init(ll,hs+k,hs+i) + dt * tend(ll,k,i);
+        state_out[ll,hs+k,hs+i] = state_init[ll,hs+k,hs+i] + dt * tend[ll,k,i]
 
   # yakl::timer_stop("apply tendencies");
 
@@ -323,7 +321,7 @@ def compute_tendencies_x(
   hy_dens_cell       = fixed_data.hy_dens_cell
   hy_dens_theta_cell = fixed_data.hy_dens_theta_cell
 
-  flux = real3d("flux", NUM_VARS, nz, nx+1)
+  flux = real3d("flux", nx=nx+1, nz=nz, nvars=NUM_VARS)
 
   # Compute the hyperviscosity coefficient
   hv_coef: real = -hv_beta * dx / (16*dt)
@@ -396,7 +394,7 @@ def compute_tendencies_z(
   hy_dens_theta_int  = fixed_data.hy_dens_theta_int
   hy_pressure_int    = fixed_data.hy_pressure_int
 
-  flux = real3d("flux", NUM_VARS, nz+1, nx)
+  flux = real3d("flux", nx=nx, nz=nz+1, nvars=NUM_VARS)
 
   # Compute the hyperviscosity coefficient
   hv_coef: real = -hv_beta * dz / (16*dt);
@@ -411,9 +409,9 @@ def compute_tendencies_z(
       #SArray<real,1,NUM_VARS> d3_vals;
       #SArray<real,1,NUM_VARS> vals;
 
-      stencil = np.zeros((1, 4), dtype=real)
-      d3_vals = np.zeros((1, NUM_VARS), dtype=real)
-      vals = np.zeros((1, NUM_VARS), dtype=real)
+      stencil = np.zeros(4, dtype=real)
+      d3_vals = np.zeros(NUM_VARS, dtype=real)
+      vals = np.zeros(NUM_VARS, dtype=real)
 
       # Use fourth-order interpolation from four cell averages to compute the value at the interface in question
       for ll in range(NUM_VARS):
@@ -440,8 +438,6 @@ def compute_tendencies_z(
       flux[ID_UMOM,k,i] = r*w*u   - hv_coef*d3_vals[ID_UMOM];
       flux[ID_WMOM,k,i] = r*w*w+p - hv_coef*d3_vals[ID_WMOM];
       flux[ID_RHOT,k,i] = r*w*t   - hv_coef*d3_vals[ID_RHOT];
-    }
-  }
 
   # Use the fluxes to compute tendencies for each cell
   #/////////////////////////////////////////////////
@@ -571,7 +567,9 @@ def init(): # -> tuple[real3d, real, Fixed_data]:
   mainproc = (myrank == 0)
 
   # Allocate the model data
-  state = real3d("state", NUM_VARS, nz+2*hs, nx+2*hs)
+  state = real3d("state", nx=nx+2*hs, nz=nz+2*hs, nvars=NUM_VARS)
+  if mainproc:
+    print(f"Allocate state: NUM_VARS={NUM_VARS}, nx+2*hs={nx+2*hs}, nz+2*hs={nz+2*hs}")
 
   # Define the maximum stable time step based on an assumed maximum wind speed
   dt: real = min(dx,dz) / max_speed * cfl;
@@ -608,6 +606,8 @@ def init(): # -> tuple[real3d, real, Fixed_data]:
     for i in range(nx+2*hs):
       # Initialize the state to zero
       for ll in range(NUM_VARS):
+        if mainproc:
+          print(f"ll={ll}, k={k}, i={i}")
         state[ll,k,i] = 0.0
       
       # Use Gauss-Legendre quadrature to initialize a hydrostatic balance + temperature perturbation
@@ -688,11 +688,11 @@ def init(): # -> tuple[real3d, real, Fixed_data]:
     hy_dens_theta_int[k] = hr*ht
     hy_pressure_int[k]   = C0*pow((hr*ht), gamm)
 
-  hy_dens_cell       = realConst1d(hy_dens_cell      )
-  hy_dens_theta_cell = realConst1d(hy_dens_theta_cell)
-  hy_dens_int        = realConst1d(hy_dens_int       )
-  hy_dens_theta_int  = realConst1d(hy_dens_theta_int )
-  hy_pressure_int    = realConst1d(hy_pressure_int   )
+  hy_dens_cell       = realConst1d("hy_dens_cell      ", nz+2*hs)
+  hy_dens_theta_cell = realConst1d("hy_dens_theta_cell", nz+2*hs)
+  hy_dens_int        = realConst1d("hy_dens_int       ", nz+1)
+  hy_dens_theta_int  = realConst1d("hy_dens_theta_int ", nz+1)
+  hy_pressure_int    = realConst1d("hy_pressure_int   ", nz+1)
 
   fixed_data = Fixed_data(nx, nz, i_beg, k_beg,
     nranks, myrank, left_rank, right_rank, mainproc,
@@ -794,7 +794,7 @@ def hydro_const_theta(z: real): # returns (r, t)
   r = rt / t                                  # Density at z
 
   return (r, t)
-}
+
 
 
 # Establish hydrostatic balance using constant Brunt-Vaisala frequency
@@ -818,10 +818,10 @@ def hydro_const_bvfreq(z: real, bv_freq0: real): # returns (r, t)
 # amp,x0,z0,xrad,zrad are input amplitude, center, and radius of the ellipse
 def sample_ellipse_cosine(x: real, z: real, amp: real, x0: real, z0: real, xrad: real, zrad: real) -> real:
   # Compute distance from bubble center
-  dist: real = sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.0
+  dist: real = math.sqrt( ((x-x0)/xrad)*((x-x0)/xrad) + ((z-z0)/zrad)*((z-z0)/zrad) ) * pi / 2.0
   # If the distance from bubble center is less than the radius, create a cos**2 profile
   if dist <= pi / 2.0:
-    return amp * pow(cos(dist),2.)
+    return amp * math.pow(math.cos(dist), 2.0)
   else:
     return 0.0
 
@@ -831,9 +831,9 @@ def sample_ellipse_cosine(x: real, z: real, amp: real, x0: real, z0: real, xrad:
 # If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics
 def output(
     state, # realConst3d
-    etime, # real
-    num_out, # int
-    fixed_data # Fixed_data const&
+    etime: real,
+    num_out: int,
+    fixed_data: Fixed_data
   ) -> int: # num_out (updated)
 
   if mainproc:
@@ -841,10 +841,7 @@ def output(
 
   # TODO (mfh 2025/03/04) Actually write to the output file.
 
-  # Increment the number of outputs
-  num_out = num_out + 1;
-
-  return num_out
+  return num_out + 1
 
 
 def finalize() -> None:
@@ -854,7 +851,7 @@ def finalize() -> None:
 # Compute reduced quantities for error checking without resorting to the "ncdiff" tool
 #void reductions( realConst3d state , double &mass , double &te , Fixed_data const &fixed_data ) {
 def reductions(
-    state # realConst3d, an input parameter
+    state, # realConst3d, an input parameter
     fixed_data # Fixed_data const&, an input parameter
   ) -> tuple[double, double]: # mass, te
 
@@ -886,3 +883,7 @@ def reductions(
   #te   = glob[1];
 
   return (mass, te)
+
+
+if __name__ == "__main__":
+  main()

From 3d9794e3c6b0bed627e68f17e0b00c692cfb55e6 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Tue, 4 Mar 2025 16:47:36 -0700
Subject: [PATCH 08/37] More run-time debugging

---
 python/miniWeather.py | 68 +++++++++++++++++++++++++++----------------
 1 file changed, 43 insertions(+), 25 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index 14710c9..8c1ebde 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -76,7 +76,7 @@
 nx_glob: int = 2 * nz_glob      # Number of total cells in the x-direction
 sim_time: real = 1000.0         # How many seconds to run the simulation
 output_freq: real = 10.0        # How frequently to output data to file (in seconds)
-data_spec_int: int = DATA_SPEC_THERMAL # How to initialize the data
+data_spec_int: int = DATA_SPEC_INJECTION # How to initialize the data
 # ///////////////////////////////////////////////////////////////////////////////////////
 # // END USER-CONFIGURABLE PARAMETERS
 # ///////////////////////////////////////////////////////////////////////////////////////
@@ -114,21 +114,37 @@ def realConst2d(name: str, nx: int, nz: int):
 # ///////////////////////////////////////////////////////////////////////////////////////
 
 class Fixed_data:
-  def __init__(self):
-    self.nx = 0          # Number of local grid cells in the x dimension for this MPI task
-    self.nz = 0          # Number of local grid cells in the z dimension for this MPI task
-    self.i_beg = 0       # beginning index in the x direction for this MPI task
-    self.k_beg = 0       # beginning index in the z direction for this MPI task
-    self.nranks = 0      # Number of MPI ranks
-    self.myrank = 0      # My rank id
-    self.left_rank = 0   # MPI Rank ID that exists to my left in the global domain
-    self.right_rank = 0  # MPI Rank ID that exists to my right in the global domain
-    self.mainproc = True # Am I the main process (rank == 0)?
-    self.hy_dens_cell = None       # realConst1d: hydrostatic density (vert cell avgs).   Dimensions: (1-hs:nz+hs)
-    self.hy_dens_theta_cell = None # realConst1d: hydrostatic rho*t (vert cell avgs).     Dimensions: (1-hs:nz+hs)
-    self.hy_dens_int = None        # realConst1d: hydrostatic density (vert cell interf). Dimensions: (1:nz+1)
-    self.hy_dens_theta_int = None  # realConst1d: hydrostatic rho*t (vert cell interf).   Dimensions: (1:nz+1)
-    self.hy_pressure_int = None    # realConst1d: hydrostatic press (vert cell interf).   Dimensions: (1:nz+1)
+  def __init__(self,
+    nx: int = 0, # Number of local grid cells in the x dimension for this MPI task
+    nz: int = 0, # Number of local grid cells in the z dimension for this MPI task
+    i_beg: int = 0, # beginning index in the x direction for this MPI task
+    k_beg: int = 0, # beginning index in the z direction for this MPI task
+    nranks: int = 0, # Number of MPI ranks
+    myrank: int = 0, # My rank id
+    left_rank: int = 0, # MPI Rank ID that exists to my left in the global domain
+    right_rank: int = 0, # MPI Rank ID that exists to my right in the global domain
+    mainproc: bool = True, # Am I the main process (rank == 0)?
+    hy_dens_cell = None, # realConst1d: hydrostatic density (vert cell avgs).   Dimensions: (1-hs:nz+hs)
+    hy_dens_theta_cell = None, # realConst1d: hydrostatic rho*t (vert cell avgs).     Dimensions: (1-hs:nz+hs)
+    hy_dens_int = None, # realConst1d: hydrostatic density (vert cell interf). Dimensions: (1:nz+1)
+    hy_dens_theta_int = None, # realConst1d: hydrostatic rho*t (vert cell interf).   Dimensions: (1:nz+1)
+    hy_pressure_int = None, # realConst1d:hydrostatic press (vert cell interf).   Dimensions: (1:nz+1)
+  ):
+
+    self.nx = nx          
+    self.nz = nz         
+    self.i_beg = i_beg
+    self.k_beg = k_beg
+    self.nranks = nranks
+    self.myrank = myrank
+    self.left_rank = left_rank
+    self.right_rank = right_rank
+    self.mainproc = mainproc
+    self.hy_dens_cell = hy_dens_cell
+    self.hy_dens_theta_cell = hy_dens_theta_cell
+    self.hy_dens_int = hy_dens_int
+    self.hy_dens_theta_int = hy_dens_theta_int
+    self.hy_pressure_int = hy_pressure_int
 
 
 # ///////////////////////////////////////////////////////////////////////////////////////
@@ -141,7 +157,7 @@ def main() -> None:
   # fixed_data: Fixed_data
   # state: real3d
   # dt: real: Model time step (seconds)
-  (fixed_data, state, dt) = init()
+  (state, fixed_data, dt) = init()
 
   mainproc = fixed_data.mainproc
 
@@ -158,7 +174,7 @@ def main() -> None:
   # ////////////////////////////////////////////////////
   # MAIN TIME STEP LOOP
   # ////////////////////////////////////////////////////
-  def run_simulation():
+  def main_time_step_loop(dt: real, etime: real, num_out: int) -> None:
     direction_switch: int = 1  # Order in which dimensional splitting takes x,z solves
     output_counter: real = 0.0 # Helps determine when it's time to do output
 
@@ -179,8 +195,10 @@ def run_simulation():
       if output_freq >= 0 and output_counter >= output_freq:
         output_counter = output_counter - output_freq
       num_out = output(state, etime, num_out, fixed_data)
+      # NOTE (mfh 2025/03/04) etime and num_out will be discarded.
+      # Figure out how to return them from within a timeit expression.
 
-  time_in_s = timeit.timeit(run_simulation(), number=1)
+  time_in_s = timeit.timeit(main_time_step_loop(dt, etime, num_out), number=1)
   if mainproc:
     print(f"CPU Time: {time_in_s} s\n")
 
@@ -337,9 +355,9 @@ def compute_tendencies_x(
       #SArray<real,1,NUM_VARS> d3_vals;
       #SArray<real,1,NUM_VARS> vals;
 
-      stencil = np.zeros((1, 4), dtype=real)
-      d3_vals = np.zeros((1, NUM_VARS), dtype=real)
-      vals = np.zeros((1, NUM_VARS), dtype=real)
+      stencil = np.zeros(4, dtype=real)
+      d3_vals = np.zeros(NUM_VARS, dtype=real)
+      vals = np.zeros(NUM_VARS, dtype=real)
       
       # Use fourth-order interpolation from four cell averages to compute the value at the interface in question
       for ll in range(NUM_VARS):
@@ -606,8 +624,8 @@ def init(): # -> tuple[real3d, real, Fixed_data]:
     for i in range(nx+2*hs):
       # Initialize the state to zero
       for ll in range(NUM_VARS):
-        if mainproc:
-          print(f"ll={ll}, k={k}, i={i}")
+        #if mainproc:
+        #  print(f"ll={ll}, k={k}, i={i}")
         state[ll,k,i] = 0.0
       
       # Use Gauss-Legendre quadrature to initialize a hydrostatic balance + temperature perturbation
@@ -836,7 +854,7 @@ def output(
     fixed_data: Fixed_data
   ) -> int: # num_out (updated)
 
-  if mainproc:
+  if fixed_data.mainproc:
     print("*** OUTPUT ***\n")
 
   # TODO (mfh 2025/03/04) Actually write to the output file.

From 6762e76a1f4ed36354993c5e3725666fde314ea9 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Tue, 4 Mar 2025 16:50:30 -0700
Subject: [PATCH 09/37] More run-time debugging

---
 python/miniWeather.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index 8c1ebde..fe3d203 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -706,12 +706,6 @@ def init(): # -> tuple[real3d, real, Fixed_data]:
     hy_dens_theta_int[k] = hr*ht
     hy_pressure_int[k]   = C0*pow((hr*ht), gamm)
 
-  hy_dens_cell       = realConst1d("hy_dens_cell      ", nz+2*hs)
-  hy_dens_theta_cell = realConst1d("hy_dens_theta_cell", nz+2*hs)
-  hy_dens_int        = realConst1d("hy_dens_int       ", nz+1)
-  hy_dens_theta_int  = realConst1d("hy_dens_theta_int ", nz+1)
-  hy_pressure_int    = realConst1d("hy_pressure_int   ", nz+1)
-
   fixed_data = Fixed_data(nx, nz, i_beg, k_beg,
     nranks, myrank, left_rank, right_rank, mainproc,
     hy_dens_cell, hy_dens_theta_cell, hy_dens_int,

From 880ee6adc5f8f3e19e3bc8d1964caf91caf14a3c Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Tue, 4 Mar 2025 17:07:53 -0700
Subject: [PATCH 10/37] It runs to completion without errors

---
 python/miniWeather.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index fe3d203..b4f3222 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -74,7 +74,7 @@
 # So, you'll want to have nx_glob be twice as large as nz_glob
 nz_glob: int = 50               # Number of total cells in the z-direction
 nx_glob: int = 2 * nz_glob      # Number of total cells in the x-direction
-sim_time: real = 1000.0         # How many seconds to run the simulation
+sim_time: real = 20.0 #1000.0         # How many seconds to run the simulation
 output_freq: real = 10.0        # How frequently to output data to file (in seconds)
 data_spec_int: int = DATA_SPEC_INJECTION # How to initialize the data
 # ///////////////////////////////////////////////////////////////////////////////////////
@@ -174,6 +174,7 @@ def main() -> None:
   # ////////////////////////////////////////////////////
   # MAIN TIME STEP LOOP
   # ////////////////////////////////////////////////////
+
   def main_time_step_loop(dt: real, etime: real, num_out: int) -> None:
     direction_switch: int = 1  # Order in which dimensional splitting takes x,z solves
     output_counter: real = 0.0 # Helps determine when it's time to do output
@@ -195,10 +196,14 @@ def main_time_step_loop(dt: real, etime: real, num_out: int) -> None:
       if output_freq >= 0 and output_counter >= output_freq:
         output_counter = output_counter - output_freq
       num_out = output(state, etime, num_out, fixed_data)
-      # NOTE (mfh 2025/03/04) etime and num_out will be discarded.
-      # Figure out how to return them from within a timeit expression.
 
-  time_in_s = timeit.timeit(main_time_step_loop(dt, etime, num_out), number=1)
+    return (etime, num_out)
+
+  start_time = timeit.default_timer()
+  (etime, num_out) = main_time_step_loop(dt, etime, num_out)
+  end_time = timeit.default_timer()
+
+  time_in_s = end_time - start_time
   if mainproc:
     print(f"CPU Time: {time_in_s} s\n")
 

From 01e45bcba091148e1c4d1d47c012b616f96472ba Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Fri, 7 Mar 2025 00:36:37 +0200
Subject: [PATCH 11/37] Debugging Python (C is OK)

---
 c/miniWeather_serial.cpp | 16 +++++++++++++++-
 cpp/CMakeLists.txt       |  2 +-
 cpp/YAKL                 |  2 +-
 python/miniWeather.py    | 17 +++++++++++------
 4 files changed, 28 insertions(+), 9 deletions(-)
 mode change 160000 => 120000 cpp/YAKL

diff --git a/c/miniWeather_serial.cpp b/c/miniWeather_serial.cpp
index 0d04d44..12f14ce 100644
--- a/c/miniWeather_serial.cpp
+++ b/c/miniWeather_serial.cpp
@@ -132,6 +132,10 @@ int main(int argc, char **argv) {
 
   //Initial reductions for mass, kinetic energy, and total energy
   reductions(mass0,te0);
+  {
+    printf( "mass0: %le\n" , mass0 );
+    printf( "te0:   %le\n" , te0   );
+  }
 
   //Output the initial state
   output(state,etime);
@@ -157,6 +161,13 @@ int main(int argc, char **argv) {
       output_counter = output_counter - output_freq;
       output(state,etime);
     }
+    {
+      double mass = 0.0;
+      double te = 0.0;
+      reductions(mass, te);
+      printf( "mass: %le\n" , mass );
+      printf( "te:   %le\n" , te   );
+    }
   }
   auto t2 = std::chrono::steady_clock::now();
   if (mainproc) {
@@ -722,6 +733,7 @@ double sample_ellipse_cosine( double x , double z , double amp , double x0 , dou
 //The file I/O uses parallel-netcdf, the only external library required for this mini-app.
 //If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics
 void output( double *state , double etime ) {
+#if 0
   int ncid, t_dimid, x_dimid, z_dimid, dens_varid, uwnd_varid, wwnd_varid, theta_varid, t_varid, dimids[3];
   int i, k, ind_r, ind_u, ind_w, ind_t;
   MPI_Offset st1[1], ct1[1], st3[3], ct3[3];
@@ -802,16 +814,18 @@ void output( double *state , double etime ) {
 
   //Close the file
   ncwrap( ncmpi_close(ncid) , __LINE__ );
-
+#endif // 0
   //Increment the number of outputs
   num_out = num_out + 1;
 
+#if 0
   //Deallocate the temp arrays
   free( dens     );
   free( uwnd     );
   free( wwnd     );
   free( theta    );
   free( etimearr );
+#endif // 0
 }
 
 
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9dde59f..8b9ea27 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -116,7 +116,7 @@ target_link_libraries(parallelfor_simd_x_test "${LDFLAGS}")
 add_test(NAME YAKL_SIMD_X_TEST COMMAND ./check_output.sh ./parallelfor_simd_x_test 1e-9 4.5e-5 )
 
 
-include(YAKL/yakl_utils.cmake)
+include(YAKL/deprecated/yakl_utils.cmake)
 yakl_process_target(serial)
 yakl_process_target(serial_test)
 yakl_process_target(mpi)
diff --git a/cpp/YAKL b/cpp/YAKL
deleted file mode 160000
index 71a059c..0000000
--- a/cpp/YAKL
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 71a059c4701d22f3d60157b01a922776261993c0
diff --git a/cpp/YAKL b/cpp/YAKL
new file mode 120000
index 0000000..faed9af
--- /dev/null
+++ b/cpp/YAKL
@@ -0,0 +1 @@
+../../YAKL
\ No newline at end of file
diff --git a/python/miniWeather.py b/python/miniWeather.py
index b4f3222..bd0f438 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -163,6 +163,9 @@ def main() -> None:
 
   # Initial reductions for mass, kinetic energy, and total energy
   (mass0, te0) = reductions(state, fixed_data)
+  if mainproc:
+    print(f"mass0: {mass0:.6e}")
+    print(f"te0:   {te0:.6e}")
 
   num_out: int = 0    # The number of outputs performed so far
   etime: real  = 0.0  # Elapsed time
@@ -188,7 +191,7 @@ def main_time_step_loop(dt: real, etime: real, num_out: int) -> None:
       direction_switch = perform_timestep(state, dt, direction_switch, fixed_data)
       # Inform the user
       if mainproc:
-        print(f"Elapsed Time: {etime}, Simulation Time: {sim_time}\n")
+        print(f"Elapsed Time: {etime}, Simulation Time: {sim_time}")
       # Update the elapsed time and output counter
       etime = etime + dt
       output_counter = output_counter + dt
@@ -197,6 +200,11 @@ def main_time_step_loop(dt: real, etime: real, num_out: int) -> None:
         output_counter = output_counter - output_freq
       num_out = output(state, etime, num_out, fixed_data)
 
+      (mass, te) = reductions(state, fixed_data)
+      if mainproc:
+        print(f"mass: {mass:.6e}")
+        print(f"te:   {te:.6e}")
+
     return (etime, num_out)
 
   start_time = timeit.default_timer()
@@ -211,8 +219,8 @@ def main_time_step_loop(dt: real, etime: real, num_out: int) -> None:
   (mass, te) = reductions(state, fixed_data)
 
   if mainproc:
-    print(f"d_mass: {((mass - mass0)/mass0)}")
-    print(f"d_te:   {((te   - te0  )/te0  )}")
+    print(f"d_mass: {((mass - mass0)/mass0):.6e}")
+    print(f"d_te:   {((te   - te0  )/te0  ):.6e}")
 
   finalize()
 
@@ -853,9 +861,6 @@ def output(
     fixed_data: Fixed_data
   ) -> int: # num_out (updated)
 
-  if fixed_data.mainproc:
-    print("*** OUTPUT ***\n")
-
   # TODO (mfh 2025/03/04) Actually write to the output file.
 
   return num_out + 1

From 2d4f752e0d4e0975a04624efb980459ee8c0958d Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Fri, 7 Mar 2025 00:43:09 +0200
Subject: [PATCH 12/37] Remove some Python / C diffs

Remove some differences (state_tmp allocation and initialization)
between Python and C.  It didn't change the Python output at all.
---
 python/miniWeather.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index bd0f438..e01a2c7 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -156,8 +156,9 @@ def main() -> None:
 
   # fixed_data: Fixed_data
   # state: real3d
+  # state_tmp: real3d
   # dt: real: Model time step (seconds)
-  (state, fixed_data, dt) = init()
+  (state, state_tmp, fixed_data, dt) = init()
 
   mainproc = fixed_data.mainproc
 
@@ -188,7 +189,7 @@ def main_time_step_loop(dt: real, etime: real, num_out: int) -> None:
         dt = sim_time - etime
 
       # Perform a single time step
-      direction_switch = perform_timestep(state, dt, direction_switch, fixed_data)
+      direction_switch = perform_timestep(state, state_tmp, dt, direction_switch, fixed_data)
       # Inform the user
       if mainproc:
         print(f"Elapsed Time: {etime}, Simulation Time: {sim_time}")
@@ -237,6 +238,7 @@ def main_time_step_loop(dt: real, etime: real, num_out: int) -> None:
 #   q_n+1  = q_n + dt/1 * rhs(q**)
 def perform_timestep(
     state, # real3d const&, input parameter
+    state_tmp,
     dt, # real, must be an input parameter
     direction_switch, # int&, in/out parameter, transformed to input and return value
     fixed_data # Fixed_data const &, input parameter
@@ -245,8 +247,6 @@ def perform_timestep(
   nx = fixed_data.nx
   nz = fixed_data.nz
 
-  state_tmp = real3d("state_tmp", nx=nx+2*hs, nz=nz+2*hs, nvars=NUM_VARS)
-
   if direction_switch != 0:
     # x-direction first
     semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_X, fixed_data)
@@ -569,7 +569,7 @@ def set_halo_values_z(
 
 # state, dt, and fixed_data used to be output parameters.
 # It would be more Pythonic to return them as a tuple.
-def init(): # -> tuple[real3d, real, Fixed_data]:
+def init(): # -> (state: real3d, state_tmp: real3d, dt: real, fixed_data: Fixed_data)
 
   ierr: int = 0
 
@@ -601,6 +601,7 @@ def init(): # -> tuple[real3d, real, Fixed_data]:
   state = real3d("state", nx=nx+2*hs, nz=nz+2*hs, nvars=NUM_VARS)
   if mainproc:
     print(f"Allocate state: NUM_VARS={NUM_VARS}, nx+2*hs={nx+2*hs}, nz+2*hs={nz+2*hs}")
+  state_tmp = real3d("state_tmp", nx=nx+2*hs, nz=nz+2*hs, nvars=NUM_VARS)
 
   # Define the maximum stable time step based on an assumed maximum wind speed
   dt: real = min(dx,dz) / max_speed * cfl;
@@ -666,6 +667,9 @@ def init(): # -> tuple[real3d, real, Fixed_data]:
           state[ID_WMOM,k,i] += (r+hr)*w                  * qweights[ii]*qweights[kk]
           state[ID_RHOT,k,i] += ( (r+hr)*(t+ht) - hr*ht ) * qweights[ii]*qweights[kk]
 
+      for ll in range(NUM_VARS):
+        state_tmp[ll,k,i] = state[ll,k,i]
+          
   hy_dens_cell       = real1d("hy_dens_cell      ", nz+2*hs)
   hy_dens_theta_cell = real1d("hy_dens_theta_cell", nz+2*hs)
   hy_dens_int        = real1d("hy_dens_int       ", nz+1)
@@ -724,7 +728,7 @@ def init(): # -> tuple[real3d, real, Fixed_data]:
     hy_dens_cell, hy_dens_theta_cell, hy_dens_int,
     hy_dens_theta_int, hy_pressure_int)
 
-  return (state, fixed_data, dt)
+  return (state, state_tmp, fixed_data, dt)
 
 # This test case is initially balanced but injects fast, cold air from the left boundary near the model top
 # x and z are input coordinates at which to sample

From f9b94f6f22c24c4b5c2a597a8508138aaf251d04 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Fri, 7 Mar 2025 01:01:47 +0200
Subject: [PATCH 13/37] Make Python flux like C flux

Extents of flux in Python differed from those in C.
I changed the Python flux allocation to work like C,
but that didn't help the mass balance.
---
 python/miniWeather.py | 55 +++++++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 25 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index e01a2c7..ef046d4 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -155,10 +155,11 @@ def main() -> None:
   #yakl::init();
 
   # fixed_data: Fixed_data
-  # state: real3d
-  # state_tmp: real3d
+  # state: real3d, NUM_VARS x (nz+2*hs) x (nz+2*hs)
+  # state_tmp: real3d, ditto
+  # flux: real3d, NUM_VARS x (nz+1) x (nx+1)
   # dt: real: Model time step (seconds)
-  (state, state_tmp, fixed_data, dt) = init()
+  (state, state_tmp, flux, fixed_data, dt) = init()
 
   mainproc = fixed_data.mainproc
 
@@ -189,7 +190,7 @@ def main_time_step_loop(dt: real, etime: real, num_out: int) -> None:
         dt = sim_time - etime
 
       # Perform a single time step
-      direction_switch = perform_timestep(state, state_tmp, dt, direction_switch, fixed_data)
+      direction_switch = perform_timestep(state, state_tmp, flux, dt, direction_switch, fixed_data)
       # Inform the user
       if mainproc:
         print(f"Elapsed Time: {etime}, Simulation Time: {sim_time}")
@@ -238,7 +239,8 @@ def main_time_step_loop(dt: real, etime: real, num_out: int) -> None:
 #   q_n+1  = q_n + dt/1 * rhs(q**)
 def perform_timestep(
     state, # real3d const&, input parameter
-    state_tmp,
+    state_tmp, # real3d
+    flux, # real3d
     dt, # real, must be an input parameter
     direction_switch, # int&, in/out parameter, transformed to input and return value
     fixed_data # Fixed_data const &, input parameter
@@ -249,22 +251,22 @@ def perform_timestep(
 
   if direction_switch != 0:
     # x-direction first
-    semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_X, fixed_data)
-    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, fixed_data)
-    semi_discrete_step(state, state_tmp, state    , dt / 1, DIR_X, fixed_data)
+    semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_X, flux, fixed_data)
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, fixed_data)
+    semi_discrete_step(state, state_tmp, state    , dt / 1, DIR_X, flux, fixed_data)
     # z-direction second
-    semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_Z, fixed_data)
-    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, fixed_data)
-    semi_discrete_step(state, state_tmp, state    , dt / 1, DIR_Z, fixed_data)
+    semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_Z, flux, fixed_data)
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, fixed_data)
+    semi_discrete_step(state, state_tmp, state    , dt / 1, DIR_Z, flux, fixed_data)
   else:
     # z-direction second
-    semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_Z, fixed_data)
-    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, fixed_data)
-    semi_discrete_step(state, state_tmp, state    , dt / 1, DIR_Z, fixed_data)
+    semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_Z, flux, fixed_data)
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, fixed_data)
+    semi_discrete_step(state, state_tmp, state    , dt / 1, DIR_Z, flux, fixed_data)
     # x-direction first
-    semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_X, fixed_data)
-    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, fixed_data)
-    semi_discrete_step(state, state_tmp, state    , dt / 1, DIR_X, fixed_data)
+    semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_X, flux, fixed_data)
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, fixed_data)
+    semi_discrete_step(state, state_tmp, state    , dt / 1, DIR_X, flux, fixed_data)
 
   if direction_switch:
     direction_switch = 0
@@ -284,6 +286,7 @@ def semi_discrete_step(
     state_out, # real3d const&,
     dt: real,
     dir: int,
+    flux, # real3d
     fixed_data # Fixed_data const&
   ) -> None:
 
@@ -302,7 +305,7 @@ def semi_discrete_step(
     #yakl::timer_stop("halo x");
     # Compute the time tendencies for the fluid state in the x-direction
     #yakl::timer_start("tendencies x");
-    compute_tendencies_x(state_forcing, tend, dt, fixed_data)
+    compute_tendencies_x(state_forcing, flux, tend, dt, fixed_data)
     #yakl::timer_stop("tendencies x");
   elif dir == DIR_Z:
     # Set the halo values for this MPI task's fluid state in the z-direction
@@ -311,7 +314,7 @@ def semi_discrete_step(
     #yakl::timer_stop("halo z");
     # Compute the time tendencies for the fluid state in the z-direction
     #yakl::timer_start("tendencies z");
-    compute_tendencies_z(state_forcing, tend, dt, fixed_data)
+    compute_tendencies_z(state_forcing, flux, tend, dt, fixed_data)
     #yakl::timer_stop("tendencies z");
 
   # /////////////////////////////////////////////////
@@ -323,6 +326,9 @@ def semi_discrete_step(
     for k in range(nz):
       for i in range(nx):
         if data_spec_int == DATA_SPEC_GRAVITY_WAVES:
+          print("*** NOT IMPLEMENTED ***");
+          sys.exit(-1);
+          
           x: real = (i_beg + i+0.5)*dx;
           z: real = (k_beg + k+0.5)*dz;
           wpert: real = sample_ellipse_cosine(x, z, 0.01, xlen/8, 1000.0, 500.0, 500.0)
@@ -342,6 +348,7 @@ def semi_discrete_step(
 # Then, compute the tendencies using those fluxes
 def compute_tendencies_x(
     state, # realConst3d
+    flux,
     tend, # real3d const&
     dt, # real
     fixed_data # Fixed_data const&
@@ -352,8 +359,6 @@ def compute_tendencies_x(
   hy_dens_cell       = fixed_data.hy_dens_cell
   hy_dens_theta_cell = fixed_data.hy_dens_theta_cell
 
-  flux = real3d("flux", nx=nx+1, nz=nz, nvars=NUM_VARS)
-
   # Compute the hyperviscosity coefficient
   hv_coef: real = -hv_beta * dx / (16*dt)
   # /////////////////////////////////////////////////
@@ -414,6 +419,7 @@ def compute_tendencies_x(
 # Then, compute the tendencies using those fluxes
 def compute_tendencies_z(
     state, # realConst3d
+    flux,
     tend, # real3d const&
     dt, # real
     fixed_data # Fixed_data const&
@@ -425,8 +431,6 @@ def compute_tendencies_z(
   hy_dens_theta_int  = fixed_data.hy_dens_theta_int
   hy_pressure_int    = fixed_data.hy_pressure_int
 
-  flux = real3d("flux", nx=nx, nz=nz+1, nvars=NUM_VARS)
-
   # Compute the hyperviscosity coefficient
   hv_coef: real = -hv_beta * dz / (16*dt);
   # /////////////////////////////////////////////////
@@ -569,7 +573,7 @@ def set_halo_values_z(
 
 # state, dt, and fixed_data used to be output parameters.
 # It would be more Pythonic to return them as a tuple.
-def init(): # -> (state: real3d, state_tmp: real3d, dt: real, fixed_data: Fixed_data)
+def init(): # -> (state: real3d, state_tmp: real3d, flux: real3d, dt: real, fixed_data: Fixed_data)
 
   ierr: int = 0
 
@@ -602,6 +606,7 @@ def init(): # -> (state: real3d, state_tmp: real3d, dt: real, fixed_data: Fixed_
   if mainproc:
     print(f"Allocate state: NUM_VARS={NUM_VARS}, nx+2*hs={nx+2*hs}, nz+2*hs={nz+2*hs}")
   state_tmp = real3d("state_tmp", nx=nx+2*hs, nz=nz+2*hs, nvars=NUM_VARS)
+  flux = real3d("flux", nx=nx+1, nz=nz+1, nvars=NUM_VARS)
 
   # Define the maximum stable time step based on an assumed maximum wind speed
   dt: real = min(dx,dz) / max_speed * cfl;
@@ -728,7 +733,7 @@ def init(): # -> (state: real3d, state_tmp: real3d, dt: real, fixed_data: Fixed_
     hy_dens_cell, hy_dens_theta_cell, hy_dens_int,
     hy_dens_theta_int, hy_pressure_int)
 
-  return (state, state_tmp, fixed_data, dt)
+  return (state, state_tmp, flux, fixed_data, dt)
 
 # This test case is initially balanced but injects fast, cold air from the left boundary near the model top
 # x and z are input coordinates at which to sample

From 52e035b5f0cf078be9a9d97814c502219dd5d580 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Fri, 7 Mar 2025 01:09:00 +0200
Subject: [PATCH 14/37] Make direction_switch the same; verify

---
 c/miniWeather_serial.cpp | 1 +
 python/miniWeather.py    | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/c/miniWeather_serial.cpp b/c/miniWeather_serial.cpp
index 12f14ce..512bf55 100644
--- a/c/miniWeather_serial.cpp
+++ b/c/miniWeather_serial.cpp
@@ -194,6 +194,7 @@ int main(int argc, char **argv) {
 // q**    = q[n] + dt/2 * rhs(q*  )
 // q[n+1] = q[n] + dt/1 * rhs(q** )
 void perform_timestep( double *state , double *state_tmp , double *flux , double *tend , double dt ) {
+  printf("direction_switch: %d\n", direction_switch);
   if (direction_switch) {
     //x-direction first
     semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_X , flux , tend );
diff --git a/python/miniWeather.py b/python/miniWeather.py
index ef046d4..a1013f2 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -249,7 +249,8 @@ def perform_timestep(
   nx = fixed_data.nx
   nz = fixed_data.nz
 
-  if direction_switch != 0:
+  print(f'direction_switch: {direction_switch}')
+  if direction_switch:
     # x-direction first
     semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_X, flux, fixed_data)
     semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, fixed_data)

From 650634205c359459f7e5d851cc9cd00e56ef37a0 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Fri, 7 Mar 2025 01:16:04 +0200
Subject: [PATCH 15/37] Reconcile more, but no change in results

---
 python/miniWeather.py | 55 ++++++++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index a1013f2..81a947f 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -159,7 +159,7 @@ def main() -> None:
   # state_tmp: real3d, ditto
   # flux: real3d, NUM_VARS x (nz+1) x (nx+1)
   # dt: real: Model time step (seconds)
-  (state, state_tmp, flux, fixed_data, dt) = init()
+  (state, state_tmp, flux, tend, fixed_data, dt) = init()
 
   mainproc = fixed_data.mainproc
 
@@ -190,7 +190,7 @@ def main_time_step_loop(dt: real, etime: real, num_out: int) -> None:
         dt = sim_time - etime
 
       # Perform a single time step
-      direction_switch = perform_timestep(state, state_tmp, flux, dt, direction_switch, fixed_data)
+      direction_switch = perform_timestep(state, state_tmp, flux, tend, dt, direction_switch, fixed_data)
       # Inform the user
       if mainproc:
         print(f"Elapsed Time: {etime}, Simulation Time: {sim_time}")
@@ -241,6 +241,7 @@ def perform_timestep(
     state, # real3d const&, input parameter
     state_tmp, # real3d
     flux, # real3d
+    tend, # real3d
     dt, # real, must be an input parameter
     direction_switch, # int&, in/out parameter, transformed to input and return value
     fixed_data # Fixed_data const &, input parameter
@@ -252,22 +253,22 @@ def perform_timestep(
   print(f'direction_switch: {direction_switch}')
   if direction_switch:
     # x-direction first
-    semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_X, flux, fixed_data)
-    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, fixed_data)
-    semi_discrete_step(state, state_tmp, state    , dt / 1, DIR_X, flux, fixed_data)
+    semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_X, flux, tend, fixed_data)
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend, fixed_data)
+    semi_discrete_step(state, state_tmp, state    , dt / 1, DIR_X, flux, tend, fixed_data)
     # z-direction second
-    semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_Z, flux, fixed_data)
-    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, fixed_data)
-    semi_discrete_step(state, state_tmp, state    , dt / 1, DIR_Z, flux, fixed_data)
+    semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_Z, flux, tend, fixed_data)
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend, fixed_data)
+    semi_discrete_step(state, state_tmp, state    , dt / 1, DIR_Z, flux, tend, fixed_data)
   else:
     # z-direction second
-    semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_Z, flux, fixed_data)
-    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, fixed_data)
-    semi_discrete_step(state, state_tmp, state    , dt / 1, DIR_Z, flux, fixed_data)
+    semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_Z, flux, tend, fixed_data)
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_Z, flux, tend, fixed_data)
+    semi_discrete_step(state, state_tmp, state    , dt / 1, DIR_Z, flux, tend, fixed_data)
     # x-direction first
-    semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_X, flux, fixed_data)
-    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, fixed_data)
-    semi_discrete_step(state, state_tmp, state    , dt / 1, DIR_X, flux, fixed_data)
+    semi_discrete_step(state, state    , state_tmp, dt / 3, DIR_X, flux, tend, fixed_data)
+    semi_discrete_step(state, state_tmp, state_tmp, dt / 2, DIR_X, flux, tend, fixed_data)
+    semi_discrete_step(state, state_tmp, state    , dt / 1, DIR_X, flux, tend, fixed_data)
 
   if direction_switch:
     direction_switch = 0
@@ -288,6 +289,7 @@ def semi_discrete_step(
     dt: real,
     dir: int,
     flux, # real3d
+    tend, # real3d
     fixed_data # Fixed_data const&
   ) -> None:
 
@@ -297,8 +299,6 @@ def semi_discrete_step(
   k_beg              = fixed_data.k_beg
   hy_dens_cell       = fixed_data.hy_dens_cell
 
-  tend = real3d("tend", nx=nx, nz=nz, nvars=NUM_VARS)
-
   if dir == DIR_X:
     # Set the halo values for this MPI task's fluid state in the x-direction
     #yakl::timer_start("halo x");
@@ -393,7 +393,7 @@ def compute_tendencies_x(
       u = vals[ID_UMOM] / r
       w = vals[ID_WMOM] / r
       t = (vals[ID_RHOT] + hy_dens_theta_cell[hs+k]) / r
-      p = C0*pow((r*t), gamm)
+      p = C0*math.pow((r*t), gamm)
 
       # Compute the flux vector
       flux[ID_DENS,k,i] = r*u     - hv_coef*d3_vals[ID_DENS]
@@ -464,7 +464,7 @@ def compute_tendencies_z(
       u: real = vals[ID_UMOM] / r;
       w: real = vals[ID_WMOM] / r;
       t: real = ( vals[ID_RHOT] + hy_dens_theta_int[k] ) / r;
-      p: real = C0*pow((r*t),gamm) - hy_pressure_int[k];
+      p: real = C0*math.pow((r*t),gamm) - hy_pressure_int[k];
       if k == 0 or k == nz:
         w                = 0;
         d3_vals[ID_DENS] = 0;
@@ -574,7 +574,7 @@ def set_halo_values_z(
 
 # state, dt, and fixed_data used to be output parameters.
 # It would be more Pythonic to return them as a tuple.
-def init(): # -> (state: real3d, state_tmp: real3d, flux: real3d, dt: real, fixed_data: Fixed_data)
+def init(): # -> (state: real3d, state_tmp: real3d, flux: real3d, tend: real3d, dt: real, fixed_data: Fixed_data)
 
   ierr: int = 0
 
@@ -608,6 +608,7 @@ def init(): # -> (state: real3d, state_tmp: real3d, flux: real3d, dt: real, fixe
     print(f"Allocate state: NUM_VARS={NUM_VARS}, nx+2*hs={nx+2*hs}, nz+2*hs={nz+2*hs}")
   state_tmp = real3d("state_tmp", nx=nx+2*hs, nz=nz+2*hs, nvars=NUM_VARS)
   flux = real3d("flux", nx=nx+1, nz=nz+1, nvars=NUM_VARS)
+  tend = real3d("tend", nx=nx, nz=nz, nvars=NUM_VARS)
 
   # Define the maximum stable time step based on an assumed maximum wind speed
   dt: real = min(dx,dz) / max_speed * cfl;
@@ -727,14 +728,14 @@ def init(): # -> (state: real3d, state_tmp: real3d, flux: real3d, dt: real, fixe
     
     hy_dens_int[k]       = hr
     hy_dens_theta_int[k] = hr*ht
-    hy_pressure_int[k]   = C0*pow((hr*ht), gamm)
+    hy_pressure_int[k]   = C0*math.pow((hr*ht), gamm)
 
   fixed_data = Fixed_data(nx, nz, i_beg, k_beg,
     nranks, myrank, left_rank, right_rank, mainproc,
     hy_dens_cell, hy_dens_theta_cell, hy_dens_int,
     hy_dens_theta_int, hy_pressure_int)
 
-  return (state, state_tmp, flux, fixed_data, dt)
+  return (state, state_tmp, flux, tend, fixed_data, dt)
 
 # This test case is initially balanced but injects fast, cold air from the left boundary near the model top
 # x and z are input coordinates at which to sample
@@ -824,8 +825,8 @@ def hydro_const_theta(z: real): # returns (r, t)
   # Establish hydrostatic balance first using Exner pressure
   t = theta0                                  # Potential Temperature at z
   exner = exner0 - grav * z / (cp * theta0)   # Exner pressure at z
-  p = p0 * pow(exner, (cp/rd))                # Pressure at z
-  rt = pow((p / C0), (1. / gamm))             # rho*theta at z
+  p = p0 * math.pow(exner, (cp/rd))                # Pressure at z
+  rt = math.pow((p / C0), (1. / gamm))             # rho*theta at z
   r = rt / t                                  # Density at z
 
   return (r, t)
@@ -841,8 +842,8 @@ def hydro_const_bvfreq(z: real, bv_freq0: real): # returns (r, t)
   exner0: real =   1.0  # Surface-level Exner pressure
   t = theta0 * exp( bv_freq0*bv_freq0 / grav * z )                                     # Pot temp at z
   exner = exner0 - grav*grav / (cp * bv_freq0*bv_freq0) * (t - theta0) / (t * theta0)  # Exner pressure at z
-  p = p0 * pow(exner,(cp/rd))                                                          # Pressure at z
-  rt = pow((p / C0),(1. / gamm))                                                       # rho*theta at z
+  p = p0 * math.pow(exner,(cp/rd))                                                          # Pressure at z
+  rt = math.pow((p / C0),(1. / gamm))                                                       # rho*theta at z
   r = rt / t                                                                           # Density at z
 
   return (r, t)
@@ -900,8 +901,8 @@ def reductions(
       u = state[ID_UMOM,hs+k,hs+i] / r                                 # U-wind
       w = state[ID_WMOM,hs+k,hs+i] / r                                 # W-wind
       th = ( state[ID_RHOT,hs+k,hs+i] + hy_dens_theta_cell[hs+k] ) / r # Potential Temperature (theta)
-      p  = C0*pow(r*th,gamm)                               # Pressure
-      t  = th / pow(p0/p,rd/cp)                            # Temperature
+      p  = C0*math.pow(r*th,gamm)                               # Pressure
+      t  = th / math.pow(p0/p,rd/cp)                            # Temperature
       ke = r*(u*u+w*w)                                     # Kinetic Energy
       ie = r*cv*t                                          # Internal Energy
       mass += r        *dx*dz # Accumulate domain mass

From 53dbd0eaefab1b28d66c15e089433437c97ee9ea Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Fri, 7 Mar 2025 01:18:52 +0200
Subject: [PATCH 16/37] Reconcile more, but no change in results

---
 python/miniWeather.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index 81a947f..98ed4bb 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -840,7 +840,7 @@ def hydro_const_theta(z: real): # returns (r, t)
 def hydro_const_bvfreq(z: real, bv_freq0: real): # returns (r, t)
   theta0: real = 300.0  # Background potential temperature
   exner0: real =   1.0  # Surface-level Exner pressure
-  t = theta0 * exp( bv_freq0*bv_freq0 / grav * z )                                     # Pot temp at z
+  t = theta0 * math.exp( bv_freq0*bv_freq0 / grav * z )                                     # Pot temp at z
   exner = exner0 - grav*grav / (cp * bv_freq0*bv_freq0) * (t - theta0) / (t * theta0)  # Exner pressure at z
   p = p0 * math.pow(exner,(cp/rd))                                                          # Pressure at z
   rt = math.pow((p / C0),(1. / gamm))                                                       # rho*theta at z

From 4c6fd2056ca1fd3c26fa6f578967d43d4ae175af Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Fri, 7 Mar 2025 01:24:19 +0200
Subject: [PATCH 17/37] Reconcile collision; didn't help

---
 python/miniWeather.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index 98ed4bb..f3210ad 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -808,10 +808,8 @@ def collision(x: real, z: real): # returns (r, u, w, t, hr, ht)
   t = 0.0
   u = 0.0
   w = 0.0
-  # FIXME (mfh 2025/03/03) This was originally t = t + ... so perhaps t should also be an input parameter.
-  # On the other hand, t was uninitialized before (which means this was probably incorrect in the original C++).
-  t = sample_ellipse_cosine(x, z,  20.0, xlen/2, 2000.0, 2000.0, 2000.0) + \
-      sample_ellipse_cosine(x, z, -20.0, xlen/2, 8000.0, 2000.0, 2000.0)
+  t = t + sample_ellipse_cosine(x, z,  20.0, xlen/2, 2000.0, 2000.0, 2000.0)
+  t = t + sample_ellipse_cosine(x, z, -20.0, xlen/2, 8000.0, 2000.0, 2000.0)
 
   return (r, u, w, t, hr, ht)
 

From 8dd07fb77235ce349807ceae81b4f6f58d30a86b Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Fri, 7 Mar 2025 01:29:26 +0200
Subject: [PATCH 18/37] More reconciling: abs -> math.fabs in set_halo_values_x

---
 python/miniWeather.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index f3210ad..e901365 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -530,7 +530,7 @@ def set_halo_values_x(
       for k in range(nz):
         for i in range(hs):
           z: real = (k_beg + k+0.5)*dz
-          if abs(z-3*zlen/4) <= zlen/16:
+          if math.fabs(z-3*zlen/4) <= zlen/16:
             state[ID_UMOM,hs+k,i] = (state[ID_DENS,hs+k,i] + hy_dens_cell[hs+k]) * 50.0
             state[ID_RHOT,hs+k,i] = (state[ID_DENS,hs+k,i] + hy_dens_cell[hs+k]) * 298.0 - hy_dens_theta_cell[hs+k]
 
@@ -761,9 +761,7 @@ def density_current(x: real, z: real): # returns (r, u, w, t, hr, ht)
   t = 0.0
   u = 0.0
   w = 0.0
-  # FIXME (mfh 2025/03/03) This was originally t = t + ... so perhaps t should also be an input parameter.
-  # On the other hand, t was uninitialized before (which means this was probably incorrect in the original C++).
-  t = sample_ellipse_cosine(x, z, -20.0, xlen/2, 5000.0, 4000.0, 2000.0)
+  t = t + sample_ellipse_cosine(x, z, -20.0, xlen/2, 5000.0, 4000.0, 2000.0)
 
   return (r, u, w, t, hr, ht)
 
@@ -791,9 +789,7 @@ def thermal(x: real, z: real): # returns (r, u, w, t, hr, ht)
   t = 0.0
   u = 0.0
   w = 0.0
-  # FIXME (mfh 2025/03/03) This was originally t = t + ... so perhaps t should also be an input parameter.
-  # On the other hand, t was uninitialized before (which means this was probably incorrect in the original C++).
-  t = sample_ellipse_cosine(x, z, 3.0, xlen/2, 2000.0, 2000.0, 2000.0)
+  t = t + sample_ellipse_cosine(x, z, 3.0, xlen/2, 2000.0, 2000.0, 2000.0)
 
   return (r, u, w, t, hr, ht)
 

From 5ae15b13f7b9077229c882bbec46da6ce0f83f05 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Fri, 7 Mar 2025 01:47:11 +0200
Subject: [PATCH 19/37] Remove superfluous comments

---
 python/miniWeather.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index e901365..f3760af 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -301,33 +301,24 @@ def semi_discrete_step(
 
   if dir == DIR_X:
     # Set the halo values for this MPI task's fluid state in the x-direction
-    #yakl::timer_start("halo x");
     set_halo_values_x(state_forcing, fixed_data)
-    #yakl::timer_stop("halo x");
     # Compute the time tendencies for the fluid state in the x-direction
-    #yakl::timer_start("tendencies x");
     compute_tendencies_x(state_forcing, flux, tend, dt, fixed_data)
-    #yakl::timer_stop("tendencies x");
   elif dir == DIR_Z:
     # Set the halo values for this MPI task's fluid state in the z-direction
-    #yakl::timer_start("halo z");
     set_halo_values_z(state_forcing, fixed_data)
-    #yakl::timer_stop("halo z");
     # Compute the time tendencies for the fluid state in the z-direction
-    #yakl::timer_start("tendencies z");
     compute_tendencies_z(state_forcing, flux, tend, dt, fixed_data)
-    #yakl::timer_stop("tendencies z");
 
   # /////////////////////////////////////////////////
   # // TODO: MAKE THESE 3 LOOPS A PARALLEL_FOR
   # /////////////////////////////////////////////////
   # Apply the tendencies to the fluid state
-  # yakl::timer_start("apply tendencies");
   for ll in range(NUM_VARS):
     for k in range(nz):
       for i in range(nx):
         if data_spec_int == DATA_SPEC_GRAVITY_WAVES:
-          print("*** NOT IMPLEMENTED ***");
+          print("*** TEMPORARILY DISABLED ***");
           sys.exit(-1);
           
           x: real = (i_beg + i+0.5)*dx;
@@ -337,8 +328,6 @@ def semi_discrete_step(
 
         state_out[ll,hs+k,hs+i] = state_init[ll,hs+k,hs+i] + dt * tend[ll,k,i]
 
-  # yakl::timer_stop("apply tendencies");
-
   # NOTE It's OK for this not to return anything,
   # as long as we can treat state_out as an output parameter.
 

From bc5c387c53e76981d3c140b5d2bfe8187652a888 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Fri, 7 Mar 2025 19:09:46 +0200
Subject: [PATCH 20/37] Change Python to run THERMAL not INJECTION

---
 python/miniWeather.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index f3760af..61048ca 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -76,7 +76,8 @@
 nx_glob: int = 2 * nz_glob      # Number of total cells in the x-direction
 sim_time: real = 20.0 #1000.0         # How many seconds to run the simulation
 output_freq: real = 10.0        # How frequently to output data to file (in seconds)
-data_spec_int: int = DATA_SPEC_INJECTION # How to initialize the data
+#data_spec_int: int = DATA_SPEC_INJECTION # How to initialize the data
+data_spec_int: int = DATA_SPEC_THERMAL # How to initialize the data
 # ///////////////////////////////////////////////////////////////////////////////////////
 # // END USER-CONFIGURABLE PARAMETERS
 # ///////////////////////////////////////////////////////////////////////////////////////

From 81ae53d9b814c758a8717fa5645e6c8f35c15a7a Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Mon, 10 Mar 2025 19:12:25 +0200
Subject: [PATCH 21/37] Fix MPI_Info_dup error in C output

PNetCDF wants an MPI_Info that is not MPI_INFO_NULL.

There are other issues with C output.
---
 c/miniWeather_serial.cpp | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/c/miniWeather_serial.cpp b/c/miniWeather_serial.cpp
index 512bf55..f2638c9 100644
--- a/c/miniWeather_serial.cpp
+++ b/c/miniWeather_serial.cpp
@@ -734,7 +734,7 @@ double sample_ellipse_cosine( double x , double z , double amp , double x0 , dou
 //The file I/O uses parallel-netcdf, the only external library required for this mini-app.
 //If it's too cumbersome, you can comment the I/O out, but you'll miss out on some potentially cool graphics
 void output( double *state , double etime ) {
-#if 0
+#if 1
   int ncid, t_dimid, x_dimid, z_dimid, dens_varid, uwnd_varid, wwnd_varid, theta_varid, t_varid, dimids[3];
   int i, k, ind_r, ind_u, ind_w, ind_t;
   MPI_Offset st1[1], ct1[1], st3[3], ct3[3];
@@ -750,10 +750,19 @@ void output( double *state , double etime ) {
   theta    = (double *) malloc(nx*nz*sizeof(double));
   etimearr = (double *) malloc(1    *sizeof(double));
 
+  // PNetCDF needs an MPI_Info object that is not MPI_INFO_NULL.
+  // It's possible that earlier PNetCDF versions tolerated MPI_INFO_NULL.
+  MPI_Info mpi_info;
+  auto info_err = MPI_Info_create(&mpi_info);
+  if (info_err != MPI_SUCCESS) {
+    printf("Error creating MPI Info object\n");
+    MPI_Abort(MPI_COMM_WORLD, -1);
+  }
+
   //If the elapsed time is zero, create the file. Otherwise, open the file
   if (etime == 0) {
     //Create the file
-    ncwrap( ncmpi_create( MPI_COMM_WORLD , "output.nc" , NC_CLOBBER , MPI_INFO_NULL , &ncid ) , __LINE__ );
+    ncwrap( ncmpi_create( MPI_COMM_WORLD , "output.nc" , NC_CLOBBER , mpi_info , &ncid ) , __LINE__ );
     //Create the dimensions
     ncwrap( ncmpi_def_dim( ncid , "t" , (MPI_Offset) NC_UNLIMITED , &t_dimid ) , __LINE__ );
     ncwrap( ncmpi_def_dim( ncid , "x" , (MPI_Offset) nx_glob      , &x_dimid ) , __LINE__ );
@@ -770,7 +779,7 @@ void output( double *state , double etime ) {
     ncwrap( ncmpi_enddef( ncid ) , __LINE__ );
   } else {
     //Open the file
-    ncwrap( ncmpi_open( MPI_COMM_WORLD , "output.nc" , NC_WRITE , MPI_INFO_NULL , &ncid ) , __LINE__ );
+    ncwrap( ncmpi_open( MPI_COMM_WORLD , "output.nc" , NC_WRITE , mpi_info , &ncid ) , __LINE__ );
     //Get the variable IDs
     ncwrap( ncmpi_inq_varid( ncid , "dens"  ,  &dens_varid ) , __LINE__ );
     ncwrap( ncmpi_inq_varid( ncid , "uwnd"  ,  &uwnd_varid ) , __LINE__ );
@@ -819,7 +828,9 @@ void output( double *state , double etime ) {
   //Increment the number of outputs
   num_out = num_out + 1;
 
-#if 0
+#if 1
+  MPI_Info_free(&mpi_info);
+  
   //Deallocate the temp arrays
   free( dens     );
   free( uwnd     );

From da28c5581229d949e38bdfb2f7011cc6510f7c2c Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Mon, 10 Mar 2025 20:52:54 +0200
Subject: [PATCH 22/37] C: output updates

---
 c/miniWeather_serial.cpp | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/c/miniWeather_serial.cpp b/c/miniWeather_serial.cpp
index f2638c9..173554b 100644
--- a/c/miniWeather_serial.cpp
+++ b/c/miniWeather_serial.cpp
@@ -742,7 +742,7 @@ void output( double *state , double etime ) {
   double *dens, *uwnd, *wwnd, *theta;
   double *etimearr;
   //Inform the user
-  if (mainproc) { printf("*** OUTPUT ***\n"); }
+  if (mainproc) { fprintf(stderr, "*** OUTPUT ***\n"); }
   //Allocate some (big) temp arrays
   dens     = (double *) malloc(nx*nz*sizeof(double));
   uwnd     = (double *) malloc(nx*nz*sizeof(double));
@@ -755,7 +755,7 @@ void output( double *state , double etime ) {
   MPI_Info mpi_info;
   auto info_err = MPI_Info_create(&mpi_info);
   if (info_err != MPI_SUCCESS) {
-    printf("Error creating MPI Info object\n");
+    fprintf(stderr, "Error creating MPI Info object\n");
     MPI_Abort(MPI_COMM_WORLD, -1);
   }
 
@@ -769,7 +769,7 @@ void output( double *state , double etime ) {
     ncwrap( ncmpi_def_dim( ncid , "z" , (MPI_Offset) nz_glob      , &z_dimid ) , __LINE__ );
     //Create the variables
     dimids[0] = t_dimid;
-    ncwrap( ncmpi_def_var( ncid , "t"     , NC_DOUBLE , 1 , dimids ,     &t_varid ) , __LINE__ );
+    ncwrap( ncmpi_def_var( ncid , "t_var"     , NC_DOUBLE , 1 , dimids ,     &t_varid ) , __LINE__ );
     dimids[0] = t_dimid; dimids[1] = z_dimid; dimids[2] = x_dimid;
     ncwrap( ncmpi_def_var( ncid , "dens"  , NC_DOUBLE , 3 , dimids ,  &dens_varid ) , __LINE__ );
     ncwrap( ncmpi_def_var( ncid , "uwnd"  , NC_DOUBLE , 3 , dimids ,  &uwnd_varid ) , __LINE__ );
@@ -785,7 +785,7 @@ void output( double *state , double etime ) {
     ncwrap( ncmpi_inq_varid( ncid , "uwnd"  ,  &uwnd_varid ) , __LINE__ );
     ncwrap( ncmpi_inq_varid( ncid , "wwnd"  ,  &wwnd_varid ) , __LINE__ );
     ncwrap( ncmpi_inq_varid( ncid , "theta" , &theta_varid ) , __LINE__ );
-    ncwrap( ncmpi_inq_varid( ncid , "t"     ,     &t_varid ) , __LINE__ );
+    ncwrap( ncmpi_inq_varid( ncid , "t_var" ,     &t_varid ) , __LINE__ );
   }
 
   //Store perturbed values in the temp arrays for output
@@ -817,7 +817,8 @@ void output( double *state , double etime ) {
   if (mainproc) {
     st1[0] = num_out;
     ct1[0] = 1;
-    etimearr[0] = etime; ncwrap( ncmpi_put_vara_double( ncid , t_varid , st1 , ct1 , etimearr ) , __LINE__ );
+    etimearr[0] = etime;
+    ncwrap( ncmpi_put_vara_double( ncid , t_varid , st1 , ct1 , etimearr ) , __LINE__ );
   }
   //End "independent" write mode
   ncwrap( ncmpi_end_indep_data(ncid) , __LINE__ );
@@ -844,9 +845,9 @@ void output( double *state , double etime ) {
 //Error reporting routine for the PNetCDF I/O
 void ncwrap( int ierr , int line ) {
   if (ierr != NC_NOERR) {
-    printf("NetCDF Error at line: %d\n", line);
-    printf("%s\n",ncmpi_strerror(ierr));
-    exit(-1);
+    fprintf(stderr, "NetCDF Error at line: %d\n", line);
+    fprintf(stderr, "%s\n", ncmpi_strerror(ierr));
+    MPI_Abort(MPI_COMM_WORLD, -1);
   }
 }
 

From a3cdf1f2acf6607139527b32feedff9e58ff282b Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Mon, 10 Mar 2025 22:40:48 +0200
Subject: [PATCH 23/37] C: debug output to stderr not stdout

---
 c/miniWeather_serial.cpp | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/c/miniWeather_serial.cpp b/c/miniWeather_serial.cpp
index 173554b..a9ddf9c 100644
--- a/c/miniWeather_serial.cpp
+++ b/c/miniWeather_serial.cpp
@@ -133,8 +133,8 @@ int main(int argc, char **argv) {
   //Initial reductions for mass, kinetic energy, and total energy
   reductions(mass0,te0);
   {
-    printf( "mass0: %le\n" , mass0 );
-    printf( "te0:   %le\n" , te0   );
+    fprintf(stderr, "mass0: %le\n" , mass0);
+    fprintf(stderr, "te0:   %le\n" , te0  );
   }
 
   //Output the initial state
@@ -151,7 +151,7 @@ int main(int argc, char **argv) {
     perform_timestep(state,state_tmp,flux,tend,dt);
     //Inform the user
 #ifndef NO_INFORM
-    if (mainproc) { printf( "Elapsed Time: %lf / %lf\n", etime , sim_time ); }
+    if (mainproc) { fprintf(stderr, "Elapsed Time: %lf / %lf\n", etime , sim_time ); }
 #endif
     //Update the elapsed time and output counter
     etime = etime + dt;
@@ -161,25 +161,27 @@ int main(int argc, char **argv) {
       output_counter = output_counter - output_freq;
       output(state,etime);
     }
+#if 0
     {
       double mass = 0.0;
       double te = 0.0;
       reductions(mass, te);
-      printf( "mass: %le\n" , mass );
-      printf( "te:   %le\n" , te   );
+      fprintf(stderr, "mass: %le\n" , mass );
+      fprintf(stderr, "te:   %le\n" , te   );
     }
+#endif // 0
   }
   auto t2 = std::chrono::steady_clock::now();
   if (mainproc) {
-    std::cout << "CPU Time: " << std::chrono::duration<double>(t2-t1).count() << " sec\n";
+    std::cerr << "CPU Time: " << std::chrono::duration<double>(t2-t1).count() << " sec\n";
   }
 
   //Final reductions for mass, kinetic energy, and total energy
   reductions(mass,te);
 
   if (mainproc) {
-    printf( "d_mass: %le\n" , (mass - mass0)/mass0 );
-    printf( "d_te:   %le\n" , (te   - te0  )/te0   );
+    fprintf(stderr, "d_mass: %le\n" , (mass - mass0)/mass0 );
+    fprintf(stderr, "d_te:   %le\n" , (te   - te0  )/te0   );
   }
 
   finalize();
@@ -194,7 +196,7 @@ int main(int argc, char **argv) {
 // q**    = q[n] + dt/2 * rhs(q*  )
 // q[n+1] = q[n] + dt/1 * rhs(q** )
 void perform_timestep( double *state , double *state_tmp , double *flux , double *tend , double dt ) {
-  printf("direction_switch: %d\n", direction_switch);
+  //fprintf(stderr, "direction_switch: %d\n", direction_switch);
   if (direction_switch) {
     //x-direction first
     semi_discrete_step( state , state     , state_tmp , dt / 3 , DIR_X , flux , tend );
@@ -535,9 +537,9 @@ void init( int *argc , char ***argv ) {
 
   //If I'm the main process in MPI, display some grid information
   if (mainproc) {
-    printf( "nx_glob, nz_glob: %d %d\n", nx_glob, nz_glob);
-    printf( "dx,dz: %lf %lf\n",dx,dz);
-    printf( "dt: %lf\n",dt);
+    fprintf(stderr, "nx_glob, nz_glob: %d %d\n", nx_glob, nz_glob);
+    fprintf(stderr, "dx,dz: %lf %lf\n",dx,dz);
+    fprintf(stderr, "dt: %lf\n",dt);
   }
   //Want to make sure this info is displayed before further output
   ierr = MPI_Barrier(MPI_COMM_WORLD);

From 79212d0d2b4e5e212e214c7548493b16f961c587 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Mon, 10 Mar 2025 22:41:02 +0200
Subject: [PATCH 24/37] C: Add build script that works for me

---
 c/build/cmake_kermit.sh | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 c/build/cmake_kermit.sh

diff --git a/c/build/cmake_kermit.sh b/c/build/cmake_kermit.sh
new file mode 100644
index 0000000..5901101
--- /dev/null
+++ b/c/build/cmake_kermit.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+PNETCDF_ROOT=/raid/mhoemmen/pkg/pnetcdf-1.14.0
+SRC_ROOT=/raid/mhoemmen/src/miniWeather/c
+OPT_FLAGS="-g -O2"
+
+cmake \
+  -DCMAKE_CXX_COMPILER=mpic++ \
+  -DCMAKE_C_COMPILER=mpicc \
+  -DCMAKE_Fortran_COMPILER=mpif90 \
+  -DCXXFLAGS="${OPT_FLAGS} -I${PNETCDF_ROOT}/include" \
+  -DLDFLAGS="-L${PNETCDF_ROOT}/lib -lpnetcdf" \
+  -DNX=100 \
+  -DNZ=50 \
+  -DSIM_TIME=20 \
+  -DOUT_FREQ=10 \
+  ${SRC_ROOT}

From 953601e2144614d9fea84ed928b1c9eef3eb777d Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Mon, 10 Mar 2025 15:35:33 -0600
Subject: [PATCH 25/37] Python: Starting on netcdf output

---
 python/miniWeather.py | 62 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index 61048ca..2b986f1 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -856,7 +856,67 @@ def output(
     fixed_data: Fixed_data
   ) -> int: # num_out (updated)
 
-  # TODO (mfh 2025/03/04) Actually write to the output file.
+  # Get dimensions from fixed_data
+  nx = fixed_data.nx
+  nz = fixed_data.nz
+  i_beg = fixed_data.i_beg 
+  k_beg = fixed_data.k_beg
+  mainproc = fixed_data.mainproc
+  hy_dens_cell = fixed_data.hy_dens_cell
+  hy_dens_theta_cell = fixed_data.hy_dens_theta_cell
+
+  # Inform the user
+  if mainproc:
+    print("*** OUTPUT ***")
+
+  # Allocate temp arrays
+  #
+  # TODO (mfh 2025/03/10) Check output order
+  dens = np.zeros((nz,nx), dtype=real)
+  uwnd = np.zeros((nz,nx), dtype=real) 
+  wwnd = np.zeros((nz,nx), dtype=real)
+  theta = np.zeros((nz,nx), dtype=real)
+  etimearr = np.zeros((1), dtype=real)
+
+  # Store perturbed values in temp arrays
+  for k in range(nz):
+    for i in range(nx):
+      ind_r = ID_DENS*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs
+      ind_u = ID_UMOM*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs
+      ind_w = ID_WMOM*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs
+      ind_t = ID_RHOT*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs
+      dens[k,i] = state[ind_r]
+      uwnd[k,i] = state[ind_u] / (hy_dens_cell[k+hs] + state[ind_r])
+      wwnd[k,i] = state[ind_w] / (hy_dens_cell[k+hs] + state[ind_r])
+      theta[k,i] = (state[ind_t] + hy_dens_theta_cell[k+hs]) / (hy_dens_cell[k+hs] + state[ind_r]) - hy_dens_theta_cell[k+hs] / hy_dens_cell[k+hs]
+
+  with (Dataset("output.nc", "w") if etime == 0 else Dataset("output.nc", "a")) as nc:
+    # Write output using netCDF4
+    if etime == 0:
+      # Create dimensions
+      nc.createDimension("t", None) # unlimited
+      nc.createDimension("x", nx_glob)
+      nc.createDimension("z", nz_glob)
+      # Create variables
+      t_var = nc.createVariable("t_var", real, ("t",))
+      dens_var = nc.createVariable("dens", real, ("t","z","x"))
+      uwnd_var = nc.createVariable("uwnd", real, ("t","z","x"))
+      wwnd_var = nc.createVariable("wwnd", real, ("t","z","x"))
+      theta_var = nc.createVariable("theta", real, ("t","z","x"))
+    else:
+      t_var = nc.variables["t_var"]
+      dens_var = nc.variables["dens"] 
+      uwnd_var = nc.variables["uwnd"]
+      wwnd_var = nc.variables["wwnd"]
+      theta_var = nc.variables["theta"]
+
+    # Write data
+    if mainproc:
+      t_var[num_out] = etime
+    dens_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = dens
+    uwnd_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = uwnd  
+    wwnd_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = wwnd
+    theta_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = theta
 
   return num_out + 1
 

From 185fcafd975613e3cd087750fe9d5927a54fe7b3 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Mon, 10 Mar 2025 15:40:07 -0600
Subject: [PATCH 26/37] Python: netcdf output 2

---
 python/miniWeather.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index 2b986f1..a193ac4 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -881,14 +881,10 @@ def output(
   # Store perturbed values in temp arrays
   for k in range(nz):
     for i in range(nx):
-      ind_r = ID_DENS*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs
-      ind_u = ID_UMOM*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs
-      ind_w = ID_WMOM*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs
-      ind_t = ID_RHOT*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs
-      dens[k,i] = state[ind_r]
-      uwnd[k,i] = state[ind_u] / (hy_dens_cell[k+hs] + state[ind_r])
-      wwnd[k,i] = state[ind_w] / (hy_dens_cell[k+hs] + state[ind_r])
-      theta[k,i] = (state[ind_t] + hy_dens_theta_cell[k+hs]) / (hy_dens_cell[k+hs] + state[ind_r]) - hy_dens_theta_cell[k+hs] / hy_dens_cell[k+hs]
+      dens[k,i] = state[ID_DENS, k+hs, i+hs]
+      uwnd[k,i] = state[ID_UMOM, k+hs, i+hs] / (hy_dens_cell[k+hs] + state[ID_DENS, k+hs, i+hs])
+      wwnd[k,i] = state[ID_WMOM, k+hs, i+hs] / (hy_dens_cell[k+hs] + state[ID_DENS, k+hs, i+hs])
+      theta[k,i] = (state[ID_RHOT, k+hs, i+hs] + hy_dens_theta_cell[k+hs]) / (hy_dens_cell[k+hs] + state[ID_DENS, k+hs, i+hs]) - hy_dens_theta_cell[k+hs] / hy_dens_cell[k+hs]
 
   with (Dataset("output.nc", "w") if etime == 0 else Dataset("output.nc", "a")) as nc:
     # Write output using netCDF4

From 5c2206adddbee88d467bcd67763aef33d29c015d Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Mon, 10 Mar 2025 15:42:37 -0600
Subject: [PATCH 27/37] Python: netcdf output 3 (may have fixed it)

---
 python/miniWeather.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index a193ac4..95b731a 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -8,10 +8,10 @@
 # //////////////////////////////////////////////////////////////////////////////////////////
 
 import math
-import numpy as np
 import sys
 import timeit
-#include "pnetcdf.h"
+import numpy as np
+from netCDF4 import Dataset
 
 # "real" in the original C++ code could be either float or double.
 real = np.float64 # or np.float32

From a1d2fc3877913a80bfaa7241d6a6ae2107700d65 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Mon, 10 Mar 2025 16:34:09 -0600
Subject: [PATCH 28/37] Fix Python output

---
 python/miniWeather.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index 95b731a..6aae6cd 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -74,10 +74,10 @@
 # So, you'll want to have nx_glob be twice as large as nz_glob
 nz_glob: int = 50               # Number of total cells in the z-direction
 nx_glob: int = 2 * nz_glob      # Number of total cells in the x-direction
-sim_time: real = 20.0 #1000.0         # How many seconds to run the simulation
+sim_time: real = 700.0         # How many seconds to run the simulation
 output_freq: real = 10.0        # How frequently to output data to file (in seconds)
 #data_spec_int: int = DATA_SPEC_INJECTION # How to initialize the data
-data_spec_int: int = DATA_SPEC_THERMAL # How to initialize the data
+data_spec_int: int = DATA_SPEC_COLLISION #DATA_SPEC_THERMAL # How to initialize the data
 # ///////////////////////////////////////////////////////////////////////////////////////
 # // END USER-CONFIGURABLE PARAMETERS
 # ///////////////////////////////////////////////////////////////////////////////////////
@@ -201,7 +201,7 @@ def main_time_step_loop(dt: real, etime: real, num_out: int) -> None:
       # If it's time for output, reset the counter, and do output
       if output_freq >= 0 and output_counter >= output_freq:
         output_counter = output_counter - output_freq
-      num_out = output(state, etime, num_out, fixed_data)
+        num_out = output(state, etime, num_out, fixed_data)
 
       (mass, te) = reductions(state, fixed_data)
       if mainproc:

From 07e0ccca14ff3cfae563f1a45246062b4b187fe5 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Mon, 10 Mar 2025 16:57:24 -0600
Subject: [PATCH 29/37] Update ncview home page

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4326e14..53dd642 100644
--- a/README.md
+++ b/README.md
@@ -94,7 +94,7 @@ Once the time tendency is computed, the fluid PDEs are essentially now cast as a
 
 * Parallel-netcdf: https://github.com/Parallel-NetCDF/PnetCDF
   * This is a dependency for two reasons: (1) NetCDF files are easy to visualize and convenient to work with; (2) The users of this code shouldn't have to write their own parallel I/O.
-* Ncview: http://meteora.ucsd.edu/~pierce/ncview_home_page.html
+* Ncview: https://cirrus.ucsd.edu/ncview/
   * This is the easiest way to visualize NetCDF files.
 * MPI
 * For OpenACC: An OpenACC-capable compiler (PGI / Nvidia, Cray, GNU)

From 6cb325ef1f8170afd09de7d7e04b6c1f5077abde Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Wed, 12 Mar 2025 17:22:01 +0200
Subject: [PATCH 30/37] C: temporarily fix sim params

---
 c/miniWeather_serial.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/c/miniWeather_serial.cpp b/c/miniWeather_serial.cpp
index a9ddf9c..2a484df 100644
--- a/c/miniWeather_serial.cpp
+++ b/c/miniWeather_serial.cpp
@@ -56,11 +56,13 @@ constexpr double qweights[] = { 0.277777777777777777777777777779E0 , 0.444444444
 ///////////////////////////////////////////////////////////////////////////////////////
 //The x-direction length is twice as long as the z-direction length
 //So, you'll want to have nx_glob be twice as large as nz_glob
-int    constexpr nx_glob       = _NX;            //Number of total cells in the x-direction
-int    constexpr nz_glob       = _NZ;            //Number of total cells in the z-direction
-double constexpr sim_time      = _SIM_TIME;      //How many seconds to run the simulation
-double constexpr output_freq   = _OUT_FREQ;      //How frequently to output data to file (in seconds)
-int    constexpr data_spec_int = _DATA_SPEC;     //How to initialize the data
+
+int    constexpr nz_glob       = 1000;          //Number of total cells in the z-direction
+int    constexpr nx_glob       = 2 * nz_glob;   //Number of total cells in the x-direction
+double constexpr sim_time      = 700.0;      //How many seconds to run the simulation
+double constexpr output_freq   = 10.0;      //How frequently to output data to file (in seconds)
+//int    constexpr data_spec_int = _DATA_SPEC;     //How to initialize the data
+int    constexpr data_spec_int = DATA_SPEC_COLLISION;     //How to initialize the data
 double constexpr dx            = xlen / nx_glob; // grid spacing in the x-direction
 double constexpr dz            = zlen / nz_glob; // grid spacing in the x-direction
 ///////////////////////////////////////////////////////////////////////////////////////

From fc81e1e19a3834aa9dac79fcf6d4329239957357 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Wed, 12 Mar 2025 18:27:53 +0200
Subject: [PATCH 31/37] C: output control (still need to test)

---
 c/miniWeather_serial.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/c/miniWeather_serial.cpp b/c/miniWeather_serial.cpp
index 2a484df..251b3c1 100644
--- a/c/miniWeather_serial.cpp
+++ b/c/miniWeather_serial.cpp
@@ -16,6 +16,8 @@
 #include "pnetcdf.h"
 #include <chrono>
 
+#define MINIWEATHER_ONLY_OUTPUT_THETA 1
+
 constexpr double pi        = 3.14159265358979323846264338327;   //Pi
 constexpr double grav      = 9.8;                               //Gravitational acceleration (m / s^2)
 constexpr double cp        = 1004.;                             //Specific heat of dry air at constant pressure
@@ -748,9 +750,11 @@ void output( double *state , double etime ) {
   //Inform the user
   if (mainproc) { fprintf(stderr, "*** OUTPUT ***\n"); }
   //Allocate some (big) temp arrays
+#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA)
   dens     = (double *) malloc(nx*nz*sizeof(double));
   uwnd     = (double *) malloc(nx*nz*sizeof(double));
   wwnd     = (double *) malloc(nx*nz*sizeof(double));
+#endif
   theta    = (double *) malloc(nx*nz*sizeof(double));
   etimearr = (double *) malloc(1    *sizeof(double));
 
@@ -775,9 +779,11 @@ void output( double *state , double etime ) {
     dimids[0] = t_dimid;
     ncwrap( ncmpi_def_var( ncid , "t_var"     , NC_DOUBLE , 1 , dimids ,     &t_varid ) , __LINE__ );
     dimids[0] = t_dimid; dimids[1] = z_dimid; dimids[2] = x_dimid;
+#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA)
     ncwrap( ncmpi_def_var( ncid , "dens"  , NC_DOUBLE , 3 , dimids ,  &dens_varid ) , __LINE__ );
     ncwrap( ncmpi_def_var( ncid , "uwnd"  , NC_DOUBLE , 3 , dimids ,  &uwnd_varid ) , __LINE__ );
     ncwrap( ncmpi_def_var( ncid , "wwnd"  , NC_DOUBLE , 3 , dimids ,  &wwnd_varid ) , __LINE__ );
+#endif
     ncwrap( ncmpi_def_var( ncid , "theta" , NC_DOUBLE , 3 , dimids , &theta_varid ) , __LINE__ );
     //End "define" mode
     ncwrap( ncmpi_enddef( ncid ) , __LINE__ );
@@ -785,9 +791,11 @@ void output( double *state , double etime ) {
     //Open the file
     ncwrap( ncmpi_open( MPI_COMM_WORLD , "output.nc" , NC_WRITE , mpi_info , &ncid ) , __LINE__ );
     //Get the variable IDs
+#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA)
     ncwrap( ncmpi_inq_varid( ncid , "dens"  ,  &dens_varid ) , __LINE__ );
     ncwrap( ncmpi_inq_varid( ncid , "uwnd"  ,  &uwnd_varid ) , __LINE__ );
     ncwrap( ncmpi_inq_varid( ncid , "wwnd"  ,  &wwnd_varid ) , __LINE__ );
+#endif
     ncwrap( ncmpi_inq_varid( ncid , "theta" , &theta_varid ) , __LINE__ );
     ncwrap( ncmpi_inq_varid( ncid , "t_var" ,     &t_varid ) , __LINE__ );
   }
@@ -796,12 +804,16 @@ void output( double *state , double etime ) {
   for (k=0; k<nz; k++) {
     for (i=0; i<nx; i++) {
       ind_r = ID_DENS*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs;
+#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA)      
       ind_u = ID_UMOM*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs;
       ind_w = ID_WMOM*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs;
+#endif
       ind_t = ID_RHOT*(nz+2*hs)*(nx+2*hs) + (k+hs)*(nx+2*hs) + i+hs;
+#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA)
       dens [k*nx+i] = state[ind_r];
       uwnd [k*nx+i] = state[ind_u] / ( hy_dens_cell[k+hs] + state[ind_r] );
       wwnd [k*nx+i] = state[ind_w] / ( hy_dens_cell[k+hs] + state[ind_r] );
+#endif      
       theta[k*nx+i] = ( state[ind_t] + hy_dens_theta_cell[k+hs] ) / ( hy_dens_cell[k+hs] + state[ind_r] ) - hy_dens_theta_cell[k+hs] / hy_dens_cell[k+hs];
     }
   }
@@ -809,9 +821,11 @@ void output( double *state , double etime ) {
   //Write the grid data to file with all the processes writing collectively
   st3[0] = num_out; st3[1] = k_beg; st3[2] = i_beg;
   ct3[0] = 1      ; ct3[1] = nz   ; ct3[2] = nx   ;
+#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA)      
   ncwrap( ncmpi_put_vara_double_all( ncid ,  dens_varid , st3 , ct3 , dens  ) , __LINE__ );
   ncwrap( ncmpi_put_vara_double_all( ncid ,  uwnd_varid , st3 , ct3 , uwnd  ) , __LINE__ );
   ncwrap( ncmpi_put_vara_double_all( ncid ,  wwnd_varid , st3 , ct3 , wwnd  ) , __LINE__ );
+#endif
   ncwrap( ncmpi_put_vara_double_all( ncid , theta_varid , st3 , ct3 , theta ) , __LINE__ );
 
   //Only the main process needs to write the elapsed time
@@ -837,9 +851,11 @@ void output( double *state , double etime ) {
   MPI_Info_free(&mpi_info);
   
   //Deallocate the temp arrays
+#if ! defined(MINIWEATHER_ONLY_OUTPUT_THETA)      
   free( dens     );
   free( uwnd     );
   free( wwnd     );
+#endif
   free( theta    );
   free( etimearr );
 #endif // 0

From bb31b7bcd7dcbd7709545b654c76af27e68139a8 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Wed, 12 Mar 2025 18:38:22 +0200
Subject: [PATCH 32/37] C: Remove hard-coding of parameters

---
 c/miniWeather_serial.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/c/miniWeather_serial.cpp b/c/miniWeather_serial.cpp
index 251b3c1..710cb4f 100644
--- a/c/miniWeather_serial.cpp
+++ b/c/miniWeather_serial.cpp
@@ -59,12 +59,11 @@ constexpr double qweights[] = { 0.277777777777777777777777777779E0 , 0.444444444
 //The x-direction length is twice as long as the z-direction length
 //So, you'll want to have nx_glob be twice as large as nz_glob
 
-int    constexpr nz_glob       = 1000;          //Number of total cells in the z-direction
-int    constexpr nx_glob       = 2 * nz_glob;   //Number of total cells in the x-direction
-double constexpr sim_time      = 700.0;      //How many seconds to run the simulation
-double constexpr output_freq   = 10.0;      //How frequently to output data to file (in seconds)
-//int    constexpr data_spec_int = _DATA_SPEC;     //How to initialize the data
-int    constexpr data_spec_int = DATA_SPEC_COLLISION;     //How to initialize the data
+int    constexpr nz_glob       = _NZ;            //Number of total cells in the z-direction
+int    constexpr nx_glob       = 2 * nz_glob;    //Number of total cells in the x-direction
+double constexpr sim_time      = _SIM_TIME;      //How many seconds to run the simulation
+double constexpr output_freq   = _OUT_FREQ;      //How frequently to output data to file (in seconds)
+int    constexpr data_spec_int = _DATA_SPEC;     //How to initialize the data
 double constexpr dx            = xlen / nx_glob; // grid spacing in the x-direction
 double constexpr dz            = zlen / nz_glob; // grid spacing in the x-direction
 ///////////////////////////////////////////////////////////////////////////////////////

From 4303ec062a72f6de416bb15aa1f61316124f9c6f Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Wed, 12 Mar 2025 13:12:46 -0600
Subject: [PATCH 33/37] Python: x boundary injection hack

For injection only, change the right boundary to be Dirichlet.

Add a C(++) build script for Linux + gcc.
---
 c/build/cmake_linux_gnu.sh | 25 +++++++++++++++++++++++++
 python/miniWeather.py      | 22 +++++++++++++++-------
 2 files changed, 40 insertions(+), 7 deletions(-)
 create mode 100644 c/build/cmake_linux_gnu.sh

diff --git a/c/build/cmake_linux_gnu.sh b/c/build/cmake_linux_gnu.sh
new file mode 100644
index 0000000..719420d
--- /dev/null
+++ b/c/build/cmake_linux_gnu.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+SRC_ROOT=../../../src/miniWeather/c
+OPT_FLAGS="-g -O2"
+
+#PNETCDF_LIB=/usr/lib/x86_64-linux-gnu
+#PNETCDF_LDFLAGS="-L${PNETCDF_LIB} -lpnetcdf"
+PNETCDF_LDFLAGS="-lpnetcdf"
+#PNETCDF_CXXFLAGS="-I$/usr/include"
+PNETCDF_CXXFLAGS=""
+
+DATA_SPEC="DATA_SPEC_INJECTION"
+
+cmake \
+  -DCMAKE_CXX_COMPILER=mpic++ \
+  -DCMAKE_C_COMPILER=mpicc \
+  -DCMAKE_Fortran_COMPILER=mpif90 \
+  -DCXXFLAGS="${OPT_FLAGS} ${PNETCDF_CXXFLAGS}" \
+  -DLDFLAGS="${PNETCDF_LDFLAGS}" \
+  -DNX=200 \
+  -DNZ=100 \
+  -DSIM_TIME=1200 \
+  -DOUT_FREQ=10 \
+  -DDATA_SPEC="${DATA_SPEC}" \
+  ${SRC_ROOT}
diff --git a/python/miniWeather.py b/python/miniWeather.py
index 6aae6cd..a59d0da 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -74,10 +74,10 @@
 # So, you'll want to have nx_glob be twice as large as nz_glob
 nz_glob: int = 50               # Number of total cells in the z-direction
 nx_glob: int = 2 * nz_glob      # Number of total cells in the x-direction
-sim_time: real = 700.0         # How many seconds to run the simulation
+sim_time: real = 1000.0         # How many seconds to run the simulation
 output_freq: real = 10.0        # How frequently to output data to file (in seconds)
-#data_spec_int: int = DATA_SPEC_INJECTION # How to initialize the data
-data_spec_int: int = DATA_SPEC_COLLISION #DATA_SPEC_THERMAL # How to initialize the data
+data_spec_int: int = DATA_SPEC_INJECTION # How to initialize the data
+#data_spec_int: int = DATA_SPEC_COLLISION #DATA_SPEC_THERMAL # How to initialize the data
 # ///////////////////////////////////////////////////////////////////////////////////////
 # // END USER-CONFIGURABLE PARAMETERS
 # ///////////////////////////////////////////////////////////////////////////////////////
@@ -507,10 +507,18 @@ def set_halo_values_x(
   # //////////////////////////////////////////////////////
   for ll in range(NUM_VARS):
     for k in range(nz):
-      state[ll,hs+k,0      ] = state[ll,hs+k,nx+hs-2];
-      state[ll,hs+k,1      ] = state[ll,hs+k,nx+hs-1];
-      state[ll,hs+k,nx+hs  ] = state[ll,hs+k,hs     ];
-      state[ll,hs+k,nx+hs+1] = state[ll,hs+k,hs+1   ];
+      if data_spec_int == DATA_SPEC_INJECTION:
+        # Dirichlet boundary on the right ONLY avoids
+        # spurious reflections on the right and four corners.
+        # This should not be considered physical and does not
+        # conserve energy (but injection doesn't anyway).
+        state[ll,hs+k,nx+hs  ] = 0.0
+        state[ll,hs+k,nx+hs+1] = 0.0
+      else:
+        state[ll,hs+k,0      ] = state[ll,hs+k,nx+hs-2]
+        state[ll,hs+k,1      ] = state[ll,hs+k,nx+hs-1]
+        state[ll,hs+k,nx+hs  ] = state[ll,hs+k,hs     ]
+        state[ll,hs+k,nx+hs+1] = state[ll,hs+k,hs+1   ]
 
   if data_spec_int == DATA_SPEC_INJECTION:
     if myrank == 0:

From 8c41a31de0bcb78bd494908f588dc66c69e6cbf6 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Mon, 17 Mar 2025 11:30:02 -0600
Subject: [PATCH 34/37] Build script: Change default model

---
 c/build/cmake_linux_gnu.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/c/build/cmake_linux_gnu.sh b/c/build/cmake_linux_gnu.sh
index 719420d..861e651 100644
--- a/c/build/cmake_linux_gnu.sh
+++ b/c/build/cmake_linux_gnu.sh
@@ -9,7 +9,7 @@ PNETCDF_LDFLAGS="-lpnetcdf"
 #PNETCDF_CXXFLAGS="-I$/usr/include"
 PNETCDF_CXXFLAGS=""
 
-DATA_SPEC="DATA_SPEC_INJECTION"
+DATA_SPEC="DATA_SPEC_COLLISION"
 
 cmake \
   -DCMAKE_CXX_COMPILER=mpic++ \
@@ -19,7 +19,7 @@ cmake \
   -DLDFLAGS="${PNETCDF_LDFLAGS}" \
   -DNX=200 \
   -DNZ=100 \
-  -DSIM_TIME=1200 \
+  -DSIM_TIME=1000 \
   -DOUT_FREQ=10 \
   -DDATA_SPEC="${DATA_SPEC}" \
   ${SRC_ROOT}

From 74a519ea08d2e17fc0a9898d1577dd1ab02a8b07 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Mon, 17 Mar 2025 11:30:29 -0600
Subject: [PATCH 35/37] Python: Add option only to output theta (temp density)

---
 python/miniWeather.py | 48 ++++++++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index a59d0da..ac39683 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -17,6 +17,9 @@
 real = np.float64 # or np.float32
 double = np.float64
 
+# Only output the temperature density ("theta")
+miniweather_output_only_theta: bool = True
+
 #
 # Parameters for indexing and flags
 # (effectively enums, but we leave them as constants
@@ -72,7 +75,7 @@
 # ///////////////////////////////////////////////////////////////////////////////////////
 # The x-direction length is twice as long as the z-direction length
 # So, you'll want to have nx_glob be twice as large as nz_glob
-nz_glob: int = 50               # Number of total cells in the z-direction
+nz_glob: int = 100              # Number of total cells in the z-direction
 nx_glob: int = 2 * nz_glob      # Number of total cells in the x-direction
 sim_time: real = 1000.0         # How many seconds to run the simulation
 output_freq: real = 10.0        # How frequently to output data to file (in seconds)
@@ -512,8 +515,15 @@ def set_halo_values_x(
         # spurious reflections on the right and four corners.
         # This should not be considered physical and does not
         # conserve energy (but injection doesn't anyway).
+
+        state[ll,hs+k,0      ] = state[ll,hs+k,nx+hs-2]
+        state[ll,hs+k,1      ] = state[ll,hs+k,nx+hs-1]
         state[ll,hs+k,nx+hs  ] = 0.0
         state[ll,hs+k,nx+hs+1] = 0.0
+
+        #state[ll,hs+k,nx+hs  ] = state[ll,hs+k,hs     ]
+        #state[ll,hs+k,nx+hs+1] = state[ll,hs+k,hs+1   ]
+
       else:
         state[ll,hs+k,0      ] = state[ll,hs+k,nx+hs-2]
         state[ll,hs+k,1      ] = state[ll,hs+k,nx+hs-1]
@@ -880,18 +890,20 @@ def output(
   # Allocate temp arrays
   #
   # TODO (mfh 2025/03/10) Check output order
-  dens = np.zeros((nz,nx), dtype=real)
-  uwnd = np.zeros((nz,nx), dtype=real) 
-  wwnd = np.zeros((nz,nx), dtype=real)
+  if not miniweather_output_only_theta:
+    dens = np.zeros((nz,nx), dtype=real)
+    uwnd = np.zeros((nz,nx), dtype=real) 
+    wwnd = np.zeros((nz,nx), dtype=real)
   theta = np.zeros((nz,nx), dtype=real)
   etimearr = np.zeros((1), dtype=real)
 
   # Store perturbed values in temp arrays
   for k in range(nz):
     for i in range(nx):
-      dens[k,i] = state[ID_DENS, k+hs, i+hs]
-      uwnd[k,i] = state[ID_UMOM, k+hs, i+hs] / (hy_dens_cell[k+hs] + state[ID_DENS, k+hs, i+hs])
-      wwnd[k,i] = state[ID_WMOM, k+hs, i+hs] / (hy_dens_cell[k+hs] + state[ID_DENS, k+hs, i+hs])
+      if not miniweather_output_only_theta:
+        dens[k,i] = state[ID_DENS, k+hs, i+hs]
+        uwnd[k,i] = state[ID_UMOM, k+hs, i+hs] / (hy_dens_cell[k+hs] + state[ID_DENS, k+hs, i+hs])
+        wwnd[k,i] = state[ID_WMOM, k+hs, i+hs] / (hy_dens_cell[k+hs] + state[ID_DENS, k+hs, i+hs])
       theta[k,i] = (state[ID_RHOT, k+hs, i+hs] + hy_dens_theta_cell[k+hs]) / (hy_dens_cell[k+hs] + state[ID_DENS, k+hs, i+hs]) - hy_dens_theta_cell[k+hs] / hy_dens_cell[k+hs]
 
   with (Dataset("output.nc", "w") if etime == 0 else Dataset("output.nc", "a")) as nc:
@@ -903,23 +915,27 @@ def output(
       nc.createDimension("z", nz_glob)
       # Create variables
       t_var = nc.createVariable("t_var", real, ("t",))
-      dens_var = nc.createVariable("dens", real, ("t","z","x"))
-      uwnd_var = nc.createVariable("uwnd", real, ("t","z","x"))
-      wwnd_var = nc.createVariable("wwnd", real, ("t","z","x"))
+      if not miniweather_output_only_theta:
+        dens_var = nc.createVariable("dens", real, ("t","z","x"))
+        uwnd_var = nc.createVariable("uwnd", real, ("t","z","x"))
+        wwnd_var = nc.createVariable("wwnd", real, ("t","z","x"))
       theta_var = nc.createVariable("theta", real, ("t","z","x"))
     else:
       t_var = nc.variables["t_var"]
-      dens_var = nc.variables["dens"] 
-      uwnd_var = nc.variables["uwnd"]
-      wwnd_var = nc.variables["wwnd"]
+      if not miniweather_output_only_theta:
+        dens_var = nc.variables["dens"] 
+        uwnd_var = nc.variables["uwnd"]
+        wwnd_var = nc.variables["wwnd"]
       theta_var = nc.variables["theta"]
 
     # Write data
     if mainproc:
       t_var[num_out] = etime
-    dens_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = dens
-    uwnd_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = uwnd  
-    wwnd_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = wwnd
+
+    if not miniweather_output_only_theta:
+      dens_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = dens
+      uwnd_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = uwnd  
+      wwnd_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = wwnd
     theta_var[num_out,k_beg:k_beg+nz,i_beg:i_beg+nx] = theta
 
   return num_out + 1

From 94839255a173915aa3821cad5873694966bbc1fc Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Mon, 17 Mar 2025 11:40:38 -0600
Subject: [PATCH 36/37] Python: Remove injection boundary special case

---
 python/miniWeather.py | 23 ++++-------------------
 1 file changed, 4 insertions(+), 19 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index ac39683..003ab73 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -510,25 +510,10 @@ def set_halo_values_x(
   # //////////////////////////////////////////////////////
   for ll in range(NUM_VARS):
     for k in range(nz):
-      if data_spec_int == DATA_SPEC_INJECTION:
-        # Dirichlet boundary on the right ONLY avoids
-        # spurious reflections on the right and four corners.
-        # This should not be considered physical and does not
-        # conserve energy (but injection doesn't anyway).
-
-        state[ll,hs+k,0      ] = state[ll,hs+k,nx+hs-2]
-        state[ll,hs+k,1      ] = state[ll,hs+k,nx+hs-1]
-        state[ll,hs+k,nx+hs  ] = 0.0
-        state[ll,hs+k,nx+hs+1] = 0.0
-
-        #state[ll,hs+k,nx+hs  ] = state[ll,hs+k,hs     ]
-        #state[ll,hs+k,nx+hs+1] = state[ll,hs+k,hs+1   ]
-
-      else:
-        state[ll,hs+k,0      ] = state[ll,hs+k,nx+hs-2]
-        state[ll,hs+k,1      ] = state[ll,hs+k,nx+hs-1]
-        state[ll,hs+k,nx+hs  ] = state[ll,hs+k,hs     ]
-        state[ll,hs+k,nx+hs+1] = state[ll,hs+k,hs+1   ]
+      state[ll,hs+k,0      ] = state[ll,hs+k,nx+hs-2]
+      state[ll,hs+k,1      ] = state[ll,hs+k,nx+hs-1]
+      state[ll,hs+k,nx+hs  ] = state[ll,hs+k,hs     ]
+      state[ll,hs+k,nx+hs+1] = state[ll,hs+k,hs+1   ]
 
   if data_spec_int == DATA_SPEC_INJECTION:
     if myrank == 0:

From a01bc207dd40728e05b95feca863182c1b091907 Mon Sep 17 00:00:00 2001
From: Mark Hoemmen <mhoemmen@nvidia.com>
Date: Mon, 17 Mar 2025 14:25:12 -0600
Subject: [PATCH 37/37] Python: Restore default model

---
 python/miniWeather.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/miniWeather.py b/python/miniWeather.py
index 003ab73..bcb2e13 100644
--- a/python/miniWeather.py
+++ b/python/miniWeather.py
@@ -79,8 +79,7 @@
 nx_glob: int = 2 * nz_glob      # Number of total cells in the x-direction
 sim_time: real = 1000.0         # How many seconds to run the simulation
 output_freq: real = 10.0        # How frequently to output data to file (in seconds)
-data_spec_int: int = DATA_SPEC_INJECTION # How to initialize the data
-#data_spec_int: int = DATA_SPEC_COLLISION #DATA_SPEC_THERMAL # How to initialize the data
+data_spec_int: int = DATA_SPEC_COLLISION #DATA_SPEC_THERMAL # How to initialize the data
 # ///////////////////////////////////////////////////////////////////////////////////////
 # // END USER-CONFIGURABLE PARAMETERS
 # ///////////////////////////////////////////////////////////////////////////////////////