From 8010ea06fe3718be926ed06203af3ea66b349822 Mon Sep 17 00:00:00 2001
From: Sergi Siso <sergi.siso@stfc.ac.uk>
Date: Tue, 12 Apr 2022 11:39:02 +0100
Subject: [PATCH 1/7] #84 Add warm up iterations to NemoLite2D device
 accelerated manual implementations

---
 .../manual_versions/psykal_acc/nemolite2d.f90 | 76 ++++++++++---------
 .../manual_versions/psykal_cpp/nemolite2d.f90 | 17 ++++-
 .../manual_versions/psykal_kokkos/Makefile    |  4 +-
 .../psykal_kokkos/nemolite2d.f90              | 16 +++-
 .../psykal_kokkos/time_step_views_kokkos.cpp  |  7 +-
 .../psykal_opencl/nemolite2d.f90              | 18 +++--
 .../psykal_sycl/nemolite2d.f90                | 23 +++++-
 7 files changed, 106 insertions(+), 55 deletions(-)

diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_acc/nemolite2d.f90 b/benchmarks/nemo/nemolite2d/manual_versions/psykal_acc/nemolite2d.f90
index e1c1f305..49428770 100644
--- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_acc/nemolite2d.f90
+++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_acc/nemolite2d.f90
@@ -1,5 +1,5 @@
 program gocean2d
-  use dl_timer
+  use dl_timer, only: timer_start, timer_stop, timer_init, timer_report, i_def64
   use grid_mod
   use field_mod
   use initialisation_mod, only: initialisation
@@ -7,11 +7,11 @@ program gocean2d
   use gocean2d_io_mod, only: model_write
   use gocean_mod,      only: model_write_log, gocean_initialise, &
                              gocean_finalise
+  !use likwid
 
-  !> GOcean2d is a Horizontal 2D hydrodynamic ocean model initially developed
-  !! by Hedong Liu, UK National Oceanography Centre (NOC), which:
-  !!   1) uses structured grid
-  !!   2) uses direct data addressing structures
+  !> A Horizontal 2D hydrodynamic ocean model which
+  !!   1) using structured grid
+  !!   2) using direct data addressing structures
 
   implicit none
 
@@ -30,9 +30,13 @@ program gocean2d
   type(r2d_field) :: ua_fld, va_fld
 
   ! time stepping index
-  integer :: istp   
-  integer :: itimer0
+  integer     :: istp  
+  integer     :: itimer0
 
+  ! Scratch space for logging messages
+  character(len=160) :: log_str
+
+  ! Initialise GOcean infrastructure
   call gocean_initialise()
 
   ! Create the model grid. We use a NE offset (i.e. the U, V and F
@@ -45,6 +49,7 @@ program gocean2d
 
   !! read in model parameters and configure the model grid 
   CALL model_init(model_grid)
+  !call likwid_markerInit()
 
   ! Create fields on this grid
 
@@ -78,16 +83,35 @@ program gocean2d
 
   call model_write(model_grid, 0, ht_fld, sshn_t_fld, un_fld, vn_fld)
 
+  write(log_str, "('Simulation domain = (',I4,':',I4,',',I4,':',I4,')')") &
+                       model_grid%subdomain%global%xstart, &
+                       model_grid%subdomain%global%xstop,  &
+                       model_grid%subdomain%global%ystart, &
+                       model_grid%subdomain%global%ystop
+  call model_write_log("((A))", TRIM(log_str))
+
+  ! Start timer for time-stepping section
+  CALL timer_start(itimer0, label='Warm up', &
+                   num_repeats=INT(1,kind=i_def64) )
+
+  call step(nit000,                               &
+           ua_fld, va_fld, un_fld, vn_fld,     &
+           sshn_t_fld, sshn_u_fld, sshn_v_fld, &
+           ssha_t_fld, ssha_u_fld, ssha_v_fld, &
+           hu_fld, hv_fld, ht_fld)
+
+  ! Stop the timer for the time-stepping section
+  call timer_stop(itimer0)
   ! Start timer for time-stepping section
   CALL timer_start(itimer0, label='Time-stepping', &
-                   num_repeats=int((nitend-nit000+1),8) )
+                   num_repeats=INT(nitend-nit000,kind=i_def64) )
 
   !! time stepping 
-  do istp = nit000, nitend, 1
+  do istp = nit000+1, nitend, 1
 
      !call model_write_log("('istp == ',I6)",istp)
 
-     call step(model_grid, istp,                   &
+     call step(istp,                               &
                ua_fld, va_fld, un_fld, vn_fld,     &
                sshn_t_fld, sshn_u_fld, sshn_v_fld, &
                ssha_t_fld, ssha_u_fld, ssha_v_fld, &
@@ -102,14 +126,15 @@ program gocean2d
   call timer_stop(itimer0)
 
   ! Compute and output some checksums for error checking
-  call model_write_log("('ua checksum = ',E16.8)", &
+  call model_write_log("('ua checksum = ', E16.8)", &
                        field_checksum(ua_fld))
-  call model_write_log("('va checksum = ',E16.8)", &
+  call model_write_log("('va checksum = ', E16.8)", &
                        field_checksum(va_fld))
 
   !! finalise the model run
   call model_finalise()
-  
+  !call likwid_markerClose()
+
   call model_write_log("((A))", 'Simulation finished!!')
 
   call gocean_finalise()
@@ -118,7 +143,7 @@ end program gocean2d
 
 !+++++++++++++++++++++++++++++++++++
 
-subroutine step(grid, istp, &
+subroutine step(istp,           &
                 ua, va, un, vn, &
                 sshn, sshn_u, sshn_v, ssha, ssha_u, ssha_v, &
                 hu, hv, ht)
@@ -128,7 +153,6 @@ subroutine step(grid, istp, &
   use time_step_mod, only: invoke_time_step
   use gocean2d_io_mod, only: model_write
   implicit none
-  type(grid_type), intent(in) :: grid
   !> The current time step
   integer,         intent(in) :: istp
   type(r2d_field), intent(inout) :: un, vn, sshn, sshn_u, sshn_v
@@ -139,27 +163,5 @@ subroutine step(grid, istp, &
                         sshn, sshn_u, sshn_v, &
                         hu, hv, ht, ua, va, un, vn)
 
-!  call invoke(                                               &
-!              continuity(istp, ssha, sshn_t, sshn_u, sshn_v, &
-!                         hu, hv, un, vn),                    &
-!              momentum_u(ua, un, vn,                         &
-!                         ssha_u, sshn_t, sshn_u, sshn_v),    &
-!              momentum_v(va, un, vn, hu, hv, ht,             &
-!                         ssha_v, sshn_t, sshn_u, sshn_v),    &
-!              bc_ssh(istp, ssha),                            &
-!              bc_solid_u(ua),                                &
-!              bc_solid_v(va),                                &
-!              bc_flather_u(ua, hu, sshn_u),                  &
-!              bc_flather_v(va, hv, sshn_v),                  &
-!              copy_field(ua, un),                            &
-!              copy_field(va, vn),                            &
-!              copy_field(ssha, sshn_t),                      &
-!              next_sshu(sshn_u, sshn_t),                     &
-!              next_sshv(sshn_v, sshn_t)                      &
-!             )
-
-
-!  call model_write(grid, istp, ht, sshn, un, vn)
-
 end subroutine step
 
diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_cpp/nemolite2d.f90 b/benchmarks/nemo/nemolite2d/manual_versions/psykal_cpp/nemolite2d.f90
index 96c9a30f..49428770 100644
--- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_cpp/nemolite2d.f90
+++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_cpp/nemolite2d.f90
@@ -31,7 +31,6 @@ program gocean2d
 
   ! time stepping index
   integer     :: istp  
-  real(go_wp) :: rstp 
   integer     :: itimer0
 
   ! Scratch space for logging messages
@@ -91,12 +90,24 @@ program gocean2d
                        model_grid%subdomain%global%ystop
   call model_write_log("((A))", TRIM(log_str))
 
+  ! Start timer for time-stepping section
+  CALL timer_start(itimer0, label='Warm up', &
+                   num_repeats=INT(1,kind=i_def64) )
+
+  call step(nit000,                               &
+           ua_fld, va_fld, un_fld, vn_fld,     &
+           sshn_t_fld, sshn_u_fld, sshn_v_fld, &
+           ssha_t_fld, ssha_u_fld, ssha_v_fld, &
+           hu_fld, hv_fld, ht_fld)
+
+  ! Stop the timer for the time-stepping section
+  call timer_stop(itimer0)
   ! Start timer for time-stepping section
   CALL timer_start(itimer0, label='Time-stepping', &
-                   num_repeats=INT(nitend-nit000+1,kind=i_def64) )
+                   num_repeats=INT(nitend-nit000,kind=i_def64) )
 
   !! time stepping 
-  do istp = nit000, nitend, 1
+  do istp = nit000+1, nitend, 1
 
      !call model_write_log("('istp == ',I6)",istp)
 
diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile
index 9be53b00..dffa828b 100644
--- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile
+++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile
@@ -35,12 +35,12 @@ KOKKOS_ARCH = Volta70 # Pascal64
 CFLAGS := -O3
 # Still use the selected compiler but using the Kokkos nvcc_wrapper
 NVCC_WRAPPER_DEFAULT_COMPILER = $(CXX)
-CXX := $(KOKKOS_PATH)/bin/nvcc_wrapper
+CXX := $(KOKKOS_PATH)/bin/nvcc_wrapper -allow-unsupported-compiler
 # The enable lambda option is necessary for the nvcc compiler to recognise
 # as CUDA kernels the lambda-inlined functions.
 KOKKOS_CUDA_OPTIONS = "enable_lambda"
 # If CUDA_LIB is not provided, infer path from the nvcc compiler location.
-CUDA_LIB ?= $(shell echo $(shell which nvcc) | sed 's/bin\/nvcc/lib64/g')
+CUDA_ROOT ?= $(shell echo $(shell which nvcc) | sed 's/bin\/nvcc//g')
 else
 $(error "Unrecognised KOKKOS_DEVICES value: $(KOKKOS_DEVICES)")
 endif
diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/nemolite2d.f90 b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/nemolite2d.f90
index c8109789..49428770 100644
--- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/nemolite2d.f90
+++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/nemolite2d.f90
@@ -90,12 +90,24 @@ program gocean2d
                        model_grid%subdomain%global%ystop
   call model_write_log("((A))", TRIM(log_str))
 
+  ! Start timer for time-stepping section
+  CALL timer_start(itimer0, label='Warm up', &
+                   num_repeats=INT(1,kind=i_def64) )
+
+  call step(nit000,                               &
+           ua_fld, va_fld, un_fld, vn_fld,     &
+           sshn_t_fld, sshn_u_fld, sshn_v_fld, &
+           ssha_t_fld, ssha_u_fld, ssha_v_fld, &
+           hu_fld, hv_fld, ht_fld)
+
+  ! Stop the timer for the time-stepping section
+  call timer_stop(itimer0)
   ! Start timer for time-stepping section
   CALL timer_start(itimer0, label='Time-stepping', &
-                   num_repeats=INT(nitend-nit000+1,kind=i_def64) )
+                   num_repeats=INT(nitend-nit000,kind=i_def64) )
 
   !! time stepping 
-  do istp = nit000, nitend, 1
+  do istp = nit000+1, nitend, 1
 
      !call model_write_log("('istp == ',I6)",istp)
 
diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/time_step_views_kokkos.cpp b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/time_step_views_kokkos.cpp
index bbdd4651..918ab609 100644
--- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/time_step_views_kokkos.cpp
+++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/time_step_views_kokkos.cpp
@@ -699,8 +699,11 @@ extern "C" void kokkos_read_from_device(double_2dview from, double * to,
     // Then, we copy the data from the mirror to the original location.
     // Since the mirror data layout is decided by kokkos, we make explicit
     // copies of each element to its location.
-    for(int jj=starty; jj < starty+ny; jj++){
-        for(int ji=startx; ji < startx+nx; ji++){
+    // We need to adjust the provided Fortran bounds to 0-indexing
+    int starty0 = starty - 1;
+    int startx0 = startx - 1;
+    for(int jj=starty0; jj < starty0+ny-1; jj++){
+        for(int ji=startx0; ji < startx0+nx-1; ji++){
             int idx = (jj * fortran_array_width + ji);
             to[idx] = mirror(jj, ji);
         }
diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_opencl/nemolite2d.f90 b/benchmarks/nemo/nemolite2d/manual_versions/psykal_opencl/nemolite2d.f90
index a3e04177..5aa092b7 100644
--- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_opencl/nemolite2d.f90
+++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_opencl/nemolite2d.f90
@@ -50,7 +50,6 @@ program gocean2d
 
   !! read in model parameters and configure the model grid 
   CALL model_init(model_grid)
-  !call likwid_markerInit()
 
   ! Create fields on this grid
 
@@ -91,15 +90,22 @@ program gocean2d
                        model_grid%subdomain%global%ystop
   call model_write_log("((A))", TRIM(log_str))
 
+  ! Warming up step
+  CALL timer_start(itimer0, label='Warm up step', &
+                   num_repeats=INT(1,kind=i_def64) )
+  call step(nit000,                               &
+            ua_fld, va_fld, un_fld, vn_fld,     &
+            sshn_t_fld, sshn_u_fld, sshn_v_fld, &
+            ssha_t_fld, ssha_u_fld, ssha_v_fld, &
+            hu_fld, hv_fld, ht_fld)
+  call timer_stop(itimer0)
+
   ! Start timer for time-stepping section
   CALL timer_start(itimer0, label='Time-stepping', &
-                   num_repeats=INT(nitend-nit000+1,kind=i_def64) )
+                   num_repeats=INT(nitend-nit000,kind=i_def64) )
 
   !! time stepping 
-  do istp = nit000, nitend, 1
-
-     !call model_write_log("('istp == ',I6)",istp)
-     rstp = real(istp, go_wp)
+  do istp = nit000+1, nitend, 1
 
      call step(istp,                               &
                ua_fld, va_fld, un_fld, vn_fld,     &
diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_sycl/nemolite2d.f90 b/benchmarks/nemo/nemolite2d/manual_versions/psykal_sycl/nemolite2d.f90
index 604348a2..49428770 100644
--- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_sycl/nemolite2d.f90
+++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_sycl/nemolite2d.f90
@@ -7,6 +7,7 @@ program gocean2d
   use gocean2d_io_mod, only: model_write
   use gocean_mod,      only: model_write_log, gocean_initialise, &
                              gocean_finalise
+  !use likwid
 
   !> A Horizontal 2D hydrodynamic ocean model which
   !!   1) using structured grid
@@ -30,7 +31,6 @@ program gocean2d
 
   ! time stepping index
   integer     :: istp  
-  real(go_wp) :: rstp 
   integer     :: itimer0
 
   ! Scratch space for logging messages
@@ -43,11 +43,13 @@ program gocean2d
   ! points immediately to the North and East of a T point all have the
   ! same i,j index).  This is the same offset scheme as used by NEMO.
   model_grid = grid_type(GO_ARAKAWA_C, &
+  !  BC_PERIODIC, BC_NON_PERIODIC ??
                          (/GO_BC_EXTERNAL,GO_BC_EXTERNAL,GO_BC_NONE/), &
                          GO_OFFSET_NE)
 
   !! read in model parameters and configure the model grid 
   CALL model_init(model_grid)
+  !call likwid_markerInit()
 
   ! Create fields on this grid
 
@@ -88,12 +90,26 @@ program gocean2d
                        model_grid%subdomain%global%ystop
   call model_write_log("((A))", TRIM(log_str))
 
+  ! Start timer for time-stepping section
+  CALL timer_start(itimer0, label='Warm up', &
+                   num_repeats=INT(1,kind=i_def64) )
+
+  call step(nit000,                               &
+           ua_fld, va_fld, un_fld, vn_fld,     &
+           sshn_t_fld, sshn_u_fld, sshn_v_fld, &
+           ssha_t_fld, ssha_u_fld, ssha_v_fld, &
+           hu_fld, hv_fld, ht_fld)
+
+  ! Stop the timer for the time-stepping section
+  call timer_stop(itimer0)
   ! Start timer for time-stepping section
   CALL timer_start(itimer0, label='Time-stepping', &
-                   num_repeats=INT(nitend-nit000+1,kind=i_def64) )
+                   num_repeats=INT(nitend-nit000,kind=i_def64) )
 
   !! time stepping 
-  do istp = nit000, nitend, 1
+  do istp = nit000+1, nitend, 1
+
+     !call model_write_log("('istp == ',I6)",istp)
 
      call step(istp,                               &
                ua_fld, va_fld, un_fld, vn_fld,     &
@@ -117,6 +133,7 @@ program gocean2d
 
   !! finalise the model run
   call model_finalise()
+  !call likwid_markerClose()
 
   call model_write_log("((A))", 'Simulation finished!!')
 

From 5400ecbaeafe6698abd14c7af00fdf63cd06c8d4 Mon Sep 17 00:00:00 2001
From: Sergi Siso <sergi.siso@stfc.ac.uk>
Date: Thu, 14 Apr 2022 01:10:59 -0700
Subject: [PATCH 2/7] Add HIP backend to the NemoLite2D kokkos makefile

---
 .../nemolite2d/manual_versions/psykal_kokkos/Makefile  |  4 ++++
 .../nemolite2d/manual_versions/psykal_kokkos/README.md | 10 +++++++---
 .../psykal_kokkos/time_step_views_kokkos.cpp           |  6 +++++-
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile
index dffa828b..3c1d083f 100644
--- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile
+++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile
@@ -41,6 +41,10 @@ CXX := $(KOKKOS_PATH)/bin/nvcc_wrapper -allow-unsupported-compiler
 KOKKOS_CUDA_OPTIONS = "enable_lambda"
 # If CUDA_LIB is not provided, infer path from the nvcc compiler location.
 CUDA_ROOT ?= $(shell echo $(shell which nvcc) | sed 's/bin\/nvcc//g')
+else ifeq ($(KOKKOS_DEVICES),HIP)
+$(info "Using HIP device")
+CXX := hipcc
+CFLAGS := -O3
 else
 $(error "Unrecognised KOKKOS_DEVICES value: $(KOKKOS_DEVICES)")
 endif
diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/README.md b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/README.md
index cfe191a3..47e36d54 100644
--- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/README.md
+++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/README.md
@@ -52,15 +52,19 @@ to the Kokkos parallel dispatch. This allows Kokkos to control the data layout,
 the padding, and the synchonization between host and device (GPU execution)
 but it requires to keep two copies of the simulation data.
 This version is available in `time_step_views_kokkos.cpp` and can be built
-with an OpenMP or a Cuda backend by setting the KOKKOS_DEVICES environment
-variable. Note that the Cuda back-end requires that the `nvcc` compiler is
-installed on the system and available in PATH. See below examples of how to
+with an OpenMP, Cuda or HIP backend by setting the `KOKKOS_DEVICES` environment
+variable. Note that the Cuda back-end requires that the `nvcc` compiler and
+the HIP back-end requires the `hipcc` compiler. These need to be
+installed on the system and the necessary paths be available in `PATH`,
+`CPATH` and `LD_LIBRARY_PATH`. See below examples of how to
 compile the Kokkos View version for different devices:
 
     > make nemolite2d_views_kokkos KOKKOS_DEVICES=OpenMP
 
     > make nemolite2d_views_kokkos KOKKOS_DEVICES=Cuda
 
+    > make nemolite2d_views_kokkos KOKKOS_DEVICES=HIP
+
 ## Running ##
 
 Model parameters (size of domain [jpiglo,jpjglo], number of time-steps
diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/time_step_views_kokkos.cpp b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/time_step_views_kokkos.cpp
index 918ab609..3edb242f 100644
--- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/time_step_views_kokkos.cpp
+++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/time_step_views_kokkos.cpp
@@ -11,7 +11,7 @@
 #include "timing.h"
 #endif
 
-#define TILE {64,4}
+#define TILE {64,1}
 
 // Create 2D View types for the Fields and Grid arrays
 typedef Kokkos::View<double**> double_2dview;
@@ -115,7 +115,11 @@ extern "C" void c_invoke_time_step(
     // The execution space is given as a preprocessor define when compiling
     // this file. e.g. `g++ -DEXEC_SPACE=OpenMP time_step_kokkos.cpp -c`
 #if defined (EXECUTION_SPACE)
+    #if EXECUTION_SPACE == HIP
+    using execution_space = Kokkos::Experimental::EXECUTION_SPACE;
+    #else
     using execution_space = Kokkos::EXECUTION_SPACE;
+    #endif
 #else
     using execution_space = Kokkos::DefaultExecutionSpace;
 #endif

From 0f4dc3f992e25d1dc917b3c316533d263b0a7091 Mon Sep 17 00:00:00 2001
From: Sergi Siso <sergi.siso@stfc.ac.uk>
Date: Thu, 14 Apr 2022 01:22:08 -0700
Subject: [PATCH 3/7] Update llvm.sh compiler setup script for AMD GPU
 acceleration

---
 compiler_setup/llvm.sh | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/compiler_setup/llvm.sh b/compiler_setup/llvm.sh
index 33578f14..ce940aa3 100644
--- a/compiler_setup/llvm.sh
+++ b/compiler_setup/llvm.sh
@@ -2,30 +2,30 @@
 # ================================================
 # This is an experimental file so other flags may be
 # needed for accelerated compilation
-# Alternative flags have been provided in the comments
-# where they have been found to be useful
 
 # Fortran compiler
 F90=flang
+# If flang is not available or causes compiler errors uncomment gfortran:
+# F90=gfortran
 # C and C++ compiler
 CC=clang
 CXX=clang++
 
 # C and C++ flags
-# note that -g is used for debugging information
-# as this is an experimental implementation
-CFLAGS="-O3 -march=native -g"
+CFLAGS="-O3"
 # Fortran compiler flags
-# As above, -g provides debugging information
-F90FLAGS="-O3 -march=native -g"
+F90FLAGS="-O3"
 # Flags to use when compiling with OpenMP support
 OMPFLAGS="-fopenmp"
 # Flags to use when compiling with OpenMP GPU offloading support
-OMPTARGETFLAGS="-fopenmp -fopenmp-targets=nvptx64"
-# OMPTARGETFLAGS="–fopenmp-targets=nvptx64-nvidia-cuda" 
+# For AMD Rocm (march is MI50: fgx906, MI100: gfx908):
+# OMPTARGETFLAGS="-target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx908"
+# For NVIDIA:
+OMPTARGETFLAGS="–fopenmp-targets=nvptx64-nvidia-cuda"
 
 # Linker flags
-LDFLAGS="-lomp -lomptarget"
+LDFLAGS="-fopenmp"
+
 # Location of various CUDA maths libraries
 LDFLAGS+=" -L${CUDA_MATH_DIR}/lib64"
 

From 5ccef6aaee922d28aa3dc9d9ec558f77fb9c4608 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergi=20Sis=C3=B3?= <sergiesg@gmail.com>
Date: Thu, 14 Apr 2022 09:31:56 +0100
Subject: [PATCH 4/7] Update NemoLite2D algorithm layer

---
 .../manual_versions/psykal_acc/nemolite2d.f90 | 30 +++++++++----------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_acc/nemolite2d.f90 b/benchmarks/nemo/nemolite2d/manual_versions/psykal_acc/nemolite2d.f90
index 49428770..d1d004fc 100644
--- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_acc/nemolite2d.f90
+++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_acc/nemolite2d.f90
@@ -7,9 +7,9 @@ program gocean2d
   use gocean2d_io_mod, only: model_write
   use gocean_mod,      only: model_write_log, gocean_initialise, &
                              gocean_finalise
-  !use likwid
 
-  !> A Horizontal 2D hydrodynamic ocean model which
+  !> GOcean2d is a Horizontal 2D hydrodynamic ocean model initially developed
+  !! by Hedong Liu, UK National Oceanography Centre (NOC), which:
   !!   1) using structured grid
   !!   2) using direct data addressing structures
 
@@ -30,8 +30,9 @@ program gocean2d
   type(r2d_field) :: ua_fld, va_fld
 
   ! time stepping index
-  integer     :: istp  
+  integer     :: istp
   integer     :: itimer0
+  integer     :: warmup_iterations = 1
 
   ! Scratch space for logging messages
   character(len=160) :: log_str
@@ -43,13 +44,11 @@ program gocean2d
   ! points immediately to the North and East of a T point all have the
   ! same i,j index).  This is the same offset scheme as used by NEMO.
   model_grid = grid_type(GO_ARAKAWA_C, &
-  !  BC_PERIODIC, BC_NON_PERIODIC ??
                          (/GO_BC_EXTERNAL,GO_BC_EXTERNAL,GO_BC_NONE/), &
                          GO_OFFSET_NE)
 
   !! read in model parameters and configure the model grid 
   CALL model_init(model_grid)
-  !call likwid_markerInit()
 
   ! Create fields on this grid
 
@@ -92,24 +91,24 @@ program gocean2d
 
   ! Start timer for time-stepping section
   CALL timer_start(itimer0, label='Warm up', &
-                   num_repeats=INT(1,kind=i_def64) )
+      num_repeats=INT(warmup_iterations,kind=i_def64) )
 
-  call step(nit000,                               &
-           ua_fld, va_fld, un_fld, vn_fld,     &
-           sshn_t_fld, sshn_u_fld, sshn_v_fld, &
-           ssha_t_fld, ssha_u_fld, ssha_v_fld, &
-           hu_fld, hv_fld, ht_fld)
+  do istp = nit000, nit000 + warmup_iterations, 1
+      call step(istp,                               &
+                ua_fld, va_fld, un_fld, vn_fld,     &
+                sshn_t_fld, sshn_u_fld, sshn_v_fld, &
+                ssha_t_fld, ssha_u_fld, ssha_v_fld, &
+                hu_fld, hv_fld, ht_fld)
+   enddo
 
   ! Stop the timer for the time-stepping section
   call timer_stop(itimer0)
   ! Start timer for time-stepping section
   CALL timer_start(itimer0, label='Time-stepping', &
-                   num_repeats=INT(nitend-nit000,kind=i_def64) )
+      num_repeats=INT(nitend-(nit000+warmup_iterations),kind=i_def64))
 
   !! time stepping 
-  do istp = nit000+1, nitend, 1
-
-     !call model_write_log("('istp == ',I6)",istp)
+  do istp = nit000+warmup_iterations, nitend, 1
 
      call step(istp,                               &
                ua_fld, va_fld, un_fld, vn_fld,     &
@@ -133,7 +132,6 @@ program gocean2d
 
   !! finalise the model run
   call model_finalise()
-  !call likwid_markerClose()
 
   call model_write_log("((A))", 'Simulation finished!!')
 

From e1a7e93cf26a6eace1fad2d715d0544f7a5c882c Mon Sep 17 00:00:00 2001
From: Sergi Siso <sergi.siso@stfc.ac.uk>
Date: Thu, 14 Apr 2022 09:51:00 +0100
Subject: [PATCH 5/7] NemoLite2D add more timing sections in the algorithm
 layer

---
 .../manual_versions/psykal_acc/nemolite2d.f90 | 23 +++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_acc/nemolite2d.f90 b/benchmarks/nemo/nemolite2d/manual_versions/psykal_acc/nemolite2d.f90
index d1d004fc..edc1f9cd 100644
--- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_acc/nemolite2d.f90
+++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_acc/nemolite2d.f90
@@ -10,8 +10,8 @@ program gocean2d
 
   !> GOcean2d is a Horizontal 2D hydrodynamic ocean model initially developed
   !! by Hedong Liu, UK National Oceanography Centre (NOC), which:
-  !!   1) using structured grid
-  !!   2) using direct data addressing structures
+  !!   1) uses structured grid
+  !!   2) uses direct data addressing structures
 
   implicit none
 
@@ -50,6 +50,11 @@ program gocean2d
   !! read in model parameters and configure the model grid 
   CALL model_init(model_grid)
 
+  ! Start timer for initialisation section (this must be after model_init
+  ! because dl_timer::timer_init() is called inside it)
+  CALL timer_start(itimer0, label='Initialise', &
+      num_repeats=INT(1,kind=i_def64) )
+
   ! Create fields on this grid
 
   ! Sea-surface height now (current time step)
@@ -89,7 +94,10 @@ program gocean2d
                        model_grid%subdomain%global%ystop
   call model_write_log("((A))", TRIM(log_str))
 
-  ! Start timer for time-stepping section
+  ! Stop the timer for the initialisation section
+  call timer_stop(itimer0)
+
+  ! Start timer for warm-up section
   CALL timer_start(itimer0, label='Warm up', &
       num_repeats=INT(warmup_iterations,kind=i_def64) )
 
@@ -101,7 +109,7 @@ program gocean2d
                 hu_fld, hv_fld, ht_fld)
    enddo
 
-  ! Stop the timer for the time-stepping section
+  ! Stop the timer for the warm-up section
   call timer_stop(itimer0)
   ! Start timer for time-stepping section
   CALL timer_start(itimer0, label='Time-stepping', &
@@ -124,12 +132,19 @@ program gocean2d
   ! Stop the timer for the time-stepping section
   call timer_stop(itimer0)
 
+  ! Start timer for checksum section
+  CALL timer_start(itimer0, label='Checksum reductions', &
+      num_repeats=INT(1,kind=i_def64) )
+
   ! Compute and output some checksums for error checking
   call model_write_log("('ua checksum = ', E16.8)", &
                        field_checksum(ua_fld))
   call model_write_log("('va checksum = ', E16.8)", &
                        field_checksum(va_fld))
 
+  ! Stop the timer for the checksum section
+  call timer_stop(itimer0)
+
   !! finalise the model run
   call model_finalise()
 

From 7a1a65dae22174c726b2f508cd5ce8772438eab7 Mon Sep 17 00:00:00 2001
From: Sergi Siso <sergi.siso@stfc.ac.uk>
Date: Thu, 14 Apr 2022 10:10:21 +0100
Subject: [PATCH 6/7] Fix NemoLite2D kokkos preprocessor macros

---
 .../psykal_kokkos/time_step_views_kokkos.cpp                | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/time_step_views_kokkos.cpp b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/time_step_views_kokkos.cpp
index 3edb242f..147a1ce4 100644
--- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/time_step_views_kokkos.cpp
+++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/time_step_views_kokkos.cpp
@@ -115,11 +115,9 @@ extern "C" void c_invoke_time_step(
     // The execution space is given as a preprocessor define when compiling
     // this file. e.g. `g++ -DEXEC_SPACE=OpenMP time_step_kokkos.cpp -c`
 #if defined (EXECUTION_SPACE)
-    #if EXECUTION_SPACE == HIP
-    using execution_space = Kokkos::Experimental::EXECUTION_SPACE;
-    #else
     using execution_space = Kokkos::EXECUTION_SPACE;
-    #endif
+    // Replace execution_space with the line below for the HIP backend
+    // using execution_space = Kokkos::Experimental::EXECUTION_SPACE;
 #else
     using execution_space = Kokkos::DefaultExecutionSpace;
 #endif

From 002a6a3afe57787d649146d169cee5a07407bb3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergi=20Sis=C3=B3?= <sergiesg@gmail.com>
Date: Tue, 21 Jun 2022 12:13:46 +0100
Subject: [PATCH 7/7] Update nemolite2d psykal version to use multiple timing
 sections

---
 .../nemo/nemolite2d/psykal/nemolite2d_alg.f90 | 63 ++++++++++++++-----
 1 file changed, 47 insertions(+), 16 deletions(-)

diff --git a/benchmarks/nemo/nemolite2d/psykal/nemolite2d_alg.f90 b/benchmarks/nemo/nemolite2d/psykal/nemolite2d_alg.f90
index c37d18cd..b620561b 100644
--- a/benchmarks/nemo/nemolite2d/psykal/nemolite2d_alg.f90
+++ b/benchmarks/nemo/nemolite2d/psykal/nemolite2d_alg.f90
@@ -31,7 +31,10 @@ program gocean2d
   ! time stepping index
   integer :: istp 
   integer :: itimer0
-  integer(i_def64) :: nrepeat
+  integer :: warmup_iterations = 1
+
+  ! Scratch space for logging messages
+  character(len=160) :: log_str
 
   call gocean_initialise()
 
@@ -46,6 +49,11 @@ program gocean2d
   !! read in model parameters and configure the model grid 
   CALL model_init(model_grid)
 
+  ! Start timer for initialisation section (this must be after model_init
+  ! because dl_timer::timer_init() is called inside it)
+  CALL timer_start(itimer0, label='Initialise', &
+      num_repeats=INT(1,kind=i_def64) )
+
   ! Create fields on this grid
 
   ! Sea-surface height now (current time step)
@@ -78,13 +86,37 @@ program gocean2d
 
   call model_write(model_grid, 0, ht_fld, sshn_t_fld, un_fld, vn_fld)
 
+  write(log_str, "('Simulation domain = (',I4,':',I4,',',I4,':',I4,')')") &
+                   model_grid%subdomain%global%xstart, &
+                   model_grid%subdomain%global%xstop,  &
+                   model_grid%subdomain%global%ystart, &
+                   model_grid%subdomain%global%ystop
+  call model_write_log("((A))", TRIM(log_str))
+
+  ! Stop the timer for the initialisation section
+  call timer_stop(itimer0)
+  
+  ! Start timer for warm-up section
+  CALL timer_start(itimer0, label='Warm up', &
+      num_repeats=INT(warmup_iterations,kind=i_def64) )
+
+  do istp = nit000, nit000 + warmup_iterations, 1
+      call step(istp,                               &
+                ua_fld, va_fld, un_fld, vn_fld,     &
+                sshn_t_fld, sshn_u_fld, sshn_v_fld, &
+                ssha_t_fld, ssha_u_fld, ssha_v_fld, &
+                hu_fld, hv_fld, ht_fld)
+   enddo
+
+  ! Stop the timer for the warm-up section
+  call timer_stop(itimer0)
+
   ! Start timer for time-stepping section
-  nrepeat = nitend - nit000 + 1
-  call model_write_log("((A))", '=== Start Time-stepping ===')
-  CALL timer_start(itimer0, label='Time-stepping', num_repeats=nrepeat)
+  CALL timer_start(itimer0, label='Time-stepping', &
+      num_repeats=INT(nitend-(nit000+warmup_iterations),kind=i_def64))
 
   !! time stepping 
-  do istp = nit000, nitend, 1
+  do istp = nit000+warmup_iterations, nitend, 1
 
      call step(istp,                               &
                ua_fld, va_fld, un_fld, vn_fld,     &
@@ -100,23 +132,22 @@ program gocean2d
   ! Stop the timer for the time-stepping section
   call timer_stop(itimer0)
 
-  call model_write_log("((A))", '=== Time-stepping finished ===')
+  ! Start timer for checksum section
+  CALL timer_start(itimer0, label='Checksum reductions', &
+      num_repeats=INT(1,kind=i_def64) )
 
   ! Compute and output some checksums for error checking
-  call model_write_log("('ua checksum = ',E16.8)", field_checksum(ua_fld))
-  call model_write_log("('va checksum = ',E16.8)", field_checksum(va_fld))
-  ! call model_write_log("('ssh_u checksum = ',E16.8)", &
-  !                      field_checksum(sshn_u_fld))
-  ! call model_write_log("('ssh_v checksum = ',E16.8)", &
-  !                      field_checksum(sshn_v_fld))
-  ! call model_write_log("('ssh_t checksum = ',E16.8)", &
-  !                      field_checksum(sshn_t_fld))
+  call model_write_log("('ua checksum = ', E16.8)", &
+                       field_checksum(ua_fld))
+  call model_write_log("('va checksum = ', E16.8)", &
+                       field_checksum(va_fld))
+
+  ! Stop the timer for the checksum section
+  call timer_stop(itimer0)
 
   !! finalise the model run
   call model_finalise()
-  
   call model_write_log("((A))", 'Simulation finished!!')
-
   call gocean_finalise()
 
 end program gocean2d