diff --git a/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile b/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile
index 8e40229b..9453719e 100644
--- a/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile
+++ b/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile
@@ -45,7 +45,7 @@ tra_adv_no_auto_serial: dl_timer
 # OpenACC version with timer around outer loop only.
 tra_adv_acc: dl_timer
 	mkdir -p $@
-	${PSYCLONE} -s ../scripts/kernels_trans.py -opsy \
+	${PSYCLONE} -s ../scripts/acc_kernels_trans.py -opsy \
             $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90
 	cp Makefile_gen $@/Makefile
 	cp tra_adv_driver.F90 $@/.
@@ -55,39 +55,68 @@ tra_adv_acc: dl_timer
 # OpenACC version with nvtx profiling instrumentation.
 tra_adv_acc_prof: dl_timer
 	mkdir -p $@
-	${PSYCLONE} --profile invokes -s ../scripts/kernels_trans.py -opsy \
+	${PSYCLONE} --profile invokes -s ../scripts/acc_kernels_trans.py -opsy \
             $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90
 	cp Makefile_gen $@/Makefile
 	cp tra_adv_driver.F90 $@/.
 	${MAKE} PROF_LIB_INC="-I${PSYCLONE_NVIDIA_LIB_DIR} -I../${DL_TIMER_DIR}/src" \
            LDFLAGS="${LDFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME} ${PSYCLONE_NVIDIA_LIB_DIR}/libnvtx_prof.a" -C $@
 
+# Serial version in SIR compliant form.
+tra_adv_no_scalars_serial: dl_timer
+	mkdir -p $@
+	cp tra_adv_driver_no_scalars.F90 $@/tra_adv_driver.F90
+	cp tra_adv_compute_no_scalars.F90 $@/tra_adv_compute.F90
+	cp Makefile_gen $@/Makefile
+	${MAKE} PROF_LIB_INC="-I../${DL_TIMER_DIR}/src" \
+            LDFLAGS="${LDFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@
+
+# OpenACC version with timer around outer loop only.
+tra_adv_no_scalars_acc: dl_timer
+	mkdir -p $@
+	${PSYCLONE} -s ../scripts/acc_kernels_trans.py -opsy \
+            $@/tra_adv_compute.f90 ./tra_adv_compute_no_scalars.F90
+	cp Makefile_gen $@/Makefile
+	cp tra_adv_driver_no_scalars.F90 $@/tra_adv_driver.F90
+	${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} ${UMEMFLAGS} -I../${DL_TIMER_DIR}/src" \
+           LDFLAGS="${LDFLAGS} ${ACCFLAGS} ${UMEMFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" PROF_LIB_INC="-I../${DL_TIMER_DIR}/src" \
+           -C $@
+
 # Serial Fortran version after transformation to SIR-compliant form.
-tra_adv_sir: dl_timer
+tra_adv_sir: dl_timer ../scripts/sir_loop_trans.py
 	mkdir -p $@
-	${PSYCLONE} -s ../scripts/sir_trans.py -opsy $@/tra_adv_compute.f90 \
-            ./tra_adv_compute_auto_arrays.F90
+	${PSYCLONE} -s ../scripts/sir_loop_trans.py -opsy $@/tra_adv_compute.f90 \
+            ./tra_adv_compute_no_scalars.F90
 	cp Makefile_gen $@/Makefile
-	cp tra_adv_driver.F90 $@/.
+	cp tra_adv_driver_no_scalars.F90 $@/tra_adv_driver.F90
 	${MAKE} PROF_LIB_INC="-I../${DL_TIMER_DIR}/src" \
            LDFLAGS="${LDFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@
 
 # OpenACC added after transformation to SIR-compliant form.
-tra_adv_sir_acc: dl_timer
+tra_adv_sir_acc_um: dl_timer ../scripts/sir_loop_kernels_um_trans.py
 	mkdir -p $@
-	${PSYCLONE} -s ../scripts/sir_kernels_trans.py -opsy \
-            $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90
+	${PSYCLONE} -s ../scripts/sir_loop_kernels_trans.py -opsy \
+            $@/tra_adv_compute.f90 ./tra_adv_compute_no_scalars.F90
 	cp Makefile_gen $@/Makefile
-	cp tra_adv_driver.F90 $@/.
-	${MAKE} PROF_LIB_INC="-I../${DL_TIMER_DIR}/src" \
-           LDFLAGS="${LDFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@
+	cp tra_adv_driver_no_scalars.F90 $@/tra_adv_driver.f90
+	${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} ${UMEMFLAGS} -Mfma -I../${DL_TIMER_DIR}/src" \
+           LDFLAGS="${LDFLAGS} ${ACCFLAGS} ${UMEMFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" PROF_LIB_INC="-I../${DL_TIMER_DIR}/src" -C $@
+
+tra_adv_sir_acc: dl_timer ../scripts/sir_loop_kernels_trans.py
+	mkdir -p $@
+	${PSYCLONE} -s ../scripts/sir_loop_kernels_trans.py -opsy \
+            $@/tra_adv_compute.f90 ./tra_adv_compute_no_scalars.F90
+	cp Makefile_gen $@/Makefile
+	cp tra_adv_driver_no_scalars.F90 $@/tra_adv_driver.f90
+	${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} -I../${DL_TIMER_DIR}/src" \
+           LDFLAGS="${LDFLAGS} ${ACCFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" PROF_LIB_INC="-I../${DL_TIMER_DIR}/src" -C $@
 
 tra_adv_sir_acc_prof: dl_timer
 	mkdir -p $@
 	${PSYCLONE} --profile invokes -s ../scripts/sir_kernels_trans.py -opsy \
-            $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90
+            $@/tra_adv_compute.f90 ./tra_adv_compute_no_scalars.F90
 	cp Makefile_gen $@/Makefile
-	cp tra_adv_driver.F90 $@/.
+	cp tra_adv_driver_no_scalars.F90 $@/tra_adv_driver.f90
 	${MAKE} PROF_LIB_INC="-I${PSYCLONE_NVIDIA_LIB_DIR} -I../${DL_TIMER_DIR}/src" \
            LDFLAGS="${LDFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME} ${PSYCLONE_NVIDIA_LIB_DIR}/libnvtx_prof.a" -C $@
 
diff --git a/benchmarks/nemo/tracer_advection/compute_in_subroutine/tra_adv_compute_no_scalars.F90 b/benchmarks/nemo/tracer_advection/compute_in_subroutine/tra_adv_compute_no_scalars.F90
new file mode 100644
index 00000000..57e2c7c0
--- /dev/null
+++ b/benchmarks/nemo/tracer_advection/compute_in_subroutine/tra_adv_compute_no_scalars.F90
@@ -0,0 +1,201 @@
+module tra_adv_compute_mod
+  implicit none
+  
+contains
+
+  subroutine tra_adv_compute(zind, tsn, ztfreez, rnfmsk, rnfmsk_z, upsmsk, tmask, zwx, zwy, umask, vmask, mydomain, zslpx, zslpy, pun, pvn, pwn, jpi, jpj, jpk, iter)    
+
+      REAL*8, ALLOCATABLE, DIMENSION(:, :, :), intent(inout):: zind
+      REAL*8, ALLOCATABLE, DIMENSION(:, :, :), intent(in)   :: tsn, tmask
+      REAL*8, ALLOCATABLE, DIMENSION(:, :), intent(in) :: ztfreez, rnfmsk, upsmsk
+      REAL*8, ALLOCATABLE, DIMENSION(:), intent(in) :: rnfmsk_z
+      REAL*8, ALLOCATABLE, DIMENSION(:, :, :), intent(inout) :: zwx, zwy
+      REAL*8, ALLOCATABLE, DIMENSION(:, :, :), intent(in) :: umask, vmask
+
+      REAL*8, ALLOCATABLE, DIMENSION(:, :, :), intent(inout) :: zslpx, zslpy
+      REAL*8, ALLOCATABLE, DIMENSION(:, :, :), intent(inout) :: mydomain      
+      REAL*8, ALLOCATABLE, DIMENSION(:, :, :), intent(in) :: pun, pvn     
+      
+      REAL*8, ALLOCATABLE, DIMENSION(:,:,:), intent(in) :: pwn
+
+      INTEGER, INTENT(IN) :: jpi, jpj, jpk
+      INTEGER, INTENT(IN) :: iter
+
+      REAL*8 :: zbtr, ztra
+
+      REAL*8 :: z0u, zalpha, zu, zdt, zzwx, zzwy, z0v, zv
+
+      REAL*8 :: zice
+
+      REAL*8 :: z0w, zw
+      
+      INTEGER :: ji, jj, jk
+
+      DO jk = 1, jpk
+         DO jj = 1, jpj
+            DO ji = 1, jpi
+               zice = 0.d0
+               IF (tsn(ji, jj, jk) <= ztfreez(ji, jj) + 0.1d0) THEN; zice = 1.d0
+               ELSE; zice = 0.d0
+               END IF
+
+               zind(ji, jj, jk) = MAX( &
+                  rnfmsk(ji, jj)*rnfmsk_z(jk), &
+                  upsmsk(ji, jj), &
+                  zice                               &
+                  &                  )*tmask(ji, jj, jk)
+               zind(ji, jj, jk) = 1 - zind(ji, jj, jk)
+            END DO
+         END DO
+      END DO
+
+      zwx(:, :, jpk) = 0.e0; zwy(:, :, jpk) = 0.e0
+
+      DO jk = 1, jpk - 1
+         DO jj = 1, jpj - 1
+            DO ji = 1, jpi - 1
+               zwx(ji, jj, jk) = umask(ji, jj, jk)*(mydomain(ji + 1, jj, jk) - mydomain(ji, jj, jk))
+               zwy(ji, jj, jk) = vmask(ji, jj, jk)*(mydomain(ji, jj + 1, jk) - mydomain(ji, jj, jk))
+            END DO
+         END DO
+      END DO
+
+      zslpx(:, :, jpk) = 0.e0; zslpy(:, :, jpk) = 0.e0
+
+      DO jk = 1, jpk - 1
+         DO jj = 2, jpj
+            DO ji = 2, jpi
+               zslpx(ji, jj, jk) = (zwx(ji, jj, jk) + zwx(ji - 1, jj, jk))   &
+                                  &            *(0.25d0 + SIGN(0.25d0, zwx(ji, jj, jk)*zwx(ji - 1, jj, jk)))
+               zslpy(ji, jj, jk) = (zwy(ji, jj, jk) + zwy(ji, jj - 1, jk))   &
+                                  &            *(0.25d0 + SIGN(0.25d0, zwy(ji, jj, jk)*zwy(ji, jj - 1, jk)))
+            END DO
+         END DO
+      END DO
+
+      DO jk = 1, jpk - 1
+         DO jj = 2, jpj
+            DO ji = 2, jpi
+               zslpx(ji, jj, jk) = SIGN(1.d0, zslpx(ji, jj, jk))*MIN(ABS(zslpx(ji, jj, jk)),   &
+                  &                                                2.d0*ABS(zwx(ji - 1, jj, jk)),   &
+                  &                                                2.d0*ABS(zwx(ji, jj, jk)))
+               zslpy(ji, jj, jk) = SIGN(1.d0, zslpy(ji, jj, jk))*MIN(ABS(zslpy(ji, jj, jk)),   &
+                  &                                                2.d0*ABS(zwy(ji, jj - 1, jk)),   &
+                  &                                                2.d0*ABS(zwy(ji, jj, jk)))
+            END DO
+         END DO
+      END DO
+
+      DO jk = 1, jpk - 1
+         !zdt = 1
+         DO jj = 2, jpj - 1
+            DO ji = 2, jpi - 1
+               z0u = SIGN(0.5d0, pun(ji, jj, jk))
+               zalpha = 0.5d0 - z0u
+               !zu = z0u - 0.5d0*pun(ji, jj, jk)*zdt
+               zu = z0u - 0.5d0*pun(ji, jj, jk)*1.0
+
+               zzwx = mydomain(ji + 1, jj, jk) + zind(ji, jj, jk)*(zu*zslpx(ji + 1, jj, jk))
+               zzwy = mydomain(ji, jj, jk) + zind(ji, jj, jk)*(zu*zslpx(ji, jj, jk))
+
+               zwx(ji, jj, jk) = pun(ji, jj, jk)*(zalpha*zzwx + (1.-zalpha)*zzwy)
+
+               z0v = SIGN(0.5d0, pvn(ji, jj, jk))
+               zalpha = 0.5d0 - z0v
+               !zv = z0v - 0.5d0*pvn(ji, jj, jk)*zdt
+               zv = z0v - 0.5d0*pvn(ji, jj, jk)*1.0
+
+               zzwx = mydomain(ji, jj + 1, jk) + zind(ji, jj, jk)*(zv*zslpy(ji, jj + 1, jk))
+               zzwy = mydomain(ji, jj, jk) + zind(ji, jj, jk)*(zv*zslpy(ji, jj, jk))
+
+               zwy(ji, jj, jk) = pvn(ji, jj, jk)*(zalpha*zzwx + (1.d0 - zalpha)*zzwy)
+            END DO
+         END DO
+      END DO
+
+      DO jk = 1, jpk - 1
+         DO jj = 2, jpj - 1
+            DO ji = 2, jpi - 1
+               zbtr = 1.
+               ztra = -zbtr*(zwx(ji, jj, jk) - zwx(ji - 1, jj, jk)   &
+                  &               + zwy(ji, jj, jk) - zwy(ji, jj - 1, jk))
+               mydomain(ji, jj, jk) = mydomain(ji, jj, jk) + ztra
+            END DO
+         END DO
+      END DO
+
+      zwx(:, :, 1) = 0.e0; zwx(:, :, jpk) = 0.e0
+
+      DO jk = 2, jpk - 1
+         zwx(:, :, jk) = tmask(:, :, jk)*(mydomain(:, :, jk - 1) - mydomain(:, :, jk))
+      END DO
+
+      zslpx(:, :, 1) = 0.e0
+
+      DO jk = 2, jpk - 1
+         DO jj = 1, jpj
+            DO ji = 1, jpi
+               zslpx(ji, jj, jk) = (zwx(ji, jj, jk) + zwx(ji, jj, jk + 1))   &
+                                  &            *(0.25d0 + SIGN(0.25d0, zwx(ji, jj, jk)*zwx(ji, jj, jk + 1)))
+            END DO
+         END DO
+      END DO
+
+      DO jk = 2, jpk - 1
+         DO jj = 1, jpj
+            DO ji = 1, jpi
+               zslpx(ji, jj, jk) = SIGN(1.d0, zslpx(ji, jj, jk))*MIN(ABS(zslpx(ji, jj, jk)), &
+                  &                                               2.d0*ABS(zwx(ji, jj, jk + 1)),   &
+                  &                                               2.d0*ABS(zwx(ji, jj, jk)))
+            END DO
+         END DO
+      END DO
+
+      zwx(:, :, 1) = pwn(:, :, 1)*mydomain(:, :, 1)
+
+      !zdt = 1
+      !zbtr = 1.
+      !DO jk = 1, jpk-1
+      !   DO jj = 2, jpj-1
+      !      DO ji = 2, jpi-1
+      !         z0w = SIGN( 0.5d0, pwn(ji,jj,jk+1) )
+      !         zalpha = 0.5d0 + z0w
+      !         zw  = z0w - 0.5d0 * pwn(ji,jj,jk+1) * zdt * zbtr
+      !
+      !         zzwx = mydomain(ji,jj,jk+1) + zind(ji,jj,jk) * (zw * zslpx(ji,jj,jk+1))
+      !         zzwy = mydomain(ji,jj,jk  ) + zind(ji,jj,jk) * (zw * zslpx(ji,jj,jk  ))
+      !
+      !         zwx(ji,jj,jk+1) = pwn(ji,jj,jk+1) * ( zalpha * zzwx + (1.-zalpha) * zzwy )
+      !      END DO
+      !   END DO
+      !END DO
+      DO jk = 2, jpk
+         DO jj = 2, jpj - 1
+            DO ji = 2, jpi - 1
+               z0w = SIGN(0.5d0, pwn(ji, jj, jk))
+               zalpha = 0.5d0 + z0w
+               !zw = z0w - 0.5d0*pwn(ji, jj, jk)*zdt*zbtr
+               zw = z0w - 0.5d0*pwn(ji, jj, jk)*1.0*1.0
+
+               zzwx = mydomain(ji, jj, jk) + zind(ji, jj, jk - 1)*(zw*zslpx(ji, jj, jk))
+               zzwy = mydomain(ji, jj, jk - 1) + zind(ji, jj, jk - 1)*(zw*zslpx(ji, jj, jk - 1))
+
+               zwx(ji, jj, jk) = pwn(ji, jj, jk)*(zalpha*zzwx + (1.-zalpha)*zzwy)
+            END DO
+         END DO
+      END DO
+
+      !zbtr = 1.
+      DO jk = 1, jpk - 1
+         DO jj = 2, jpj - 1
+            DO ji = 2, jpi - 1
+               !ztra = -zbtr*(zwx(ji, jj, jk) - zwx(ji, jj, jk + 1))
+               ztra = -1.0*(zwx(ji, jj, jk) - zwx(ji, jj, jk + 1))
+               mydomain(ji, jj, jk) = ztra
+            END DO
+         END DO
+      END DO
+
+    end subroutine tra_adv_compute
+
+end module tra_adv_compute_mod
diff --git a/benchmarks/nemo/tracer_advection/compute_in_subroutine/tra_adv_driver_no_scalars.F90 b/benchmarks/nemo/tracer_advection/compute_in_subroutine/tra_adv_driver_no_scalars.F90
new file mode 100644
index 00000000..919ad031
--- /dev/null
+++ b/benchmarks/nemo/tracer_advection/compute_in_subroutine/tra_adv_driver_no_scalars.F90
@@ -0,0 +1,150 @@
+program tracer_advection
+  USE dl_timer, only: timer_init, timer_register, timer_start, timer_stop, timer_report
+  use tra_adv_compute_mod, only: tra_adv_compute
+  implicit none
+  REAL*8, ALLOCATABLE, SAVE, DIMENSION(:,:,:)   :: tsn 
+  REAL*8, ALLOCATABLE, SAVE, DIMENSION(:,:,:)   :: pun, pvn, pwn
+  REAL*8, ALLOCATABLE, SAVE, DIMENSION(:,:,:)   :: mydomain, umask, vmask, tmask, zind
+  REAL*8, ALLOCATABLE, SAVE, DIMENSION(:,:,:)   :: zslpx, zslpy, zwx, zwy
+  REAL*8, ALLOCATABLE, SAVE, DIMENSION(:,:)     :: ztfreez, rnfmsk, upsmsk
+  REAL*8, ALLOCATABLE, SAVE, DIMENSION(:)       :: rnfmsk_z
+  REAL*8                                        :: r, checksum
+  INTEGER                                       :: jpi, jpj, jpk, ji, jj, jk, jt
+  INTEGER*8                                     :: itn_count
+  CHARACTER(len=10)                             :: env
+  !> Timer indexes, one for initialisation, one for the 'time-stepping'
+  INTEGER :: init_timer, step_timer
+
+  CALL get_environment_variable("JPI", env)
+  READ ( env, '(i10)' ) jpi
+  CALL get_environment_variable("JPJ", env)
+  READ ( env, '(i10)' ) jpj
+  CALL get_environment_variable("JPK", env)
+  READ ( env, '(i10)' ) jpk
+  CALL get_environment_variable("IT", env)
+  READ ( env, '(i10)' ) itn_count
+
+  ! Set-up our timers
+
+  CALL timer_init()
+  CALL timer_register(init_timer, label='Initialisation')
+  ! We exclude the first step from the timed region.
+  CALL timer_register(step_timer, label='Time-stepping', &
+                      num_repeats=itn_count-1)
+
+  ! Initialisation
+
+  call timer_start(init_timer)
+
+  ALLOCATE( mydomain (jpi,jpj,jpk), &
+            pun (jpi,jpj,jpk), &
+            pvn (jpi,jpj,jpk), &
+            pwn (jpi,jpj,jpk), &
+            umask (jpi,jpj,jpk), &
+            vmask (jpi,jpj,jpk), &
+            tmask (jpi,jpj,jpk), &
+            zind (jpi,jpj,jpk), &
+            ztfreez (jpi,jpj), &
+            rnfmsk (jpi,jpj), &
+            upsmsk (jpi,jpj), &
+            rnfmsk_z (jpk), &
+            tsn(jpi,jpj,jpk), &
+            zslpx(jpi,jpj,jpk), &
+            zslpy(jpi,jpj,jpk), &
+            zwx(jpi,jpj,jpk), &
+            zwy(jpi,jpj,jpk))
+  
+  ! Array initialization
+
+  r = jpi*jpj*jpk
+
+  ! the following three lines can be uncommented to randomize arrays initialization
+  !call random_seed()
+  !call random_number(r)
+  !r = r*jpi*jpj*jpk
+
+  DO jk = 1, jpk
+     DO jj = 1, jpj
+        DO ji = 1, jpi
+           umask(ji,jj,jk) = ji*jj*jk/r
+           mydomain(ji,jj,jk) =ji*jj*jk/r
+           pun(ji,jj,jk) =ji*jj*jk/r
+           pvn(ji,jj,jk) =ji*jj*jk/r
+           pwn(ji,jj,jk) =ji*jj*jk/r
+           vmask(ji,jj,jk)= ji*jj*jk/r
+           tsn(ji,jj,jk)= ji*jj*jk/r
+           tmask(ji,jj,jk)= ji*jj*jk/r
+        END DO
+     END DO
+  END DO
+
+  r = jpi*jpj
+  DO jj=1, jpj
+     DO ji=1, jpi
+        ztfreez(ji,jj) = ji*jj/r
+        upsmsk(ji,jj) = ji*jj/r
+        rnfmsk(ji,jj) = ji*jj/r
+     END DO
+  END DO
+
+  DO jk=1, jpk
+     rnfmsk_z(jk)=jk/jpk
+  END DO
+
+  call timer_stop(init_timer)
+
+  jt = 1
+  call tra_adv_compute(zind, tsn, ztfreez, rnfmsk, rnfmsk_z, upsmsk, tmask, &
+          zwx, zwy, umask, vmask, mydomain, zslpx, zslpy, pun, pvn, pwn, &
+          jpi, jpj, jpk, jt)
+
+  call timer_start(step_timer)
+
+  do jt = 2, itn_count
+      call tra_adv_compute(zind, tsn, ztfreez, rnfmsk, rnfmsk_z, upsmsk, &
+              tmask, zwx, zwy, umask, vmask, mydomain, zslpx, zslpy, &
+              pun, pvn, pwn, jpi, jpj, jpk, jt)
+  end do
+
+  call timer_stop(step_timer)
+
+  ! Output final field and compute checksum
+
+  open(unit = 24, file = 'output.dat', form='formatted')
+
+  checksum = 0.0d0
+  do jk = 1, jpk-1
+     do jj = 2, jpj-1
+        do ji = 2, jpi-1
+           checksum = checksum + mydomain(ji,jj,jk)
+           write(24,*) mydomain(ji,jj,jk)
+        end do
+     end do
+  end do
+
+  write(*, "('Checksum for domain ', 2(I4, ' x'), I4, ' (',I4,' iterations) = ',E23.16)") &
+       jpi, jpj, jpk, itn_count, checksum
+
+  close(24)
+
+  deallocate( mydomain )
+  deallocate( pun )
+  deallocate( pvn )
+  deallocate( pwn )
+  deallocate( umask)
+  deallocate( vmask)
+  deallocate( tmask)
+  deallocate( zind )
+  deallocate( ztfreez )
+  deallocate( rnfmsk)
+  deallocate( upsmsk)
+  deallocate( rnfmsk_z)
+  deallocate( tsn)
+  deallocate( zslpx)
+  deallocate( zslpy)
+  deallocate( zwx)
+  deallocate( zwy)
+
+  call timer_report()
+
+end program tracer_advection
diff --git a/benchmarks/nemo/tracer_advection/original/Makefile b/benchmarks/nemo/tracer_advection/original/Makefile
index 7ef7fe85..811ed84e 100644
--- a/benchmarks/nemo/tracer_advection/original/Makefile
+++ b/benchmarks/nemo/tracer_advection/original/Makefile
@@ -59,13 +59,20 @@ tra_adv_serial: dl_timer ./tra_adv.F90
 	${MAKE} FORT_FLAGS="${F90FLAGS} -I../${DL_TIMER_DIR}/src" \
             LDFLAGS="${LDFLAGS} ${OMPFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@
 
-tra_adv_acc_kernels: dl_timer ./tra_adv.F90
+tra_adv_acc_kernels_umem: dl_timer ./tra_adv.F90
 	mkdir -p $@
 	${PSYCLONE} -s ../scripts/acc_kernels_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90
 	cp Makefile_gen $@/Makefile
 	${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} ${UMEMFLAGS} -I../${DL_TIMER_DIR}/src" \
            LDFLAGS="${LDFLAGS} ${ACCFLAGS} ${UMEMFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@
 
+tra_adv_acc_kernels: dl_timer ./tra_adv.F90
+	mkdir -p $@
+	${PSYCLONE} -s ../scripts/acc_kernels_explicit_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90
+	cp Makefile_gen $@/Makefile
+	${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} ${UMEMFLAGS} -I../${DL_TIMER_DIR}/src" \
+           LDFLAGS="${LDFLAGS} ${ACCFLAGS} ${UMEMFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@
+
 tra_adv_acc_loops: dl_timer ./tra_adv.F90
 	mkdir -p $@
 	${PSYCLONE} -s ../scripts/acc_loops_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90
diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_trans.py
new file mode 100644
index 00000000..9844c81a
--- /dev/null
+++ b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_trans.py
@@ -0,0 +1,500 @@
+# -----------------------------------------------------------------------------
+# BSD 3-Clause License
+#
+# Copyright (c) 2018-2022, Science and Technology Facilities Council.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+# -----------------------------------------------------------------------------
+# Authors: R. W. Ford, A. R. Porter, N. Nobre and S. Siso, STFC Daresbury Lab
+
+'''A transformation script that seeks to apply OpenACC DATA and KERNELS
+directives to NEMO style code. In order to use it you must first install
+PSyclone. See README.md in the top-level directory.
+
+Once you have psyclone installed, this may be used by doing:
+
+ $ psyclone -api nemo -s kernels_trans.py some_source_file.f90
+
+This should produce a lot of output, ending with generated
+Fortran. Note that the Fortran source files provided to PSyclone must
+have already been preprocessed (if required).
+
+The transformation script attempts to insert Kernels directives at the
+highest possible location(s) in the schedule tree (i.e. to enclose as
+much code as possible in each Kernels region). However, due to
+limitations in the Nvidia compiler, we must take care to exclude certain
+nodes (such as If blocks) from within Kernel regions. If a proposed
+region is found to contain such a node (by the ``valid_acc_kernel``
+routine) then the script moves a level down the tree and then repeats
+the process of attempting to create the largest possible Kernel region.
+
+Tested with the NVIDIA HPC SDK version 22.5.
+'''
+
+import logging
+from psyclone.errors import InternalError
+from psyclone.nemo import NemoInvokeSchedule, NemoKern, NemoLoop
+from psyclone.psyGen import TransInfo
+from psyclone.psyir.nodes import IfBlock, CodeBlock, Schedule, \
+    ArrayReference, Assignment, BinaryOperation, Loop, \
+    Literal, Return, Call, ACCDirective, ACCLoopDirective
+from psyclone.psyir.transformations import TransformationError, ProfileTrans, \
+                                           ACCUpdateTrans
+from psyclone.transformations import ACCEnterDataTrans
+
+# Get the PSyclone transformations we will use
+ACC_KERN_TRANS = TransInfo().get_trans_name('ACCKernelsTrans')
+ACC_LOOP_TRANS = TransInfo().get_trans_name('ACCLoopTrans')
+ACC_ROUTINE_TRANS = TransInfo().get_trans_name('ACCRoutineTrans')
+ACC_EDATA_TRANS = ACCEnterDataTrans()
+ACC_UPDATE_TRANS = ACCUpdateTrans()
+PROFILE_TRANS = ProfileTrans()
+
+# Whether or not to add profiling calls around unaccelerated regions
+PROFILE_NONACC = False
+
+# Whether or not to add OpenACC enter data and update directives to explicitly
+# move data between host and device memory
+ACC_DATA = True
+
+# If routine names contain these substrings then we do not profile them
+PROFILING_IGNORE = ["_init", "_rst", "alloc", "agrif", "flo_dom",
+                    "macho", "mpp_", "nemo_gcm",
+                    # These are small functions that the addition of profiling
+                    # prevents from being in-lined (and then breaks any attempt
+                    # to create OpenACC regions with calls to them)
+                    "interp1", "interp2", "interp3", "integ_spline", "sbc_dcy",
+                    "sum", "sign_"]
+
+# Routines we do not attempt to add any OpenACC to (because it breaks with
+# the Nvidia compiler or because it just isn't worth it)
+ACC_IGNORE = ["day_mth",  # Just calendar operations
+              "obs_surf_alloc", "oce_alloc",
+              # Compiler fails w/ "Unsupported local variable"
+              # Zero performance impact since outside execution path
+              "copy_obfbdata", "merge_obfbdata",
+              "turb_ncar",  # Transforming hurts performance
+              "iom_open", "iom_get_123d", "iom_nf90_rp0123d",
+              "trc_bc_ini", "p2z_ini", "p4z_ini"]  # Str handling, init routine
+
+# Currently fparser has no way of distinguishing array accesses from
+# function calls if the symbol is imported from some other module.
+# We therefore work-around this by keeping a list of known NEMO
+# functions that must be excluded from within KERNELS regions.
+NEMO_FUNCTIONS = ["alpha_charn", "cd_neutral_10m", "cpl_freq", "cp_air",
+                  "eos_pt_from_ct", "gamma_moist", "l_vap",
+                  "sbc_dcy", "solfrac", "psi_h", "psi_m", "psi_m_coare",
+                  "psi_h_coare", "psi_m_ecmwf", "psi_h_ecmwf", "q_sat",
+                  "rho_air", "visc_air", "sbc_dcy", "glob_sum",
+                  "glob_sum_full", "ptr_sj", "ptr_sjk", "interp1", "interp2",
+                  "interp3", "integ_spline"]
+
+
+class ExcludeSettings():
+    '''
+    Class to hold settings on what to exclude from OpenACC KERNELS regions.
+
+    :param Optional[dict] settings: map of settings to override.
+
+    '''
+    def __init__(self, settings={}):
+        # Whether we exclude IFs where the logical expression is not a
+        # comparison operation.
+        self.ifs_scalars = settings.get("ifs_scalars", False)
+
+
+# Routines which are exceptions to the OpenACC Kernels regions exclusion rules.
+EXCLUDING = {"default": ExcludeSettings(),
+             # Exclude for better GPU performance (requires further analysis).
+             "dyn_spg_ts": ExcludeSettings({"ifs_scalars": True}),
+             "tra_zdf_imp": ExcludeSettings({"ifs_scalars": True}),
+             # Exclude due to compiler bug preventing CPU multicore executions.
+             "dom_vvl_init": ExcludeSettings({"ifs_scalars": True})}
+
+
+def log_msg(name, msg, node):
+    '''
+    Log a message indicating why a transformation could not be performed.
+
+    :param str name: the name of the routine.
+    :param str msg: the message to log.
+    :param node: the PSyIR node that prevented the transformation.
+    :type node: :py:class:`psyclone.psyir.nodes.Node`
+
+    '''
+    # Create a str representation of the position of the problematic node
+    # in the PSyIR tree.
+    node_strings = []
+    parent = node
+    while parent:
+        node_strings.append(parent.node_str(colour=False))
+        parent = parent.parent
+    node_strings.reverse()
+    location = "->".join(node_strings)
+    # Log the message
+    logging.info("%s: %s: %s", name, msg, location)
+
+
+def valid_acc_kernel(node):
+    '''
+    Whether the sub-tree that has `node` at its root is eligible to be
+    enclosed within an OpenACC KERNELS directive.
+
+    :param node: the node in the PSyIRe to check.
+    :type node: :py:class:`psyclone.psyir.nodes.Node`
+
+    :returns: True if the sub-tree can be enclosed in a KERNELS region.
+    :rtype: bool
+
+    '''
+    # The Fortran routine which our parent Invoke represents
+    routine_name = node.ancestor(NemoInvokeSchedule).invoke.name
+
+    # Allow for per-routine setting of what to exclude from within KERNELS
+    # regions. This is because sometimes things work in one context but not
+    # in another (with the Nvidia compiler).
+    excluding = EXCLUDING.get(routine_name, EXCLUDING["default"])
+
+    # Rather than walk the tree multiple times, look for both excluded node
+    # types and possibly problematic operations
+    excluded_node_types = (CodeBlock, Return, Call, IfBlock, NemoLoop)
+    excluded_nodes = node.walk(excluded_node_types)
+
+    for enode in excluded_nodes:
+        if isinstance(enode, (CodeBlock, Return, Call)):
+            log_msg(routine_name,
+                    f"region contains {type(enode).__name__}", enode)
+            return False
+
+        if isinstance(enode, IfBlock):
+            # We permit IF blocks originating from WHERE constructs and
+            # single-statement IF blocks containing a Loop in KERNELS regions
+            if "was_where" in enode.annotations or \
+               "was_single_stmt" in enode.annotations and enode.walk(Loop):
+                continue
+
+            arrays = enode.condition.walk(ArrayReference)
+            # We exclude if statements where the condition expression does
+            # not refer to arrays at all as this may cause compiler issues
+            # (get "Missing branch target block") or produce faster code.
+            if not arrays and excluding.ifs_scalars and \
+               not isinstance(enode.condition, BinaryOperation):
+                log_msg(routine_name, "IF references scalars", enode)
+                return False
+            # When using CUDA Unified Memory, only allocated arrays reside in
+            # shared memory (including those that are created by compiler-
+            # -generated allocs, e.g. for automatic arrays). We assume that all
+            # arrays of rank 2 or greater are dynamically allocated, whereas 1D
+            # arrays are often static in NEMO. Hence, we disallow IFs where the
+            # logical expression involves the latter.
+            if any(len(array.children) == 1 for array in arrays):
+                log_msg(routine_name,
+                        "IF references 1D arrays that may be static", enode)
+                return False
+
+        elif isinstance(enode, NemoLoop):
+            # Heuristic:
+            # We don't want to put loops around 3D loops into KERNELS regions
+            # and nor do we want to put loops over levels into KERNELS regions
+            # if they themselves contain several 2D loops.
+            # In general, this heuristic will depend upon how many levels the
+            # model configuration will contain.
+            child = enode.loop_body[0]
+            if isinstance(child, Loop) and child.loop_type == "levels":
+                # We have a loop around a loop over levels
+                log_msg(routine_name, "Loop is around a loop over levels",
+                        enode)
+                return False
+            if enode.loop_type == "levels" and \
+               len(enode.loop_body.children) > 1:
+                # The body of the loop contains more than one statement.
+                # How many distinct loop nests are there?
+                loop_count = 0
+                for child in enode.loop_body.children:
+                    if child.walk(Loop):
+                        loop_count += 1
+                        if loop_count > 1:
+                            log_msg(routine_name,
+                                    "Loop over levels contains several "
+                                    "other loops", enode)
+                            return False
+
+    # For now we don't support putting *just* the implicit loop assignment in
+    # things like:
+    #    if(do_this)my_array(:,:) = 1.0
+    # inside a kernels region. Once we generate Fortran instead of modifying
+    # the fparser2 parse tree this will become possible.
+    if isinstance(node.parent, Schedule) and \
+       isinstance(node.parent.parent, IfBlock) and \
+       "was_single_stmt" in node.parent.parent.annotations:
+        log_msg(routine_name, "Would split single-line If statement", node)
+        return False
+
+    # Finally, check that we haven't got any 'array accesses' that are in
+    # fact function calls.
+    refs = node.walk(ArrayReference)
+    # Since kernels are leaves in the PSyIR, we need to separately check
+    # their schedules for array references too.
+    kernels = node.walk(NemoKern)
+    for kern in kernels:
+        sched = kern.get_kernel_schedule()
+        refs += sched.walk(ArrayReference)
+    for ref in refs:
+        # Check if this reference has the name of a known function and if that
+        # reference appears outside said known function.
+        if ref.name.lower() in NEMO_FUNCTIONS and \
+           ref.name.lower() != routine_name.lower():
+            log_msg(routine_name,
+                    f"Loop contains function call: {ref.name}", ref)
+            return False
+    return True
+
+
+def add_kernels(children):
+    '''
+    Walks through the PSyIR inserting OpenACC KERNELS directives at as
+    high a level as possible.
+
+    :param children: list of sibling Nodes in PSyIR that are candidates for \
+                     inclusion in an ACC KERNELS region.
+    :type children: list of :py:class:`psyclone.psyir.nodes.Node`
+
+    :returns: True if any KERNELS regions are successfully added.
+    :rtype: bool
+
+    '''
+    added_kernels = False
+    if not children:
+        return added_kernels
+
+    node_list = []
+    for child in children[:]:
+        # Can this node be included in a kernels region?
+        if not valid_acc_kernel(child):
+            # It can't so we put what we have so far inside a kernels region
+            success = try_kernels_trans(node_list)
+            added_kernels |= success
+            # A node that cannot be included in a kernels region marks the
+            # end of the current candidate region so reset the list.
+            node_list = []
+            # Now we go down a level and try again
+            if isinstance(child, IfBlock):
+                success1 = add_kernels(child.if_body)
+                success2 = add_kernels(child.else_body)
+                success = success1 or success2
+            elif isinstance(child, Loop):
+                success = add_kernels(child.loop_body)
+            else:
+                success = add_kernels(child.children)
+            added_kernels |= success
+        else:
+            # We can add this node to our list for the current region
+            node_list.append(child)
+    success = try_kernels_trans(node_list)
+    added_kernels |= success
+
+    return added_kernels
+
+
+def add_profiling(children):
+    '''
+    Walks down the PSyIR and inserts the largest possible profiling regions.
+    Code that contains OpenACC directives is excluded.
+
+    :param children: sibling nodes in the PSyIR to which to attempt to add \
+                     profiling regions.
+    :type children: list of :py:class:`psyclone.psyir.nodes.Node`
+
+    '''
+    if not children:
+        return
+
+    node_list = []
+    for child in children[:]:
+        # Do we want this node to be included in a profiling region?
+        if child.walk((ACCDirective, Return)):
+            # It contains OpenACC so we put what we have so far inside a
+            # profiling region
+            add_profile_region(node_list)
+            # A node that is not included in a profiling region marks the
+            # end of the current candidate region so reset the list.
+            node_list = []
+            # Now we go down a level and try again without attempting to put
+            # profiling below OpenACC directives or within Assignments
+            if isinstance(child, IfBlock):
+                add_profiling(child.if_body)
+                add_profiling(child.else_body)
+            elif not isinstance(child, (Assignment, ACCDirective)):
+                add_profiling(child.children)
+        else:
+            # We can add this node to our list for the current region
+            node_list.append(child)
+    add_profile_region(node_list)
+
+
+def add_profile_region(nodes):
+    '''
+    Attempt to put the supplied list of nodes within a profiling region.
+
+    :param nodes: list of sibling PSyIR nodes to enclose.
+    :type nodes: list of :py:class:`psyclone.psyir.nodes.Node`
+
+    '''
+    if nodes:
+        # Check whether we should be adding profiling inside this routine
+        routine_name = \
+            nodes[0].ancestor(NemoInvokeSchedule).invoke.name.lower()
+        if any([ignore in routine_name for ignore in PROFILING_IGNORE]):
+            return
+        if len(nodes) == 1:
+            if isinstance(nodes[0], CodeBlock) and \
+               len(nodes[0].get_ast_nodes) == 1:
+                # Don't create profiling regions for CodeBlocks consisting
+                # of a single statement
+                return
+            if isinstance(nodes[0], IfBlock) and \
+               "was_single_stmt" in nodes[0].annotations and \
+               isinstance(nodes[0].if_body[0], CodeBlock):
+                # We also don't put single statements consisting of
+                # 'IF(condition) CALL blah()' inside profiling regions
+                return
+        try:
+            PROFILE_TRANS.apply(nodes)
+        except TransformationError:
+            pass
+
+
+def try_kernels_trans(nodes):
+    '''
+    Attempt to enclose the supplied list of nodes within a kernels
+    region. If the transformation fails then the error message is
+    reported but execution continues.
+
+    :param nodes: list of Nodes to enclose within a Kernels region.
+    :type nodes: list of :py:class:`psyclone.psyir.nodes.Node`
+
+    :returns: True if the transformation was successful, False otherwise.
+    :rtype: bool
+
+    '''
+    # We only enclose the proposed region if it contains a loop.
+    have_loop = False
+    for node in nodes:
+        if node.walk(Loop):
+            have_loop = True
+            break
+        assigns = node.walk(Assignment)
+        for assign in assigns:
+            if assign.is_array_range:
+                have_loop = True
+                break
+    if not have_loop:
+        return False
+
+    try:
+        ACC_KERN_TRANS.apply(nodes, {"default_present": False})
+
+        # Put COLLAPSE on any tightly-nested loops over latitude and longitude.
+        for node in nodes:
+            loops = node.walk(Loop)
+            for loop in loops:
+                if loop.ancestor(ACCLoopDirective):
+                    # We've already transformed a parent Loop so skip this one.
+                    continue
+                # We put a COLLAPSE(2) clause on any perfectly-nested lat-lon
+                # loops that have a Literal value for their step. The latter
+                # condition is necessary to avoid compiler errors.
+                if loop.loop_type == "lat" and \
+                   isinstance(loop.step_expr, Literal) and \
+                   isinstance(loop.loop_body[0], Loop) and \
+                   loop.loop_body[0].loop_type == "lon" and \
+                   isinstance(loop.loop_body[0].step_expr, Literal) and \
+                   len(loop.loop_body.children) == 1:
+                    try:
+                        ACC_LOOP_TRANS.apply(loop, {"collapse": 2})
+                    except (TransformationError) as err:
+                        print(f"Failed to collapse lat-lon loop: {loop}")
+                        print(f"Error was: {err}")
+
+        return True
+    except (TransformationError, InternalError) as err:
+        print(f"Failed to insert acc kernels around nodes: {nodes}")
+        print(f"Error was: {err}")
+        return False
+
+
+def trans(psy):
+    '''A PSyclone-script compliant transformation function. Applies
+    OpenACC 'kernels' directives to NEMO code. Data movement can be
+    handled manually or through CUDA's managed-memory functionality.
+
+    :param psy: The PSy layer object to apply transformations to.
+    :type psy: :py:class:`psyclone.psyGen.PSy`
+
+    '''
+    logging.basicConfig(filename='psyclone.log', filemode='w',
+                        level=logging.INFO)
+
+    invoke_list = "\n".join([str(name) for name in psy.invokes.names])
+    print(f"Invokes found:\n{invoke_list}\n")
+
+    for invoke in psy.invokes.invoke_list:
+
+        sched = invoke.schedule
+        if not sched:
+            print(f"Invoke {invoke.name} has no Schedule! Skipping...")
+            continue
+
+        # In the lib_fortran file we annotate each routine that does not
+        # have a Loop or a Call with the OpenACC Routine Directive
+        if psy.name == "psy_lib_fortran_psy" and not sched.walk((Loop, Call)):
+            print(f"Transforming {invoke.name} with acc routine")
+            ACC_ROUTINE_TRANS.apply(sched)
+            continue
+
+        # Attempt to add OpenACC directives unless we are ignoring this routine
+        if invoke.name.lower() not in ACC_IGNORE:
+            print(f"Transforming {invoke.name} with acc kernels")
+            have_kernels = add_kernels(sched.children)
+            if have_kernels and ACC_DATA:
+                print(f"Transforming {invoke.name} with acc enter data")
+                ACC_EDATA_TRANS.apply(sched)
+        else:
+            print(f"Addition of OpenACC to routine {invoke.name} disabled!")
+
+        if ACC_DATA:
+            print(f"Transforming {invoke.name} with acc update")
+            ACC_UPDATE_TRANS.apply(sched)
+
+        # Add profiling instrumentation
+        if PROFILE_NONACC:
+            print(f"Adding profiling to non-OpenACC regions in {invoke.name}")
+            add_profiling(sched.children)
+
+    return psy
diff --git a/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py b/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py
index c1513957..b506880e 100755
--- a/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py
+++ b/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py
@@ -36,8 +36,7 @@
 ''' PSyclone transformation script to insert OpenMP Target Loop directives
 to the outermost loop that is parallelisable, including implicit loops. '''
 
-from psyclone.psyir.transformations import OMPTargetTrans
-from psyclone.transformations import OMPLoopTrans
+from psyclone.psyir.transformations import OMPTargetTrans, OMPLoopTrans
 from utils import insert_explicit_loop_parallelism, normalise_loops
 
 
@@ -54,9 +53,9 @@ def trans(psy):
     '''
     omp_target_trans = OMPTargetTrans()
     omp_loop_trans = OMPLoopTrans()
-    # Disabling worksharing will produce the 'loop' directive which is better
+    # Use the 'loop' directive which is better
     # suited to map the work into the GPU
-    omp_loop_trans.omp_worksharing = False
+    omp_loop_trans.omp_directive = "loop"
 
     print("Invokes found:")
     for invoke in psy.invokes.invoke_list:
diff --git a/benchmarks/nemo/tracer_advection/scripts/problemsize.sh b/benchmarks/nemo/tracer_advection/scripts/problemsize.sh
index 09441c3e..85ca589f 100755
--- a/benchmarks/nemo/tracer_advection/scripts/problemsize.sh
+++ b/benchmarks/nemo/tracer_advection/scripts/problemsize.sh
@@ -16,15 +16,26 @@ export IT=500
 # Number of vertical levels
 export JPK=75
 
+export JPI=128
+export JPJ=128
+taskset -c 2 $@ > /dev/null 2>&1
+
 base=2
 #for power in $(seq 4 12); do
-for power in $(seq 4 9); do
+for power in $(seq 6 10); do
 
     size=$(echo "$base^$power" | bc)
     export JPI=${size}
     export JPJ=${size}
 
+    if (( $power < 6 ));
+    then
+      # Do a warm-up run if it's a small problem size
+      taskset -c 2 $@ > /dev/null 2>&1
+    fi
     time=$(taskset -c 2 $@  | awk '{if ($1 == "Time-stepping") {print $5} }')
 
     echo $size $time
 done
+
+rm -f output.dat
diff --git a/benchmarks/nemo/tracer_advection/scripts/sir_loop_kernels_trans.py b/benchmarks/nemo/tracer_advection/scripts/sir_loop_kernels_trans.py
new file mode 100644
index 00000000..23d28eaa
--- /dev/null
+++ b/benchmarks/nemo/tracer_advection/scripts/sir_loop_kernels_trans.py
@@ -0,0 +1,76 @@
+# -----------------------------------------------------------------------------
+# BSD 3-Clause License
+#
+# Copyright (c) 2022, Science and Technology Facilities Council.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+# -----------------------------------------------------------------------------
+# Authors: R. W. Ford and A. R. Porter, STFC Daresbury Lab.
+
+'''Module providing a PSyclone transformation script that first converts
+the supplied PSyIR into a form compatible with the Stencil Intermediate
+Representation (SIR) and then adds OpenACC Kernels regions to it.
+
+'''
+
+from acc_kernels_trans import add_kernels
+from sir_loop_trans import make_sir_compliant
+from psyclone.psyir.transformations import ACCUpdateTrans
+from psyclone.transformations import ACCEnterDataTrans
+
+
+UPDATE_TRANS = ACCUpdateTrans()
+EDATA_TRANS = ACCEnterDataTrans()
+
+
+def trans(psy):
+    '''
+    Transformation routine for use with PSyclone. It calls
+    :py:func:`sir_trans.make_sir_compliant` and then
+    :py:func:`kernels_trans.add_kernels` for each schedule in each invoke.
+
+    :param psy: the PSy object which this script will transform.
+    :type psy: :py:class:`psyclone.psyGen.PSy`
+
+    :returns: the transformed PSy object.
+    :rtype: :py:class:`psyclone.psyGen.PSy`
+
+    '''
+    for invoke in psy.invokes.invoke_list:
+
+        sched = invoke.schedule
+        if not sched:
+            print(f"Invoke {invoke.name} has no Schedule! Skipping...")
+            continue
+
+        make_sir_compliant(sched)
+        add_kernels(sched.children)
+        EDATA_TRANS.apply(sched)
+        UPDATE_TRANS.apply(sched)
+        print(sched.view())
diff --git a/benchmarks/nemo/tracer_advection/scripts/sir_loop_kernels_um_trans.py b/benchmarks/nemo/tracer_advection/scripts/sir_loop_kernels_um_trans.py
new file mode 100644
index 00000000..390ae05d
--- /dev/null
+++ b/benchmarks/nemo/tracer_advection/scripts/sir_loop_kernels_um_trans.py
@@ -0,0 +1,68 @@
+# -----------------------------------------------------------------------------
+# BSD 3-Clause License
+#
+# Copyright (c) 2022, Science and Technology Facilities Council.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+# -----------------------------------------------------------------------------
+# Authors: R. W. Ford and A. R. Porter, STFC Daresbury Lab.
+
+'''Module providing a PSyclone transformation script that first converts
+the supplied PSyIR into a form compatible with the Stencil Intermediate
+Representation (SIR) and then adds OpenACC Kernels regions to it.
+
+'''
+
+from acc_kernels_trans import add_kernels
+from sir_loop_trans import make_sir_compliant
+
+
+def trans(psy):
+    '''
+    Transformation routine for use with PSyclone. It calls
+    :py:func:`sir_trans.make_sir_compliant` and then
+    :py:func:`kernels_trans.add_kernels` for each schedule in each invoke.
+
+    :param psy: the PSy object which this script will transform.
+    :type psy: :py:class:`psyclone.psyGen.PSy`
+
+    :returns: the transformed PSy object.
+    :rtype: :py:class:`psyclone.psyGen.PSy`
+
+    '''
+    for invoke in psy.invokes.invoke_list:
+
+        sched = invoke.schedule
+        if not sched:
+            print(f"Invoke {invoke.name} has no Schedule! Skipping...")
+            continue
+
+        make_sir_compliant(sched)
+        add_kernels(sched.children)
+        print(sched.view())
diff --git a/benchmarks/nemo/tracer_advection/scripts/sir_loop_trans.py b/benchmarks/nemo/tracer_advection/scripts/sir_loop_trans.py
new file mode 100644
index 00000000..016a41f8
--- /dev/null
+++ b/benchmarks/nemo/tracer_advection/scripts/sir_loop_trans.py
@@ -0,0 +1,113 @@
+# -----------------------------------------------------------------------------
+# BSD 3-Clause License
+#
+# Copyright (c) 2022, Science and Technology Facilities Council
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+# -----------------------------------------------------------------------------
+# Author: R. W. Ford, STFC Daresbury Lab
+
+'''Module providing a transformation script that converts the supplied
+PSyIR to the Stencil intermediate representation (SIR). Translation to
+the SIR is limited to the NEMO API. The NEMO API has no algorithm
+layer so all of the original code is captured in the invoke
+objects. Therefore by translating all of the invoke objects, all of
+the original code is translated.
+
+'''
+
+from psyclone.psyir.nodes import Assignment
+from psyclone.psyir.transformations import HoistTrans
+from psyclone.domain.nemo.transformations import NemoAllArrayRange2LoopTrans, \
+    NemoAllArrayAccess2LoopTrans
+
+
+def trans(psy):
+    '''Transformation routine for use with PSyclone. Applies the
+    NemoAllArrayRange2LoopTrans, NemoAllArrayAccess2LoopTrans and
+    HoistTrans transformations to the supplied invokes. This
+    transformation routine is limited to the NEMO API.
+
+    :param psy: the PSy object which this script will transform.
+    :type psy: :py:class:`psyclone.psyGen.PSy`
+    :returns: the transformed PSy object.
+    :rtype: :py:class:`psyclone.psyGen.PSy`
+
+    '''
+
+    # For each Invoke transform the schedule so that it is compatible
+    # with SIR generation. Note, there is no algorithm layer in the NEMO API
+    # so the invokes represent all of the original code.
+    for invoke in psy.invokes.invoke_list:
+        schedule = invoke.schedule
+
+        make_sir_compliant(schedule)
+
+    return psy
+
+
+def make_sir_compliant(schedule):
+    '''
+    Applies various transformations to the supplied schedule to replace any
+    features that cannot be represented in SIR with alternative forms:
+
+    1. Converts any accesses of individual array elements into 1-trip loops.
+    2. Transforms array assignments into loops.
+    3. Hoists any loop-invariant assignments out of loops over levels.
+
+    :param schedule: the schedule to transform.
+    :type schedule: :py:class:`psyclone.psyir.nodes.Schedule`
+
+    '''
+    array_range_trans = NemoAllArrayRange2LoopTrans()
+    array_access_trans = NemoAllArrayAccess2LoopTrans()
+    hoist_trans = HoistTrans()
+
+    # Transform any single index accesses in array assignments
+    # (e.g. a(1)) into 1-trip loops.
+    for assignment in schedule.walk(Assignment):
+        array_access_trans.apply(assignment)
+
+    # Transform any array assignments (Fortran ':' notation) into loops.
+    for assignment in schedule.walk(Assignment):
+        array_range_trans.apply(assignment)
+
+    # Remove any loop invariant assignments inside k-loops to make
+    # them perfectly nested. At the moment this transformation
+    # does not perform any dependence analysis validation so could
+    # move code that should not be moved, see issue
+    # #1387. However, it is known that it is safe do apply this
+    # transformation to this particular code
+    # (tra_adv_compute.F90).
+    for loop in schedule.loops():
+        # outermost only
+        if loop.loop_type == "levels":
+            for child in loop.loop_body[:]:
+                if isinstance(child, Assignment):
+                    hoist_trans.apply(child)