From f68a2c61dc31fa2b9b399ac628979ce0bfad24cc Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Tue, 21 Oct 2025 10:54:42 +0100
Subject: [PATCH 01/25] Update submodules

---
 shared/FortCL     | 2 +-
 shared/PSyclone   | 2 +-
 shared/dl_esm_inf | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/shared/FortCL b/shared/FortCL
index d516ed01..401148e4 160000
--- a/shared/FortCL
+++ b/shared/FortCL
@@ -1 +1 @@
-Subproject commit d516ed01ea23565bfc4f531a795d2c7a2a57fe50
+Subproject commit 401148e4b6d6efdd4d0157123b118ed07d831446
diff --git a/shared/PSyclone b/shared/PSyclone
index 106543da..c9c20b1e 160000
--- a/shared/PSyclone
+++ b/shared/PSyclone
@@ -1 +1 @@
-Subproject commit 106543dafe26fe114de192f27311637a85a28a81
+Subproject commit c9c20b1ee96c10352b31463276408ad33ab84752
diff --git a/shared/dl_esm_inf b/shared/dl_esm_inf
index ad209e9d..358402ec 160000
--- a/shared/dl_esm_inf
+++ b/shared/dl_esm_inf
@@ -1 +1 @@
-Subproject commit ad209e9d252995bd83127de4c481232ca14ed655
+Subproject commit 358402ecc4d88e93a62a3ca13dc9d20d2eb27f90

From 588d403c00debb2ae90b7ff6333ce0d7f0cf7558 Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Thu, 23 Oct 2025 09:30:35 +0100
Subject: [PATCH 02/25] Update PSyclone flags in tra_adv makefile

---
 .../nemo/tracer_advection/original/Makefile   | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/benchmarks/nemo/tracer_advection/original/Makefile b/benchmarks/nemo/tracer_advection/original/Makefile
index 16d37045..d535c660 100644
--- a/benchmarks/nemo/tracer_advection/original/Makefile
+++ b/benchmarks/nemo/tracer_advection/original/Makefile
@@ -49,7 +49,7 @@ DL_TIMER_DIR = ../../../../shared/dl_timer
 DL_TIMER_NAME = libdl_timer_omp.a
 
 # Shorthand for invoking PSyclone.
-PSYCLONE = psyclone -api nemo -l output ${PSYCLONE_PROFILE}
+PSYCLONE = psyclone -l output ${PSYCLONE_PROFILE}
 
 # Add necessary flags for Nvidia nvtx instrumentation
 ifeq ($(ENABLE_NVIDIA_PROFILE),yes)
@@ -71,14 +71,14 @@ tra_adv_serial: dl_timer ./tra_adv.F90
 
 tra_adv_omp_cpu_levels: dl_timer ./tra_adv.F90
 	mkdir -p $@
-	${PSYCLONE} -s ../scripts/omp_cpu_levels_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90
+	${PSYCLONE} -s ../scripts/omp_cpu_levels_trans.py -o $@/tra_adv.f90 ./tra_adv.F90
 	cp Makefile_gen $@/Makefile
 	${MAKE} FORT_FLAGS="${F90FLAGS} ${OMPFLAGS} -I../${DL_TIMER_DIR}/src" \
             LDFLAGS="${LDFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@
 
 tra_adv_omp_cpu: dl_timer ./tra_adv.F90
 	mkdir -p $@
-	${PSYCLONE} -s ../scripts/omp_cpu_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90
+	${PSYCLONE} -s ../scripts/omp_cpu_trans.py -o $@/tra_adv.f90 ./tra_adv.F90
 	cp Makefile_gen $@/Makefile
 	${MAKE} FORT_FLAGS="${F90FLAGS} ${OMPFLAGS} -I../${DL_TIMER_DIR}/src" \
             LDFLAGS="${LDFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@
@@ -87,42 +87,42 @@ tra_adv_omp_cpu: dl_timer ./tra_adv.F90
 
 tra_adv_acc_kernels_unified_memory: dl_timer ./tra_adv.F90
 	mkdir -p $@
-	${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90
+	${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -o $@/tra_adv.f90 ./tra_adv.F90
 	cp Makefile_gen $@/Makefile
 	${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} ${UMEMFLAGS} -I../${DL_TIMER_DIR}/src" \
            LDFLAGS="${LDFLAGS} ${ACCFLAGS} ${UMEMFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@
 
 tra_adv_acc_kernels_explicit_data_movement: dl_timer ./tra_adv.F90
 	mkdir -p $@
-	${PSYCLONE} -s ../scripts/acc_kernels_explicit_data_movement_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90
+	${PSYCLONE} -s ../scripts/acc_kernels_explicit_data_movement_trans.py -o $@/tra_adv.f90 ./tra_adv.F90
 	cp Makefile_gen $@/Makefile
 	${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} -I../${DL_TIMER_DIR}/src" \
            LDFLAGS="${LDFLAGS} ${ACCFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@
 
 tra_adv_acc_loops_unified_memory: dl_timer ./tra_adv.F90
 	mkdir -p $@
-	${PSYCLONE} -s ../scripts/acc_loops_unified_memory_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90
+	${PSYCLONE} -s ../scripts/acc_loops_unified_memory_trans.py -o $@/tra_adv.f90 ./tra_adv.F90
 	cp Makefile_gen $@/Makefile
 	${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} ${UMEMFLAGS} -I../${DL_TIMER_DIR}/src" \
            LDFLAGS="${LDFLAGS} ${ACCFLAGS} ${UMEMFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@
 
 tra_adv_acc_loops_explicit_data_movement: dl_timer ./tra_adv.F90
 	mkdir -p $@
-	${PSYCLONE} -s ../scripts/acc_loops_explicit_data_movement_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90
+	${PSYCLONE} -s ../scripts/acc_loops_explicit_data_movement_trans.py -o $@/tra_adv.f90 ./tra_adv.F90
 	cp Makefile_gen $@/Makefile
 	${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} -I../${DL_TIMER_DIR}/src" \
            LDFLAGS="${LDFLAGS} ${ACCFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@
 
 tra_adv_acc_mixed_unified_memory: dl_timer ./tra_adv.F90
 	mkdir -p $@
-	${PSYCLONE} -s ../scripts/acc_mixed_unified_memory_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90
+	${PSYCLONE} -s ../scripts/acc_mixed_unified_memory_trans.py -o $@/tra_adv.f90 ./tra_adv.F90
 	cp Makefile_gen $@/Makefile
 	${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} ${UMEMFLAGS} -I../${DL_TIMER_DIR}/src" \
            LDFLAGS="${LDFLAGS} ${ACCFLAGS} ${UMEMFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@
 
 tra_adv_acc_mixed_explicit_data_movement: dl_timer ./tra_adv.F90
 	mkdir -p $@
-	${PSYCLONE} -s ../scripts/acc_mixed_explicit_data_movement_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90
+	${PSYCLONE} -s ../scripts/acc_mixed_explicit_data_movement_trans.py -o $@/tra_adv.f90 ./tra_adv.F90
 	cp Makefile_gen $@/Makefile
 	${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} -I../${DL_TIMER_DIR}/src" \
            LDFLAGS="${LDFLAGS} ${ACCFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@
@@ -132,7 +132,7 @@ ifndef UMEMFLAGS
 	$(error The OMP offload target requires OpenMP unified memory but the UMEMFLAGS environment variable is not set)
 endif
 	mkdir -p $@
-	${PSYCLONE} -s ../scripts/omp_gpu_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90
+	${PSYCLONE} -s ../scripts/omp_gpu_trans.py -o $@/tra_adv.f90 ./tra_adv.F90
 	cp Makefile_gen $@/Makefile
 	${MAKE} PROF_LIB_INC="-I../${DL_TIMER_DIR}/src" \
             FORT_FLAGS="${F90FLAGS} ${OMPTARGETFLAGS} ${UMEMFLAGS} -I../${DL_TIMER_DIR}/src" \

From 2a8c0d228a6af98a46aa67c27a10977f6cd0942c Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Thu, 23 Oct 2025 09:32:21 +0100
Subject: [PATCH 03/25] Add new compiler-setup script for spack

---
 compiler_setup/spack_nvidia.sh | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 compiler_setup/spack_nvidia.sh

diff --git a/compiler_setup/spack_nvidia.sh b/compiler_setup/spack_nvidia.sh
new file mode 100644
index 00000000..11fc3096
--- /dev/null
+++ b/compiler_setup/spack_nvidia.sh
@@ -0,0 +1,11 @@
+# Build settings for the Nvidia compiler
+# ================================================
+# Fortran compiler
+
+# ==============================
+export F90=$FC
+export PSYCLONE_NVIDIA_LIB_DIR=${HOME}/Projects/PSyclone/lib/profiling/nvidia
+export OMPTARGETFLAGS="-mp=gpu -gpu=ccnative"
+export OMPFLAGS="-mp"
+export UMEMFLAGS="-gpu=managed"
+

From defe249de3f5bf84894597d868048bfb9f00f13e Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Thu, 23 Oct 2025 11:00:14 +0100
Subject: [PATCH 04/25] Update omp cpu and gpu scripts

---
 .../tracer_advection/scripts/omp_cpu_trans.py |  30 +-
 .../tracer_advection/scripts/omp_gpu_trans.py |  22 +-
 .../nemo/tracer_advection/scripts/utils.py    | 276 +++++++++++++-----
 3 files changed, 219 insertions(+), 109 deletions(-)

diff --git a/benchmarks/nemo/tracer_advection/scripts/omp_cpu_trans.py b/benchmarks/nemo/tracer_advection/scripts/omp_cpu_trans.py
index c77813e7..ee7e85b4 100644
--- a/benchmarks/nemo/tracer_advection/scripts/omp_cpu_trans.py
+++ b/benchmarks/nemo/tracer_advection/scripts/omp_cpu_trans.py
@@ -1,7 +1,7 @@
 # -----------------------------------------------------------------------------
 # BSD 3-Clause License
 #
-# Copyright (c) 2022, Science and Technology Facilities Council.
+# Copyright (c) 2022-2025, Science and Technology Facilities Council.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -36,38 +36,34 @@
 ''' PSyclone transformation script to insert OpenMP Parallel Loop directives
 to the outermost loop that is parallelisable, including implicit loops.'''
 
-from psyclone.psyGen import TransInfo
+from psyclone.psyir.nodes import Node, Routine
+from psyclone.transformations import OMPParallelTrans, OMPLoopTrans
 from utils import insert_explicit_loop_parallelism, normalise_loops
 
 
-def trans(psy):
+def trans(psyir: Node):
     ''' Add OpenMP Parallel Loop directive to all loops, including implicit
     ones to target CPU parallelism.
 
-    :param psy: the PSy object which this script will transform.
-    :type psy: :py:class:`psyclone.psyGen.PSy`
-    :returns: the transformed PSy object.
-    :rtype: :py:class:`psyclone.psyGen.PSy`
+    :param psy: the PSyIR which this script will transform.
 
     '''
-    omp_parallel_trans = TransInfo().get_trans_name('OMPParallelTrans')
-    omp_loop_trans = TransInfo().get_trans_name('OMPLoopTrans')
+    omp_parallel_trans = OMPParallelTrans()
+    omp_loop_trans = OMPLoopTrans()
 
-    print("Invokes found:")
-    for invoke in psy.invokes.invoke_list:
-        print(invoke.name)
+    print("Routines: found:")
+    for routine in psyir.walk(Routine):
+        print(routine.name)
 
         normalise_loops(
-                invoke.schedule,
-                unwrap_array_ranges=True,
+                routine,
+                convert_array_notation=True,
                 hoist_expressions=False,
         )
 
         insert_explicit_loop_parallelism(
-                invoke.schedule,
+                routine,
                 region_directive_trans=omp_parallel_trans,
                 loop_directive_trans=omp_loop_trans,
                 collapse=False
         )
-
-    return psy
diff --git a/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py b/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py
index be2f4927..92663046 100644
--- a/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py
+++ b/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py
@@ -36,19 +36,17 @@
 ''' PSyclone transformation script to insert OpenMP Target Loop directives
 to the outermost loop that is parallelisable, including implicit loops. '''
 
+from psyclone.psyir.nodes import Node, Routine
 from psyclone.psyir.transformations import OMPTargetTrans, OMPLoopTrans
 from utils import insert_explicit_loop_parallelism, normalise_loops
 
 
-def trans(psy):
+def trans(psy: Node):
     ''' Add OpenMP Target and Loop directives to all loops, including the
     implicit ones, to parallelise the code and execute it in an acceleration
     device.
 
-    :param psy: the PSy object which this script will transform.
-    :type psy: :py:class:`psyclone.psyGen.PSy`
-    :returns: the transformed PSy object.
-    :rtype: :py:class:`psyclone.psyGen.PSy`
+    :param psy: the PSyIR which this script will transform.
 
     '''
     omp_target_trans = OMPTargetTrans()
@@ -56,21 +54,17 @@ def trans(psy):
     omp_loop_trans.omp_directive = "teamsdistributeparalleldo"
     omp_loop_trans.omp_schedule = "none"
 
-    print("Invokes found:")
-    for invoke in psy.invokes.invoke_list:
-        print(invoke.name)
+    print("Routines found:")
+    for routine in psy.walk(Routine):
+        print(routine.name)
 
         normalise_loops(
-                invoke.schedule,
-                unwrap_array_ranges=True,
-                hoist_expressions=True,
+                routine,
         )
 
         insert_explicit_loop_parallelism(
-                invoke.schedule,
+                routine,
                 region_directive_trans=omp_target_trans,
                 loop_directive_trans=omp_loop_trans,
                 collapse=True
         )
-
-    return psy
diff --git a/benchmarks/nemo/tracer_advection/scripts/utils.py b/benchmarks/nemo/tracer_advection/scripts/utils.py
index 8c9f530a..eed22fb8 100644
--- a/benchmarks/nemo/tracer_advection/scripts/utils.py
+++ b/benchmarks/nemo/tracer_advection/scripts/utils.py
@@ -35,16 +35,30 @@
 
 ''' Utilities file to parallelise Nemo code. '''
 
-from psyclone.domain.nemo.transformations import NemoAllArrayRange2LoopTrans
-from psyclone.errors import InternalError
-from psyclone.psyir.nodes import Loop, Assignment, Directive, CodeBlock, Call
-from psyclone.psyir.transformations import HoistLoopBoundExprTrans, HoistTrans
-from psyclone.transformations import TransformationError, ACCKernelsTrans
+import os
+from typing import List, Union
+
+from psyclone.psyir.nodes import (
+    Assignment, Directive, CodeBlock, Call, IfBlock, IntrinsicCall, Loop, Node,
+    Reference, Return, Routine, Schedule, StructureReference)
+from psyclone.psyir.symbols import DataSymbol
+from psyclone.psyir.transformations import (
+    ArrayAssignment2LoopsTrans, HoistLocalArraysTrans, HoistLoopBoundExprTrans,
+    HoistTrans, Maxval2LoopTrans, OMPMinimiseSyncTrans, ProfileTrans,
+    Reference2ArrayRangeTrans, ScalarisationTrans)
+from psyclone.transformations import TransformationError
+
+# If routine names contain these substrings then we do not profile them
+PROFILING_IGNORE = []
 
 
 def normalise_loops(
         schedule,
-        unwrap_array_ranges: bool = True,
+        hoist_local_arrays: bool = True,
+        convert_array_notation: bool = True,
+        loopify_array_intrinsics: bool = True,
+        convert_range_loops: bool = True,
+        scalarise_loops: bool = False,
         hoist_expressions: bool = True,
         ):
     ''' Normalise all loops in the given schedule so that they are in an
@@ -52,20 +66,77 @@ def normalise_loops(
     them.
 
     :param schedule: the PSyIR Schedule to transform.
-    :param unwrap_array_ranges: whether to convert ranges to explicit loops.
-    :param hoist_expressions: whether to hoist bounds and loop invariant \
+    :type schedule: :py:class:`psyclone.psyir.nodes.node`
+    :param bool hoist_local_arrays: whether to hoist local arrays.
+    :param bool convert_array_notation: whether to convert array notation
+        to explicit loops.
+    :param bool loopify_array_intrinsics: whether to convert intrinsics that
+        operate on arrays to explicit loops (currently only maxval).
+    :param bool convert_range_loops: whether to convert ranges to explicit
+        loops.
+    :param scalarise_loops: whether to attempt to convert arrays to scalars
+        where possible, default is False.
+    :param hoist_expressions: whether to hoist bounds and loop invariant
         statements out of the loop nest.
     '''
-    if unwrap_array_ranges:
+    if hoist_local_arrays:
+        # Apply the HoistLocalArraysTrans when possible, it cannot be applied
+        # to files with statement functions because it will attempt to put the
+        # allocate above it, which is not valid Fortran.
+        try:
+            HoistLocalArraysTrans().apply(schedule)
+        except TransformationError:
+            pass
+
+    if convert_array_notation:
+        # Make sure all array dimensions are explicit
+        for reference in schedule.walk(Reference):
+            part_of_the_call = reference.ancestor(Call)
+            if part_of_the_call:
+                if not part_of_the_call.is_elemental:
+                    continue
+            if isinstance(reference.symbol, DataSymbol):
+                try:
+                    Reference2ArrayRangeTrans().apply(reference)
+                except TransformationError:
+                    pass
+
+    if loopify_array_intrinsics:
+        for intr in schedule.walk(IntrinsicCall):
+            if intr.intrinsic.name == "MAXVAL":
+                try:
+                    Maxval2LoopTrans().apply(intr)
+                except TransformationError as err:
+                    print(err.value)
+
+    if convert_range_loops:
         # Convert all array implicit loops to explicit loops
-        explicit_loops = NemoAllArrayRange2LoopTrans()
+        explicit_loops = ArrayAssignment2LoopsTrans()
         for assignment in schedule.walk(Assignment):
-            explicit_loops.apply(assignment)
+            if assignment.walk(StructureReference):
+                continue  # TODO #2951 Fix issues with structure_refs
+            try:
+                explicit_loops.apply(assignment)
+            except TransformationError:
+                pass
+
+    if scalarise_loops:
+        # Apply scalarisation to every loop. Execute this in reverse order
+        # as sometimes we can scalarise earlier loops if following loops
+        # have already been scalarised.
+        loops = schedule.walk(Loop)
+        loops.reverse()
+        scalartrans = ScalarisationTrans()
+        for loop in loops:
+            scalartrans.apply(loop)
 
     if hoist_expressions:
         # First hoist all possible expressions
         for loop in schedule.walk(Loop):
-            HoistLoopBoundExprTrans().apply(loop)
+            try:
+                HoistLoopBoundExprTrans().apply(loop)
+            except TransformationError:
+                pass
 
         # Hoist all possible assignments (in reverse order so the inner loop
         # constants are hoisted all the way out if possible)
@@ -81,109 +152,158 @@ def insert_explicit_loop_parallelism(
         schedule,
         region_directive_trans=None,
         loop_directive_trans=None,
-        collapse: bool = True
+        collapse: bool = True,
+        privatise_arrays: bool = False,
+        asynchronous_parallelism: bool = False,
+        uniform_intrinsics_only: bool = False,
+        enable_reductions: bool = False,
         ):
     ''' For each loop in the schedule that doesn't already have a Directive
     as an ancestor, attempt to insert the given region and loop directives.
 
-    :param region_directive_trans: PSyclone transformation to insert the \
+    :param schedule: the PSyIR Schedule to transform.
+    :type schedule: :py:class:`psyclone.psyir.nodes.node`
+    :param region_directive_trans: PSyclone transformation that inserts the
         region directive.
-    :param loop_directive_trans: PSyclone transformation to use to insert the \
-        loop directive.
-    :param collapse: whether to attempt to insert the collapse clause to as \
+    :type region_directive_trans: \
+        :py:class:`psyclone.transformation.Transformation`
+    :param loop_directive_trans: PSyclone transformation that inserts the
+        loop parallelisation directive.
+    :type loop_directive_trans: \
+        :py:class:`psyclone.transformation.Transformation`
+    :param collapse: whether to attempt to insert the collapse clause to as
         many nested loops as possible.
-    '''
+    :param privatise_arrays: whether to attempt to privatise arrays that cause
+        write-write race conditions.
+    :param asynchronous_parallelism: whether to attempt to add asynchronocity
+    to the parallel sections.
+    :param uniform_intrinsics_only: if True it prevent offloading loops
+        with non-reproducible device intrinsics.
+    :param enable_reductions: whether to enable generation of reduction
+        clauses automatically.
 
+    '''
+    nemo_v4 = os.environ.get('NEMOV4', False)
+    if schedule.name == "ts_wgt":
+        return  # TODO #2937 WaW dependency incorrectly considered private
     # Add the parallel directives in each loop
     for loop in schedule.walk(Loop):
         if loop.ancestor(Directive):
             continue  # Skip if an outer loop is already parallelised
 
-        try:
-            loop_directive_trans.apply(loop)
-            # Only add the region directive if the loop was successfully
-            # parallelised.
-            if region_directive_trans is not None:
-                region_directive_trans.apply(loop.parent.parent)
-        except TransformationError as err:
-            # This loop can not be transformed, proceed to next loop
-            print("Loop not parallelised because:", str(err))
-            continue
+        opts = {"collapse": collapse, "privatise_arrays": privatise_arrays,
+                "verbose": True, "nowait": asynchronous_parallelism,
+                "enable_reductions": enable_reductions}
 
-        if collapse:
-            # Count the number of perfectly nested loops
-            num_nested_loops = 0
-            next_loop = loop
-            while isinstance(next_loop, Loop):
-                num_nested_loops += 1
-                if len(next_loop.loop_body.children) > 1:
-                    break
-                next_loop = next_loop.loop_body.children[0]
+        if uniform_intrinsics_only:
+            opts["device_string"] = "nvfortran-uniform"
 
-            if num_nested_loops > 1:
-                loop.parent.parent.collapse = num_nested_loops
+        routine_name = loop.ancestor(Routine).name
 
+        if ('dyn_spg' in routine_name and len(loop.walk(Loop)) > 2):
+            loop.append_preceding_comment(
+                "PSyclone: Loop not parallelised because it is in 'dyn_spg' "
+                "and is not the inner loop")
+            continue
 
-def valid_kernel(node):
-    '''
-    Whether the sub-tree that has `node` at its root is eligible to be
-    enclosed within an OpenACC KERNELS directive.
+        try:
+            # First check that the region_directive is feasible for this region
+            if region_directive_trans:
+                # TODO psyclone/#3066 - validate *should* accept a single Node
+                # but currently has a bug and doesn't so we have to make a
+                # list and pass that.
+                region_directive_trans.validate([loop], options=opts)
 
-    :param node: the node in the PSyIR to check.
-    :type node: :py:class:`psyclone.psyir.nodes.Node`
+            # If it is, apply the parallelisation directive
+            loop_directive_trans.apply(loop, options=opts)
 
-    :returns: True if the sub-tree can be enclosed in a KERNELS region.
-    :rtype: bool
+            # And if successful, the region directive on top.
+            if region_directive_trans:
+                region_directive_trans.apply(loop.parent.parent, options=opts)
+        except TransformationError:
+            # This loop cannot be transformed, proceed to next loop.
+            # The parallelisation restrictions will be explained with a comment
+            # associted to the loop in the generated output.
+            continue
 
-    '''
-    excluded_node_types = (CodeBlock, Call)
-    return node.walk(excluded_node_types) == []
+    # If we are adding asynchronous parallelism then we now try to minimise
+    # the number of barriers.
+    if asynchronous_parallelism:
+        minsync_trans = OMPMinimiseSyncTrans()
+        minsync_trans.apply(schedule)
 
 
-def add_kernels(children, default_present=True):
+def add_profiling(children: Union[List[Node], Schedule]):
     '''
-    Walks through the PSyIR inserting OpenACC KERNELS directives at as
-    high a level as possible.
+    Walks down the PSyIR and inserts the largest possible profiling regions
+    in place. Code inside functions or that contains directives is excluded.
 
-    :param children: list of sibling Nodes in PSyIR that are candidates for \
-                     inclusion in an ACC KERNELS region.
-    :type children: list of :py:class:`psyclone.psyir.nodes.Node`
-    :param bool default_present: whether or not to supply the \
-                          DEFAULT(PRESENT) clause to ACC KERNELS directives.
+    :param children: a Schedule or sibling nodes in the PSyIR to which to
+        attempt to add profiling regions.
 
     '''
+    if children and isinstance(children, Schedule):
+        # If we are given a Schedule, we look at its children.
+        children = children.children
+
     if not children:
         return
 
+    # We do not want profiling calipers inside functions (such as the
+    # PSyclone-generated comparison functions).
+    parent_routine = children[0].ancestor(Routine)
+    if parent_routine and parent_routine.return_symbol:
+        return
+
     node_list = []
     for child in children[:]:
-        # Can this node be included in a kernels region?
-        if not valid_kernel(child):
-            try_kernels_trans(node_list, default_present)
+        # Do we want this node to be included in a profiling region?
+        if child.walk((Directive, Return)):
+            # It contains a directive or return statement so we put what we
+            # have so far inside a profiling region.
+            add_profile_region(node_list)
+            # A node that is not included in a profiling region marks the
+            # end of the current candidate region so reset the list.
             node_list = []
-            # It can't so go down a level and try again
-            add_kernels(child.children)
+            # Now we go down a level and try again without attempting to put
+            # profiling below directives or within Assignments
+            if isinstance(child, IfBlock):
+                add_profiling(child.if_body)
+                add_profiling(child.else_body)
+            elif not isinstance(child, (Assignment, Directive)):
+                add_profiling(child.children)
         else:
+            # We can add this node to our list for the current region
             node_list.append(child)
-    try_kernels_trans(node_list, default_present)
+    add_profile_region(node_list)
 
 
-def try_kernels_trans(nodes, default_present):
+def add_profile_region(nodes):
     '''
-    Attempt to enclose the supplied list of nodes within a kernels
-    region. If the transformation fails then the error message is
-    reported but execution continues.
+    Attempt to put the supplied list of nodes within a profiling region.
 
-    :param nodes: list of Nodes to enclose within a Kernels region.
+    :param nodes: list of sibling PSyIR nodes to enclose.
     :type nodes: list of :py:class:`psyclone.psyir.nodes.Node`
-    :param bool default_present: whether or not to supply the \
-                          DEFAULT(PRESENT) clause to ACC KERNELS directives.
 
     '''
-    if not nodes:
-        return
-    try:
-        ACCKernelsTrans().apply(nodes, {"default_present": default_present})
-    except (TransformationError, InternalError) as err:
-        print(f"Failed to transform nodes: {nodes}")
-        print(f"Error was: {err}")
+    if nodes:
+        # Check whether we should be adding profiling inside this routine
+        routine_name = nodes[0].ancestor(Routine).name.lower()
+        if any(ignore in routine_name for ignore in PROFILING_IGNORE):
+            return
+        if len(nodes) == 1:
+            if isinstance(nodes[0], CodeBlock) and \
+               len(nodes[0].get_ast_nodes) == 1:
+                # Don't create profiling regions for CodeBlocks consisting
+                # of a single statement
+                return
+            if isinstance(nodes[0], IfBlock) and \
+               "was_single_stmt" in nodes[0].annotations and \
+               isinstance(nodes[0].if_body[0], CodeBlock):
+                # We also don't put single statements consisting of
+                # 'IF(condition) CALL blah()' inside profiling regions
+                return
+        try:
+            ProfileTrans().apply(nodes)
+        except TransformationError:
+            pass

From bf09178c9a2a7d6e2ee58227562068c0897b662d Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Thu, 23 Oct 2025 11:13:09 +0100
Subject: [PATCH 05/25] Fix the omp_cpu_levels_trans script

---
 .../scripts/omp_cpu_levels_trans.py           | 47 ++++++++++---------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/benchmarks/nemo/tracer_advection/scripts/omp_cpu_levels_trans.py b/benchmarks/nemo/tracer_advection/scripts/omp_cpu_levels_trans.py
index bc6b7ef9..4c8388b5 100644
--- a/benchmarks/nemo/tracer_advection/scripts/omp_cpu_levels_trans.py
+++ b/benchmarks/nemo/tracer_advection/scripts/omp_cpu_levels_trans.py
@@ -1,7 +1,7 @@
 # -----------------------------------------------------------------------------
 # BSD 3-Clause License
 #
-# Copyright (c) 2018-2023, Science and Technology Facilities Council
+# Copyright (c) 2018-2025, Science and Technology Facilities Council
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -33,40 +33,43 @@
 # -----------------------------------------------------------------------------
 # Authors: R. W. Ford, A. R. Porter and S. Siso, STFC Daresbury Lab
 
-'''A simple transformation script for the introduction of OpenMP with PSyclone.
+'''A very simple transformation script for the introduction of OpenMP
+ to certain loops using PSyclone.
 
- >>> psyclone -api "nemo" -s ./omp_cpu_levels_trans.py tra_adv.F90
+ >>> psyclone -s ./omp_cpu_levels_trans.py tra_adv.F90
 
 This should produce a lot of output, ending with generated Fortran.
 
 '''
 
-from psyclone.psyGen import TransInfo
-from psyclone.nemo import NemoKern
+from psyclone.psyir.nodes import Loop, Node, Routine
+from psyclone.transformations import OMPParallelLoopTrans, TransformationError
 
-def trans(psy):
+# Set up some loop_type inference rules in order to reference useful domain
+# loop constructs by name
+Loop.set_loop_type_inference_rules({
+    "lon": {"variable": "ji"},
+    "lat": {"variable": "jj"},
+    "levels": {"variable": "jk"}
+})
+
+
+def trans(psy: Node):
     ''' Transform a specific Schedule by making all loops
     over levels OpenMP parallel.
 
-    :param psy: the object holding all information on the PSy layer \
-                to be modified.
-    :type psy: :py:class:`psyclone.psyGen.PSy`
-
-    :returns: the transformed PSy object
-    :rtype:  :py:class:`psyclone.psyGen.PSy`
+    :param psy: the PSyIR to be modified.
 
     '''
     # Get the transformation we will apply
-    ompt = TransInfo().get_trans_name('OMPParallelLoopTrans')
-    for invoke in psy.invokes.invoke_list:
-        # Get the Schedule of the target routine
-        sched = invoke.schedule
+    ompt = OMPParallelLoopTrans()
+    for sched in psy.walk(Routine):
         # Apply the OMP transformation to each loop over levels containing
         # a kernel
         for loop in sched.loops():
-            kernels = loop.walk(NemoKern)
-            if kernels and loop.loop_type == "levels":
-                ompt.apply(loop)
-
-    # Return the modified psy object
-    return psy
+            if loop.loop_type == "levels":
+                try:
+                    ompt.apply(loop)
+                except TransformationError as err:
+                    loop.append_preceding_comment(
+                        f"Loop cannot be parallelised because: {err}")

From 4c6aeb887ae4a8895b629afc327707c60546e75b Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Thu, 23 Oct 2025 11:43:46 +0100
Subject: [PATCH 06/25] Update acc kernels managed target

---
 .../acc_kernels_unified_memory_trans.py       | 29 ++++----
 .../nemo/tracer_advection/scripts/utils.py    | 71 ++++++++++++++++++-
 2 files changed, 85 insertions(+), 15 deletions(-)

diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_unified_memory_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_unified_memory_trans.py
index fd9ad97b..93a84179 100644
--- a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_unified_memory_trans.py
+++ b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_unified_memory_trans.py
@@ -1,7 +1,7 @@
 # -----------------------------------------------------------------------------
 # BSD 3-Clause License
 #
-# Copyright (c) 2018-2022, Science and Technology Facilities Council.
+# Copyright (c) 2018-2025, Science and Technology Facilities Council.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -34,12 +34,12 @@
 # Authors: R. W. Ford, A. R. Porter and S. Siso, STFC Daresbury Lab
 
 '''A transformation script that seeks to apply OpenACC KERNELS directives to
-NEMO style code. In order to use it you must first install PSyclone. See
+generic Fortran code. In order to use it you must first install PSyclone. See
 README.md in the top-level directory.
 
 Once you have psyclone installed, this may be used by doing:
 
- $ psyclone -api nemo -s <this_script> <target_source_file>
+ $ psyclone -s <this_script> <target_source_file>
 
 The transformation script attempts to insert Kernels directives at the
 highest possible location(s) in the schedule tree (i.e. to enclose as
@@ -47,24 +47,25 @@
 
 '''
 
+from psyclone.psyir.nodes import Node, Routine
+
 from utils import add_kernels
 
 
-def trans(psy):
+def trans(psy: Node):
     '''A PSyclone-script compliant transformation function. Applies
-    OpenACC 'kernels' to NEMO code.
+    OpenACC 'kernels' to existing code.
 
-    :param psy: The PSy layer object to apply transformations to.
-    :type psy: :py:class:`psyclone.psyGen.PSy`
-    '''
+    :param psy: The PSyIR to apply transformations to.
 
-    print("Invokes found:")
-    print("\n".join([str(name) for name in psy.invokes.names]))
+    '''
+    print("Routines found:")
 
-    for invoke in psy.invokes.invoke_list:
+    for routine in psy.walk(Routine):
+        print(routine.name)
 
-        if not invoke.schedule:
-            print(f"Invoke {invoke.name} has no Schedule! Skipping...")
+        if not routine.children:
+            print(f"Routine {routine.name} is empty! Skipping...")
             continue
 
-        add_kernels(invoke.schedule.children)
+        add_kernels(routine.children)
diff --git a/benchmarks/nemo/tracer_advection/scripts/utils.py b/benchmarks/nemo/tracer_advection/scripts/utils.py
index eed22fb8..3d419b5d 100644
--- a/benchmarks/nemo/tracer_advection/scripts/utils.py
+++ b/benchmarks/nemo/tracer_advection/scripts/utils.py
@@ -38,12 +38,14 @@
 import os
 from typing import List, Union
 
+from psyclone.errors import InternalError
 from psyclone.psyir.nodes import (
     Assignment, Directive, CodeBlock, Call, IfBlock, IntrinsicCall, Loop, Node,
     Reference, Return, Routine, Schedule, StructureReference)
 from psyclone.psyir.symbols import DataSymbol
 from psyclone.psyir.transformations import (
-    ArrayAssignment2LoopsTrans, HoistLocalArraysTrans, HoistLoopBoundExprTrans,
+    ACCKernelsTrans, ArrayAssignment2LoopsTrans, HoistLocalArraysTrans,
+    HoistLoopBoundExprTrans,
     HoistTrans, Maxval2LoopTrans, OMPMinimiseSyncTrans, ProfileTrans,
     Reference2ArrayRangeTrans, ScalarisationTrans)
 from psyclone.transformations import TransformationError
@@ -307,3 +309,70 @@ def add_profile_region(nodes):
             ProfileTrans().apply(nodes)
         except TransformationError:
             pass
+
+
+def valid_kernel(node):
+    '''
+    Whether the sub-tree that has `node` at its root is eligible to be
+    enclosed within an OpenACC KERNELS directive.
+
+    :param node: the node in the PSyIR to check.
+    :type node: :py:class:`psyclone.psyir.nodes.Node`
+
+    :returns: True if the sub-tree can be enclosed in a KERNELS region.
+    :rtype: bool
+
+    '''
+    try:
+        ACCKernelsTrans().validate(node, {"disable_loop_check": True})
+    except TransformationError:
+        return False
+
+    return True
+
+
+def add_kernels(children: list[Node], default_present: bool = True):
+    '''
+    Walks through the PSyIR inserting OpenACC KERNELS directives at as
+    high a level as possible.
+
+    :param children: list of sibling Nodes in PSyIR that are candidates for
+                     inclusion in an ACC KERNELS region.
+    :param default_present: whether or not to supply the
+        DEFAULT(PRESENT) clause to ACC KERNELS directives.
+
+    '''
+    if not children:
+        return
+
+    node_list = []
+    for child in children[:]:
+        # Can this node be included in a kernels region?
+        if not valid_kernel(child):
+            try_kernels_trans(node_list, default_present)
+            node_list = []
+            # It can't so go down a level and try again
+            add_kernels(child.children)
+        else:
+            node_list.append(child)
+    try_kernels_trans(node_list, default_present)
+
+
+def try_kernels_trans(nodes: list[Node], default_present: bool):
+    '''
+    Attempt to enclose the supplied list of nodes within a kernels
+    region. If the transformation fails then the error message is
+    reported but execution continues.
+
+    :param nodes: list of Nodes to enclose within a Kernels region.
+    :param default_present: whether or not to supply the
+        DEFAULT(PRESENT) clause to ACC KERNELS directives.
+
+    '''
+    if not nodes:
+        return
+    try:
+        ACCKernelsTrans().apply(nodes, {"default_present": default_present})
+    except (TransformationError, InternalError) as err:
+        print(f"Failed to transform nodes: {nodes}")
+        print(f"Error was: {err}")

From b4d3c69b3ff3c9a9b74d9a7323c875d7e984861a Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Thu, 23 Oct 2025 11:53:28 +0100
Subject: [PATCH 07/25] Fix ACC kernels with explicit mem movement

---
 ...cc_kernels_explicit_data_movement_trans.py | 32 ++++++++-----------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_data_movement_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_data_movement_trans.py
index 6260568f..04f546b3 100644
--- a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_data_movement_trans.py
+++ b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_data_movement_trans.py
@@ -1,7 +1,7 @@
 # -----------------------------------------------------------------------------
 # BSD 3-Clause License
 #
-# Copyright (c) 2018-2022, Science and Technology Facilities Council.
+# Copyright (c) 2018-2025, Science and Technology Facilities Council.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -39,7 +39,7 @@
 
 Once you have psyclone installed, this may be used by doing:
 
- $ psyclone -api nemo -s <this_script> <target_source_file>
+ $ psyclone -s <this_script> <target_source_file>
 
 The transformation script attempts to insert Kernels directives at the
 highest possible location(s) in the schedule tree (i.e. to enclose as
@@ -47,30 +47,26 @@
 
 '''
 
-from psyclone.psyir.nodes import Directive
+from psyclone.psyir.nodes import Directive, Routine, Node
 from psyclone.psyir.transformations import ACCUpdateTrans
 from psyclone.transformations import ACCEnterDataTrans
 from utils import add_kernels
 
 
-def trans(psy):
+def trans(psy: Node):
     '''A PSyclone-script compliant transformation function. Applies
-    OpenACC 'kernels' and 'data movement' directives to NEMO code.
+    OpenACC 'kernels' and 'data movement' directives to generic code.
 
-    :param psy: The PSy layer object to apply transformations to.
-    :type psy: :py:class:`psyclone.psyGen.PSy`
-    '''
-
-    print("Invokes found:")
-    print("\n".join([str(name) for name in psy.invokes.names]))
+    :param psy: The PSyIR to apply transformations to.
 
-    for invoke in psy.invokes.invoke_list:
+    '''
+    for sched in psy.walk(Routine):
 
-        if not invoke.schedule:
-            print(f"Invoke {invoke.name} has no Schedule! Skipping...")
+        if not sched.children:
+            print(f"Routine {sched.name} is empty! Skipping...")
             continue
 
-        add_kernels(invoke.schedule.children)
-        if invoke.schedule.walk(Directive):
-            ACCEnterDataTrans().apply(invoke.schedule)
-            ACCUpdateTrans().apply(invoke.schedule)
+        add_kernels(sched.children)
+        if sched.walk(Directive):
+            ACCEnterDataTrans().apply(sched)
+            ACCUpdateTrans().apply(sched)

From e29ff3637bb41886cd07a50711f27f0f5ec03c61 Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Thu, 23 Oct 2025 11:59:54 +0100
Subject: [PATCH 08/25] Fix acc loops with explicit mem

---
 .../acc_loops_explicit_data_movement_trans.py | 40 ++++++++-----------
 1 file changed, 17 insertions(+), 23 deletions(-)

diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_loops_explicit_data_movement_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_loops_explicit_data_movement_trans.py
index 4c5d1ebc..38aa257d 100644
--- a/benchmarks/nemo/tracer_advection/scripts/acc_loops_explicit_data_movement_trans.py
+++ b/benchmarks/nemo/tracer_advection/scripts/acc_loops_explicit_data_movement_trans.py
@@ -1,7 +1,7 @@
 # -----------------------------------------------------------------------------
 # BSD 3-Clause License
 #
-# Copyright (c) 2022-2023, Science and Technology Facilities Council.
+# Copyright (c) 2022-2025, Science and Technology Facilities Council.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -37,47 +37,41 @@
 to the outermost loop that is parallelisable, including implicit loops. This
 script also adds OpenACC explicit data movement directives.'''
 
-from psyclone.psyir.nodes import Directive
-from psyclone.psyGen import TransInfo
+from psyclone.psyir.nodes import Directive, Node, Routine
 from psyclone.psyir.transformations import ACCUpdateTrans
-from psyclone.transformations import ACCEnterDataTrans
+from psyclone.transformations import (
+    ACCEnterDataTrans, ACCLoopTrans, ACCParallelTrans)
 from utils import insert_explicit_loop_parallelism, normalise_loops
 
 
-def trans(psy):
+def trans(psy: Node):
     ''' Add OpenACC Parallel Loop directive to all loops, including implicit
     ones, to target GPU parallelism and explicit data movement directives.
 
-    :param psy: the PSy object which this script will transform.
-    :type psy: :py:class:`psyclone.psyGen.PSy`
-
-    :returns: the transformed PSy object.
-    :rtype: :py:class:`psyclone.psyGen.PSy`
+    :param psy: the PSyIR which this script will transform.
 
     '''
-    acc_parallel_trans = TransInfo().get_trans_name('ACCParallelTrans')
-    acc_loop_trans = TransInfo().get_trans_name('ACCLoopTrans')
+    acc_parallel_trans = ACCParallelTrans()
+    acc_loop_trans = ACCLoopTrans()
 
-    print("Invokes found:")
-    for invoke in psy.invokes.invoke_list:
-        print(invoke.name)
+    print("Routines found:")
+    for routine in psy.walk(Routine):
+        print(routine.name)
 
         # Convert array and range notation to loops and hoist expressions
         normalise_loops(
-            invoke.schedule,
-            unwrap_array_ranges=True,
+            routine,
+            scalarise_loops=True,
             hoist_expressions=True,
         )
 
         insert_explicit_loop_parallelism(
-            invoke.schedule,
+            routine,
             region_directive_trans=acc_parallel_trans,
             loop_directive_trans=acc_loop_trans,
             collapse=True
         )
 
-        if invoke.schedule.walk(Directive):
-            ACCEnterDataTrans().apply(invoke.schedule)
-            ACCUpdateTrans().apply(invoke.schedule)
-
-    return psy
+        if routine.walk(Directive):
+            ACCEnterDataTrans().apply(routine)
+            ACCUpdateTrans().apply(routine)

From 1f3184876d955afd24ecae944d3c16a05956a286 Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Thu, 23 Oct 2025 12:04:49 +0100
Subject: [PATCH 09/25] Update acc-mixed with explicit mem

---
 .../acc_mixed_explicit_data_movement_trans.py | 38 +++++++++----------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_mixed_explicit_data_movement_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_mixed_explicit_data_movement_trans.py
index db5dee0b..8d7b9661 100644
--- a/benchmarks/nemo/tracer_advection/scripts/acc_mixed_explicit_data_movement_trans.py
+++ b/benchmarks/nemo/tracer_advection/scripts/acc_mixed_explicit_data_movement_trans.py
@@ -1,7 +1,7 @@
 # -----------------------------------------------------------------------------
 # BSD 3-Clause License
 #
-# Copyright (c) 2018-2023, Science and Technology Facilities Council.
+# Copyright (c) 2018-2025, Science and Technology Facilities Council.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -39,7 +39,7 @@
 
 Once you have psyclone installed, this may be used by doing:
 
- $ psyclone -api nemo -s ./acc_mixed_explicit_data_movement_trans.py <file>
+ $ psyclone -s ./acc_mixed_explicit_data_movement_trans.py <file>
 
 This should produce a lot of output, ending with generated Fortran. Note
 that the Fortran source files provided to PSyclone must have already been
@@ -47,50 +47,48 @@
 
 '''
 
-from psyclone.psyir.nodes import Directive
+from psyclone.psyir.nodes import Directive, Node, Routine
 from psyclone.psyir.transformations import ACCUpdateTrans
 from psyclone.transformations import ACCEnterDataTrans, ACCLoopTrans
 from utils import add_kernels, normalise_loops, \
     insert_explicit_loop_parallelism
 
 
-def trans(psy):
+def trans(psy: Node):
     '''A PSyclone-script compliant transformation function. Applies
     OpenACC 'kernels', 'loop' and explicit 'data' directives to NEMO code.
 
-    :param psy: The PSy layer object to apply transformations to.
-    :type psy: :py:class:`psyclone.psyGen.PSy`
-    '''
+    :param psy: The PSyIR to apply transformations to.
 
-    print("Invokes found:")
-    print("\n".join([str(name) for name in psy.invokes.names]))
+    '''
+    print("Routines found:")
+    print("\n".join([rt.name for rt in psy.walk(Routine)]))
 
-    for invoke in psy.invokes.invoke_list:
+    for routine in psy.walk(Routine):
 
-        sched = invoke.schedule
-        if not sched:
-            print("Invoke {invoke.name} has no Schedule! Skipping...")
+        if not routine.children:
+            print("Routine {routine.name} is empty! Skipping...")
             continue
 
         # Convert array and range syntax to explicit loops
         normalise_loops(
-            invoke.schedule,
-            unwrap_array_ranges=True,
+            routine,
+            scalarise_loops=True,
             hoist_expressions=True,
         )
 
         # Add OpenACC Loop directives
         insert_explicit_loop_parallelism(
-            invoke.schedule,
+            routine,
             region_directive_trans=None,
             loop_directive_trans=ACCLoopTrans(),
             collapse=True
         )
 
         # Add OpenACC Kernel directives
-        add_kernels(sched.children)
+        add_kernels(routine.children)
 
         # Add OpenACC data directives
-        if invoke.schedule.walk(Directive):
-            ACCEnterDataTrans().apply(invoke.schedule)
-            ACCUpdateTrans().apply(invoke.schedule)
+        if routine.walk(Directive):
+            ACCEnterDataTrans().apply(routine)
+            ACCUpdateTrans().apply(routine)

From 401ece2e82b43dde3d5f84be7f2d56fb787f8ea1 Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Thu, 23 Oct 2025 12:13:53 +0100
Subject: [PATCH 10/25] Modernise compiler flags and fix acc-mixed-umem target

---
 .../scripts/acc_mixed_unified_memory_trans.py | 29 +++++++++----------
 compiler_setup/spack_nvidia.sh                |  2 +-
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_mixed_unified_memory_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_mixed_unified_memory_trans.py
index 3cc7b6c0..3ed50735 100644
--- a/benchmarks/nemo/tracer_advection/scripts/acc_mixed_unified_memory_trans.py
+++ b/benchmarks/nemo/tracer_advection/scripts/acc_mixed_unified_memory_trans.py
@@ -1,7 +1,7 @@
 # -----------------------------------------------------------------------------
 # BSD 3-Clause License
 #
-# Copyright (c) 2018-2023, Science and Technology Facilities Council.
+# Copyright (c) 2018-2025, Science and Technology Facilities Council.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -47,43 +47,42 @@
 
 '''
 
+from psyclone.psyir.nodes import Node, Routine
 from psyclone.transformations import ACCLoopTrans
 from utils import add_kernels, normalise_loops, \
     insert_explicit_loop_parallelism
 
 
-def trans(psy):
+def trans(psy: Node):
     '''A PSyclone-script compliant transformation function. Applies
     OpenACC 'kernels' and 'loop' directives to NEMO code.
 
-    :param psy: The PSy layer object to apply transformations to.
-    :type psy: :py:class:`psyclone.psyGen.PSy`
-    '''
+    :param psy: The PSyIR to apply transformations to.
 
-    print("Invokes found:")
-    print("\n".join([str(name) for name in psy.invokes.names]))
+    '''
+    print("Routines found:")
+    print("\n".join([rt.name for rt in psy.walk(Routine)]))
 
-    for invoke in psy.invokes.invoke_list:
+    for routine in psy.walk(Routine):
 
-        sched = invoke.schedule
-        if not sched:
-            print("Invoke {invoke.name} has no Schedule! Skipping...")
+        if not routine.children:
+            print("Invoke {routine.name} is empty! Skipping...")
             continue
 
         # Convert array and range syntax to explicit loops
         normalise_loops(
-            invoke.schedule,
-            unwrap_array_ranges=True,
+            routine,
+            scalarise_loops=True,
             hoist_expressions=True,
         )
 
         # Add OpenACC Loop directives
         insert_explicit_loop_parallelism(
-            invoke.schedule,
+            routine,
             region_directive_trans=None,
             loop_directive_trans=ACCLoopTrans(),
             collapse=True
         )
 
         # Add OpenACC Kernel directives
-        add_kernels(sched.children)
+        add_kernels(routine)
diff --git a/compiler_setup/spack_nvidia.sh b/compiler_setup/spack_nvidia.sh
index 11fc3096..491e9a83 100644
--- a/compiler_setup/spack_nvidia.sh
+++ b/compiler_setup/spack_nvidia.sh
@@ -7,5 +7,5 @@ export F90=$FC
 export PSYCLONE_NVIDIA_LIB_DIR=${HOME}/Projects/PSyclone/lib/profiling/nvidia
 export OMPTARGETFLAGS="-mp=gpu -gpu=ccnative"
 export OMPFLAGS="-mp"
-export UMEMFLAGS="-gpu=managed"
+export UMEMFLAGS="-gpu=mem:managed"
 

From 9191008504c74220069316622e9f79c743881130 Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Thu, 23 Oct 2025 12:19:51 +0100
Subject: [PATCH 11/25] Fix acc-loops-um

---
 .../scripts/acc_loops_unified_memory_trans.py | 29 +++++++++----------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_loops_unified_memory_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_loops_unified_memory_trans.py
index 1557dd54..4ccf0ca6 100644
--- a/benchmarks/nemo/tracer_advection/scripts/acc_loops_unified_memory_trans.py
+++ b/benchmarks/nemo/tracer_advection/scripts/acc_loops_unified_memory_trans.py
@@ -1,7 +1,7 @@
 # -----------------------------------------------------------------------------
 # BSD 3-Clause License
 #
-# Copyright (c) 2022-2023, Science and Technology Facilities Council.
+# Copyright (c) 2022-2025, Science and Technology Facilities Council.
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
@@ -36,38 +36,35 @@
 ''' PSyclone transformation script to insert OpenACC Parallel Loop directives
 to the outermost loop that is parallelisable, including implicit loops.'''
 
-from psyclone.psyGen import TransInfo
+from psyclone.psyir.nodes import Node, Routine
+from psyclone.transformations import ACCParallelTrans, ACCLoopTrans
+
 from utils import insert_explicit_loop_parallelism, normalise_loops
 
 
-def trans(psy):
+def trans(psy: Node):
     ''' Add OpenACC Parallel Loop directive to all loops, including implicit
     ones to target GPU parallelism.
 
-    :param psy: the PSy object which this script will transform.
-    :type psy: :py:class:`psyclone.psyGen.PSy`
-    :returns: the transformed PSy object.
-    :rtype: :py:class:`psyclone.psyGen.PSy`
+    :param psy: the PSyIR which this script will transform.
 
     '''
-    acc_parallel_trans = TransInfo().get_trans_name('ACCParallelTrans')
-    acc_loop_trans = TransInfo().get_trans_name('ACCLoopTrans')
+    acc_parallel_trans = ACCParallelTrans()
+    acc_loop_trans = ACCLoopTrans()
 
-    print("Invokes found:")
-    for invoke in psy.invokes.invoke_list:
-        print(invoke.name)
+    print("Routines found:")
+    for routine in psy.walk(Routine):
+        print(routine.name)
 
         normalise_loops(
-            invoke.schedule,
-            unwrap_array_ranges=True,
+            routine,
             hoist_expressions=True,
         )
 
         insert_explicit_loop_parallelism(
-            invoke.schedule,
+            routine,
             region_directive_trans=acc_parallel_trans,
             loop_directive_trans=acc_loop_trans,
             collapse=True
         )
 
-    return psy

From 6c5eb135339325895d2524d1c7fc639bf4fb4075 Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Thu, 23 Oct 2025 13:00:20 +0100
Subject: [PATCH 12/25] Rm path to profiling lib from spack-setup script

---
 compiler_setup/spack_nvidia.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/compiler_setup/spack_nvidia.sh b/compiler_setup/spack_nvidia.sh
index 491e9a83..5db0c629 100644
--- a/compiler_setup/spack_nvidia.sh
+++ b/compiler_setup/spack_nvidia.sh
@@ -4,8 +4,9 @@
 
 # ==============================
 export F90=$FC
-export PSYCLONE_NVIDIA_LIB_DIR=${HOME}/Projects/PSyclone/lib/profiling/nvidia
+
 export OMPTARGETFLAGS="-mp=gpu -gpu=ccnative"
 export OMPFLAGS="-mp"
 export UMEMFLAGS="-gpu=mem:managed"
+export ACCFLAGS="-acc=gpu -gpu=ccnative"
 

From 87d307a1eb2c21d31134195af4893daa3a1e2bf5 Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Thu, 23 Oct 2025 13:03:57 +0100
Subject: [PATCH 13/25] Fix Makefile for tra-adv compute_in_subroutine

---
 .../tracer_advection/compute_in_subroutine/Makefile | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile b/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile
index 90a55d02..a8d3a901 100644
--- a/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile
+++ b/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile
@@ -22,7 +22,7 @@ DL_TIMER_NAME = libdl_timer_omp.a
 
 # Shorthand for invoking PSyclone with line-length limiting applied
 # to the output Fortran.
-PSYCLONE = psyclone -api nemo -l output
+PSYCLONE = psyclone -l output
 
 # Serial version.
 tra_adv_serial: dl_timer
@@ -45,7 +45,7 @@ tra_adv_no_auto_serial: dl_timer
 # OpenACC version using Unified Memory with timer around outer loop only.
 tra_adv_acc: dl_timer
 	mkdir -p $@
-	${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -opsy \
+	${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -o \
             $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90
 	cp Makefile_gen $@/Makefile
 	cp tra_adv_driver.F90 $@/.
@@ -58,7 +58,7 @@ ifndef PSYCLONE_NVIDIA_LIB_DIR
 	$(error The tra_adv_acc_prof must have the PSYCLONE_NVIDIA_LIB_DIR defined)
 endif
 	mkdir -p $@
-	${PSYCLONE} --profile invokes -s ../scripts/acc_kernels_unified_memory_trans.py -opsy \
+	${PSYCLONE} --profile invokes -s ../scripts/acc_kernels_unified_memory_trans.py -o \
             $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90
 	cp Makefile_gen $@/Makefile
 	cp tra_adv_driver.F90 $@/.
@@ -68,7 +68,7 @@ endif
 # Serial Fortran version after transformation to SIR-compliant form.
 tra_adv_sir: dl_timer
 	mkdir -p $@
-	${PSYCLONE} -s ../scripts/sir_trans.py -opsy $@/tra_adv_compute.f90 \
+	${PSYCLONE} -s ../scripts/sir_trans.py -o $@/tra_adv_compute.f90 \
             ./tra_adv_compute_auto_arrays.F90
 	cp Makefile_gen $@/Makefile
 	cp tra_adv_driver.F90 $@/.
@@ -78,7 +78,7 @@ tra_adv_sir: dl_timer
 # OpenACC added after transformation to SIR-compliant form.
 tra_adv_sir_acc: dl_timer
 	mkdir -p $@
-	${PSYCLONE} -s ../scripts/sir_kernels_trans.py -opsy \
+	${PSYCLONE} -s ../scripts/sir_kernels_trans.py -o \
             $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90
 	cp Makefile_gen $@/Makefile
 	cp tra_adv_driver.F90 $@/.
@@ -90,7 +90,7 @@ ifndef PSYCLONE_NVIDIA_LIB_DIR
 	$(error The tra_adv_sir_acc_prof must have the PSYCLONE_NVIDIA_LIB_DIR defined)
 endif
 	mkdir -p $@
-	${PSYCLONE} --profile invokes -s ../scripts/sir_kernels_trans.py -opsy \
+	${PSYCLONE} --profile invokes -s ../scripts/sir_kernels_trans.py -o \
             $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90
 	cp Makefile_gen $@/Makefile
 	cp tra_adv_driver.F90 $@/.
@@ -110,3 +110,4 @@ allclean: clean
 	rm -rf tra_adv_acc_prof
 	rm -rf tra_adv_sir
 	rm -rf tra_adv_sir_acc
+	rm -rf tra_adv_no_auto_serial

From d0d6c06aa79ae93d2ece2cf7c78efcf20843aef2 Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Thu, 23 Oct 2025 13:12:24 +0100
Subject: [PATCH 14/25] Fix other versions of tra_adv benchmark

---
 .../tracer_advection/compute_in_subroutine/Makefile    |  4 ++--
 benchmarks/nemo/tracer_advection/multi_kernel/Makefile | 10 +++++-----
 compiler_setup/spack_nvidia.sh                         |  1 +
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile b/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile
index a8d3a901..593b6207 100644
--- a/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile
+++ b/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile
@@ -58,7 +58,7 @@ ifndef PSYCLONE_NVIDIA_LIB_DIR
 	$(error The tra_adv_acc_prof must have the PSYCLONE_NVIDIA_LIB_DIR defined)
 endif
 	mkdir -p $@
-	${PSYCLONE} --profile invokes -s ../scripts/acc_kernels_unified_memory_trans.py -o \
+	${PSYCLONE} --profile routines -s ../scripts/acc_kernels_unified_memory_trans.py -o \
             $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90
 	cp Makefile_gen $@/Makefile
 	cp tra_adv_driver.F90 $@/.
@@ -90,7 +90,7 @@ ifndef PSYCLONE_NVIDIA_LIB_DIR
 	$(error The tra_adv_sir_acc_prof must have the PSYCLONE_NVIDIA_LIB_DIR defined)
 endif
 	mkdir -p $@
-	${PSYCLONE} --profile invokes -s ../scripts/sir_kernels_trans.py -o \
+	${PSYCLONE} --profile routines -s ../scripts/sir_kernels_trans.py -o \
             $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90
 	cp Makefile_gen $@/Makefile
 	cp tra_adv_driver.F90 $@/.
diff --git a/benchmarks/nemo/tracer_advection/multi_kernel/Makefile b/benchmarks/nemo/tracer_advection/multi_kernel/Makefile
index 83b3d966..ab8b09bc 100644
--- a/benchmarks/nemo/tracer_advection/multi_kernel/Makefile
+++ b/benchmarks/nemo/tracer_advection/multi_kernel/Makefile
@@ -22,7 +22,7 @@ DL_TIMER_DIR = ../../../../shared/dl_timer
 DL_TIMER_NAME = libdl_timer_omp.a
 
 # Shorthand for invoking PSyclone.
-PSYCLONE = psyclone -api nemo -l output
+PSYCLONE = psyclone -l output
 
 # Serial version.
 tra_adv_serial: dl_timer
@@ -36,7 +36,7 @@ tra_adv_serial: dl_timer
 # OpenACC version with timer around outer loop only.
 tra_adv_acc: dl_timer
 	mkdir -p $@
-	${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -opsy \
+	${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -o \
             $@/tra_adv_compute.f90 ./tra_adv_compute.F90
 	cp Makefile_gen $@/Makefile
 	cp tra_adv_driver.F90 $@/.
@@ -49,7 +49,7 @@ ifndef PSYCLONE_NVIDIA_LIB_DIR
 	$(error The tra_adv_acc_prof must have the PSYCLONE_NVIDIA_LIB_DIR defined)
 endif
 	mkdir -p $@
-	${PSYCLONE} --profile invokes -s ../scripts/kernels_trans.py -opsy \
+	${PSYCLONE} --profile routines -s ../scripts/acc_kernels_unified_memory_trans.py -o \
             $@/tra_adv_compute.f90 ./tra_adv_compute.F90
 	cp Makefile_gen $@/Makefile
 	cp tra_adv_driver.F90 $@/.
@@ -59,7 +59,7 @@ endif
 # Serial Fortran version after transformation to SIR-compliant form.
 tra_adv_sir: dl_timer
 	mkdir -p $@
-	${PSYCLONE} -s ../scripts/sir_trans.py -opsy $@/tra_adv_compute.f90 \
+	${PSYCLONE} -s ../scripts/sir_trans.py -o $@/tra_adv_compute.f90 \
             ./tra_adv_compute.F90
 	cp Makefile_gen $@/Makefile
 	cp tra_adv_driver.F90 $@/.
@@ -69,7 +69,7 @@ tra_adv_sir: dl_timer
 # OpenACC added after transformation to SIR-compliant form.
 tra_adv_sir_acc: dl_timer
 	mkdir -p $@
-	${PSYCLONE} -s ../scripts/sir_kernels_trans.py -opsy \
+	${PSYCLONE} -s ../scripts/sir_kernels_trans.py -o \
             $@/tra_adv_compute.f90 ./tra_adv_compute.F90
 	cp Makefile_gen $@/Makefile
 	cp tra_adv_driver.F90 $@/.
diff --git a/compiler_setup/spack_nvidia.sh b/compiler_setup/spack_nvidia.sh
index 5db0c629..6eb25ec0 100644
--- a/compiler_setup/spack_nvidia.sh
+++ b/compiler_setup/spack_nvidia.sh
@@ -5,6 +5,7 @@
 # ==============================
 export F90=$FC
 
+export LDFLAGS="-cuda -L${CUDA_HOME}/lib64 -lnvToolsExt"
 export OMPTARGETFLAGS="-mp=gpu -gpu=ccnative"
 export OMPFLAGS="-mp"
 export UMEMFLAGS="-gpu=mem:managed"

From 9c5526c42d0194f7e7649ac22ea77c68d679a708 Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Thu, 23 Oct 2025 13:16:38 +0100
Subject: [PATCH 15/25] Update GHA workflow file

---
 .github/workflows/makefile-test.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/makefile-test.yml b/.github/workflows/makefile-test.yml
index 245a0a78..c6185ab8 100644
--- a/.github/workflows/makefile-test.yml
+++ b/.github/workflows/makefile-test.yml
@@ -47,12 +47,12 @@ on:
 jobs:
   build:
 
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
       with:
         submodules: recursive
-    - uses: actions/setup-python@v4
+    - uses: actions/setup-python@v5
     - run: python -m pip install --upgrade pip
     - run: cd shared/PSyclone && pip install .
     - name: Install dependencies

From ba634ea14e4367559f523f7e046f765de12db090 Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Thu, 23 Oct 2025 13:22:45 +0100
Subject: [PATCH 16/25] Update kokkos submodule

---
 shared/kokkos | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shared/kokkos b/shared/kokkos
index ae5fc649..aecc5dc8 160000
--- a/shared/kokkos
+++ b/shared/kokkos
@@ -1 +1 @@
-Subproject commit ae5fc649ef4b62b48a01123759ed066bff227b43
+Subproject commit aecc5dc8f5be7df3a4d8b9c6fa99f1212475bccc

From b856e783156470eec108894ba8ade1747984548a Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Thu, 23 Oct 2025 16:04:06 +0100
Subject: [PATCH 17/25] Fix NEMOLite2D acc version

---
 .../fortran/boundary_conditions_mod.f90       |  1 -
 .../psykal/psyclone_scripts/acc_transform.py  | 21 ++++++++++++-------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/benchmarks/nemo/nemolite2d/kernels/fortran/boundary_conditions_mod.f90 b/benchmarks/nemo/nemolite2d/kernels/fortran/boundary_conditions_mod.f90
index ca47f880..ca10aa6a 100644
--- a/benchmarks/nemo/nemolite2d/kernels/fortran/boundary_conditions_mod.f90
+++ b/benchmarks/nemo/nemolite2d/kernels/fortran/boundary_conditions_mod.f90
@@ -5,7 +5,6 @@ module boundary_conditions_mod
        GO_STENCIL
   use kernel_mod, only: kernel_type, GO_POINTWISE, GO_DOFS, &
       GO_ALL_PTS, GO_INTERNAL_PTS
-  use physical_params_mod
   use grid_mod
   use field_mod
   implicit none
diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py
index 7725c55f..e453a9ea 100644
--- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py
+++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py
@@ -3,7 +3,10 @@
 
 from psyclone.domain.common.transformations import KernelModuleInlineTrans
 from psyclone.psyGen import TransInfo
-from psyclone.psyir.nodes import Loop
+from psyclone.psyir.nodes import Loop, Routine
+from psyclone.transformations import (
+    ACCEnterDataTrans, ACCLoopTrans, ACCParallelTrans, ACCRoutineTrans,
+    KernelImportsToArguments)
 
 
 def trans(psy):
@@ -12,19 +15,23 @@ def trans(psy):
     tinfo = TransInfo()
     parallel_trans = tinfo.get_trans_name('ACCParallelTrans')
     loop_trans = tinfo.get_trans_name('ACCLoopTrans')
-    enter_data_trans = tinfo.get_trans_name('ACCEnterDataTrans')
-    routine_trans = tinfo.get_trans_name('ACCRoutineTrans')
-    glo2arg_trans = tinfo.get_trans_name('KernelImportsToArguments')
+    enter_data_trans = ACCEnterDataTrans()
+    routine_trans = ACCRoutineTrans()
+    glo2arg_trans = KernelImportsToArguments()
     inline_trans = KernelModuleInlineTrans()
 
-    invoke = psy.invokes.get('invoke_0')
-    schedule = invoke.schedule
+    schedule = psy.walk(Routine)[0]
 
     # Apply the OpenACC Loop transformation to *every* loop
     # in the schedule
     for child in schedule.children:
         if isinstance(child, Loop):
-            loop_trans.apply(child, {"collapse": 2})
+            # We need to ignore dependencies on 'va' because PSyclone correctly
+            # spots that there is a dependence in one of the boundary-condition
+            # kernels. However, we know that practically this isn't a problem
+            # because of the way the domain (mask) is configured.
+            loop_trans.apply(child, {"collapse": 2,
+                                     "ignore_dependencies_for": ["va"]})
 
     # Put all of the loops in a single parallel region
     parallel_trans.apply(schedule)

From 004f7e5f51ceee25f18ed6d0e3c9113b0d28fb23 Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Thu, 23 Oct 2025 16:17:09 +0100
Subject: [PATCH 18/25] Update all NEMOLite2D transformation scripts

---
 .../psykal/psyclone_scripts/acc_transform.py  |  8 +++----
 .../psykal/psyclone_scripts/ocl_transform.py  | 16 ++++++++------
 .../psyclone_scripts/omp_task_transform.py    | 22 ++++++++++++-------
 .../psykal/psyclone_scripts/omp_transform.py  | 11 +++++++---
 .../psyclone_scripts/serial_transform.py      |  9 +++++---
 5 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py
index e453a9ea..52ff8c7c 100644
--- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py
+++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py
@@ -18,7 +18,7 @@ def trans(psy):
     enter_data_trans = ACCEnterDataTrans()
     routine_trans = ACCRoutineTrans()
     glo2arg_trans = KernelImportsToArguments()
-    inline_trans = KernelModuleInlineTrans()
+    mod_inline_trans = KernelModuleInlineTrans()
 
     schedule = psy.walk(Routine)[0]
 
@@ -27,8 +27,8 @@ def trans(psy):
     for child in schedule.children:
         if isinstance(child, Loop):
             # We need to ignore dependencies on 'va' because PSyclone correctly
-            # spots that there is a dependence in one of the boundary-condition
-            # kernels. However, we know that practically this isn't a problem
+            # spots that there is a dependence in the bc_flather_v kernel.
+            # However, we know that practically this isn't a problem
             # because of the way the domain (mask) is configured.
             loop_trans.apply(child, {"collapse": 2,
                                      "ignore_dependencies_for": ["va"]})
@@ -44,6 +44,6 @@ def trans(psy):
     for kern in schedule.coded_kernels():
         glo2arg_trans.apply(kern)
         routine_trans.apply(kern)
-        inline_trans.apply(kern)
+        mod_inline_trans.apply(kern)
 
     return psy
diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/ocl_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/ocl_transform.py
index 8235d3a0..6221871d 100644
--- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/ocl_transform.py
+++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/ocl_transform.py
@@ -3,10 +3,13 @@
 that PSyclone will generate an OpenCL PSy layer. '''
 
 import os
-from psyclone.psyGen import TransInfo
-from psyclone.domain.gocean.transformations import \
-    GOMoveIterationBoundariesInsideKernelTrans, GOOpenCLTrans
+
+from psyclone.domain.gocean.transformations import (
+    GOMoveIterationBoundariesInsideKernelTrans, GOOpenCLTrans)
 from psyclone.configuration import Config
+from psyclone.psyir.nodes import Routine
+from psyclone.transformations import (
+    KernelImportsToArguments)
 
 
 # Global variables to configure the PSyclone OpenCL generation:
@@ -33,13 +36,12 @@ def trans(psy):
     ''' Transform the schedule for OpenCL generation '''
 
     # Import transformations
-    tinfo = TransInfo()
-    globaltrans = tinfo.get_trans_name('KernelImportsToArguments')
+    globaltrans = KernelImportsToArguments()
     move_boundaries_trans = GOMoveIterationBoundariesInsideKernelTrans()
     cltrans = GOOpenCLTrans()
 
-    # Get the invoke routine
-    schedule = psy.invokes.get('invoke_0').schedule
+    # Get the routine
+    schedule = psy.walk(Routine)[0]
 
     # Map the kernels by their name to different OpenCL queues. The multiple
     # command queues can be executed concurrently while each command queue
diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py
index 17be3e07..4b40ad59 100644
--- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py
+++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py
@@ -2,20 +2,21 @@
 function via the -s option. It applies OpenMP tasking to every loop
 and inlines all kernels in the schedule.'''
 
-from psyclone.psyir.nodes import Loop
+from psyclone.psyir.nodes import Loop, Routine
 from psyclone.configuration import Config
-from psyclone.transformations import OMPParallelTrans, OMPSingleTrans, \
-                                     OMPTaskloopTrans, KernelModuleInlineTrans
-from psyclone.psyir.transformations import OMPTaskwaitTrans
-from psyclone.psyir.nodes import OMPTaskloopDirective, OMPTaskwaitDirective, \
-                                 OMPDirective, OMPParallelDirective
+from psyclone.domain.common.transformations import KernelModuleInlineTrans
+from psyclone.transformations import (
+    OMPParallelTrans, OMPSingleTrans)
+from psyclone.psyir.transformations import OMPTaskloopTrans, OMPTaskwaitTrans
+from psyclone.psyir.nodes import (OMPTaskloopDirective, OMPTaskwaitDirective,
+                                  OMPDirective, OMPParallelDirective)
 
 
 def trans(psy):
     '''Transformation entry point'''
     config = Config.get()
 
-    schedule = psy.invokes.get('invoke_0').schedule
+    schedule = psy.walk(Routine)[0]
 
     loop_trans = OMPTaskloopTrans(grainsize=32, nogroup=True)
     wait_trans = OMPTaskwaitTrans()
@@ -28,7 +29,12 @@ def trans(psy):
 
     for child in schedule.children:
         if isinstance(child, Loop):
-            loop_trans.apply(child)
+            # We need to ignore dependencies on 'va' because PSyclone correctly
+            # spots that there is a dependence in the bc_flather_v kernel.
+            # However, we know that practically this isn't a problem
+            # because of the way the domain (mask) is configured.
+            loop_trans.apply(child,
+                             options={"ignore_dependencies_for": ["va"]})
 
     single_trans = OMPSingleTrans()
     parallel_trans = OMPParallelTrans()
diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py
index 573cef38..6a50a357 100644
--- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py
+++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py
@@ -5,7 +5,7 @@
 from psyclone.configuration import Config
 from psyclone.domain.common.transformations import KernelModuleInlineTrans
 from psyclone.psyGen import TransInfo
-from psyclone.psyir.nodes import Loop
+from psyclone.psyir.nodes import Loop, Routine
 
 
 def trans(psy):
@@ -17,7 +17,7 @@ def trans(psy):
     parallel_trans = tinfo.get_trans_name('OMPParallelTrans')
     module_inline_trans = KernelModuleInlineTrans()
 
-    schedule = psy.invokes.get('invoke_0').schedule
+    schedule = psy.walk(Routine)[0]
 
     # Inline all kernels in this Schedule
     for kernel in schedule.kernels():
@@ -30,7 +30,12 @@ def trans(psy):
             if isinstance(child, Loop):
                 parallel_loop_trans.apply(child)
         else:
-            loop_trans.apply(child)
+            # We need to ignore dependencies on 'va' because PSyclone correctly
+            # spots that there is a dependence in the bc_flather_v kernel.
+            # However, we know that practically this isn't a problem
+            # because of the way the domain (mask) is configured.
+            loop_trans.apply(child,
+                             options={"ignore_dependencies_for": ["va"]})
 
     if not config.distributed_memory:
         # If it is not distributed memory, enclose all of these loops
diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/serial_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/serial_transform.py
index 1456cc2d..544e93ae 100644
--- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/serial_transform.py
+++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/serial_transform.py
@@ -2,14 +2,17 @@
 via the -s option. This script module-inline all kernels in the PSy-layer.'''
 
 from psyclone.domain.common.transformations import KernelModuleInlineTrans
+from psyclone.psyir.nodes import Node, Routine
 
 
-def trans(psy):
-    ''' Transformation script entry function '''
+def trans(psy: Node):
+    '''Entry point for PSyIR transformation. This script module-inlines
+    every user-supplied kernel that is called.
 
+    '''
     itrans = KernelModuleInlineTrans()
 
-    schedule = psy.invokes.get('invoke_0').schedule
+    schedule = psy.walk(Routine)[0]
 
     # Module-Inline all coded kernels in this Schedule
     for kernel in schedule.coded_kernels():

From fa9747aefd6605b71876cb0b7aaaf318af1376fc Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Thu, 23 Oct 2025 16:31:40 +0100
Subject: [PATCH 19/25] Rm unused and ancient script from Shallow

---
 benchmarks/shallow/SEQ/runme_loop_fuse.py | 25 -----------------------
 1 file changed, 25 deletions(-)
 delete mode 100644 benchmarks/shallow/SEQ/runme_loop_fuse.py

diff --git a/benchmarks/shallow/SEQ/runme_loop_fuse.py b/benchmarks/shallow/SEQ/runme_loop_fuse.py
deleted file mode 100644
index 0265c266..00000000
--- a/benchmarks/shallow/SEQ/runme_loop_fuse.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from parse import parse,ParseError
-from psyGen import PSyFactory,GenerationError
-#from algGen import Alg
-api="gocean"
-filename="shallow_gocean.f90"
-ast,invokeInfo=parse(filename,api=api,invoke_name="invoke")
-psy=PSyFactory(api).create(invokeInfo)
-print psy.gen
-#alg=Alg(ast,psy)
-
-print psy.invokes.names
-schedule=psy.invokes.get('invoke_0').schedule
-schedule.view()
-
-from psyGen import TransInfo
-t=TransInfo()
-print t.list
-#lf=t.get_trans_name('DoubleLoopFuse')
-lf=t.get_trans_name('LoopFuse')
-
-newschedule,memento=lf.apply(schedule.children[0],schedule.children[1])
-#newschedule,memento=lf.apply(schedule.children[0].children[0].children[0],schedule.children[1].children[0].children[0])
-newschedule.view()
-#psy.invokes.get('invoke_0')._schedule=newschedule
-#print psy.gen

From 20a6db24b9c40d1c8c17649b20fe8f4ec17c63d0 Mon Sep 17 00:00:00 2001
From: Sergi Siso <sergiesg@gmail.com>
Date: Tue, 9 Dec 2025 10:13:02 +0000
Subject: [PATCH 20/25] Update kokkos

---
 .../nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile   | 5 ++++-
 shared/kokkos                                                | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile
index 9be53b00..65725e62 100644
--- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile
+++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile
@@ -16,6 +16,9 @@ KOKKOS_PATH ?= $(SHARED_DIR)/kokkos
 KOKKOS_DEBUG ?= no  # Careful, 10x performance penalty in kernels.
 CXXFLAGS = $(CFLAGS)  # Use same CFLAGS to compile Kokkos library.
 
+# The Kokkos Makefile is deprecated, but we can still use it with:
+KOKKOS_USE_DEPRECATED_MAKEFILES=1
+
 # If no KOKKOS_DEVICES is specified, by default use the OpenMP
 KOKKOS_DEVICES ?= OpenMP
 
@@ -90,7 +93,7 @@ clean:
 	${MAKE} -C ${INF_DIR} clean
 	rm -f *.o *.mod *.MOD *~ *.dat
 	rm -f gnu_opt_report.txt *.optrpt
-	rm -rf KokkosCore_* Makefile.kokkos.f90
+	rm -rf KokkosCore_* Makefile.kokkos.f90 desul Desul_Config.tmp
 
 allclean: clean
 	rm -f *.exe fparser.log *.a
diff --git a/shared/kokkos b/shared/kokkos
index aecc5dc8..552f2375 160000
--- a/shared/kokkos
+++ b/shared/kokkos
@@ -1 +1 @@
-Subproject commit aecc5dc8f5be7df3a4d8b9c6fa99f1212475bccc
+Subproject commit 552f2375de06361f8a5662abc0859ae233b5d8f8

From bc336ee2ea2e6bae00454618220008de0de98575 Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Mon, 15 Dec 2025 11:07:57 +0000
Subject: [PATCH 21/25] #101 update PSyclone to master

---
 shared/PSyclone | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/shared/PSyclone b/shared/PSyclone
index c9c20b1e..63d4c225 160000
--- a/shared/PSyclone
+++ b/shared/PSyclone
@@ -1 +1 @@
-Subproject commit c9c20b1ee96c10352b31463276408ad33ab84752
+Subproject commit 63d4c22552fb6cd5fafbd4185ef373a1d9e3713c

From cb4039630bf87b42e8aaa7d22291428cdb30ac47 Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Mon, 15 Dec 2025 11:18:58 +0000
Subject: [PATCH 22/25] #101 update problem-size script and compiler flags

---
 .../tracer_advection/scripts/problemsize.sh   |  5 +++-
 compiler_setup/intel.sh                       |  6 ++---
 compiler_setup/nvidia_acc.sh                  | 24 ++++---------------
 compiler_setup/spack_nvidia.sh                |  2 +-
 4 files changed, 11 insertions(+), 26 deletions(-)

diff --git a/benchmarks/nemo/tracer_advection/scripts/problemsize.sh b/benchmarks/nemo/tracer_advection/scripts/problemsize.sh
index 09441c3e..f7b7d985 100755
--- a/benchmarks/nemo/tracer_advection/scripts/problemsize.sh
+++ b/benchmarks/nemo/tracer_advection/scripts/problemsize.sh
@@ -2,6 +2,8 @@
 
 # Bash script to execute the tracer-advection benchmark with increasing
 # domain sizes.
+# By default the process is pinned to core 0. Please edit the taskset
+# command below if you wish to change this.
 
 if [ "$#" -lt 1 ] || [ ! -x "$1" ]; then
     echo "Wrong arguments. Usage: ../../problemsize.sh ./executable"
@@ -24,7 +26,8 @@ for power in $(seq 4 9); do
     export JPI=${size}
     export JPJ=${size}
 
-    time=$(taskset -c 2 $@  | awk '{if ($1 == "Time-stepping") {print $5} }')
+    # Execute - use taskset to pin the process to a core.
+    time=$(taskset -c 0 $@  | awk '{if ($1 == "Time-stepping") {print $5} }')
 
     echo $size $time
 done
diff --git a/compiler_setup/intel.sh b/compiler_setup/intel.sh
index dd3a70dd..ce334af0 100644
--- a/compiler_setup/intel.sh
+++ b/compiler_setup/intel.sh
@@ -41,10 +41,8 @@ OMPFLAGS="-qopenmp"
 LDFLAGS= 
 #LDFLAGS+= -fast
 
-# The archiver used to generate the API library. We must
-# use Intel's xiar if doing IPO as otherwise the library
-# doesn't contain the necessary symbols.
-AR=xiar
+# The archiver used to generate the API library.
+AR=ar
 ARFLAGS=cru
 
 export F90
diff --git a/compiler_setup/nvidia_acc.sh b/compiler_setup/nvidia_acc.sh
index 415e73a8..0cb5a438 100644
--- a/compiler_setup/nvidia_acc.sh
+++ b/compiler_setup/nvidia_acc.sh
@@ -11,30 +11,14 @@ CFLAGS="-g"
 F90FLAGS="-O3 -Minfo=all"
 # Debugging options
 #F90FLAGS"+=" -fcheck=all -fbacktrace -ffpe-trap=invalid -g -O0"
-# -Mcuda is for CUDA Fortran
-# nordc - do not link to routines compiled for device (ensure
-# kernel code is in-lined in loops)
-# cc = compute capability
-# Registers are shared by threads in an SMP. The more registers a kernel
-# uses, the fewer threads it can support. This parameter can be tuned and
-# should be a multiple of 8.
-# -Mcuda is required to build CUDA Fortran
-# For Quadro K600
-#F90FLAGS+=" -acc -ta=tesla:cc30,nordc -Mcuda=cc30,nordc"
-# For Tesla K20c
-#F90FLAGS+=" -acc -ta=tesla,cc35,maxregcount:80,nordc -Mcuda=cc35,maxregcount:80,nordc"
 # V100 with managed memory
-F90FLAGS+=" -acc=gpu -gpu=cc70,managed"
+F90FLAGS+=" -acc=gpu -gpu=cc70,mem:managed"
 # Linker flags
-# For Quadro K600
-#LDFLAGS+=" -acc -ta=tesla,cc30 -Mcuda=cc30,nordc"
-# For Tesla K20c
-#LDFLAGS="-acc -ta=nvidia,cc35 -Mcuda=cc35,nordc"
 # V100 with managed memory
-LDFLAGS="-acc=gpu -gpu=cc70,managed"
-# Location of various CUDA maths libraries. libnvToolsExt is required when
+LDFLAGS="-acc=gpu -gpu=cc70,mem:managed"
+# Location of various CUDA maths libraries. nvtx3interop is required when
 # using nvtx for profiling.
-LDFLAGS+=" -Mcuda -L${CUDA_MATH_DIR}/lib64  -lnvToolsExt"
+LDFLAGS+=" -cuda -L${CUDA_MATH_DIR}/lib64 -lnvtx3interop"
 # Flags to use when compiling with OpenMP support
 OMPFLAGS="-mp"
 # Command to use to create archive of object files
diff --git a/compiler_setup/spack_nvidia.sh b/compiler_setup/spack_nvidia.sh
index 6eb25ec0..a3ac0450 100644
--- a/compiler_setup/spack_nvidia.sh
+++ b/compiler_setup/spack_nvidia.sh
@@ -5,7 +5,7 @@
 # ==============================
 export F90=$FC
 
-export LDFLAGS="-cuda -L${CUDA_HOME}/lib64 -lnvToolsExt"
+export LDFLAGS="-cuda -L${CUDA_HOME}/lib64 -lnvtx3interop"
 export OMPTARGETFLAGS="-mp=gpu -gpu=ccnative"
 export OMPFLAGS="-mp"
 export UMEMFLAGS="-gpu=mem:managed"

From 432dc4c9d585da8cfa8dc8266210042d6890da44 Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Mon, 15 Dec 2025 11:46:12 +0000
Subject: [PATCH 23/25] #101 update psyclone scripts to handle u- and v-flather
 kernels

---
 .../psykal/psyclone_scripts/acc_transform.py  | 16 ++++++++++------
 .../psyclone_scripts/omp_task_transform.py    | 12 ++++++++----
 .../psykal/psyclone_scripts/omp_transform.py  | 19 ++++++++++++-------
 3 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py
index 52ff8c7c..4d858485 100644
--- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py
+++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py
@@ -26,12 +26,16 @@ def trans(psy):
     # in the schedule
     for child in schedule.children:
         if isinstance(child, Loop):
-            # We need to ignore dependencies on 'va' because PSyclone correctly
-            # spots that there is a dependence in the bc_flather_v kernel.
-            # However, we know that practically this isn't a problem
-            # because of the way the domain (mask) is configured.
-            loop_trans.apply(child, {"collapse": 2,
-                                     "ignore_dependencies_for": ["va"]})
+            opts = {"collapse": 2}
+            if child.kernels()[0].name == "bc_flather_v_code":
+                # We need to ignore dependencies on 'va' because PSyclone
+                # spots that there is a dependence in the bc_flather_v kernel.
+                # However, we know that practically this isn't a problem
+                # because of the way the domain (mask) is configured.
+                opts["ignore_dependencies_for"] = ["va%data"]
+            if child.kernels()[0].name == "bc_flather_u_code":
+                opts["ignore_dependencies_for"] = ["ua%data"]
+            loop_trans.apply(child, options=opts)
 
     # Put all of the loops in a single parallel region
     parallel_trans.apply(schedule)
diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py
index 4b40ad59..179ed4d0 100644
--- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py
+++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py
@@ -29,12 +29,16 @@ def trans(psy):
 
     for child in schedule.children:
         if isinstance(child, Loop):
-            # We need to ignore dependencies on 'va' because PSyclone correctly
-            # spots that there is a dependence in the bc_flather_v kernel.
+            # We need to ignore dependencies on '{u,v}a' because PSyclone
+            # spots that there is a dependence in the bc_flather_{u,v} kernel.
             # However, we know that practically this isn't a problem
             # because of the way the domain (mask) is configured.
-            loop_trans.apply(child,
-                             options={"ignore_dependencies_for": ["va"]})
+            options = {}
+            if child.kernels()[0].name == "bc_flather_v_code":
+                options["ignore_dependencies_for"] = ["va%data"]
+            if child.kernels()[0].name == "bc_flather_u_code":
+                options["ignore_dependencies_for"] = ["ua%data"]
+            loop_trans.apply(child, options=options)
 
     single_trans = OMPSingleTrans()
     parallel_trans = OMPParallelTrans()
diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py
index 6a50a357..d150587b 100644
--- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py
+++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py
@@ -26,16 +26,21 @@ def trans(psy):
     # Apply the OpenMPLoop transformation to every child in the schedule or
     # OpenMPParallelLoop to every Loop if it has distributed memory.
     for child in schedule.children:
+        # We need to ignore dependencies on '{u,v}a' because PSyclone correctly
+        # spots that there is a dependence in the bc_flather_{u,v} kernel.
+        # However, we know that practically this isn't a problem
+        # because these boundary-condition kernels only update values
+        # outside the domain.
+        options = {}
+        if child.kernels()[0].name == "bc_flather_v_code":
+            options["ignore_dependencies_for"] = ["va%data"]
+        if child.kernels()[0].name == "bc_flather_u_code":
+            options["ignore_dependencies_for"] = ["ua%data"]
         if config.distributed_memory:
             if isinstance(child, Loop):
-                parallel_loop_trans.apply(child)
+                parallel_loop_trans.apply(child, options=options)
         else:
-            # We need to ignore dependencies on 'va' because PSyclone correctly
-            # spots that there is a dependence in the bc_flather_v kernel.
-            # However, we know that practically this isn't a problem
-            # because of the way the domain (mask) is configured.
-            loop_trans.apply(child,
-                             options={"ignore_dependencies_for": ["va"]})
+            loop_trans.apply(child, options=options)
 
     if not config.distributed_memory:
         # If it is not distributed memory, enclose all of these loops

From badb92d649c4d195dc15c058bce8ce9ecb51da36 Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Mon, 15 Dec 2025 12:05:59 +0000
Subject: [PATCH 24/25] #101 tidy tra-adv scripts

---
 .../psykal/psyclone_scripts/acc_transform.py  | 12 +--
 .../psyclone_scripts/omp_task_transform.py    |  6 +-
 .../psykal/psyclone_scripts/omp_transform.py  | 13 +--
 ...cc_kernels_explicit_data_movement_trans.py |  6 +-
 .../acc_kernels_unified_memory_trans.py       |  6 +-
 .../acc_loops_explicit_data_movement_trans.py |  6 +-
 .../scripts/acc_loops_unified_memory_trans.py |  6 +-
 .../acc_mixed_explicit_data_movement_trans.py |  8 +-
 .../scripts/acc_mixed_unified_memory_trans.py |  8 +-
 .../scripts/omp_cpu_levels_trans.py           |  6 +-
 .../tracer_advection/scripts/omp_cpu_trans.py |  2 +-
 .../tracer_advection/scripts/omp_gpu_trans.py |  6 +-
 .../nemo/tracer_advection/scripts/utils.py    | 92 +------------------
 13 files changed, 44 insertions(+), 133 deletions(-)

diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py
index 4d858485..3d1273b7 100644
--- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py
+++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py
@@ -3,15 +3,15 @@
 
 from psyclone.domain.common.transformations import KernelModuleInlineTrans
 from psyclone.psyGen import TransInfo
-from psyclone.psyir.nodes import Loop, Routine
+from psyclone.psyir.nodes import Container, Loop, Routine
 from psyclone.transformations import (
     ACCEnterDataTrans, ACCLoopTrans, ACCParallelTrans, ACCRoutineTrans,
     KernelImportsToArguments)
 
 
-def trans(psy):
-    ''' Take the supplied psy object, apply OpenACC transformations
-    to the schedule of invoke_0 and return the new psy object '''
+def trans(psyir: Container) -> None:
+    ''' Take the supplied psyir object, apply OpenACC transformations
+    to the schedule of invoke_0. '''
     tinfo = TransInfo()
     parallel_trans = tinfo.get_trans_name('ACCParallelTrans')
     loop_trans = tinfo.get_trans_name('ACCLoopTrans')
@@ -20,7 +20,7 @@ def trans(psy):
     glo2arg_trans = KernelImportsToArguments()
     mod_inline_trans = KernelModuleInlineTrans()
 
-    schedule = psy.walk(Routine)[0]
+    schedule = psyir.walk(Routine)[0]
 
     # Apply the OpenACC Loop transformation to *every* loop
     # in the schedule
@@ -49,5 +49,3 @@ def trans(psy):
         glo2arg_trans.apply(kern)
         routine_trans.apply(kern)
         mod_inline_trans.apply(kern)
-
-    return psy
diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py
index 179ed4d0..b10e07ff 100644
--- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py
+++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py
@@ -2,7 +2,7 @@
 function via the -s option. It applies OpenMP tasking to every loop
 and inlines all kernels in the schedule.'''
 
-from psyclone.psyir.nodes import Loop, Routine
+from psyclone.psyir.nodes import Container, Loop, Routine
 from psyclone.configuration import Config
 from psyclone.domain.common.transformations import KernelModuleInlineTrans
 from psyclone.transformations import (
@@ -12,11 +12,11 @@
                                   OMPDirective, OMPParallelDirective)
 
 
-def trans(psy):
+def trans(psyir: Container) -> None:
     '''Transformation entry point'''
     config = Config.get()
 
-    schedule = psy.walk(Routine)[0]
+    schedule = psyir.walk(Routine)[0]
 
     loop_trans = OMPTaskloopTrans(grainsize=32, nogroup=True)
     wait_trans = OMPTaskwaitTrans()
diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py
index d150587b..ff085752 100644
--- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py
+++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py
@@ -5,11 +5,14 @@
 from psyclone.configuration import Config
 from psyclone.domain.common.transformations import KernelModuleInlineTrans
 from psyclone.psyGen import TransInfo
-from psyclone.psyir.nodes import Loop, Routine
+from psyclone.psyir.nodes import Container, Loop, Routine
 
 
-def trans(psy):
-    ''' Transformation entry point '''
+def trans(psyir: Container) -> None:
+    '''
+    Transformation entry point.
+
+    '''
     config = Config.get()
     tinfo = TransInfo()
     parallel_loop_trans = tinfo.get_trans_name('GOceanOMPParallelLoopTrans')
@@ -17,7 +20,7 @@ def trans(psy):
     parallel_trans = tinfo.get_trans_name('OMPParallelTrans')
     module_inline_trans = KernelModuleInlineTrans()
 
-    schedule = psy.walk(Routine)[0]
+    schedule = psyir.walk(Routine)[0]
 
     # Inline all kernels in this Schedule
     for kernel in schedule.kernels():
@@ -46,5 +49,3 @@ def trans(psy):
         # If it is not distributed memory, enclose all of these loops
         # within a single OpenMP PARALLEL region
         parallel_trans.apply(schedule.children)
-
-    return psy
diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_data_movement_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_data_movement_trans.py
index 04f546b3..a8375404 100644
--- a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_data_movement_trans.py
+++ b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_data_movement_trans.py
@@ -53,14 +53,14 @@
 from utils import add_kernels
 
 
-def trans(psy: Node):
+def trans(psyir: Node) -> None:
     '''A PSyclone-script compliant transformation function. Applies
     OpenACC 'kernels' and 'data movement' directives to generic code.
 
-    :param psy: The PSyIR to apply transformations to.
+    :param psyir: The PSyIR to apply transformations to.
 
     '''
-    for sched in psy.walk(Routine):
+    for sched in psyir.walk(Routine):
 
         if not sched.children:
             print(f"Routine {sched.name} is empty! Skipping...")
diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_unified_memory_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_unified_memory_trans.py
index 93a84179..904ffef9 100644
--- a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_unified_memory_trans.py
+++ b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_unified_memory_trans.py
@@ -52,16 +52,16 @@
 from utils import add_kernels
 
 
-def trans(psy: Node):
+def trans(psyir: Node) -> None:
     '''A PSyclone-script compliant transformation function. Applies
     OpenACC 'kernels' to existing code.
 
-    :param psy: The PSyIR to apply transformations to.
+    :param psyir: The PSyIR to apply transformations to.
 
     '''
     print("Routines found:")
 
-    for routine in psy.walk(Routine):
+    for routine in psyir.walk(Routine):
         print(routine.name)
 
         if not routine.children:
diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_loops_explicit_data_movement_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_loops_explicit_data_movement_trans.py
index 38aa257d..ac42bce2 100644
--- a/benchmarks/nemo/tracer_advection/scripts/acc_loops_explicit_data_movement_trans.py
+++ b/benchmarks/nemo/tracer_advection/scripts/acc_loops_explicit_data_movement_trans.py
@@ -44,18 +44,18 @@
 from utils import insert_explicit_loop_parallelism, normalise_loops
 
 
-def trans(psy: Node):
+def trans(psyir: Node) -> None:
     ''' Add OpenACC Parallel Loop directive to all loops, including implicit
     ones, to target GPU parallelism and explicit data movement directives.
 
-    :param psy: the PSyIR which this script will transform.
+    :param psyir: the PSyIR which this script will transform.
 
     '''
     acc_parallel_trans = ACCParallelTrans()
     acc_loop_trans = ACCLoopTrans()
 
     print("Routines found:")
-    for routine in psy.walk(Routine):
+    for routine in psyir.walk(Routine):
         print(routine.name)
 
         # Convert array and range notation to loops and hoist expressions
diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_loops_unified_memory_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_loops_unified_memory_trans.py
index 4ccf0ca6..efef8816 100644
--- a/benchmarks/nemo/tracer_advection/scripts/acc_loops_unified_memory_trans.py
+++ b/benchmarks/nemo/tracer_advection/scripts/acc_loops_unified_memory_trans.py
@@ -42,18 +42,18 @@
 from utils import insert_explicit_loop_parallelism, normalise_loops
 
 
-def trans(psy: Node):
+def trans(psyir: Node) -> None:
     ''' Add OpenACC Parallel Loop directive to all loops, including implicit
     ones to target GPU parallelism.
 
-    :param psy: the PSyIR which this script will transform.
+    :param psyir: the PSyIR which this script will transform.
 
     '''
     acc_parallel_trans = ACCParallelTrans()
     acc_loop_trans = ACCLoopTrans()
 
     print("Routines found:")
-    for routine in psy.walk(Routine):
+    for routine in psyir.walk(Routine):
         print(routine.name)
 
         normalise_loops(
diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_mixed_explicit_data_movement_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_mixed_explicit_data_movement_trans.py
index 8d7b9661..b8f63f50 100644
--- a/benchmarks/nemo/tracer_advection/scripts/acc_mixed_explicit_data_movement_trans.py
+++ b/benchmarks/nemo/tracer_advection/scripts/acc_mixed_explicit_data_movement_trans.py
@@ -54,17 +54,17 @@
     insert_explicit_loop_parallelism
 
 
-def trans(psy: Node):
+def trans(psyir: Node) -> None:
     '''A PSyclone-script compliant transformation function. Applies
     OpenACC 'kernels', 'loop' and explicit 'data' directives to NEMO code.
 
-    :param psy: The PSyIR to apply transformations to.
+    :param psyir: The PSyIR to apply transformations to.
 
     '''
     print("Routines found:")
-    print("\n".join([rt.name for rt in psy.walk(Routine)]))
+    print("\n".join([rt.name for rt in psyir.walk(Routine)]))
 
-    for routine in psy.walk(Routine):
+    for routine in psyir.walk(Routine):
 
         if not routine.children:
             print("Routine {routine.name} is empty! Skipping...")
diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_mixed_unified_memory_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_mixed_unified_memory_trans.py
index 3ed50735..c3cd28cb 100644
--- a/benchmarks/nemo/tracer_advection/scripts/acc_mixed_unified_memory_trans.py
+++ b/benchmarks/nemo/tracer_advection/scripts/acc_mixed_unified_memory_trans.py
@@ -53,17 +53,17 @@
     insert_explicit_loop_parallelism
 
 
-def trans(psy: Node):
+def trans(psyir: Node) -> Node:
     '''A PSyclone-script compliant transformation function. Applies
     OpenACC 'kernels' and 'loop' directives to NEMO code.
 
-    :param psy: The PSyIR to apply transformations to.
+    :param psyir: The PSyIR to apply transformations to.
 
     '''
     print("Routines found:")
-    print("\n".join([rt.name for rt in psy.walk(Routine)]))
+    print("\n".join([rt.name for rt in psyir.walk(Routine)]))
 
-    for routine in psy.walk(Routine):
+    for routine in psyir.walk(Routine):
 
         if not routine.children:
             print("Invoke {routine.name} is empty! Skipping...")
diff --git a/benchmarks/nemo/tracer_advection/scripts/omp_cpu_levels_trans.py b/benchmarks/nemo/tracer_advection/scripts/omp_cpu_levels_trans.py
index 4c8388b5..c2e9d9fb 100644
--- a/benchmarks/nemo/tracer_advection/scripts/omp_cpu_levels_trans.py
+++ b/benchmarks/nemo/tracer_advection/scripts/omp_cpu_levels_trans.py
@@ -54,16 +54,16 @@
 })
 
 
-def trans(psy: Node):
+def trans(psyir: Node) -> None:
     ''' Transform a specific Schedule by making all loops
     over levels OpenMP parallel.
 
-    :param psy: the PSyIR to be modified.
+    :param psyir: the PSyIR to be modified.
 
     '''
     # Get the transformation we will apply
     ompt = OMPParallelLoopTrans()
-    for sched in psy.walk(Routine):
+    for sched in psyir.walk(Routine):
         # Apply the OMP transformation to each loop over levels containing
         # a kernel
         for loop in sched.loops():
diff --git a/benchmarks/nemo/tracer_advection/scripts/omp_cpu_trans.py b/benchmarks/nemo/tracer_advection/scripts/omp_cpu_trans.py
index ee7e85b4..1860eead 100644
--- a/benchmarks/nemo/tracer_advection/scripts/omp_cpu_trans.py
+++ b/benchmarks/nemo/tracer_advection/scripts/omp_cpu_trans.py
@@ -41,7 +41,7 @@
 from utils import insert_explicit_loop_parallelism, normalise_loops
 
 
-def trans(psyir: Node):
+def trans(psyir: Node) -> None:
     ''' Add OpenMP Parallel Loop directive to all loops, including implicit
     ones to target CPU parallelism.
 
diff --git a/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py b/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py
index 92663046..6612a9f5 100644
--- a/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py
+++ b/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py
@@ -41,12 +41,12 @@
 from utils import insert_explicit_loop_parallelism, normalise_loops
 
 
-def trans(psy: Node):
+def trans(psyir: Node) -> None:
     ''' Add OpenMP Target and Loop directives to all loops, including the
     implicit ones, to parallelise the code and execute it in an acceleration
     device.
 
-    :param psy: the PSyIR which this script will transform.
+    :param psyir: the PSyIR which this script will transform.
 
     '''
     omp_target_trans = OMPTargetTrans()
@@ -55,7 +55,7 @@ def trans(psy: Node):
     omp_loop_trans.omp_schedule = "none"
 
     print("Routines found:")
-    for routine in psy.walk(Routine):
+    for routine in psyir.walk(Routine):
         print(routine.name)
 
         normalise_loops(
diff --git a/benchmarks/nemo/tracer_advection/scripts/utils.py b/benchmarks/nemo/tracer_advection/scripts/utils.py
index 3d419b5d..db02f2be 100644
--- a/benchmarks/nemo/tracer_advection/scripts/utils.py
+++ b/benchmarks/nemo/tracer_advection/scripts/utils.py
@@ -46,13 +46,10 @@
 from psyclone.psyir.transformations import (
     ACCKernelsTrans, ArrayAssignment2LoopsTrans, HoistLocalArraysTrans,
     HoistLoopBoundExprTrans,
-    HoistTrans, Maxval2LoopTrans, OMPMinimiseSyncTrans, ProfileTrans,
+    HoistTrans, Maxval2LoopTrans, OMPMinimiseSyncTrans,
     Reference2ArrayRangeTrans, ScalarisationTrans)
 from psyclone.transformations import TransformationError
 
-# If routine names contain these substrings then we do not profile them
-PROFILING_IGNORE = []
-
 
 def normalise_loops(
         schedule,
@@ -186,8 +183,7 @@ def insert_explicit_loop_parallelism(
 
     '''
     nemo_v4 = os.environ.get('NEMOV4', False)
-    if schedule.name == "ts_wgt":
-        return  # TODO #2937 WaW dependency incorrectly considered private
+
     # Add the parallel directives in each loop
     for loop in schedule.walk(Loop):
         if loop.ancestor(Directive):
@@ -200,14 +196,6 @@ def insert_explicit_loop_parallelism(
         if uniform_intrinsics_only:
             opts["device_string"] = "nvfortran-uniform"
 
-        routine_name = loop.ancestor(Routine).name
-
-        if ('dyn_spg' in routine_name and len(loop.walk(Loop)) > 2):
-            loop.append_preceding_comment(
-                "PSyclone: Loop not parallelised because it is in 'dyn_spg' "
-                "and is not the inner loop")
-            continue
-
         try:
             # First check that the region_directive is feasible for this region
             if region_directive_trans:
@@ -235,82 +223,6 @@ def insert_explicit_loop_parallelism(
         minsync_trans.apply(schedule)
 
 
-def add_profiling(children: Union[List[Node], Schedule]):
-    '''
-    Walks down the PSyIR and inserts the largest possible profiling regions
-    in place. Code inside functions or that contains directives is excluded.
-
-    :param children: a Schedule or sibling nodes in the PSyIR to which to
-        attempt to add profiling regions.
-
-    '''
-    if children and isinstance(children, Schedule):
-        # If we are given a Schedule, we look at its children.
-        children = children.children
-
-    if not children:
-        return
-
-    # We do not want profiling calipers inside functions (such as the
-    # PSyclone-generated comparison functions).
-    parent_routine = children[0].ancestor(Routine)
-    if parent_routine and parent_routine.return_symbol:
-        return
-
-    node_list = []
-    for child in children[:]:
-        # Do we want this node to be included in a profiling region?
-        if child.walk((Directive, Return)):
-            # It contains a directive or return statement so we put what we
-            # have so far inside a profiling region.
-            add_profile_region(node_list)
-            # A node that is not included in a profiling region marks the
-            # end of the current candidate region so reset the list.
-            node_list = []
-            # Now we go down a level and try again without attempting to put
-            # profiling below directives or within Assignments
-            if isinstance(child, IfBlock):
-                add_profiling(child.if_body)
-                add_profiling(child.else_body)
-            elif not isinstance(child, (Assignment, Directive)):
-                add_profiling(child.children)
-        else:
-            # We can add this node to our list for the current region
-            node_list.append(child)
-    add_profile_region(node_list)
-
-
-def add_profile_region(nodes):
-    '''
-    Attempt to put the supplied list of nodes within a profiling region.
-
-    :param nodes: list of sibling PSyIR nodes to enclose.
-    :type nodes: list of :py:class:`psyclone.psyir.nodes.Node`
-
-    '''
-    if nodes:
-        # Check whether we should be adding profiling inside this routine
-        routine_name = nodes[0].ancestor(Routine).name.lower()
-        if any(ignore in routine_name for ignore in PROFILING_IGNORE):
-            return
-        if len(nodes) == 1:
-            if isinstance(nodes[0], CodeBlock) and \
-               len(nodes[0].get_ast_nodes) == 1:
-                # Don't create profiling regions for CodeBlocks consisting
-                # of a single statement
-                return
-            if isinstance(nodes[0], IfBlock) and \
-               "was_single_stmt" in nodes[0].annotations and \
-               isinstance(nodes[0].if_body[0], CodeBlock):
-                # We also don't put single statements consisting of
-                # 'IF(condition) CALL blah()' inside profiling regions
-                return
-        try:
-            ProfileTrans().apply(nodes)
-        except TransformationError:
-            pass
-
-
 def valid_kernel(node):
     '''
     Whether the sub-tree that has `node` at its root is eligible to be

From 04084426f809babba379dd36bea779825b81c6fe Mon Sep 17 00:00:00 2001
From: Andrew Porter <andrew.porter@stfc.ac.uk>
Date: Mon, 15 Dec 2025 14:02:08 +0000
Subject: [PATCH 25/25] #101 fix nvidia compiler flags

---
 compiler_setup/nvidia.sh     | 2 +-
 compiler_setup/nvidia_acc.sh | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/compiler_setup/nvidia.sh b/compiler_setup/nvidia.sh
index c9abd3b2..61c4db93 100644
--- a/compiler_setup/nvidia.sh
+++ b/compiler_setup/nvidia.sh
@@ -26,7 +26,7 @@ OMPFLAGS="-mp"
 # Flag to use when compiling with OpenMP GPU offloading support
 OMPTARGETFLAGS="-mp=gpu -gpu=ccnative"
 # Flag to use to specify use of 'managed memory' (unified memory)
-UMEMFLAGS="-gpu=managed"
+UMEMFLAGS="-gpu=mem:managed"
 # Flags to use when compiling with OpenACC support
 ACCFLAGS="-acc=gpu -gpu=ccnative"
 
diff --git a/compiler_setup/nvidia_acc.sh b/compiler_setup/nvidia_acc.sh
index 0cb5a438..61c4f289 100644
--- a/compiler_setup/nvidia_acc.sh
+++ b/compiler_setup/nvidia_acc.sh
@@ -11,11 +11,11 @@ CFLAGS="-g"
 F90FLAGS="-O3 -Minfo=all"
 # Debugging options
 #F90FLAGS"+=" -fcheck=all -fbacktrace -ffpe-trap=invalid -g -O0"
-# V100 with managed memory
-F90FLAGS+=" -acc=gpu -gpu=cc70,mem:managed"
+# managed memory
+F90FLAGS+=" -acc=gpu -gpu=mem:managed"
 # Linker flags
-# V100 with managed memory
-LDFLAGS="-acc=gpu -gpu=cc70,mem:managed"
+# managed memory
+LDFLAGS="-acc=gpu -gpu=mem:managed"
 # Location of various CUDA maths libraries. nvtx3interop is required when
 # using nvtx for profiling.
 LDFLAGS+=" -cuda -L${CUDA_MATH_DIR}/lib64 -lnvtx3interop"