From f68a2c61dc31fa2b9b399ac628979ce0bfad24cc Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Tue, 21 Oct 2025 10:54:42 +0100 Subject: [PATCH 01/25] Update submodules --- shared/FortCL | 2 +- shared/PSyclone | 2 +- shared/dl_esm_inf | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/shared/FortCL b/shared/FortCL index d516ed01..401148e4 160000 --- a/shared/FortCL +++ b/shared/FortCL @@ -1 +1 @@ -Subproject commit d516ed01ea23565bfc4f531a795d2c7a2a57fe50 +Subproject commit 401148e4b6d6efdd4d0157123b118ed07d831446 diff --git a/shared/PSyclone b/shared/PSyclone index 106543da..c9c20b1e 160000 --- a/shared/PSyclone +++ b/shared/PSyclone @@ -1 +1 @@ -Subproject commit 106543dafe26fe114de192f27311637a85a28a81 +Subproject commit c9c20b1ee96c10352b31463276408ad33ab84752 diff --git a/shared/dl_esm_inf b/shared/dl_esm_inf index ad209e9d..358402ec 160000 --- a/shared/dl_esm_inf +++ b/shared/dl_esm_inf @@ -1 +1 @@ -Subproject commit ad209e9d252995bd83127de4c481232ca14ed655 +Subproject commit 358402ecc4d88e93a62a3ca13dc9d20d2eb27f90 From 588d403c00debb2ae90b7ff6333ce0d7f0cf7558 Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 23 Oct 2025 09:30:35 +0100 Subject: [PATCH 02/25] Update PSyclone flags in tra_adv makefile --- .../nemo/tracer_advection/original/Makefile | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/benchmarks/nemo/tracer_advection/original/Makefile b/benchmarks/nemo/tracer_advection/original/Makefile index 16d37045..d535c660 100644 --- a/benchmarks/nemo/tracer_advection/original/Makefile +++ b/benchmarks/nemo/tracer_advection/original/Makefile @@ -49,7 +49,7 @@ DL_TIMER_DIR = ../../../../shared/dl_timer DL_TIMER_NAME = libdl_timer_omp.a # Shorthand for invoking PSyclone. -PSYCLONE = psyclone -api nemo -l output ${PSYCLONE_PROFILE} +PSYCLONE = psyclone -l output ${PSYCLONE_PROFILE} # Add necessary flags for Nvidia nvtx instrumentation ifeq ($(ENABLE_NVIDIA_PROFILE),yes) @@ -71,14 +71,14 @@ tra_adv_serial: dl_timer ./tra_adv.F90 tra_adv_omp_cpu_levels: dl_timer ./tra_adv.F90 mkdir -p $@ - ${PSYCLONE} -s ../scripts/omp_cpu_levels_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 + ${PSYCLONE} -s ../scripts/omp_cpu_levels_trans.py -o $@/tra_adv.f90 ./tra_adv.F90 cp Makefile_gen $@/Makefile ${MAKE} FORT_FLAGS="${F90FLAGS} ${OMPFLAGS} -I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ tra_adv_omp_cpu: dl_timer ./tra_adv.F90 mkdir -p $@ - ${PSYCLONE} -s ../scripts/omp_cpu_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 + ${PSYCLONE} -s ../scripts/omp_cpu_trans.py -o $@/tra_adv.f90 ./tra_adv.F90 cp Makefile_gen $@/Makefile ${MAKE} FORT_FLAGS="${F90FLAGS} ${OMPFLAGS} -I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ @@ -87,42 +87,42 @@ tra_adv_omp_cpu: dl_timer ./tra_adv.F90 tra_adv_acc_kernels_unified_memory: dl_timer ./tra_adv.F90 mkdir -p $@ - ${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 + ${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -o $@/tra_adv.f90 ./tra_adv.F90 cp Makefile_gen $@/Makefile ${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} ${UMEMFLAGS} -I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ${ACCFLAGS} ${UMEMFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ tra_adv_acc_kernels_explicit_data_movement: dl_timer ./tra_adv.F90 mkdir -p $@ - ${PSYCLONE} -s ../scripts/acc_kernels_explicit_data_movement_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 + ${PSYCLONE} -s ../scripts/acc_kernels_explicit_data_movement_trans.py -o $@/tra_adv.f90 ./tra_adv.F90 cp Makefile_gen $@/Makefile ${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} -I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ${ACCFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ tra_adv_acc_loops_unified_memory: dl_timer ./tra_adv.F90 mkdir -p $@ - ${PSYCLONE} -s ../scripts/acc_loops_unified_memory_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 + ${PSYCLONE} -s ../scripts/acc_loops_unified_memory_trans.py -o $@/tra_adv.f90 ./tra_adv.F90 cp Makefile_gen $@/Makefile ${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} ${UMEMFLAGS} -I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ${ACCFLAGS} ${UMEMFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ tra_adv_acc_loops_explicit_data_movement: dl_timer ./tra_adv.F90 mkdir -p $@ - ${PSYCLONE} -s ../scripts/acc_loops_explicit_data_movement_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 + ${PSYCLONE} -s ../scripts/acc_loops_explicit_data_movement_trans.py -o $@/tra_adv.f90 ./tra_adv.F90 cp Makefile_gen $@/Makefile ${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} -I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ${ACCFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ tra_adv_acc_mixed_unified_memory: dl_timer ./tra_adv.F90 mkdir -p $@ - ${PSYCLONE} -s ../scripts/acc_mixed_unified_memory_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 + ${PSYCLONE} -s ../scripts/acc_mixed_unified_memory_trans.py -o $@/tra_adv.f90 ./tra_adv.F90 cp Makefile_gen $@/Makefile ${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} ${UMEMFLAGS} -I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ${ACCFLAGS} ${UMEMFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ tra_adv_acc_mixed_explicit_data_movement: dl_timer ./tra_adv.F90 mkdir -p $@ - ${PSYCLONE} -s ../scripts/acc_mixed_explicit_data_movement_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 + ${PSYCLONE} -s ../scripts/acc_mixed_explicit_data_movement_trans.py -o $@/tra_adv.f90 ./tra_adv.F90 cp Makefile_gen $@/Makefile ${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} -I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ${ACCFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ @@ -132,7 +132,7 @@ ifndef UMEMFLAGS $(error The OMP offload target requires OpenMP unified memory but the UMEMFLAGS environment variable is not set) endif mkdir -p $@ - ${PSYCLONE} -s ../scripts/omp_gpu_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 + ${PSYCLONE} -s ../scripts/omp_gpu_trans.py -o $@/tra_adv.f90 ./tra_adv.F90 cp Makefile_gen $@/Makefile ${MAKE} PROF_LIB_INC="-I../${DL_TIMER_DIR}/src" \ FORT_FLAGS="${F90FLAGS} ${OMPTARGETFLAGS} ${UMEMFLAGS} -I../${DL_TIMER_DIR}/src" \ From 2a8c0d228a6af98a46aa67c27a10977f6cd0942c Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 23 Oct 2025 09:32:21 +0100 Subject: [PATCH 03/25] Add new compiler-setup script for spack --- compiler_setup/spack_nvidia.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 compiler_setup/spack_nvidia.sh diff --git a/compiler_setup/spack_nvidia.sh b/compiler_setup/spack_nvidia.sh new file mode 100644 index 00000000..11fc3096 --- /dev/null +++ b/compiler_setup/spack_nvidia.sh @@ -0,0 +1,11 @@ +# Build settings for the Nvidia compiler +# ================================================ +# Fortran compiler + +# ============================== +export F90=$FC +export PSYCLONE_NVIDIA_LIB_DIR=${HOME}/Projects/PSyclone/lib/profiling/nvidia +export OMPTARGETFLAGS="-mp=gpu -gpu=ccnative" +export OMPFLAGS="-mp" +export UMEMFLAGS="-gpu=managed" + From defe249de3f5bf84894597d868048bfb9f00f13e Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 23 Oct 2025 11:00:14 +0100 Subject: [PATCH 04/25] Update omp cpu and gpu scripts --- .../tracer_advection/scripts/omp_cpu_trans.py | 30 +- .../tracer_advection/scripts/omp_gpu_trans.py | 22 +- .../nemo/tracer_advection/scripts/utils.py | 276 +++++++++++++----- 3 files changed, 219 insertions(+), 109 deletions(-) diff --git a/benchmarks/nemo/tracer_advection/scripts/omp_cpu_trans.py b/benchmarks/nemo/tracer_advection/scripts/omp_cpu_trans.py index c77813e7..ee7e85b4 100644 --- a/benchmarks/nemo/tracer_advection/scripts/omp_cpu_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/omp_cpu_trans.py @@ -1,7 +1,7 @@ # ----------------------------------------------------------------------------- # BSD 3-Clause License # -# Copyright (c) 2022, Science and Technology Facilities Council. +# Copyright (c) 2022-2025, Science and Technology Facilities Council. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -36,38 +36,34 @@ ''' PSyclone transformation script to insert OpenMP Parallel Loop directives to the outermost loop that is parallelisable, including implicit loops.''' -from psyclone.psyGen import TransInfo +from psyclone.psyir.nodes import Node, Routine +from psyclone.transformations import OMPParallelTrans, OMPLoopTrans from utils import insert_explicit_loop_parallelism, normalise_loops -def trans(psy): +def trans(psyir: Node): ''' Add OpenMP Parallel Loop directive to all loops, including implicit ones to target CPU parallelism. - :param psy: the PSy object which this script will transform. - :type psy: :py:class:`psyclone.psyGen.PSy` - :returns: the transformed PSy object. - :rtype: :py:class:`psyclone.psyGen.PSy` + :param psy: the PSyIR which this script will transform. ''' - omp_parallel_trans = TransInfo().get_trans_name('OMPParallelTrans') - omp_loop_trans = TransInfo().get_trans_name('OMPLoopTrans') + omp_parallel_trans = OMPParallelTrans() + omp_loop_trans = OMPLoopTrans() - print("Invokes found:") - for invoke in psy.invokes.invoke_list: - print(invoke.name) + print("Routines: found:") + for routine in psyir.walk(Routine): + print(routine.name) normalise_loops( - invoke.schedule, - unwrap_array_ranges=True, + routine, + convert_array_notation=True, hoist_expressions=False, ) insert_explicit_loop_parallelism( - invoke.schedule, + routine, region_directive_trans=omp_parallel_trans, loop_directive_trans=omp_loop_trans, collapse=False ) - - return psy diff --git a/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py b/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py index be2f4927..92663046 100644 --- a/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py @@ -36,19 +36,17 @@ ''' PSyclone transformation script to insert OpenMP Target Loop directives to the outermost loop that is parallelisable, including implicit loops. ''' +from psyclone.psyir.nodes import Node, Routine from psyclone.psyir.transformations import OMPTargetTrans, OMPLoopTrans from utils import insert_explicit_loop_parallelism, normalise_loops -def trans(psy): +def trans(psy: Node): ''' Add OpenMP Target and Loop directives to all loops, including the implicit ones, to parallelise the code and execute it in an acceleration device. - :param psy: the PSy object which this script will transform. - :type psy: :py:class:`psyclone.psyGen.PSy` - :returns: the transformed PSy object. - :rtype: :py:class:`psyclone.psyGen.PSy` + :param psy: the PSyIR which this script will transform. ''' omp_target_trans = OMPTargetTrans() @@ -56,21 +54,17 @@ def trans(psy): omp_loop_trans.omp_directive = "teamsdistributeparalleldo" omp_loop_trans.omp_schedule = "none" - print("Invokes found:") - for invoke in psy.invokes.invoke_list: - print(invoke.name) + print("Routines found:") + for routine in psy.walk(Routine): + print(routine.name) normalise_loops( - invoke.schedule, - unwrap_array_ranges=True, - hoist_expressions=True, + routine, ) insert_explicit_loop_parallelism( - invoke.schedule, + routine, region_directive_trans=omp_target_trans, loop_directive_trans=omp_loop_trans, collapse=True ) - - return psy diff --git a/benchmarks/nemo/tracer_advection/scripts/utils.py b/benchmarks/nemo/tracer_advection/scripts/utils.py index 8c9f530a..eed22fb8 100644 --- a/benchmarks/nemo/tracer_advection/scripts/utils.py +++ b/benchmarks/nemo/tracer_advection/scripts/utils.py @@ -35,16 +35,30 @@ ''' Utilities file to parallelise Nemo code. ''' -from psyclone.domain.nemo.transformations import NemoAllArrayRange2LoopTrans -from psyclone.errors import InternalError -from psyclone.psyir.nodes import Loop, Assignment, Directive, CodeBlock, Call -from psyclone.psyir.transformations import HoistLoopBoundExprTrans, HoistTrans -from psyclone.transformations import TransformationError, ACCKernelsTrans +import os +from typing import List, Union + +from psyclone.psyir.nodes import ( + Assignment, Directive, CodeBlock, Call, IfBlock, IntrinsicCall, Loop, Node, + Reference, Return, Routine, Schedule, StructureReference) +from psyclone.psyir.symbols import DataSymbol +from psyclone.psyir.transformations import ( + ArrayAssignment2LoopsTrans, HoistLocalArraysTrans, HoistLoopBoundExprTrans, + HoistTrans, Maxval2LoopTrans, OMPMinimiseSyncTrans, ProfileTrans, + Reference2ArrayRangeTrans, ScalarisationTrans) +from psyclone.transformations import TransformationError + +# If routine names contain these substrings then we do not profile them +PROFILING_IGNORE = [] def normalise_loops( schedule, - unwrap_array_ranges: bool = True, + hoist_local_arrays: bool = True, + convert_array_notation: bool = True, + loopify_array_intrinsics: bool = True, + convert_range_loops: bool = True, + scalarise_loops: bool = False, hoist_expressions: bool = True, ): ''' Normalise all loops in the given schedule so that they are in an @@ -52,20 +66,77 @@ def normalise_loops( them. :param schedule: the PSyIR Schedule to transform. - :param unwrap_array_ranges: whether to convert ranges to explicit loops. - :param hoist_expressions: whether to hoist bounds and loop invariant \ + :type schedule: :py:class:`psyclone.psyir.nodes.node` + :param bool hoist_local_arrays: whether to hoist local arrays. + :param bool convert_array_notation: whether to convert array notation + to explicit loops. + :param bool loopify_array_intrinsics: whether to convert intrinsics that + operate on arrays to explicit loops (currently only maxval). + :param bool convert_range_loops: whether to convert ranges to explicit + loops. + :param scalarise_loops: whether to attempt to convert arrays to scalars + where possible, default is False. + :param hoist_expressions: whether to hoist bounds and loop invariant statements out of the loop nest. ''' - if unwrap_array_ranges: + if hoist_local_arrays: + # Apply the HoistLocalArraysTrans when possible, it cannot be applied + # to files with statement functions because it will attempt to put the + # allocate above it, which is not valid Fortran. + try: + HoistLocalArraysTrans().apply(schedule) + except TransformationError: + pass + + if convert_array_notation: + # Make sure all array dimensions are explicit + for reference in schedule.walk(Reference): + part_of_the_call = reference.ancestor(Call) + if part_of_the_call: + if not part_of_the_call.is_elemental: + continue + if isinstance(reference.symbol, DataSymbol): + try: + Reference2ArrayRangeTrans().apply(reference) + except TransformationError: + pass + + if loopify_array_intrinsics: + for intr in schedule.walk(IntrinsicCall): + if intr.intrinsic.name == "MAXVAL": + try: + Maxval2LoopTrans().apply(intr) + except TransformationError as err: + print(err.value) + + if convert_range_loops: # Convert all array implicit loops to explicit loops - explicit_loops = NemoAllArrayRange2LoopTrans() + explicit_loops = ArrayAssignment2LoopsTrans() for assignment in schedule.walk(Assignment): - explicit_loops.apply(assignment) + if assignment.walk(StructureReference): + continue # TODO #2951 Fix issues with structure_refs + try: + explicit_loops.apply(assignment) + except TransformationError: + pass + + if scalarise_loops: + # Apply scalarisation to every loop. Execute this in reverse order + # as sometimes we can scalarise earlier loops if following loops + # have already been scalarised. + loops = schedule.walk(Loop) + loops.reverse() + scalartrans = ScalarisationTrans() + for loop in loops: + scalartrans.apply(loop) if hoist_expressions: # First hoist all possible expressions for loop in schedule.walk(Loop): - HoistLoopBoundExprTrans().apply(loop) + try: + HoistLoopBoundExprTrans().apply(loop) + except TransformationError: + pass # Hoist all possible assignments (in reverse order so the inner loop # constants are hoisted all the way out if possible) @@ -81,109 +152,158 @@ def insert_explicit_loop_parallelism( schedule, region_directive_trans=None, loop_directive_trans=None, - collapse: bool = True + collapse: bool = True, + privatise_arrays: bool = False, + asynchronous_parallelism: bool = False, + uniform_intrinsics_only: bool = False, + enable_reductions: bool = False, ): ''' For each loop in the schedule that doesn't already have a Directive as an ancestor, attempt to insert the given region and loop directives. - :param region_directive_trans: PSyclone transformation to insert the \ + :param schedule: the PSyIR Schedule to transform. + :type schedule: :py:class:`psyclone.psyir.nodes.node` + :param region_directive_trans: PSyclone transformation that inserts the region directive. - :param loop_directive_trans: PSyclone transformation to use to insert the \ - loop directive. - :param collapse: whether to attempt to insert the collapse clause to as \ + :type region_directive_trans: \ + :py:class:`psyclone.transformation.Transformation` + :param loop_directive_trans: PSyclone transformation that inserts the + loop parallelisation directive. + :type loop_directive_trans: \ + :py:class:`psyclone.transformation.Transformation` + :param collapse: whether to attempt to insert the collapse clause to as many nested loops as possible. - ''' + :param privatise_arrays: whether to attempt to privatise arrays that cause + write-write race conditions. + :param asynchronous_parallelism: whether to attempt to add asynchronocity + to the parallel sections. + :param uniform_intrinsics_only: if True it prevent offloading loops + with non-reproducible device intrinsics. + :param enable_reductions: whether to enable generation of reduction + clauses automatically. + ''' + nemo_v4 = os.environ.get('NEMOV4', False) + if schedule.name == "ts_wgt": + return # TODO #2937 WaW dependency incorrectly considered private # Add the parallel directives in each loop for loop in schedule.walk(Loop): if loop.ancestor(Directive): continue # Skip if an outer loop is already parallelised - try: - loop_directive_trans.apply(loop) - # Only add the region directive if the loop was successfully - # parallelised. - if region_directive_trans is not None: - region_directive_trans.apply(loop.parent.parent) - except TransformationError as err: - # This loop can not be transformed, proceed to next loop - print("Loop not parallelised because:", str(err)) - continue + opts = {"collapse": collapse, "privatise_arrays": privatise_arrays, + "verbose": True, "nowait": asynchronous_parallelism, + "enable_reductions": enable_reductions} - if collapse: - # Count the number of perfectly nested loops - num_nested_loops = 0 - next_loop = loop - while isinstance(next_loop, Loop): - num_nested_loops += 1 - if len(next_loop.loop_body.children) > 1: - break - next_loop = next_loop.loop_body.children[0] + if uniform_intrinsics_only: + opts["device_string"] = "nvfortran-uniform" - if num_nested_loops > 1: - loop.parent.parent.collapse = num_nested_loops + routine_name = loop.ancestor(Routine).name + if ('dyn_spg' in routine_name and len(loop.walk(Loop)) > 2): + loop.append_preceding_comment( + "PSyclone: Loop not parallelised because it is in 'dyn_spg' " + "and is not the inner loop") + continue -def valid_kernel(node): - ''' - Whether the sub-tree that has `node` at its root is eligible to be - enclosed within an OpenACC KERNELS directive. + try: + # First check that the region_directive is feasible for this region + if region_directive_trans: + # TODO psyclone/#3066 - validate *should* accept a single Node + # but currently has a bug and doesn't so we have to make a + # list and pass that. + region_directive_trans.validate([loop], options=opts) - :param node: the node in the PSyIR to check. - :type node: :py:class:`psyclone.psyir.nodes.Node` + # If it is, apply the parallelisation directive + loop_directive_trans.apply(loop, options=opts) - :returns: True if the sub-tree can be enclosed in a KERNELS region. - :rtype: bool + # And if successful, the region directive on top. + if region_directive_trans: + region_directive_trans.apply(loop.parent.parent, options=opts) + except TransformationError: + # This loop cannot be transformed, proceed to next loop. + # The parallelisation restrictions will be explained with a comment + # associted to the loop in the generated output. + continue - ''' - excluded_node_types = (CodeBlock, Call) - return node.walk(excluded_node_types) == [] + # If we are adding asynchronous parallelism then we now try to minimise + # the number of barriers. + if asynchronous_parallelism: + minsync_trans = OMPMinimiseSyncTrans() + minsync_trans.apply(schedule) -def add_kernels(children, default_present=True): +def add_profiling(children: Union[List[Node], Schedule]): ''' - Walks through the PSyIR inserting OpenACC KERNELS directives at as - high a level as possible. + Walks down the PSyIR and inserts the largest possible profiling regions + in place. Code inside functions or that contains directives is excluded. - :param children: list of sibling Nodes in PSyIR that are candidates for \ - inclusion in an ACC KERNELS region. - :type children: list of :py:class:`psyclone.psyir.nodes.Node` - :param bool default_present: whether or not to supply the \ - DEFAULT(PRESENT) clause to ACC KERNELS directives. + :param children: a Schedule or sibling nodes in the PSyIR to which to + attempt to add profiling regions. ''' + if children and isinstance(children, Schedule): + # If we are given a Schedule, we look at its children. + children = children.children + if not children: return + # We do not want profiling calipers inside functions (such as the + # PSyclone-generated comparison functions). + parent_routine = children[0].ancestor(Routine) + if parent_routine and parent_routine.return_symbol: + return + node_list = [] for child in children[:]: - # Can this node be included in a kernels region? - if not valid_kernel(child): - try_kernels_trans(node_list, default_present) + # Do we want this node to be included in a profiling region? + if child.walk((Directive, Return)): + # It contains a directive or return statement so we put what we + # have so far inside a profiling region. + add_profile_region(node_list) + # A node that is not included in a profiling region marks the + # end of the current candidate region so reset the list. node_list = [] - # It can't so go down a level and try again - add_kernels(child.children) + # Now we go down a level and try again without attempting to put + # profiling below directives or within Assignments + if isinstance(child, IfBlock): + add_profiling(child.if_body) + add_profiling(child.else_body) + elif not isinstance(child, (Assignment, Directive)): + add_profiling(child.children) else: + # We can add this node to our list for the current region node_list.append(child) - try_kernels_trans(node_list, default_present) + add_profile_region(node_list) -def try_kernels_trans(nodes, default_present): +def add_profile_region(nodes): ''' - Attempt to enclose the supplied list of nodes within a kernels - region. If the transformation fails then the error message is - reported but execution continues. + Attempt to put the supplied list of nodes within a profiling region. - :param nodes: list of Nodes to enclose within a Kernels region. + :param nodes: list of sibling PSyIR nodes to enclose. :type nodes: list of :py:class:`psyclone.psyir.nodes.Node` - :param bool default_present: whether or not to supply the \ - DEFAULT(PRESENT) clause to ACC KERNELS directives. ''' - if not nodes: - return - try: - ACCKernelsTrans().apply(nodes, {"default_present": default_present}) - except (TransformationError, InternalError) as err: - print(f"Failed to transform nodes: {nodes}") - print(f"Error was: {err}") + if nodes: + # Check whether we should be adding profiling inside this routine + routine_name = nodes[0].ancestor(Routine).name.lower() + if any(ignore in routine_name for ignore in PROFILING_IGNORE): + return + if len(nodes) == 1: + if isinstance(nodes[0], CodeBlock) and \ + len(nodes[0].get_ast_nodes) == 1: + # Don't create profiling regions for CodeBlocks consisting + # of a single statement + return + if isinstance(nodes[0], IfBlock) and \ + "was_single_stmt" in nodes[0].annotations and \ + isinstance(nodes[0].if_body[0], CodeBlock): + # We also don't put single statements consisting of + # 'IF(condition) CALL blah()' inside profiling regions + return + try: + ProfileTrans().apply(nodes) + except TransformationError: + pass From bf09178c9a2a7d6e2ee58227562068c0897b662d Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 23 Oct 2025 11:13:09 +0100 Subject: [PATCH 05/25] Fix the omp_cpu_levels_trans script --- .../scripts/omp_cpu_levels_trans.py | 47 ++++++++++--------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/benchmarks/nemo/tracer_advection/scripts/omp_cpu_levels_trans.py b/benchmarks/nemo/tracer_advection/scripts/omp_cpu_levels_trans.py index bc6b7ef9..4c8388b5 100644 --- a/benchmarks/nemo/tracer_advection/scripts/omp_cpu_levels_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/omp_cpu_levels_trans.py @@ -1,7 +1,7 @@ # ----------------------------------------------------------------------------- # BSD 3-Clause License # -# Copyright (c) 2018-2023, Science and Technology Facilities Council +# Copyright (c) 2018-2025, Science and Technology Facilities Council # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -33,40 +33,43 @@ # ----------------------------------------------------------------------------- # Authors: R. W. Ford, A. R. Porter and S. Siso, STFC Daresbury Lab -'''A simple transformation script for the introduction of OpenMP with PSyclone. +'''A very simple transformation script for the introduction of OpenMP + to certain loops using PSyclone. - >>> psyclone -api "nemo" -s ./omp_cpu_levels_trans.py tra_adv.F90 + >>> psyclone -s ./omp_cpu_levels_trans.py tra_adv.F90 This should produce a lot of output, ending with generated Fortran. ''' -from psyclone.psyGen import TransInfo -from psyclone.nemo import NemoKern +from psyclone.psyir.nodes import Loop, Node, Routine +from psyclone.transformations import OMPParallelLoopTrans, TransformationError -def trans(psy): +# Set up some loop_type inference rules in order to reference useful domain +# loop constructs by name +Loop.set_loop_type_inference_rules({ + "lon": {"variable": "ji"}, + "lat": {"variable": "jj"}, + "levels": {"variable": "jk"} +}) + + +def trans(psy: Node): ''' Transform a specific Schedule by making all loops over levels OpenMP parallel. - :param psy: the object holding all information on the PSy layer \ - to be modified. - :type psy: :py:class:`psyclone.psyGen.PSy` - - :returns: the transformed PSy object - :rtype: :py:class:`psyclone.psyGen.PSy` + :param psy: the PSyIR to be modified. ''' # Get the transformation we will apply - ompt = TransInfo().get_trans_name('OMPParallelLoopTrans') - for invoke in psy.invokes.invoke_list: - # Get the Schedule of the target routine - sched = invoke.schedule + ompt = OMPParallelLoopTrans() + for sched in psy.walk(Routine): # Apply the OMP transformation to each loop over levels containing # a kernel for loop in sched.loops(): - kernels = loop.walk(NemoKern) - if kernels and loop.loop_type == "levels": - ompt.apply(loop) - - # Return the modified psy object - return psy + if loop.loop_type == "levels": + try: + ompt.apply(loop) + except TransformationError as err: + loop.append_preceding_comment( + f"Loop cannot be parallelised because: {err}") From 4c6aeb887ae4a8895b629afc327707c60546e75b Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 23 Oct 2025 11:43:46 +0100 Subject: [PATCH 06/25] Update acc kernels managed target --- .../acc_kernels_unified_memory_trans.py | 29 ++++---- .../nemo/tracer_advection/scripts/utils.py | 71 ++++++++++++++++++- 2 files changed, 85 insertions(+), 15 deletions(-) diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_unified_memory_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_unified_memory_trans.py index fd9ad97b..93a84179 100644 --- a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_unified_memory_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_unified_memory_trans.py @@ -1,7 +1,7 @@ # ----------------------------------------------------------------------------- # BSD 3-Clause License # -# Copyright (c) 2018-2022, Science and Technology Facilities Council. +# Copyright (c) 2018-2025, Science and Technology Facilities Council. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,12 +34,12 @@ # Authors: R. W. Ford, A. R. Porter and S. Siso, STFC Daresbury Lab '''A transformation script that seeks to apply OpenACC KERNELS directives to -NEMO style code. In order to use it you must first install PSyclone. See +generic Fortran code. In order to use it you must first install PSyclone. See README.md in the top-level directory. Once you have psyclone installed, this may be used by doing: - $ psyclone -api nemo -s + $ psyclone -s The transformation script attempts to insert Kernels directives at the highest possible location(s) in the schedule tree (i.e. to enclose as @@ -47,24 +47,25 @@ ''' +from psyclone.psyir.nodes import Node, Routine + from utils import add_kernels -def trans(psy): +def trans(psy: Node): '''A PSyclone-script compliant transformation function. Applies - OpenACC 'kernels' to NEMO code. + OpenACC 'kernels' to existing code. - :param psy: The PSy layer object to apply transformations to. - :type psy: :py:class:`psyclone.psyGen.PSy` - ''' + :param psy: The PSyIR to apply transformations to. - print("Invokes found:") - print("\n".join([str(name) for name in psy.invokes.names])) + ''' + print("Routines found:") - for invoke in psy.invokes.invoke_list: + for routine in psy.walk(Routine): + print(routine.name) - if not invoke.schedule: - print(f"Invoke {invoke.name} has no Schedule! Skipping...") + if not routine.children: + print(f"Routine {routine.name} is empty! Skipping...") continue - add_kernels(invoke.schedule.children) + add_kernels(routine.children) diff --git a/benchmarks/nemo/tracer_advection/scripts/utils.py b/benchmarks/nemo/tracer_advection/scripts/utils.py index eed22fb8..3d419b5d 100644 --- a/benchmarks/nemo/tracer_advection/scripts/utils.py +++ b/benchmarks/nemo/tracer_advection/scripts/utils.py @@ -38,12 +38,14 @@ import os from typing import List, Union +from psyclone.errors import InternalError from psyclone.psyir.nodes import ( Assignment, Directive, CodeBlock, Call, IfBlock, IntrinsicCall, Loop, Node, Reference, Return, Routine, Schedule, StructureReference) from psyclone.psyir.symbols import DataSymbol from psyclone.psyir.transformations import ( - ArrayAssignment2LoopsTrans, HoistLocalArraysTrans, HoistLoopBoundExprTrans, + ACCKernelsTrans, ArrayAssignment2LoopsTrans, HoistLocalArraysTrans, + HoistLoopBoundExprTrans, HoistTrans, Maxval2LoopTrans, OMPMinimiseSyncTrans, ProfileTrans, Reference2ArrayRangeTrans, ScalarisationTrans) from psyclone.transformations import TransformationError @@ -307,3 +309,70 @@ def add_profile_region(nodes): ProfileTrans().apply(nodes) except TransformationError: pass + + +def valid_kernel(node): + ''' + Whether the sub-tree that has `node` at its root is eligible to be + enclosed within an OpenACC KERNELS directive. + + :param node: the node in the PSyIR to check. + :type node: :py:class:`psyclone.psyir.nodes.Node` + + :returns: True if the sub-tree can be enclosed in a KERNELS region. + :rtype: bool + + ''' + try: + ACCKernelsTrans().validate(node, {"disable_loop_check": True}) + except TransformationError: + return False + + return True + + +def add_kernels(children: list[Node], default_present: bool = True): + ''' + Walks through the PSyIR inserting OpenACC KERNELS directives at as + high a level as possible. + + :param children: list of sibling Nodes in PSyIR that are candidates for + inclusion in an ACC KERNELS region. + :param default_present: whether or not to supply the + DEFAULT(PRESENT) clause to ACC KERNELS directives. + + ''' + if not children: + return + + node_list = [] + for child in children[:]: + # Can this node be included in a kernels region? + if not valid_kernel(child): + try_kernels_trans(node_list, default_present) + node_list = [] + # It can't so go down a level and try again + add_kernels(child.children) + else: + node_list.append(child) + try_kernels_trans(node_list, default_present) + + +def try_kernels_trans(nodes: list[Node], default_present: bool): + ''' + Attempt to enclose the supplied list of nodes within a kernels + region. If the transformation fails then the error message is + reported but execution continues. + + :param nodes: list of Nodes to enclose within a Kernels region. + :param default_present: whether or not to supply the + DEFAULT(PRESENT) clause to ACC KERNELS directives. + + ''' + if not nodes: + return + try: + ACCKernelsTrans().apply(nodes, {"default_present": default_present}) + except (TransformationError, InternalError) as err: + print(f"Failed to transform nodes: {nodes}") + print(f"Error was: {err}") From b4d3c69b3ff3c9a9b74d9a7323c875d7e984861a Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 23 Oct 2025 11:53:28 +0100 Subject: [PATCH 07/25] Fix ACC kernels with explicit mem movement --- ...cc_kernels_explicit_data_movement_trans.py | 32 ++++++++----------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_data_movement_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_data_movement_trans.py index 6260568f..04f546b3 100644 --- a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_data_movement_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_data_movement_trans.py @@ -1,7 +1,7 @@ # ----------------------------------------------------------------------------- # BSD 3-Clause License # -# Copyright (c) 2018-2022, Science and Technology Facilities Council. +# Copyright (c) 2018-2025, Science and Technology Facilities Council. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -39,7 +39,7 @@ Once you have psyclone installed, this may be used by doing: - $ psyclone -api nemo -s + $ psyclone -s The transformation script attempts to insert Kernels directives at the highest possible location(s) in the schedule tree (i.e. to enclose as @@ -47,30 +47,26 @@ ''' -from psyclone.psyir.nodes import Directive +from psyclone.psyir.nodes import Directive, Routine, Node from psyclone.psyir.transformations import ACCUpdateTrans from psyclone.transformations import ACCEnterDataTrans from utils import add_kernels -def trans(psy): +def trans(psy: Node): '''A PSyclone-script compliant transformation function. Applies - OpenACC 'kernels' and 'data movement' directives to NEMO code. + OpenACC 'kernels' and 'data movement' directives to generic code. - :param psy: The PSy layer object to apply transformations to. - :type psy: :py:class:`psyclone.psyGen.PSy` - ''' - - print("Invokes found:") - print("\n".join([str(name) for name in psy.invokes.names])) + :param psy: The PSyIR to apply transformations to. - for invoke in psy.invokes.invoke_list: + ''' + for sched in psy.walk(Routine): - if not invoke.schedule: - print(f"Invoke {invoke.name} has no Schedule! Skipping...") + if not sched.children: + print(f"Routine {sched.name} is empty! Skipping...") continue - add_kernels(invoke.schedule.children) - if invoke.schedule.walk(Directive): - ACCEnterDataTrans().apply(invoke.schedule) - ACCUpdateTrans().apply(invoke.schedule) + add_kernels(sched.children) + if sched.walk(Directive): + ACCEnterDataTrans().apply(sched) + ACCUpdateTrans().apply(sched) From e29ff3637bb41886cd07a50711f27f0f5ec03c61 Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 23 Oct 2025 11:59:54 +0100 Subject: [PATCH 08/25] Fix acc loops with explicit mem --- .../acc_loops_explicit_data_movement_trans.py | 40 ++++++++----------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_loops_explicit_data_movement_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_loops_explicit_data_movement_trans.py index 4c5d1ebc..38aa257d 100644 --- a/benchmarks/nemo/tracer_advection/scripts/acc_loops_explicit_data_movement_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/acc_loops_explicit_data_movement_trans.py @@ -1,7 +1,7 @@ # ----------------------------------------------------------------------------- # BSD 3-Clause License # -# Copyright (c) 2022-2023, Science and Technology Facilities Council. +# Copyright (c) 2022-2025, Science and Technology Facilities Council. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,47 +37,41 @@ to the outermost loop that is parallelisable, including implicit loops. This script also adds OpenACC explicit data movement directives.''' -from psyclone.psyir.nodes import Directive -from psyclone.psyGen import TransInfo +from psyclone.psyir.nodes import Directive, Node, Routine from psyclone.psyir.transformations import ACCUpdateTrans -from psyclone.transformations import ACCEnterDataTrans +from psyclone.transformations import ( + ACCEnterDataTrans, ACCLoopTrans, ACCParallelTrans) from utils import insert_explicit_loop_parallelism, normalise_loops -def trans(psy): +def trans(psy: Node): ''' Add OpenACC Parallel Loop directive to all loops, including implicit ones, to target GPU parallelism and explicit data movement directives. - :param psy: the PSy object which this script will transform. - :type psy: :py:class:`psyclone.psyGen.PSy` - - :returns: the transformed PSy object. - :rtype: :py:class:`psyclone.psyGen.PSy` + :param psy: the PSyIR which this script will transform. ''' - acc_parallel_trans = TransInfo().get_trans_name('ACCParallelTrans') - acc_loop_trans = TransInfo().get_trans_name('ACCLoopTrans') + acc_parallel_trans = ACCParallelTrans() + acc_loop_trans = ACCLoopTrans() - print("Invokes found:") - for invoke in psy.invokes.invoke_list: - print(invoke.name) + print("Routines found:") + for routine in psy.walk(Routine): + print(routine.name) # Convert array and range notation to loops and hoist expressions normalise_loops( - invoke.schedule, - unwrap_array_ranges=True, + routine, + scalarise_loops=True, hoist_expressions=True, ) insert_explicit_loop_parallelism( - invoke.schedule, + routine, region_directive_trans=acc_parallel_trans, loop_directive_trans=acc_loop_trans, collapse=True ) - if invoke.schedule.walk(Directive): - ACCEnterDataTrans().apply(invoke.schedule) - ACCUpdateTrans().apply(invoke.schedule) - - return psy + if routine.walk(Directive): + ACCEnterDataTrans().apply(routine) + ACCUpdateTrans().apply(routine) From 1f3184876d955afd24ecae944d3c16a05956a286 Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 23 Oct 2025 12:04:49 +0100 Subject: [PATCH 09/25] Update acc-mixed with explicit mem --- .../acc_mixed_explicit_data_movement_trans.py | 38 +++++++++---------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_mixed_explicit_data_movement_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_mixed_explicit_data_movement_trans.py index db5dee0b..8d7b9661 100644 --- a/benchmarks/nemo/tracer_advection/scripts/acc_mixed_explicit_data_movement_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/acc_mixed_explicit_data_movement_trans.py @@ -1,7 +1,7 @@ # ----------------------------------------------------------------------------- # BSD 3-Clause License # -# Copyright (c) 2018-2023, Science and Technology Facilities Council. +# Copyright (c) 2018-2025, Science and Technology Facilities Council. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -39,7 +39,7 @@ Once you have psyclone installed, this may be used by doing: - $ psyclone -api nemo -s ./acc_mixed_explicit_data_movement_trans.py + $ psyclone -s ./acc_mixed_explicit_data_movement_trans.py This should produce a lot of output, ending with generated Fortran. Note that the Fortran source files provided to PSyclone must have already been @@ -47,50 +47,48 @@ ''' -from psyclone.psyir.nodes import Directive +from psyclone.psyir.nodes import Directive, Node, Routine from psyclone.psyir.transformations import ACCUpdateTrans from psyclone.transformations import ACCEnterDataTrans, ACCLoopTrans from utils import add_kernels, normalise_loops, \ insert_explicit_loop_parallelism -def trans(psy): +def trans(psy: Node): '''A PSyclone-script compliant transformation function. Applies OpenACC 'kernels', 'loop' and explicit 'data' directives to NEMO code. - :param psy: The PSy layer object to apply transformations to. - :type psy: :py:class:`psyclone.psyGen.PSy` - ''' + :param psy: The PSyIR to apply transformations to. - print("Invokes found:") - print("\n".join([str(name) for name in psy.invokes.names])) + ''' + print("Routines found:") + print("\n".join([rt.name for rt in psy.walk(Routine)])) - for invoke in psy.invokes.invoke_list: + for routine in psy.walk(Routine): - sched = invoke.schedule - if not sched: - print("Invoke {invoke.name} has no Schedule! Skipping...") + if not routine.children: + print("Routine {routine.name} is empty! Skipping...") continue # Convert array and range syntax to explicit loops normalise_loops( - invoke.schedule, - unwrap_array_ranges=True, + routine, + scalarise_loops=True, hoist_expressions=True, ) # Add OpenACC Loop directives insert_explicit_loop_parallelism( - invoke.schedule, + routine, region_directive_trans=None, loop_directive_trans=ACCLoopTrans(), collapse=True ) # Add OpenACC Kernel directives - add_kernels(sched.children) + add_kernels(routine.children) # Add OpenACC data directives - if invoke.schedule.walk(Directive): - ACCEnterDataTrans().apply(invoke.schedule) - ACCUpdateTrans().apply(invoke.schedule) + if routine.walk(Directive): + ACCEnterDataTrans().apply(routine) + ACCUpdateTrans().apply(routine) From 401ece2e82b43dde3d5f84be7f2d56fb787f8ea1 Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 23 Oct 2025 12:13:53 +0100 Subject: [PATCH 10/25] Modernise compiler flags and fix acc-mixed-umem target --- .../scripts/acc_mixed_unified_memory_trans.py | 29 +++++++++---------- compiler_setup/spack_nvidia.sh | 2 +- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_mixed_unified_memory_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_mixed_unified_memory_trans.py index 3cc7b6c0..3ed50735 100644 --- a/benchmarks/nemo/tracer_advection/scripts/acc_mixed_unified_memory_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/acc_mixed_unified_memory_trans.py @@ -1,7 +1,7 @@ # ----------------------------------------------------------------------------- # BSD 3-Clause License # -# Copyright (c) 2018-2023, Science and Technology Facilities Council. +# Copyright (c) 2018-2025, Science and Technology Facilities Council. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -47,43 +47,42 @@ ''' +from psyclone.psyir.nodes import Node, Routine from psyclone.transformations import ACCLoopTrans from utils import add_kernels, normalise_loops, \ insert_explicit_loop_parallelism -def trans(psy): +def trans(psy: Node): '''A PSyclone-script compliant transformation function. Applies OpenACC 'kernels' and 'loop' directives to NEMO code. - :param psy: The PSy layer object to apply transformations to. - :type psy: :py:class:`psyclone.psyGen.PSy` - ''' + :param psy: The PSyIR to apply transformations to. - print("Invokes found:") - print("\n".join([str(name) for name in psy.invokes.names])) + ''' + print("Routines found:") + print("\n".join([rt.name for rt in psy.walk(Routine)])) - for invoke in psy.invokes.invoke_list: + for routine in psy.walk(Routine): - sched = invoke.schedule - if not sched: - print("Invoke {invoke.name} has no Schedule! Skipping...") + if not routine.children: + print("Invoke {routine.name} is empty! Skipping...") continue # Convert array and range syntax to explicit loops normalise_loops( - invoke.schedule, - unwrap_array_ranges=True, + routine, + scalarise_loops=True, hoist_expressions=True, ) # Add OpenACC Loop directives insert_explicit_loop_parallelism( - invoke.schedule, + routine, region_directive_trans=None, loop_directive_trans=ACCLoopTrans(), collapse=True ) # Add OpenACC Kernel directives - add_kernels(sched.children) + add_kernels(routine) diff --git a/compiler_setup/spack_nvidia.sh b/compiler_setup/spack_nvidia.sh index 11fc3096..491e9a83 100644 --- a/compiler_setup/spack_nvidia.sh +++ b/compiler_setup/spack_nvidia.sh @@ -7,5 +7,5 @@ export F90=$FC export PSYCLONE_NVIDIA_LIB_DIR=${HOME}/Projects/PSyclone/lib/profiling/nvidia export OMPTARGETFLAGS="-mp=gpu -gpu=ccnative" export OMPFLAGS="-mp" -export UMEMFLAGS="-gpu=managed" +export UMEMFLAGS="-gpu=mem:managed" From 9191008504c74220069316622e9f79c743881130 Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 23 Oct 2025 12:19:51 +0100 Subject: [PATCH 11/25] Fix acc-loops-um --- .../scripts/acc_loops_unified_memory_trans.py | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_loops_unified_memory_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_loops_unified_memory_trans.py index 1557dd54..4ccf0ca6 100644 --- a/benchmarks/nemo/tracer_advection/scripts/acc_loops_unified_memory_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/acc_loops_unified_memory_trans.py @@ -1,7 +1,7 @@ # ----------------------------------------------------------------------------- # BSD 3-Clause License # -# Copyright (c) 2022-2023, Science and Technology Facilities Council. +# Copyright (c) 2022-2025, Science and Technology Facilities Council. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -36,38 +36,35 @@ ''' PSyclone transformation script to insert OpenACC Parallel Loop directives to the outermost loop that is parallelisable, including implicit loops.''' -from psyclone.psyGen import TransInfo +from psyclone.psyir.nodes import Node, Routine +from psyclone.transformations import ACCParallelTrans, ACCLoopTrans + from utils import insert_explicit_loop_parallelism, normalise_loops -def trans(psy): +def trans(psy: Node): ''' Add OpenACC Parallel Loop directive to all loops, including implicit ones to target GPU parallelism. - :param psy: the PSy object which this script will transform. - :type psy: :py:class:`psyclone.psyGen.PSy` - :returns: the transformed PSy object. - :rtype: :py:class:`psyclone.psyGen.PSy` + :param psy: the PSyIR which this script will transform. ''' - acc_parallel_trans = TransInfo().get_trans_name('ACCParallelTrans') - acc_loop_trans = TransInfo().get_trans_name('ACCLoopTrans') + acc_parallel_trans = ACCParallelTrans() + acc_loop_trans = ACCLoopTrans() - print("Invokes found:") - for invoke in psy.invokes.invoke_list: - print(invoke.name) + print("Routines found:") + for routine in psy.walk(Routine): + print(routine.name) normalise_loops( - invoke.schedule, - unwrap_array_ranges=True, + routine, hoist_expressions=True, ) insert_explicit_loop_parallelism( - invoke.schedule, + routine, region_directive_trans=acc_parallel_trans, loop_directive_trans=acc_loop_trans, collapse=True ) - return psy From 6c5eb135339325895d2524d1c7fc639bf4fb4075 Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 23 Oct 2025 13:00:20 +0100 Subject: [PATCH 12/25] Rm path to profiling lib from spack-setup script --- compiler_setup/spack_nvidia.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/compiler_setup/spack_nvidia.sh b/compiler_setup/spack_nvidia.sh index 491e9a83..5db0c629 100644 --- a/compiler_setup/spack_nvidia.sh +++ b/compiler_setup/spack_nvidia.sh @@ -4,8 +4,9 @@ # ============================== export F90=$FC -export PSYCLONE_NVIDIA_LIB_DIR=${HOME}/Projects/PSyclone/lib/profiling/nvidia + export OMPTARGETFLAGS="-mp=gpu -gpu=ccnative" export OMPFLAGS="-mp" export UMEMFLAGS="-gpu=mem:managed" +export ACCFLAGS="-acc=gpu -gpu=ccnative" From 87d307a1eb2c21d31134195af4893daa3a1e2bf5 Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 23 Oct 2025 13:03:57 +0100 Subject: [PATCH 13/25] Fix Makefile for tra-adv compute_in_subroutine --- .../tracer_advection/compute_in_subroutine/Makefile | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile b/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile index 90a55d02..a8d3a901 100644 --- a/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile +++ b/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile @@ -22,7 +22,7 @@ DL_TIMER_NAME = libdl_timer_omp.a # Shorthand for invoking PSyclone with line-length limiting applied # to the output Fortran. -PSYCLONE = psyclone -api nemo -l output +PSYCLONE = psyclone -l output # Serial version. tra_adv_serial: dl_timer @@ -45,7 +45,7 @@ tra_adv_no_auto_serial: dl_timer # OpenACC version using Unified Memory with timer around outer loop only. tra_adv_acc: dl_timer mkdir -p $@ - ${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -opsy \ + ${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -o \ $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. @@ -58,7 +58,7 @@ ifndef PSYCLONE_NVIDIA_LIB_DIR $(error The tra_adv_acc_prof must have the PSYCLONE_NVIDIA_LIB_DIR defined) endif mkdir -p $@ - ${PSYCLONE} --profile invokes -s ../scripts/acc_kernels_unified_memory_trans.py -opsy \ + ${PSYCLONE} --profile invokes -s ../scripts/acc_kernels_unified_memory_trans.py -o \ $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. @@ -68,7 +68,7 @@ endif # Serial Fortran version after transformation to SIR-compliant form. tra_adv_sir: dl_timer mkdir -p $@ - ${PSYCLONE} -s ../scripts/sir_trans.py -opsy $@/tra_adv_compute.f90 \ + ${PSYCLONE} -s ../scripts/sir_trans.py -o $@/tra_adv_compute.f90 \ ./tra_adv_compute_auto_arrays.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. @@ -78,7 +78,7 @@ tra_adv_sir: dl_timer # OpenACC added after transformation to SIR-compliant form. tra_adv_sir_acc: dl_timer mkdir -p $@ - ${PSYCLONE} -s ../scripts/sir_kernels_trans.py -opsy \ + ${PSYCLONE} -s ../scripts/sir_kernels_trans.py -o \ $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. @@ -90,7 +90,7 @@ ifndef PSYCLONE_NVIDIA_LIB_DIR $(error The tra_adv_sir_acc_prof must have the PSYCLONE_NVIDIA_LIB_DIR defined) endif mkdir -p $@ - ${PSYCLONE} --profile invokes -s ../scripts/sir_kernels_trans.py -opsy \ + ${PSYCLONE} --profile invokes -s ../scripts/sir_kernels_trans.py -o \ $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. @@ -110,3 +110,4 @@ allclean: clean rm -rf tra_adv_acc_prof rm -rf tra_adv_sir rm -rf tra_adv_sir_acc + rm -rf tra_adv_no_auto_serial From d0d6c06aa79ae93d2ece2cf7c78efcf20843aef2 Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 23 Oct 2025 13:12:24 +0100 Subject: [PATCH 14/25] Fix other versions of tra_adv benchmark --- .../tracer_advection/compute_in_subroutine/Makefile | 4 ++-- benchmarks/nemo/tracer_advection/multi_kernel/Makefile | 10 +++++----- compiler_setup/spack_nvidia.sh | 1 + 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile b/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile index a8d3a901..593b6207 100644 --- a/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile +++ b/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile @@ -58,7 +58,7 @@ ifndef PSYCLONE_NVIDIA_LIB_DIR $(error The tra_adv_acc_prof must have the PSYCLONE_NVIDIA_LIB_DIR defined) endif mkdir -p $@ - ${PSYCLONE} --profile invokes -s ../scripts/acc_kernels_unified_memory_trans.py -o \ + ${PSYCLONE} --profile routines -s ../scripts/acc_kernels_unified_memory_trans.py -o \ $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. @@ -90,7 +90,7 @@ ifndef PSYCLONE_NVIDIA_LIB_DIR $(error The tra_adv_sir_acc_prof must have the PSYCLONE_NVIDIA_LIB_DIR defined) endif mkdir -p $@ - ${PSYCLONE} --profile invokes -s ../scripts/sir_kernels_trans.py -o \ + ${PSYCLONE} --profile routines -s ../scripts/sir_kernels_trans.py -o \ $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. diff --git a/benchmarks/nemo/tracer_advection/multi_kernel/Makefile b/benchmarks/nemo/tracer_advection/multi_kernel/Makefile index 83b3d966..ab8b09bc 100644 --- a/benchmarks/nemo/tracer_advection/multi_kernel/Makefile +++ b/benchmarks/nemo/tracer_advection/multi_kernel/Makefile @@ -22,7 +22,7 @@ DL_TIMER_DIR = ../../../../shared/dl_timer DL_TIMER_NAME = libdl_timer_omp.a # Shorthand for invoking PSyclone. -PSYCLONE = psyclone -api nemo -l output +PSYCLONE = psyclone -l output # Serial version. tra_adv_serial: dl_timer @@ -36,7 +36,7 @@ tra_adv_serial: dl_timer # OpenACC version with timer around outer loop only. tra_adv_acc: dl_timer mkdir -p $@ - ${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -opsy \ + ${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -o \ $@/tra_adv_compute.f90 ./tra_adv_compute.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. @@ -49,7 +49,7 @@ ifndef PSYCLONE_NVIDIA_LIB_DIR $(error The tra_adv_acc_prof must have the PSYCLONE_NVIDIA_LIB_DIR defined) endif mkdir -p $@ - ${PSYCLONE} --profile invokes -s ../scripts/kernels_trans.py -opsy \ + ${PSYCLONE} --profile routines -s ../scripts/acc_kernels_unified_memory_trans.py -o \ $@/tra_adv_compute.f90 ./tra_adv_compute.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. @@ -59,7 +59,7 @@ endif # Serial Fortran version after transformation to SIR-compliant form. tra_adv_sir: dl_timer mkdir -p $@ - ${PSYCLONE} -s ../scripts/sir_trans.py -opsy $@/tra_adv_compute.f90 \ + ${PSYCLONE} -s ../scripts/sir_trans.py -o $@/tra_adv_compute.f90 \ ./tra_adv_compute.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. @@ -69,7 +69,7 @@ tra_adv_sir: dl_timer # OpenACC added after transformation to SIR-compliant form. tra_adv_sir_acc: dl_timer mkdir -p $@ - ${PSYCLONE} -s ../scripts/sir_kernels_trans.py -opsy \ + ${PSYCLONE} -s ../scripts/sir_kernels_trans.py -o \ $@/tra_adv_compute.f90 ./tra_adv_compute.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. diff --git a/compiler_setup/spack_nvidia.sh b/compiler_setup/spack_nvidia.sh index 5db0c629..6eb25ec0 100644 --- a/compiler_setup/spack_nvidia.sh +++ b/compiler_setup/spack_nvidia.sh @@ -5,6 +5,7 @@ # ============================== export F90=$FC +export LDFLAGS="-cuda -L${CUDA_HOME}/lib64 -lnvToolsExt" export OMPTARGETFLAGS="-mp=gpu -gpu=ccnative" export OMPFLAGS="-mp" export UMEMFLAGS="-gpu=mem:managed" From 9c5526c42d0194f7e7649ac22ea77c68d679a708 Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 23 Oct 2025 13:16:38 +0100 Subject: [PATCH 15/25] Update GHA workflow file --- .github/workflows/makefile-test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/makefile-test.yml b/.github/workflows/makefile-test.yml index 245a0a78..c6185ab8 100644 --- a/.github/workflows/makefile-test.yml +++ b/.github/workflows/makefile-test.yml @@ -47,12 +47,12 @@ on: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: recursive - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 - run: python -m pip install --upgrade pip - run: cd shared/PSyclone && pip install . - name: Install dependencies From ba634ea14e4367559f523f7e046f765de12db090 Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 23 Oct 2025 13:22:45 +0100 Subject: [PATCH 16/25] Update kokkos submodule --- shared/kokkos | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shared/kokkos b/shared/kokkos index ae5fc649..aecc5dc8 160000 --- a/shared/kokkos +++ b/shared/kokkos @@ -1 +1 @@ -Subproject commit ae5fc649ef4b62b48a01123759ed066bff227b43 +Subproject commit aecc5dc8f5be7df3a4d8b9c6fa99f1212475bccc From b856e783156470eec108894ba8ade1747984548a Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 23 Oct 2025 16:04:06 +0100 Subject: [PATCH 17/25] Fix NEMOLite2D acc version --- .../fortran/boundary_conditions_mod.f90 | 1 - .../psykal/psyclone_scripts/acc_transform.py | 21 ++++++++++++------- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/benchmarks/nemo/nemolite2d/kernels/fortran/boundary_conditions_mod.f90 b/benchmarks/nemo/nemolite2d/kernels/fortran/boundary_conditions_mod.f90 index ca47f880..ca10aa6a 100644 --- a/benchmarks/nemo/nemolite2d/kernels/fortran/boundary_conditions_mod.f90 +++ b/benchmarks/nemo/nemolite2d/kernels/fortran/boundary_conditions_mod.f90 @@ -5,7 +5,6 @@ module boundary_conditions_mod GO_STENCIL use kernel_mod, only: kernel_type, GO_POINTWISE, GO_DOFS, & GO_ALL_PTS, GO_INTERNAL_PTS - use physical_params_mod use grid_mod use field_mod implicit none diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py index 7725c55f..e453a9ea 100644 --- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py +++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py @@ -3,7 +3,10 @@ from psyclone.domain.common.transformations import KernelModuleInlineTrans from psyclone.psyGen import TransInfo -from psyclone.psyir.nodes import Loop +from psyclone.psyir.nodes import Loop, Routine +from psyclone.transformations import ( + ACCEnterDataTrans, ACCLoopTrans, ACCParallelTrans, ACCRoutineTrans, + KernelImportsToArguments) def trans(psy): @@ -12,19 +15,23 @@ def trans(psy): tinfo = TransInfo() parallel_trans = tinfo.get_trans_name('ACCParallelTrans') loop_trans = tinfo.get_trans_name('ACCLoopTrans') - enter_data_trans = tinfo.get_trans_name('ACCEnterDataTrans') - routine_trans = tinfo.get_trans_name('ACCRoutineTrans') - glo2arg_trans = tinfo.get_trans_name('KernelImportsToArguments') + enter_data_trans = ACCEnterDataTrans() + routine_trans = ACCRoutineTrans() + glo2arg_trans = KernelImportsToArguments() inline_trans = KernelModuleInlineTrans() - invoke = psy.invokes.get('invoke_0') - schedule = invoke.schedule + schedule = psy.walk(Routine)[0] # Apply the OpenACC Loop transformation to *every* loop # in the schedule for child in schedule.children: if isinstance(child, Loop): - loop_trans.apply(child, {"collapse": 2}) + # We need to ignore dependencies on 'va' because PSyclone correctly + # spots that there is a dependence in one of the boundary-condition + # kernels. However, we know that practically this isn't a problem + # because of the way the domain (mask) is configured. + loop_trans.apply(child, {"collapse": 2, + "ignore_dependencies_for": ["va"]}) # Put all of the loops in a single parallel region parallel_trans.apply(schedule) From 004f7e5f51ceee25f18ed6d0e3c9113b0d28fb23 Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 23 Oct 2025 16:17:09 +0100 Subject: [PATCH 18/25] Update all NEMOLite2D transformation scripts --- .../psykal/psyclone_scripts/acc_transform.py | 8 +++---- .../psykal/psyclone_scripts/ocl_transform.py | 16 ++++++++------ .../psyclone_scripts/omp_task_transform.py | 22 ++++++++++++------- .../psykal/psyclone_scripts/omp_transform.py | 11 +++++++--- .../psyclone_scripts/serial_transform.py | 9 +++++--- 5 files changed, 41 insertions(+), 25 deletions(-) diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py index e453a9ea..52ff8c7c 100644 --- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py +++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py @@ -18,7 +18,7 @@ def trans(psy): enter_data_trans = ACCEnterDataTrans() routine_trans = ACCRoutineTrans() glo2arg_trans = KernelImportsToArguments() - inline_trans = KernelModuleInlineTrans() + mod_inline_trans = KernelModuleInlineTrans() schedule = psy.walk(Routine)[0] @@ -27,8 +27,8 @@ def trans(psy): for child in schedule.children: if isinstance(child, Loop): # We need to ignore dependencies on 'va' because PSyclone correctly - # spots that there is a dependence in one of the boundary-condition - # kernels. However, we know that practically this isn't a problem + # spots that there is a dependence in the bc_flather_v kernel. + # However, we know that practically this isn't a problem # because of the way the domain (mask) is configured. loop_trans.apply(child, {"collapse": 2, "ignore_dependencies_for": ["va"]}) @@ -44,6 +44,6 @@ def trans(psy): for kern in schedule.coded_kernels(): glo2arg_trans.apply(kern) routine_trans.apply(kern) - inline_trans.apply(kern) + mod_inline_trans.apply(kern) return psy diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/ocl_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/ocl_transform.py index 8235d3a0..6221871d 100644 --- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/ocl_transform.py +++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/ocl_transform.py @@ -3,10 +3,13 @@ that PSyclone will generate an OpenCL PSy layer. ''' import os -from psyclone.psyGen import TransInfo -from psyclone.domain.gocean.transformations import \ - GOMoveIterationBoundariesInsideKernelTrans, GOOpenCLTrans + +from psyclone.domain.gocean.transformations import ( + GOMoveIterationBoundariesInsideKernelTrans, GOOpenCLTrans) from psyclone.configuration import Config +from psyclone.psyir.nodes import Routine +from psyclone.transformations import ( + KernelImportsToArguments) # Global variables to configure the PSyclone OpenCL generation: @@ -33,13 +36,12 @@ def trans(psy): ''' Transform the schedule for OpenCL generation ''' # Import transformations - tinfo = TransInfo() - globaltrans = tinfo.get_trans_name('KernelImportsToArguments') + globaltrans = KernelImportsToArguments() move_boundaries_trans = GOMoveIterationBoundariesInsideKernelTrans() cltrans = GOOpenCLTrans() - # Get the invoke routine - schedule = psy.invokes.get('invoke_0').schedule + # Get the routine + schedule = psy.walk(Routine)[0] # Map the kernels by their name to different OpenCL queues. The multiple # command queues can be executed concurrently while each command queue diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py index 17be3e07..4b40ad59 100644 --- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py +++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py @@ -2,20 +2,21 @@ function via the -s option. It applies OpenMP tasking to every loop and inlines all kernels in the schedule.''' -from psyclone.psyir.nodes import Loop +from psyclone.psyir.nodes import Loop, Routine from psyclone.configuration import Config -from psyclone.transformations import OMPParallelTrans, OMPSingleTrans, \ - OMPTaskloopTrans, KernelModuleInlineTrans -from psyclone.psyir.transformations import OMPTaskwaitTrans -from psyclone.psyir.nodes import OMPTaskloopDirective, OMPTaskwaitDirective, \ - OMPDirective, OMPParallelDirective +from psyclone.domain.common.transformations import KernelModuleInlineTrans +from psyclone.transformations import ( + OMPParallelTrans, OMPSingleTrans) +from psyclone.psyir.transformations import OMPTaskloopTrans, OMPTaskwaitTrans +from psyclone.psyir.nodes import (OMPTaskloopDirective, OMPTaskwaitDirective, + OMPDirective, OMPParallelDirective) def trans(psy): '''Transformation entry point''' config = Config.get() - schedule = psy.invokes.get('invoke_0').schedule + schedule = psy.walk(Routine)[0] loop_trans = OMPTaskloopTrans(grainsize=32, nogroup=True) wait_trans = OMPTaskwaitTrans() @@ -28,7 +29,12 @@ def trans(psy): for child in schedule.children: if isinstance(child, Loop): - loop_trans.apply(child) + # We need to ignore dependencies on 'va' because PSyclone correctly + # spots that there is a dependence in the bc_flather_v kernel. + # However, we know that practically this isn't a problem + # because of the way the domain (mask) is configured. + loop_trans.apply(child, + options={"ignore_dependencies_for": ["va"]}) single_trans = OMPSingleTrans() parallel_trans = OMPParallelTrans() diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py index 573cef38..6a50a357 100644 --- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py +++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py @@ -5,7 +5,7 @@ from psyclone.configuration import Config from psyclone.domain.common.transformations import KernelModuleInlineTrans from psyclone.psyGen import TransInfo -from psyclone.psyir.nodes import Loop +from psyclone.psyir.nodes import Loop, Routine def trans(psy): @@ -17,7 +17,7 @@ def trans(psy): parallel_trans = tinfo.get_trans_name('OMPParallelTrans') module_inline_trans = KernelModuleInlineTrans() - schedule = psy.invokes.get('invoke_0').schedule + schedule = psy.walk(Routine)[0] # Inline all kernels in this Schedule for kernel in schedule.kernels(): @@ -30,7 +30,12 @@ def trans(psy): if isinstance(child, Loop): parallel_loop_trans.apply(child) else: - loop_trans.apply(child) + # We need to ignore dependencies on 'va' because PSyclone correctly + # spots that there is a dependence in the bc_flather_v kernel. + # However, we know that practically this isn't a problem + # because of the way the domain (mask) is configured. + loop_trans.apply(child, + options={"ignore_dependencies_for": ["va"]}) if not config.distributed_memory: # If it is not distributed memory, enclose all of these loops diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/serial_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/serial_transform.py index 1456cc2d..544e93ae 100644 --- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/serial_transform.py +++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/serial_transform.py @@ -2,14 +2,17 @@ via the -s option. This script module-inline all kernels in the PSy-layer.''' from psyclone.domain.common.transformations import KernelModuleInlineTrans +from psyclone.psyir.nodes import Node, Routine -def trans(psy): - ''' Transformation script entry function ''' +def trans(psy: Node): + '''Entry point for PSyIR transformation. This script module-inlines + every user-supplied kernel that is called. + ''' itrans = KernelModuleInlineTrans() - schedule = psy.invokes.get('invoke_0').schedule + schedule = psy.walk(Routine)[0] # Module-Inline all coded kernels in this Schedule for kernel in schedule.coded_kernels(): From fa9747aefd6605b71876cb0b7aaaf318af1376fc Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Thu, 23 Oct 2025 16:31:40 +0100 Subject: [PATCH 19/25] Rm unused and ancient script from Shallow --- benchmarks/shallow/SEQ/runme_loop_fuse.py | 25 ----------------------- 1 file changed, 25 deletions(-) delete mode 100644 benchmarks/shallow/SEQ/runme_loop_fuse.py diff --git a/benchmarks/shallow/SEQ/runme_loop_fuse.py b/benchmarks/shallow/SEQ/runme_loop_fuse.py deleted file mode 100644 index 0265c266..00000000 --- a/benchmarks/shallow/SEQ/runme_loop_fuse.py +++ /dev/null @@ -1,25 +0,0 @@ -from parse import parse,ParseError -from psyGen import PSyFactory,GenerationError -#from algGen import Alg -api="gocean" -filename="shallow_gocean.f90" -ast,invokeInfo=parse(filename,api=api,invoke_name="invoke") -psy=PSyFactory(api).create(invokeInfo) -print psy.gen -#alg=Alg(ast,psy) - -print psy.invokes.names -schedule=psy.invokes.get('invoke_0').schedule -schedule.view() - -from psyGen import TransInfo -t=TransInfo() -print t.list -#lf=t.get_trans_name('DoubleLoopFuse') -lf=t.get_trans_name('LoopFuse') - -newschedule,memento=lf.apply(schedule.children[0],schedule.children[1]) -#newschedule,memento=lf.apply(schedule.children[0].children[0].children[0],schedule.children[1].children[0].children[0]) -newschedule.view() -#psy.invokes.get('invoke_0')._schedule=newschedule -#print psy.gen From 20a6db24b9c40d1c8c17649b20fe8f4ec17c63d0 Mon Sep 17 00:00:00 2001 From: Sergi Siso Date: Tue, 9 Dec 2025 10:13:02 +0000 Subject: [PATCH 20/25] Update kokkos --- .../nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile | 5 ++++- shared/kokkos | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile index 9be53b00..65725e62 100644 --- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile +++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile @@ -16,6 +16,9 @@ KOKKOS_PATH ?= $(SHARED_DIR)/kokkos KOKKOS_DEBUG ?= no # Careful, 10x performance penalty in kernels. CXXFLAGS = $(CFLAGS) # Use same CFLAGS to compile Kokkos library. +# The Kokkos Makefile is deprecated, but we can still use it with: +KOKKOS_USE_DEPRECATED_MAKEFILES=1 + # If no KOKKOS_DEVICES is specified, by default use the OpenMP KOKKOS_DEVICES ?= OpenMP @@ -90,7 +93,7 @@ clean: ${MAKE} -C ${INF_DIR} clean rm -f *.o *.mod *.MOD *~ *.dat rm -f gnu_opt_report.txt *.optrpt - rm -rf KokkosCore_* Makefile.kokkos.f90 + rm -rf KokkosCore_* Makefile.kokkos.f90 desul Desul_Config.tmp allclean: clean rm -f *.exe fparser.log *.a diff --git a/shared/kokkos b/shared/kokkos index aecc5dc8..552f2375 160000 --- a/shared/kokkos +++ b/shared/kokkos @@ -1 +1 @@ -Subproject commit aecc5dc8f5be7df3a4d8b9c6fa99f1212475bccc +Subproject commit 552f2375de06361f8a5662abc0859ae233b5d8f8 From bc336ee2ea2e6bae00454618220008de0de98575 Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Mon, 15 Dec 2025 11:07:57 +0000 Subject: [PATCH 21/25] #101 update PSyclone to master --- shared/PSyclone | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shared/PSyclone b/shared/PSyclone index c9c20b1e..63d4c225 160000 --- a/shared/PSyclone +++ b/shared/PSyclone @@ -1 +1 @@ -Subproject commit c9c20b1ee96c10352b31463276408ad33ab84752 +Subproject commit 63d4c22552fb6cd5fafbd4185ef373a1d9e3713c From cb4039630bf87b42e8aaa7d22291428cdb30ac47 Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Mon, 15 Dec 2025 11:18:58 +0000 Subject: [PATCH 22/25] #101 update problem-size script and compiler flags --- .../tracer_advection/scripts/problemsize.sh | 5 +++- compiler_setup/intel.sh | 6 ++--- compiler_setup/nvidia_acc.sh | 24 ++++--------------- compiler_setup/spack_nvidia.sh | 2 +- 4 files changed, 11 insertions(+), 26 deletions(-) diff --git a/benchmarks/nemo/tracer_advection/scripts/problemsize.sh b/benchmarks/nemo/tracer_advection/scripts/problemsize.sh index 09441c3e..f7b7d985 100755 --- a/benchmarks/nemo/tracer_advection/scripts/problemsize.sh +++ b/benchmarks/nemo/tracer_advection/scripts/problemsize.sh @@ -2,6 +2,8 @@ # Bash script to execute the tracer-advection benchmark with increasing # domain sizes. +# By default the process is pinned to core 0. Please edit the taskset +# command below if you wish to change this. if [ "$#" -lt 1 ] || [ ! -x "$1" ]; then echo "Wrong arguments. Usage: ../../problemsize.sh ./executable" @@ -24,7 +26,8 @@ for power in $(seq 4 9); do export JPI=${size} export JPJ=${size} - time=$(taskset -c 2 $@ | awk '{if ($1 == "Time-stepping") {print $5} }') + # Execute - use taskset to pin the process to a core. + time=$(taskset -c 0 $@ | awk '{if ($1 == "Time-stepping") {print $5} }') echo $size $time done diff --git a/compiler_setup/intel.sh b/compiler_setup/intel.sh index dd3a70dd..ce334af0 100644 --- a/compiler_setup/intel.sh +++ b/compiler_setup/intel.sh @@ -41,10 +41,8 @@ OMPFLAGS="-qopenmp" LDFLAGS= #LDFLAGS+= -fast -# The archiver used to generate the API library. We must -# use Intel's xiar if doing IPO as otherwise the library -# doesn't contain the necessary symbols. -AR=xiar +# The archiver used to generate the API library. +AR=ar ARFLAGS=cru export F90 diff --git a/compiler_setup/nvidia_acc.sh b/compiler_setup/nvidia_acc.sh index 415e73a8..0cb5a438 100644 --- a/compiler_setup/nvidia_acc.sh +++ b/compiler_setup/nvidia_acc.sh @@ -11,30 +11,14 @@ CFLAGS="-g" F90FLAGS="-O3 -Minfo=all" # Debugging options #F90FLAGS"+=" -fcheck=all -fbacktrace -ffpe-trap=invalid -g -O0" -# -Mcuda is for CUDA Fortran -# nordc - do not link to routines compiled for device (ensure -# kernel code is in-lined in loops) -# cc = compute capability -# Registers are shared by threads in an SMP. The more registers a kernel -# uses, the fewer threads it can support. This parameter can be tuned and -# should be a multiple of 8. -# -Mcuda is required to build CUDA Fortran -# For Quadro K600 -#F90FLAGS+=" -acc -ta=tesla:cc30,nordc -Mcuda=cc30,nordc" -# For Tesla K20c -#F90FLAGS+=" -acc -ta=tesla,cc35,maxregcount:80,nordc -Mcuda=cc35,maxregcount:80,nordc" # V100 with managed memory -F90FLAGS+=" -acc=gpu -gpu=cc70,managed" +F90FLAGS+=" -acc=gpu -gpu=cc70,mem:managed" # Linker flags -# For Quadro K600 -#LDFLAGS+=" -acc -ta=tesla,cc30 -Mcuda=cc30,nordc" -# For Tesla K20c -#LDFLAGS="-acc -ta=nvidia,cc35 -Mcuda=cc35,nordc" # V100 with managed memory -LDFLAGS="-acc=gpu -gpu=cc70,managed" -# Location of various CUDA maths libraries. libnvToolsExt is required when +LDFLAGS="-acc=gpu -gpu=cc70,mem:managed" +# Location of various CUDA maths libraries. nvtx3interop is required when # using nvtx for profiling. -LDFLAGS+=" -Mcuda -L${CUDA_MATH_DIR}/lib64 -lnvToolsExt" +LDFLAGS+=" -cuda -L${CUDA_MATH_DIR}/lib64 -lnvtx3interop" # Flags to use when compiling with OpenMP support OMPFLAGS="-mp" # Command to use to create archive of object files diff --git a/compiler_setup/spack_nvidia.sh b/compiler_setup/spack_nvidia.sh index 6eb25ec0..a3ac0450 100644 --- a/compiler_setup/spack_nvidia.sh +++ b/compiler_setup/spack_nvidia.sh @@ -5,7 +5,7 @@ # ============================== export F90=$FC -export LDFLAGS="-cuda -L${CUDA_HOME}/lib64 -lnvToolsExt" +export LDFLAGS="-cuda -L${CUDA_HOME}/lib64 -lnvtx3interop" export OMPTARGETFLAGS="-mp=gpu -gpu=ccnative" export OMPFLAGS="-mp" export UMEMFLAGS="-gpu=mem:managed" From 432dc4c9d585da8cfa8dc8266210042d6890da44 Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Mon, 15 Dec 2025 11:46:12 +0000 Subject: [PATCH 23/25] #101 update psyclone scripts to handle u- and v-flather kernels --- .../psykal/psyclone_scripts/acc_transform.py | 16 ++++++++++------ .../psyclone_scripts/omp_task_transform.py | 12 ++++++++---- .../psykal/psyclone_scripts/omp_transform.py | 19 ++++++++++++------- 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py index 52ff8c7c..4d858485 100644 --- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py +++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py @@ -26,12 +26,16 @@ def trans(psy): # in the schedule for child in schedule.children: if isinstance(child, Loop): - # We need to ignore dependencies on 'va' because PSyclone correctly - # spots that there is a dependence in the bc_flather_v kernel. - # However, we know that practically this isn't a problem - # because of the way the domain (mask) is configured. - loop_trans.apply(child, {"collapse": 2, - "ignore_dependencies_for": ["va"]}) + opts = {"collapse": 2} + if child.kernels()[0].name == "bc_flather_v_code": + # We need to ignore dependencies on 'va' because PSyclone + # spots that there is a dependence in the bc_flather_v kernel. + # However, we know that practically this isn't a problem + # because of the way the domain (mask) is configured. + opts["ignore_dependencies_for"] = ["va%data"] + if child.kernels()[0].name == "bc_flather_u_code": + opts["ignore_dependencies_for"] = ["ua%data"] + loop_trans.apply(child, options=opts) # Put all of the loops in a single parallel region parallel_trans.apply(schedule) diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py index 4b40ad59..179ed4d0 100644 --- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py +++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py @@ -29,12 +29,16 @@ def trans(psy): for child in schedule.children: if isinstance(child, Loop): - # We need to ignore dependencies on 'va' because PSyclone correctly - # spots that there is a dependence in the bc_flather_v kernel. + # We need to ignore dependencies on '{u,v}a' because PSyclone + # spots that there is a dependence in the bc_flather_{u,v} kernel. # However, we know that practically this isn't a problem # because of the way the domain (mask) is configured. - loop_trans.apply(child, - options={"ignore_dependencies_for": ["va"]}) + options = {} + if child.kernels()[0].name == "bc_flather_v_code": + options["ignore_dependencies_for"] = ["va%data"] + if child.kernels()[0].name == "bc_flather_u_code": + options["ignore_dependencies_for"] = ["ua%data"] + loop_trans.apply(child, options=options) single_trans = OMPSingleTrans() parallel_trans = OMPParallelTrans() diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py index 6a50a357..d150587b 100644 --- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py +++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py @@ -26,16 +26,21 @@ def trans(psy): # Apply the OpenMPLoop transformation to every child in the schedule or # OpenMPParallelLoop to every Loop if it has distributed memory. for child in schedule.children: + # We need to ignore dependencies on '{u,v}a' because PSyclone correctly + # spots that there is a dependence in the bc_flather_{u,v} kernel. + # However, we know that practically this isn't a problem + # because these boundary-condition kernels only update values + # outside the domain. + options = {} + if child.kernels()[0].name == "bc_flather_v_code": + options["ignore_dependencies_for"] = ["va%data"] + if child.kernels()[0].name == "bc_flather_u_code": + options["ignore_dependencies_for"] = ["ua%data"] if config.distributed_memory: if isinstance(child, Loop): - parallel_loop_trans.apply(child) + parallel_loop_trans.apply(child, options=options) else: - # We need to ignore dependencies on 'va' because PSyclone correctly - # spots that there is a dependence in the bc_flather_v kernel. - # However, we know that practically this isn't a problem - # because of the way the domain (mask) is configured. - loop_trans.apply(child, - options={"ignore_dependencies_for": ["va"]}) + loop_trans.apply(child, options=options) if not config.distributed_memory: # If it is not distributed memory, enclose all of these loops From badb92d649c4d195dc15c058bce8ce9ecb51da36 Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Mon, 15 Dec 2025 12:05:59 +0000 Subject: [PATCH 24/25] #101 tidy tra-adv scripts --- .../psykal/psyclone_scripts/acc_transform.py | 12 +-- .../psyclone_scripts/omp_task_transform.py | 6 +- .../psykal/psyclone_scripts/omp_transform.py | 13 +-- ...cc_kernels_explicit_data_movement_trans.py | 6 +- .../acc_kernels_unified_memory_trans.py | 6 +- .../acc_loops_explicit_data_movement_trans.py | 6 +- .../scripts/acc_loops_unified_memory_trans.py | 6 +- .../acc_mixed_explicit_data_movement_trans.py | 8 +- .../scripts/acc_mixed_unified_memory_trans.py | 8 +- .../scripts/omp_cpu_levels_trans.py | 6 +- .../tracer_advection/scripts/omp_cpu_trans.py | 2 +- .../tracer_advection/scripts/omp_gpu_trans.py | 6 +- .../nemo/tracer_advection/scripts/utils.py | 92 +------------------ 13 files changed, 44 insertions(+), 133 deletions(-) diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py index 4d858485..3d1273b7 100644 --- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py +++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py @@ -3,15 +3,15 @@ from psyclone.domain.common.transformations import KernelModuleInlineTrans from psyclone.psyGen import TransInfo -from psyclone.psyir.nodes import Loop, Routine +from psyclone.psyir.nodes import Container, Loop, Routine from psyclone.transformations import ( ACCEnterDataTrans, ACCLoopTrans, ACCParallelTrans, ACCRoutineTrans, KernelImportsToArguments) -def trans(psy): - ''' Take the supplied psy object, apply OpenACC transformations - to the schedule of invoke_0 and return the new psy object ''' +def trans(psyir: Container) -> None: + ''' Take the supplied psyir object, apply OpenACC transformations + to the schedule of invoke_0. ''' tinfo = TransInfo() parallel_trans = tinfo.get_trans_name('ACCParallelTrans') loop_trans = tinfo.get_trans_name('ACCLoopTrans') @@ -20,7 +20,7 @@ def trans(psy): glo2arg_trans = KernelImportsToArguments() mod_inline_trans = KernelModuleInlineTrans() - schedule = psy.walk(Routine)[0] + schedule = psyir.walk(Routine)[0] # Apply the OpenACC Loop transformation to *every* loop # in the schedule @@ -49,5 +49,3 @@ def trans(psy): glo2arg_trans.apply(kern) routine_trans.apply(kern) mod_inline_trans.apply(kern) - - return psy diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py index 179ed4d0..b10e07ff 100644 --- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py +++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py @@ -2,7 +2,7 @@ function via the -s option. It applies OpenMP tasking to every loop and inlines all kernels in the schedule.''' -from psyclone.psyir.nodes import Loop, Routine +from psyclone.psyir.nodes import Container, Loop, Routine from psyclone.configuration import Config from psyclone.domain.common.transformations import KernelModuleInlineTrans from psyclone.transformations import ( @@ -12,11 +12,11 @@ OMPDirective, OMPParallelDirective) -def trans(psy): +def trans(psyir: Container) -> None: '''Transformation entry point''' config = Config.get() - schedule = psy.walk(Routine)[0] + schedule = psyir.walk(Routine)[0] loop_trans = OMPTaskloopTrans(grainsize=32, nogroup=True) wait_trans = OMPTaskwaitTrans() diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py index d150587b..ff085752 100644 --- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py +++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py @@ -5,11 +5,14 @@ from psyclone.configuration import Config from psyclone.domain.common.transformations import KernelModuleInlineTrans from psyclone.psyGen import TransInfo -from psyclone.psyir.nodes import Loop, Routine +from psyclone.psyir.nodes import Container, Loop, Routine -def trans(psy): - ''' Transformation entry point ''' +def trans(psyir: Container) -> None: + ''' + Transformation entry point. + + ''' config = Config.get() tinfo = TransInfo() parallel_loop_trans = tinfo.get_trans_name('GOceanOMPParallelLoopTrans') @@ -17,7 +20,7 @@ def trans(psy): parallel_trans = tinfo.get_trans_name('OMPParallelTrans') module_inline_trans = KernelModuleInlineTrans() - schedule = psy.walk(Routine)[0] + schedule = psyir.walk(Routine)[0] # Inline all kernels in this Schedule for kernel in schedule.kernels(): @@ -46,5 +49,3 @@ def trans(psy): # If it is not distributed memory, enclose all of these loops # within a single OpenMP PARALLEL region parallel_trans.apply(schedule.children) - - return psy diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_data_movement_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_data_movement_trans.py index 04f546b3..a8375404 100644 --- a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_data_movement_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_data_movement_trans.py @@ -53,14 +53,14 @@ from utils import add_kernels -def trans(psy: Node): +def trans(psyir: Node) -> None: '''A PSyclone-script compliant transformation function. Applies OpenACC 'kernels' and 'data movement' directives to generic code. - :param psy: The PSyIR to apply transformations to. + :param psyir: The PSyIR to apply transformations to. ''' - for sched in psy.walk(Routine): + for sched in psyir.walk(Routine): if not sched.children: print(f"Routine {sched.name} is empty! Skipping...") diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_unified_memory_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_unified_memory_trans.py index 93a84179..904ffef9 100644 --- a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_unified_memory_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_unified_memory_trans.py @@ -52,16 +52,16 @@ from utils import add_kernels -def trans(psy: Node): +def trans(psyir: Node) -> None: '''A PSyclone-script compliant transformation function. Applies OpenACC 'kernels' to existing code. - :param psy: The PSyIR to apply transformations to. + :param psyir: The PSyIR to apply transformations to. ''' print("Routines found:") - for routine in psy.walk(Routine): + for routine in psyir.walk(Routine): print(routine.name) if not routine.children: diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_loops_explicit_data_movement_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_loops_explicit_data_movement_trans.py index 38aa257d..ac42bce2 100644 --- a/benchmarks/nemo/tracer_advection/scripts/acc_loops_explicit_data_movement_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/acc_loops_explicit_data_movement_trans.py @@ -44,18 +44,18 @@ from utils import insert_explicit_loop_parallelism, normalise_loops -def trans(psy: Node): +def trans(psyir: Node) -> None: ''' Add OpenACC Parallel Loop directive to all loops, including implicit ones, to target GPU parallelism and explicit data movement directives. - :param psy: the PSyIR which this script will transform. + :param psyir: the PSyIR which this script will transform. ''' acc_parallel_trans = ACCParallelTrans() acc_loop_trans = ACCLoopTrans() print("Routines found:") - for routine in psy.walk(Routine): + for routine in psyir.walk(Routine): print(routine.name) # Convert array and range notation to loops and hoist expressions diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_loops_unified_memory_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_loops_unified_memory_trans.py index 4ccf0ca6..efef8816 100644 --- a/benchmarks/nemo/tracer_advection/scripts/acc_loops_unified_memory_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/acc_loops_unified_memory_trans.py @@ -42,18 +42,18 @@ from utils import insert_explicit_loop_parallelism, normalise_loops -def trans(psy: Node): +def trans(psyir: Node) -> None: ''' Add OpenACC Parallel Loop directive to all loops, including implicit ones to target GPU parallelism. - :param psy: the PSyIR which this script will transform. + :param psyir: the PSyIR which this script will transform. ''' acc_parallel_trans = ACCParallelTrans() acc_loop_trans = ACCLoopTrans() print("Routines found:") - for routine in psy.walk(Routine): + for routine in psyir.walk(Routine): print(routine.name) normalise_loops( diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_mixed_explicit_data_movement_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_mixed_explicit_data_movement_trans.py index 8d7b9661..b8f63f50 100644 --- a/benchmarks/nemo/tracer_advection/scripts/acc_mixed_explicit_data_movement_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/acc_mixed_explicit_data_movement_trans.py @@ -54,17 +54,17 @@ insert_explicit_loop_parallelism -def trans(psy: Node): +def trans(psyir: Node) -> None: '''A PSyclone-script compliant transformation function. Applies OpenACC 'kernels', 'loop' and explicit 'data' directives to NEMO code. - :param psy: The PSyIR to apply transformations to. + :param psyir: The PSyIR to apply transformations to. ''' print("Routines found:") - print("\n".join([rt.name for rt in psy.walk(Routine)])) + print("\n".join([rt.name for rt in psyir.walk(Routine)])) - for routine in psy.walk(Routine): + for routine in psyir.walk(Routine): if not routine.children: print("Routine {routine.name} is empty! Skipping...") diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_mixed_unified_memory_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_mixed_unified_memory_trans.py index 3ed50735..c3cd28cb 100644 --- a/benchmarks/nemo/tracer_advection/scripts/acc_mixed_unified_memory_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/acc_mixed_unified_memory_trans.py @@ -53,17 +53,17 @@ insert_explicit_loop_parallelism -def trans(psy: Node): +def trans(psyir: Node) -> Node: '''A PSyclone-script compliant transformation function. Applies OpenACC 'kernels' and 'loop' directives to NEMO code. - :param psy: The PSyIR to apply transformations to. + :param psyir: The PSyIR to apply transformations to. ''' print("Routines found:") - print("\n".join([rt.name for rt in psy.walk(Routine)])) + print("\n".join([rt.name for rt in psyir.walk(Routine)])) - for routine in psy.walk(Routine): + for routine in psyir.walk(Routine): if not routine.children: print("Invoke {routine.name} is empty! Skipping...") diff --git a/benchmarks/nemo/tracer_advection/scripts/omp_cpu_levels_trans.py b/benchmarks/nemo/tracer_advection/scripts/omp_cpu_levels_trans.py index 4c8388b5..c2e9d9fb 100644 --- a/benchmarks/nemo/tracer_advection/scripts/omp_cpu_levels_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/omp_cpu_levels_trans.py @@ -54,16 +54,16 @@ }) -def trans(psy: Node): +def trans(psyir: Node) -> None: ''' Transform a specific Schedule by making all loops over levels OpenMP parallel. - :param psy: the PSyIR to be modified. + :param psyir: the PSyIR to be modified. ''' # Get the transformation we will apply ompt = OMPParallelLoopTrans() - for sched in psy.walk(Routine): + for sched in psyir.walk(Routine): # Apply the OMP transformation to each loop over levels containing # a kernel for loop in sched.loops(): diff --git a/benchmarks/nemo/tracer_advection/scripts/omp_cpu_trans.py b/benchmarks/nemo/tracer_advection/scripts/omp_cpu_trans.py index ee7e85b4..1860eead 100644 --- a/benchmarks/nemo/tracer_advection/scripts/omp_cpu_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/omp_cpu_trans.py @@ -41,7 +41,7 @@ from utils import insert_explicit_loop_parallelism, normalise_loops -def trans(psyir: Node): +def trans(psyir: Node) -> None: ''' Add OpenMP Parallel Loop directive to all loops, including implicit ones to target CPU parallelism. diff --git a/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py b/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py index 92663046..6612a9f5 100644 --- a/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py @@ -41,12 +41,12 @@ from utils import insert_explicit_loop_parallelism, normalise_loops -def trans(psy: Node): +def trans(psyir: Node) -> None: ''' Add OpenMP Target and Loop directives to all loops, including the implicit ones, to parallelise the code and execute it in an acceleration device. - :param psy: the PSyIR which this script will transform. + :param psyir: the PSyIR which this script will transform. ''' omp_target_trans = OMPTargetTrans() @@ -55,7 +55,7 @@ def trans(psy: Node): omp_loop_trans.omp_schedule = "none" print("Routines found:") - for routine in psy.walk(Routine): + for routine in psyir.walk(Routine): print(routine.name) normalise_loops( diff --git a/benchmarks/nemo/tracer_advection/scripts/utils.py b/benchmarks/nemo/tracer_advection/scripts/utils.py index 3d419b5d..db02f2be 100644 --- a/benchmarks/nemo/tracer_advection/scripts/utils.py +++ b/benchmarks/nemo/tracer_advection/scripts/utils.py @@ -46,13 +46,10 @@ from psyclone.psyir.transformations import ( ACCKernelsTrans, ArrayAssignment2LoopsTrans, HoistLocalArraysTrans, HoistLoopBoundExprTrans, - HoistTrans, Maxval2LoopTrans, OMPMinimiseSyncTrans, ProfileTrans, + HoistTrans, Maxval2LoopTrans, OMPMinimiseSyncTrans, Reference2ArrayRangeTrans, ScalarisationTrans) from psyclone.transformations import TransformationError -# If routine names contain these substrings then we do not profile them -PROFILING_IGNORE = [] - def normalise_loops( schedule, @@ -186,8 +183,7 @@ def insert_explicit_loop_parallelism( ''' nemo_v4 = os.environ.get('NEMOV4', False) - if schedule.name == "ts_wgt": - return # TODO #2937 WaW dependency incorrectly considered private + # Add the parallel directives in each loop for loop in schedule.walk(Loop): if loop.ancestor(Directive): @@ -200,14 +196,6 @@ def insert_explicit_loop_parallelism( if uniform_intrinsics_only: opts["device_string"] = "nvfortran-uniform" - routine_name = loop.ancestor(Routine).name - - if ('dyn_spg' in routine_name and len(loop.walk(Loop)) > 2): - loop.append_preceding_comment( - "PSyclone: Loop not parallelised because it is in 'dyn_spg' " - "and is not the inner loop") - continue - try: # First check that the region_directive is feasible for this region if region_directive_trans: @@ -235,82 +223,6 @@ def insert_explicit_loop_parallelism( minsync_trans.apply(schedule) -def add_profiling(children: Union[List[Node], Schedule]): - ''' - Walks down the PSyIR and inserts the largest possible profiling regions - in place. Code inside functions or that contains directives is excluded. - - :param children: a Schedule or sibling nodes in the PSyIR to which to - attempt to add profiling regions. - - ''' - if children and isinstance(children, Schedule): - # If we are given a Schedule, we look at its children. - children = children.children - - if not children: - return - - # We do not want profiling calipers inside functions (such as the - # PSyclone-generated comparison functions). - parent_routine = children[0].ancestor(Routine) - if parent_routine and parent_routine.return_symbol: - return - - node_list = [] - for child in children[:]: - # Do we want this node to be included in a profiling region? - if child.walk((Directive, Return)): - # It contains a directive or return statement so we put what we - # have so far inside a profiling region. - add_profile_region(node_list) - # A node that is not included in a profiling region marks the - # end of the current candidate region so reset the list. - node_list = [] - # Now we go down a level and try again without attempting to put - # profiling below directives or within Assignments - if isinstance(child, IfBlock): - add_profiling(child.if_body) - add_profiling(child.else_body) - elif not isinstance(child, (Assignment, Directive)): - add_profiling(child.children) - else: - # We can add this node to our list for the current region - node_list.append(child) - add_profile_region(node_list) - - -def add_profile_region(nodes): - ''' - Attempt to put the supplied list of nodes within a profiling region. - - :param nodes: list of sibling PSyIR nodes to enclose. - :type nodes: list of :py:class:`psyclone.psyir.nodes.Node` - - ''' - if nodes: - # Check whether we should be adding profiling inside this routine - routine_name = nodes[0].ancestor(Routine).name.lower() - if any(ignore in routine_name for ignore in PROFILING_IGNORE): - return - if len(nodes) == 1: - if isinstance(nodes[0], CodeBlock) and \ - len(nodes[0].get_ast_nodes) == 1: - # Don't create profiling regions for CodeBlocks consisting - # of a single statement - return - if isinstance(nodes[0], IfBlock) and \ - "was_single_stmt" in nodes[0].annotations and \ - isinstance(nodes[0].if_body[0], CodeBlock): - # We also don't put single statements consisting of - # 'IF(condition) CALL blah()' inside profiling regions - return - try: - ProfileTrans().apply(nodes) - except TransformationError: - pass - - def valid_kernel(node): ''' Whether the sub-tree that has `node` at its root is eligible to be From 04084426f809babba379dd36bea779825b81c6fe Mon Sep 17 00:00:00 2001 From: Andrew Porter Date: Mon, 15 Dec 2025 14:02:08 +0000 Subject: [PATCH 25/25] #101 fix nvidia compiler flags --- compiler_setup/nvidia.sh | 2 +- compiler_setup/nvidia_acc.sh | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/compiler_setup/nvidia.sh b/compiler_setup/nvidia.sh index c9abd3b2..61c4db93 100644 --- a/compiler_setup/nvidia.sh +++ b/compiler_setup/nvidia.sh @@ -26,7 +26,7 @@ OMPFLAGS="-mp" # Flag to use when compiling with OpenMP GPU offloading support OMPTARGETFLAGS="-mp=gpu -gpu=ccnative" # Flag to use to specify use of 'managed memory' (unified memory) -UMEMFLAGS="-gpu=managed" +UMEMFLAGS="-gpu=mem:managed" # Flags to use when compiling with OpenACC support ACCFLAGS="-acc=gpu -gpu=ccnative" diff --git a/compiler_setup/nvidia_acc.sh b/compiler_setup/nvidia_acc.sh index 0cb5a438..61c4f289 100644 --- a/compiler_setup/nvidia_acc.sh +++ b/compiler_setup/nvidia_acc.sh @@ -11,11 +11,11 @@ CFLAGS="-g" F90FLAGS="-O3 -Minfo=all" # Debugging options #F90FLAGS"+=" -fcheck=all -fbacktrace -ffpe-trap=invalid -g -O0" -# V100 with managed memory -F90FLAGS+=" -acc=gpu -gpu=cc70,mem:managed" +# managed memory +F90FLAGS+=" -acc=gpu -gpu=mem:managed" # Linker flags -# V100 with managed memory -LDFLAGS="-acc=gpu -gpu=cc70,mem:managed" +# managed memory +LDFLAGS="-acc=gpu -gpu=mem:managed" # Location of various CUDA maths libraries. nvtx3interop is required when # using nvtx for profiling. LDFLAGS+=" -cuda -L${CUDA_MATH_DIR}/lib64 -lnvtx3interop"