diff --git a/.github/workflows/makefile-test.yml b/.github/workflows/makefile-test.yml index 245a0a78..c6185ab8 100644 --- a/.github/workflows/makefile-test.yml +++ b/.github/workflows/makefile-test.yml @@ -47,12 +47,12 @@ on: jobs: build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: recursive - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 - run: python -m pip install --upgrade pip - run: cd shared/PSyclone && pip install . - name: Install dependencies diff --git a/benchmarks/nemo/nemolite2d/kernels/fortran/boundary_conditions_mod.f90 b/benchmarks/nemo/nemolite2d/kernels/fortran/boundary_conditions_mod.f90 index ca47f880..ca10aa6a 100644 --- a/benchmarks/nemo/nemolite2d/kernels/fortran/boundary_conditions_mod.f90 +++ b/benchmarks/nemo/nemolite2d/kernels/fortran/boundary_conditions_mod.f90 @@ -5,7 +5,6 @@ module boundary_conditions_mod GO_STENCIL use kernel_mod, only: kernel_type, GO_POINTWISE, GO_DOFS, & GO_ALL_PTS, GO_INTERNAL_PTS - use physical_params_mod use grid_mod use field_mod implicit none diff --git a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile index 9be53b00..65725e62 100644 --- a/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile +++ b/benchmarks/nemo/nemolite2d/manual_versions/psykal_kokkos/Makefile @@ -16,6 +16,9 @@ KOKKOS_PATH ?= $(SHARED_DIR)/kokkos KOKKOS_DEBUG ?= no # Careful, 10x performance penalty in kernels. CXXFLAGS = $(CFLAGS) # Use same CFLAGS to compile Kokkos library. +# The Kokkos Makefile is deprecated, but we can still use it with: +KOKKOS_USE_DEPRECATED_MAKEFILES=1 + # If no KOKKOS_DEVICES is specified, by default use the OpenMP KOKKOS_DEVICES ?= OpenMP @@ -90,7 +93,7 @@ clean: ${MAKE} -C ${INF_DIR} clean rm -f *.o *.mod *.MOD *~ *.dat rm -f gnu_opt_report.txt *.optrpt - rm -rf KokkosCore_* Makefile.kokkos.f90 + rm -rf KokkosCore_* Makefile.kokkos.f90 desul Desul_Config.tmp allclean: clean rm -f *.exe fparser.log *.a diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py index 7725c55f..3d1273b7 100644 --- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py +++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/acc_transform.py @@ -3,28 +3,39 @@ from psyclone.domain.common.transformations import KernelModuleInlineTrans from psyclone.psyGen import TransInfo -from psyclone.psyir.nodes import Loop +from psyclone.psyir.nodes import Container, Loop, Routine +from psyclone.transformations import ( + ACCEnterDataTrans, ACCLoopTrans, ACCParallelTrans, ACCRoutineTrans, + KernelImportsToArguments) -def trans(psy): - ''' Take the supplied psy object, apply OpenACC transformations - to the schedule of invoke_0 and return the new psy object ''' +def trans(psyir: Container) -> None: + ''' Take the supplied psyir object, apply OpenACC transformations + to the schedule of invoke_0. ''' tinfo = TransInfo() parallel_trans = tinfo.get_trans_name('ACCParallelTrans') loop_trans = tinfo.get_trans_name('ACCLoopTrans') - enter_data_trans = tinfo.get_trans_name('ACCEnterDataTrans') - routine_trans = tinfo.get_trans_name('ACCRoutineTrans') - glo2arg_trans = tinfo.get_trans_name('KernelImportsToArguments') - inline_trans = KernelModuleInlineTrans() + enter_data_trans = ACCEnterDataTrans() + routine_trans = ACCRoutineTrans() + glo2arg_trans = KernelImportsToArguments() + mod_inline_trans = KernelModuleInlineTrans() - invoke = psy.invokes.get('invoke_0') - schedule = invoke.schedule + schedule = psyir.walk(Routine)[0] # Apply the OpenACC Loop transformation to *every* loop # in the schedule for child in schedule.children: if isinstance(child, Loop): - loop_trans.apply(child, {"collapse": 2}) + opts = {"collapse": 2} + if child.kernels()[0].name == "bc_flather_v_code": + # We need to ignore dependencies on 'va' because PSyclone + # spots that there is a dependence in the bc_flather_v kernel. + # However, we know that practically this isn't a problem + # because of the way the domain (mask) is configured. + opts["ignore_dependencies_for"] = ["va%data"] + if child.kernels()[0].name == "bc_flather_u_code": + opts["ignore_dependencies_for"] = ["ua%data"] + loop_trans.apply(child, options=opts) # Put all of the loops in a single parallel region parallel_trans.apply(schedule) @@ -37,6 +48,4 @@ def trans(psy): for kern in schedule.coded_kernels(): glo2arg_trans.apply(kern) routine_trans.apply(kern) - inline_trans.apply(kern) - - return psy + mod_inline_trans.apply(kern) diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/ocl_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/ocl_transform.py index 8235d3a0..6221871d 100644 --- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/ocl_transform.py +++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/ocl_transform.py @@ -3,10 +3,13 @@ that PSyclone will generate an OpenCL PSy layer. ''' import os -from psyclone.psyGen import TransInfo -from psyclone.domain.gocean.transformations import \ - GOMoveIterationBoundariesInsideKernelTrans, GOOpenCLTrans + +from psyclone.domain.gocean.transformations import ( + GOMoveIterationBoundariesInsideKernelTrans, GOOpenCLTrans) from psyclone.configuration import Config +from psyclone.psyir.nodes import Routine +from psyclone.transformations import ( + KernelImportsToArguments) # Global variables to configure the PSyclone OpenCL generation: @@ -33,13 +36,12 @@ def trans(psy): ''' Transform the schedule for OpenCL generation ''' # Import transformations - tinfo = TransInfo() - globaltrans = tinfo.get_trans_name('KernelImportsToArguments') + globaltrans = KernelImportsToArguments() move_boundaries_trans = GOMoveIterationBoundariesInsideKernelTrans() cltrans = GOOpenCLTrans() - # Get the invoke routine - schedule = psy.invokes.get('invoke_0').schedule + # Get the routine + schedule = psy.walk(Routine)[0] # Map the kernels by their name to different OpenCL queues. The multiple # command queues can be executed concurrently while each command queue diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py index 17be3e07..b10e07ff 100644 --- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py +++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_task_transform.py @@ -2,20 +2,21 @@ function via the -s option. It applies OpenMP tasking to every loop and inlines all kernels in the schedule.''' -from psyclone.psyir.nodes import Loop +from psyclone.psyir.nodes import Container, Loop, Routine from psyclone.configuration import Config -from psyclone.transformations import OMPParallelTrans, OMPSingleTrans, \ - OMPTaskloopTrans, KernelModuleInlineTrans -from psyclone.psyir.transformations import OMPTaskwaitTrans -from psyclone.psyir.nodes import OMPTaskloopDirective, OMPTaskwaitDirective, \ - OMPDirective, OMPParallelDirective +from psyclone.domain.common.transformations import KernelModuleInlineTrans +from psyclone.transformations import ( + OMPParallelTrans, OMPSingleTrans) +from psyclone.psyir.transformations import OMPTaskloopTrans, OMPTaskwaitTrans +from psyclone.psyir.nodes import (OMPTaskloopDirective, OMPTaskwaitDirective, + OMPDirective, OMPParallelDirective) -def trans(psy): +def trans(psyir: Container) -> None: '''Transformation entry point''' config = Config.get() - schedule = psy.invokes.get('invoke_0').schedule + schedule = psyir.walk(Routine)[0] loop_trans = OMPTaskloopTrans(grainsize=32, nogroup=True) wait_trans = OMPTaskwaitTrans() @@ -28,7 +29,16 @@ def trans(psy): for child in schedule.children: if isinstance(child, Loop): - loop_trans.apply(child) + # We need to ignore dependencies on '{u,v}a' because PSyclone + # spots that there is a dependence in the bc_flather_{u,v} kernel. + # However, we know that practically this isn't a problem + # because of the way the domain (mask) is configured. + options = {} + if child.kernels()[0].name == "bc_flather_v_code": + options["ignore_dependencies_for"] = ["va%data"] + if child.kernels()[0].name == "bc_flather_u_code": + options["ignore_dependencies_for"] = ["ua%data"] + loop_trans.apply(child, options=options) single_trans = OMPSingleTrans() parallel_trans = OMPParallelTrans() diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py index 573cef38..ff085752 100644 --- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py +++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/omp_transform.py @@ -5,11 +5,14 @@ from psyclone.configuration import Config from psyclone.domain.common.transformations import KernelModuleInlineTrans from psyclone.psyGen import TransInfo -from psyclone.psyir.nodes import Loop +from psyclone.psyir.nodes import Container, Loop, Routine -def trans(psy): - ''' Transformation entry point ''' +def trans(psyir: Container) -> None: + ''' + Transformation entry point. + + ''' config = Config.get() tinfo = TransInfo() parallel_loop_trans = tinfo.get_trans_name('GOceanOMPParallelLoopTrans') @@ -17,7 +20,7 @@ def trans(psy): parallel_trans = tinfo.get_trans_name('OMPParallelTrans') module_inline_trans = KernelModuleInlineTrans() - schedule = psy.invokes.get('invoke_0').schedule + schedule = psyir.walk(Routine)[0] # Inline all kernels in this Schedule for kernel in schedule.kernels(): @@ -26,15 +29,23 @@ def trans(psy): # Apply the OpenMPLoop transformation to every child in the schedule or # OpenMPParallelLoop to every Loop if it has distributed memory. for child in schedule.children: + # We need to ignore dependencies on '{u,v}a' because PSyclone correctly + # spots that there is a dependence in the bc_flather_{u,v} kernel. + # However, we know that practically this isn't a problem + # because these boundary-condition kernels only update values + # outside the domain. + options = {} + if child.kernels()[0].name == "bc_flather_v_code": + options["ignore_dependencies_for"] = ["va%data"] + if child.kernels()[0].name == "bc_flather_u_code": + options["ignore_dependencies_for"] = ["ua%data"] if config.distributed_memory: if isinstance(child, Loop): - parallel_loop_trans.apply(child) + parallel_loop_trans.apply(child, options=options) else: - loop_trans.apply(child) + loop_trans.apply(child, options=options) if not config.distributed_memory: # If it is not distributed memory, enclose all of these loops # within a single OpenMP PARALLEL region parallel_trans.apply(schedule.children) - - return psy diff --git a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/serial_transform.py b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/serial_transform.py index 1456cc2d..544e93ae 100644 --- a/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/serial_transform.py +++ b/benchmarks/nemo/nemolite2d/psykal/psyclone_scripts/serial_transform.py @@ -2,14 +2,17 @@ via the -s option. This script module-inline all kernels in the PSy-layer.''' from psyclone.domain.common.transformations import KernelModuleInlineTrans +from psyclone.psyir.nodes import Node, Routine -def trans(psy): - ''' Transformation script entry function ''' +def trans(psy: Node): + '''Entry point for PSyIR transformation. This script module-inlines + every user-supplied kernel that is called. + ''' itrans = KernelModuleInlineTrans() - schedule = psy.invokes.get('invoke_0').schedule + schedule = psy.walk(Routine)[0] # Module-Inline all coded kernels in this Schedule for kernel in schedule.coded_kernels(): diff --git a/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile b/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile index 90a55d02..593b6207 100644 --- a/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile +++ b/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile @@ -22,7 +22,7 @@ DL_TIMER_NAME = libdl_timer_omp.a # Shorthand for invoking PSyclone with line-length limiting applied # to the output Fortran. -PSYCLONE = psyclone -api nemo -l output +PSYCLONE = psyclone -l output # Serial version. tra_adv_serial: dl_timer @@ -45,7 +45,7 @@ tra_adv_no_auto_serial: dl_timer # OpenACC version using Unified Memory with timer around outer loop only. tra_adv_acc: dl_timer mkdir -p $@ - ${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -opsy \ + ${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -o \ $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. @@ -58,7 +58,7 @@ ifndef PSYCLONE_NVIDIA_LIB_DIR $(error The tra_adv_acc_prof must have the PSYCLONE_NVIDIA_LIB_DIR defined) endif mkdir -p $@ - ${PSYCLONE} --profile invokes -s ../scripts/acc_kernels_unified_memory_trans.py -opsy \ + ${PSYCLONE} --profile routines -s ../scripts/acc_kernels_unified_memory_trans.py -o \ $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. @@ -68,7 +68,7 @@ endif # Serial Fortran version after transformation to SIR-compliant form. tra_adv_sir: dl_timer mkdir -p $@ - ${PSYCLONE} -s ../scripts/sir_trans.py -opsy $@/tra_adv_compute.f90 \ + ${PSYCLONE} -s ../scripts/sir_trans.py -o $@/tra_adv_compute.f90 \ ./tra_adv_compute_auto_arrays.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. @@ -78,7 +78,7 @@ tra_adv_sir: dl_timer # OpenACC added after transformation to SIR-compliant form. tra_adv_sir_acc: dl_timer mkdir -p $@ - ${PSYCLONE} -s ../scripts/sir_kernels_trans.py -opsy \ + ${PSYCLONE} -s ../scripts/sir_kernels_trans.py -o \ $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. @@ -90,7 +90,7 @@ ifndef PSYCLONE_NVIDIA_LIB_DIR $(error The tra_adv_sir_acc_prof must have the PSYCLONE_NVIDIA_LIB_DIR defined) endif mkdir -p $@ - ${PSYCLONE} --profile invokes -s ../scripts/sir_kernels_trans.py -opsy \ + ${PSYCLONE} --profile routines -s ../scripts/sir_kernels_trans.py -o \ $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. @@ -110,3 +110,4 @@ allclean: clean rm -rf tra_adv_acc_prof rm -rf tra_adv_sir rm -rf tra_adv_sir_acc + rm -rf tra_adv_no_auto_serial diff --git a/benchmarks/nemo/tracer_advection/multi_kernel/Makefile b/benchmarks/nemo/tracer_advection/multi_kernel/Makefile index 83b3d966..ab8b09bc 100644 --- a/benchmarks/nemo/tracer_advection/multi_kernel/Makefile +++ b/benchmarks/nemo/tracer_advection/multi_kernel/Makefile @@ -22,7 +22,7 @@ DL_TIMER_DIR = ../../../../shared/dl_timer DL_TIMER_NAME = libdl_timer_omp.a # Shorthand for invoking PSyclone. -PSYCLONE = psyclone -api nemo -l output +PSYCLONE = psyclone -l output # Serial version. tra_adv_serial: dl_timer @@ -36,7 +36,7 @@ tra_adv_serial: dl_timer # OpenACC version with timer around outer loop only. tra_adv_acc: dl_timer mkdir -p $@ - ${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -opsy \ + ${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -o \ $@/tra_adv_compute.f90 ./tra_adv_compute.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. @@ -49,7 +49,7 @@ ifndef PSYCLONE_NVIDIA_LIB_DIR $(error The tra_adv_acc_prof must have the PSYCLONE_NVIDIA_LIB_DIR defined) endif mkdir -p $@ - ${PSYCLONE} --profile invokes -s ../scripts/kernels_trans.py -opsy \ + ${PSYCLONE} --profile routines -s ../scripts/acc_kernels_unified_memory_trans.py -o \ $@/tra_adv_compute.f90 ./tra_adv_compute.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. @@ -59,7 +59,7 @@ endif # Serial Fortran version after transformation to SIR-compliant form. tra_adv_sir: dl_timer mkdir -p $@ - ${PSYCLONE} -s ../scripts/sir_trans.py -opsy $@/tra_adv_compute.f90 \ + ${PSYCLONE} -s ../scripts/sir_trans.py -o $@/tra_adv_compute.f90 \ ./tra_adv_compute.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. @@ -69,7 +69,7 @@ tra_adv_sir: dl_timer # OpenACC added after transformation to SIR-compliant form. tra_adv_sir_acc: dl_timer mkdir -p $@ - ${PSYCLONE} -s ../scripts/sir_kernels_trans.py -opsy \ + ${PSYCLONE} -s ../scripts/sir_kernels_trans.py -o \ $@/tra_adv_compute.f90 ./tra_adv_compute.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. diff --git a/benchmarks/nemo/tracer_advection/original/Makefile b/benchmarks/nemo/tracer_advection/original/Makefile index 16d37045..d535c660 100644 --- a/benchmarks/nemo/tracer_advection/original/Makefile +++ b/benchmarks/nemo/tracer_advection/original/Makefile @@ -49,7 +49,7 @@ DL_TIMER_DIR = ../../../../shared/dl_timer DL_TIMER_NAME = libdl_timer_omp.a # Shorthand for invoking PSyclone. -PSYCLONE = psyclone -api nemo -l output ${PSYCLONE_PROFILE} +PSYCLONE = psyclone -l output ${PSYCLONE_PROFILE} # Add necessary flags for Nvidia nvtx instrumentation ifeq ($(ENABLE_NVIDIA_PROFILE),yes) @@ -71,14 +71,14 @@ tra_adv_serial: dl_timer ./tra_adv.F90 tra_adv_omp_cpu_levels: dl_timer ./tra_adv.F90 mkdir -p $@ - ${PSYCLONE} -s ../scripts/omp_cpu_levels_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 + ${PSYCLONE} -s ../scripts/omp_cpu_levels_trans.py -o $@/tra_adv.f90 ./tra_adv.F90 cp Makefile_gen $@/Makefile ${MAKE} FORT_FLAGS="${F90FLAGS} ${OMPFLAGS} -I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ tra_adv_omp_cpu: dl_timer ./tra_adv.F90 mkdir -p $@ - ${PSYCLONE} -s ../scripts/omp_cpu_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 + ${PSYCLONE} -s ../scripts/omp_cpu_trans.py -o $@/tra_adv.f90 ./tra_adv.F90 cp Makefile_gen $@/Makefile ${MAKE} FORT_FLAGS="${F90FLAGS} ${OMPFLAGS} -I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ @@ -87,42 +87,42 @@ tra_adv_omp_cpu: dl_timer ./tra_adv.F90 tra_adv_acc_kernels_unified_memory: dl_timer ./tra_adv.F90 mkdir -p $@ - ${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 + ${PSYCLONE} -s ../scripts/acc_kernels_unified_memory_trans.py -o $@/tra_adv.f90 ./tra_adv.F90 cp Makefile_gen $@/Makefile ${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} ${UMEMFLAGS} -I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ${ACCFLAGS} ${UMEMFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ tra_adv_acc_kernels_explicit_data_movement: dl_timer ./tra_adv.F90 mkdir -p $@ - ${PSYCLONE} -s ../scripts/acc_kernels_explicit_data_movement_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 + ${PSYCLONE} -s ../scripts/acc_kernels_explicit_data_movement_trans.py -o $@/tra_adv.f90 ./tra_adv.F90 cp Makefile_gen $@/Makefile ${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} -I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ${ACCFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ tra_adv_acc_loops_unified_memory: dl_timer ./tra_adv.F90 mkdir -p $@ - ${PSYCLONE} -s ../scripts/acc_loops_unified_memory_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 + ${PSYCLONE} -s ../scripts/acc_loops_unified_memory_trans.py -o $@/tra_adv.f90 ./tra_adv.F90 cp Makefile_gen $@/Makefile ${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} ${UMEMFLAGS} -I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ${ACCFLAGS} ${UMEMFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ tra_adv_acc_loops_explicit_data_movement: dl_timer ./tra_adv.F90 mkdir -p $@ - ${PSYCLONE} -s ../scripts/acc_loops_explicit_data_movement_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 + ${PSYCLONE} -s ../scripts/acc_loops_explicit_data_movement_trans.py -o $@/tra_adv.f90 ./tra_adv.F90 cp Makefile_gen $@/Makefile ${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} -I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ${ACCFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ tra_adv_acc_mixed_unified_memory: dl_timer ./tra_adv.F90 mkdir -p $@ - ${PSYCLONE} -s ../scripts/acc_mixed_unified_memory_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 + ${PSYCLONE} -s ../scripts/acc_mixed_unified_memory_trans.py -o $@/tra_adv.f90 ./tra_adv.F90 cp Makefile_gen $@/Makefile ${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} ${UMEMFLAGS} -I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ${ACCFLAGS} ${UMEMFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ tra_adv_acc_mixed_explicit_data_movement: dl_timer ./tra_adv.F90 mkdir -p $@ - ${PSYCLONE} -s ../scripts/acc_mixed_explicit_data_movement_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 + ${PSYCLONE} -s ../scripts/acc_mixed_explicit_data_movement_trans.py -o $@/tra_adv.f90 ./tra_adv.F90 cp Makefile_gen $@/Makefile ${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} -I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ${ACCFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ @@ -132,7 +132,7 @@ ifndef UMEMFLAGS $(error The OMP offload target requires OpenMP unified memory but the UMEMFLAGS environment variable is not set) endif mkdir -p $@ - ${PSYCLONE} -s ../scripts/omp_gpu_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 + ${PSYCLONE} -s ../scripts/omp_gpu_trans.py -o $@/tra_adv.f90 ./tra_adv.F90 cp Makefile_gen $@/Makefile ${MAKE} PROF_LIB_INC="-I../${DL_TIMER_DIR}/src" \ FORT_FLAGS="${F90FLAGS} ${OMPTARGETFLAGS} ${UMEMFLAGS} -I../${DL_TIMER_DIR}/src" \ diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_data_movement_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_data_movement_trans.py index 6260568f..a8375404 100644 --- a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_data_movement_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_data_movement_trans.py @@ -1,7 +1,7 @@ # ----------------------------------------------------------------------------- # BSD 3-Clause License # -# Copyright (c) 2018-2022, Science and Technology Facilities Council. +# Copyright (c) 2018-2025, Science and Technology Facilities Council. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -39,7 +39,7 @@ Once you have psyclone installed, this may be used by doing: - $ psyclone -api nemo -s + $ psyclone -s The transformation script attempts to insert Kernels directives at the highest possible location(s) in the schedule tree (i.e. to enclose as @@ -47,30 +47,26 @@ ''' -from psyclone.psyir.nodes import Directive +from psyclone.psyir.nodes import Directive, Routine, Node from psyclone.psyir.transformations import ACCUpdateTrans from psyclone.transformations import ACCEnterDataTrans from utils import add_kernels -def trans(psy): +def trans(psyir: Node) -> None: '''A PSyclone-script compliant transformation function. Applies - OpenACC 'kernels' and 'data movement' directives to NEMO code. + OpenACC 'kernels' and 'data movement' directives to generic code. - :param psy: The PSy layer object to apply transformations to. - :type psy: :py:class:`psyclone.psyGen.PSy` - ''' - - print("Invokes found:") - print("\n".join([str(name) for name in psy.invokes.names])) + :param psyir: The PSyIR to apply transformations to. - for invoke in psy.invokes.invoke_list: + ''' + for sched in psyir.walk(Routine): - if not invoke.schedule: - print(f"Invoke {invoke.name} has no Schedule! Skipping...") + if not sched.children: + print(f"Routine {sched.name} is empty! Skipping...") continue - add_kernels(invoke.schedule.children) - if invoke.schedule.walk(Directive): - ACCEnterDataTrans().apply(invoke.schedule) - ACCUpdateTrans().apply(invoke.schedule) + add_kernels(sched.children) + if sched.walk(Directive): + ACCEnterDataTrans().apply(sched) + ACCUpdateTrans().apply(sched) diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_unified_memory_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_unified_memory_trans.py index fd9ad97b..904ffef9 100644 --- a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_unified_memory_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_unified_memory_trans.py @@ -1,7 +1,7 @@ # ----------------------------------------------------------------------------- # BSD 3-Clause License # -# Copyright (c) 2018-2022, Science and Technology Facilities Council. +# Copyright (c) 2018-2025, Science and Technology Facilities Council. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,12 +34,12 @@ # Authors: R. W. Ford, A. R. Porter and S. Siso, STFC Daresbury Lab '''A transformation script that seeks to apply OpenACC KERNELS directives to -NEMO style code. In order to use it you must first install PSyclone. See +generic Fortran code. In order to use it you must first install PSyclone. See README.md in the top-level directory. Once you have psyclone installed, this may be used by doing: - $ psyclone -api nemo -s + $ psyclone -s The transformation script attempts to insert Kernels directives at the highest possible location(s) in the schedule tree (i.e. to enclose as @@ -47,24 +47,25 @@ ''' +from psyclone.psyir.nodes import Node, Routine + from utils import add_kernels -def trans(psy): +def trans(psyir: Node) -> None: '''A PSyclone-script compliant transformation function. Applies - OpenACC 'kernels' to NEMO code. + OpenACC 'kernels' to existing code. - :param psy: The PSy layer object to apply transformations to. - :type psy: :py:class:`psyclone.psyGen.PSy` - ''' + :param psyir: The PSyIR to apply transformations to. - print("Invokes found:") - print("\n".join([str(name) for name in psy.invokes.names])) + ''' + print("Routines found:") - for invoke in psy.invokes.invoke_list: + for routine in psyir.walk(Routine): + print(routine.name) - if not invoke.schedule: - print(f"Invoke {invoke.name} has no Schedule! Skipping...") + if not routine.children: + print(f"Routine {routine.name} is empty! Skipping...") continue - add_kernels(invoke.schedule.children) + add_kernels(routine.children) diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_loops_explicit_data_movement_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_loops_explicit_data_movement_trans.py index 4c5d1ebc..ac42bce2 100644 --- a/benchmarks/nemo/tracer_advection/scripts/acc_loops_explicit_data_movement_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/acc_loops_explicit_data_movement_trans.py @@ -1,7 +1,7 @@ # ----------------------------------------------------------------------------- # BSD 3-Clause License # -# Copyright (c) 2022-2023, Science and Technology Facilities Council. +# Copyright (c) 2022-2025, Science and Technology Facilities Council. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,47 +37,41 @@ to the outermost loop that is parallelisable, including implicit loops. This script also adds OpenACC explicit data movement directives.''' -from psyclone.psyir.nodes import Directive -from psyclone.psyGen import TransInfo +from psyclone.psyir.nodes import Directive, Node, Routine from psyclone.psyir.transformations import ACCUpdateTrans -from psyclone.transformations import ACCEnterDataTrans +from psyclone.transformations import ( + ACCEnterDataTrans, ACCLoopTrans, ACCParallelTrans) from utils import insert_explicit_loop_parallelism, normalise_loops -def trans(psy): +def trans(psyir: Node) -> None: ''' Add OpenACC Parallel Loop directive to all loops, including implicit ones, to target GPU parallelism and explicit data movement directives. - :param psy: the PSy object which this script will transform. - :type psy: :py:class:`psyclone.psyGen.PSy` - - :returns: the transformed PSy object. - :rtype: :py:class:`psyclone.psyGen.PSy` + :param psyir: the PSyIR which this script will transform. ''' - acc_parallel_trans = TransInfo().get_trans_name('ACCParallelTrans') - acc_loop_trans = TransInfo().get_trans_name('ACCLoopTrans') + acc_parallel_trans = ACCParallelTrans() + acc_loop_trans = ACCLoopTrans() - print("Invokes found:") - for invoke in psy.invokes.invoke_list: - print(invoke.name) + print("Routines found:") + for routine in psyir.walk(Routine): + print(routine.name) # Convert array and range notation to loops and hoist expressions normalise_loops( - invoke.schedule, - unwrap_array_ranges=True, + routine, + scalarise_loops=True, hoist_expressions=True, ) insert_explicit_loop_parallelism( - invoke.schedule, + routine, region_directive_trans=acc_parallel_trans, loop_directive_trans=acc_loop_trans, collapse=True ) - if invoke.schedule.walk(Directive): - ACCEnterDataTrans().apply(invoke.schedule) - ACCUpdateTrans().apply(invoke.schedule) - - return psy + if routine.walk(Directive): + ACCEnterDataTrans().apply(routine) + ACCUpdateTrans().apply(routine) diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_loops_unified_memory_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_loops_unified_memory_trans.py index 1557dd54..efef8816 100644 --- a/benchmarks/nemo/tracer_advection/scripts/acc_loops_unified_memory_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/acc_loops_unified_memory_trans.py @@ -1,7 +1,7 @@ # ----------------------------------------------------------------------------- # BSD 3-Clause License # -# Copyright (c) 2022-2023, Science and Technology Facilities Council. +# Copyright (c) 2022-2025, Science and Technology Facilities Council. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -36,38 +36,35 @@ ''' PSyclone transformation script to insert OpenACC Parallel Loop directives to the outermost loop that is parallelisable, including implicit loops.''' -from psyclone.psyGen import TransInfo +from psyclone.psyir.nodes import Node, Routine +from psyclone.transformations import ACCParallelTrans, ACCLoopTrans + from utils import insert_explicit_loop_parallelism, normalise_loops -def trans(psy): +def trans(psyir: Node) -> None: ''' Add OpenACC Parallel Loop directive to all loops, including implicit ones to target GPU parallelism. - :param psy: the PSy object which this script will transform. - :type psy: :py:class:`psyclone.psyGen.PSy` - :returns: the transformed PSy object. - :rtype: :py:class:`psyclone.psyGen.PSy` + :param psyir: the PSyIR which this script will transform. ''' - acc_parallel_trans = TransInfo().get_trans_name('ACCParallelTrans') - acc_loop_trans = TransInfo().get_trans_name('ACCLoopTrans') + acc_parallel_trans = ACCParallelTrans() + acc_loop_trans = ACCLoopTrans() - print("Invokes found:") - for invoke in psy.invokes.invoke_list: - print(invoke.name) + print("Routines found:") + for routine in psyir.walk(Routine): + print(routine.name) normalise_loops( - invoke.schedule, - unwrap_array_ranges=True, + routine, hoist_expressions=True, ) insert_explicit_loop_parallelism( - invoke.schedule, + routine, region_directive_trans=acc_parallel_trans, loop_directive_trans=acc_loop_trans, collapse=True ) - return psy diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_mixed_explicit_data_movement_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_mixed_explicit_data_movement_trans.py index db5dee0b..b8f63f50 100644 --- a/benchmarks/nemo/tracer_advection/scripts/acc_mixed_explicit_data_movement_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/acc_mixed_explicit_data_movement_trans.py @@ -1,7 +1,7 @@ # ----------------------------------------------------------------------------- # BSD 3-Clause License # -# Copyright (c) 2018-2023, Science and Technology Facilities Council. +# Copyright (c) 2018-2025, Science and Technology Facilities Council. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -39,7 +39,7 @@ Once you have psyclone installed, this may be used by doing: - $ psyclone -api nemo -s ./acc_mixed_explicit_data_movement_trans.py + $ psyclone -s ./acc_mixed_explicit_data_movement_trans.py This should produce a lot of output, ending with generated Fortran. Note that the Fortran source files provided to PSyclone must have already been @@ -47,50 +47,48 @@ ''' -from psyclone.psyir.nodes import Directive +from psyclone.psyir.nodes import Directive, Node, Routine from psyclone.psyir.transformations import ACCUpdateTrans from psyclone.transformations import ACCEnterDataTrans, ACCLoopTrans from utils import add_kernels, normalise_loops, \ insert_explicit_loop_parallelism -def trans(psy): +def trans(psyir: Node) -> None: '''A PSyclone-script compliant transformation function. Applies OpenACC 'kernels', 'loop' and explicit 'data' directives to NEMO code. - :param psy: The PSy layer object to apply transformations to. - :type psy: :py:class:`psyclone.psyGen.PSy` - ''' + :param psyir: The PSyIR to apply transformations to. - print("Invokes found:") - print("\n".join([str(name) for name in psy.invokes.names])) + ''' + print("Routines found:") + print("\n".join([rt.name for rt in psyir.walk(Routine)])) - for invoke in psy.invokes.invoke_list: + for routine in psyir.walk(Routine): - sched = invoke.schedule - if not sched: - print("Invoke {invoke.name} has no Schedule! Skipping...") + if not routine.children: + print("Routine {routine.name} is empty! Skipping...") continue # Convert array and range syntax to explicit loops normalise_loops( - invoke.schedule, - unwrap_array_ranges=True, + routine, + scalarise_loops=True, hoist_expressions=True, ) # Add OpenACC Loop directives insert_explicit_loop_parallelism( - invoke.schedule, + routine, region_directive_trans=None, loop_directive_trans=ACCLoopTrans(), collapse=True ) # Add OpenACC Kernel directives - add_kernels(sched.children) + add_kernels(routine.children) # Add OpenACC data directives - if invoke.schedule.walk(Directive): - ACCEnterDataTrans().apply(invoke.schedule) - ACCUpdateTrans().apply(invoke.schedule) + if routine.walk(Directive): + ACCEnterDataTrans().apply(routine) + ACCUpdateTrans().apply(routine) diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_mixed_unified_memory_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_mixed_unified_memory_trans.py index 3cc7b6c0..c3cd28cb 100644 --- a/benchmarks/nemo/tracer_advection/scripts/acc_mixed_unified_memory_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/acc_mixed_unified_memory_trans.py @@ -1,7 +1,7 @@ # ----------------------------------------------------------------------------- # BSD 3-Clause License # -# Copyright (c) 2018-2023, Science and Technology Facilities Council. +# Copyright (c) 2018-2025, Science and Technology Facilities Council. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -47,43 +47,42 @@ ''' +from psyclone.psyir.nodes import Node, Routine from psyclone.transformations import ACCLoopTrans from utils import add_kernels, normalise_loops, \ insert_explicit_loop_parallelism -def trans(psy): +def trans(psyir: Node) -> Node: '''A PSyclone-script compliant transformation function. Applies OpenACC 'kernels' and 'loop' directives to NEMO code. - :param psy: The PSy layer object to apply transformations to. - :type psy: :py:class:`psyclone.psyGen.PSy` - ''' + :param psyir: The PSyIR to apply transformations to. - print("Invokes found:") - print("\n".join([str(name) for name in psy.invokes.names])) + ''' + print("Routines found:") + print("\n".join([rt.name for rt in psyir.walk(Routine)])) - for invoke in psy.invokes.invoke_list: + for routine in psyir.walk(Routine): - sched = invoke.schedule - if not sched: - print("Invoke {invoke.name} has no Schedule! Skipping...") + if not routine.children: + print("Invoke {routine.name} is empty! Skipping...") continue # Convert array and range syntax to explicit loops normalise_loops( - invoke.schedule, - unwrap_array_ranges=True, + routine, + scalarise_loops=True, hoist_expressions=True, ) # Add OpenACC Loop directives insert_explicit_loop_parallelism( - invoke.schedule, + routine, region_directive_trans=None, loop_directive_trans=ACCLoopTrans(), collapse=True ) # Add OpenACC Kernel directives - add_kernels(sched.children) + add_kernels(routine) diff --git a/benchmarks/nemo/tracer_advection/scripts/omp_cpu_levels_trans.py b/benchmarks/nemo/tracer_advection/scripts/omp_cpu_levels_trans.py index bc6b7ef9..c2e9d9fb 100644 --- a/benchmarks/nemo/tracer_advection/scripts/omp_cpu_levels_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/omp_cpu_levels_trans.py @@ -1,7 +1,7 @@ # ----------------------------------------------------------------------------- # BSD 3-Clause License # -# Copyright (c) 2018-2023, Science and Technology Facilities Council +# Copyright (c) 2018-2025, Science and Technology Facilities Council # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -33,40 +33,43 @@ # ----------------------------------------------------------------------------- # Authors: R. W. Ford, A. R. Porter and S. Siso, STFC Daresbury Lab -'''A simple transformation script for the introduction of OpenMP with PSyclone. +'''A very simple transformation script for the introduction of OpenMP + to certain loops using PSyclone. - >>> psyclone -api "nemo" -s ./omp_cpu_levels_trans.py tra_adv.F90 + >>> psyclone -s ./omp_cpu_levels_trans.py tra_adv.F90 This should produce a lot of output, ending with generated Fortran. ''' -from psyclone.psyGen import TransInfo -from psyclone.nemo import NemoKern +from psyclone.psyir.nodes import Loop, Node, Routine +from psyclone.transformations import OMPParallelLoopTrans, TransformationError -def trans(psy): +# Set up some loop_type inference rules in order to reference useful domain +# loop constructs by name +Loop.set_loop_type_inference_rules({ + "lon": {"variable": "ji"}, + "lat": {"variable": "jj"}, + "levels": {"variable": "jk"} +}) + + +def trans(psyir: Node) -> None: ''' Transform a specific Schedule by making all loops over levels OpenMP parallel. - :param psy: the object holding all information on the PSy layer \ - to be modified. - :type psy: :py:class:`psyclone.psyGen.PSy` - - :returns: the transformed PSy object - :rtype: :py:class:`psyclone.psyGen.PSy` + :param psyir: the PSyIR to be modified. ''' # Get the transformation we will apply - ompt = TransInfo().get_trans_name('OMPParallelLoopTrans') - for invoke in psy.invokes.invoke_list: - # Get the Schedule of the target routine - sched = invoke.schedule + ompt = OMPParallelLoopTrans() + for sched in psyir.walk(Routine): # Apply the OMP transformation to each loop over levels containing # a kernel for loop in sched.loops(): - kernels = loop.walk(NemoKern) - if kernels and loop.loop_type == "levels": - ompt.apply(loop) - - # Return the modified psy object - return psy + if loop.loop_type == "levels": + try: + ompt.apply(loop) + except TransformationError as err: + loop.append_preceding_comment( + f"Loop cannot be parallelised because: {err}") diff --git a/benchmarks/nemo/tracer_advection/scripts/omp_cpu_trans.py b/benchmarks/nemo/tracer_advection/scripts/omp_cpu_trans.py index c77813e7..1860eead 100644 --- a/benchmarks/nemo/tracer_advection/scripts/omp_cpu_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/omp_cpu_trans.py @@ -1,7 +1,7 @@ # ----------------------------------------------------------------------------- # BSD 3-Clause License # -# Copyright (c) 2022, Science and Technology Facilities Council. +# Copyright (c) 2022-2025, Science and Technology Facilities Council. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -36,38 +36,34 @@ ''' PSyclone transformation script to insert OpenMP Parallel Loop directives to the outermost loop that is parallelisable, including implicit loops.''' -from psyclone.psyGen import TransInfo +from psyclone.psyir.nodes import Node, Routine +from psyclone.transformations import OMPParallelTrans, OMPLoopTrans from utils import insert_explicit_loop_parallelism, normalise_loops -def trans(psy): +def trans(psyir: Node) -> None: ''' Add OpenMP Parallel Loop directive to all loops, including implicit ones to target CPU parallelism. - :param psy: the PSy object which this script will transform. - :type psy: :py:class:`psyclone.psyGen.PSy` - :returns: the transformed PSy object. - :rtype: :py:class:`psyclone.psyGen.PSy` + :param psy: the PSyIR which this script will transform. ''' - omp_parallel_trans = TransInfo().get_trans_name('OMPParallelTrans') - omp_loop_trans = TransInfo().get_trans_name('OMPLoopTrans') + omp_parallel_trans = OMPParallelTrans() + omp_loop_trans = OMPLoopTrans() - print("Invokes found:") - for invoke in psy.invokes.invoke_list: - print(invoke.name) + print("Routines: found:") + for routine in psyir.walk(Routine): + print(routine.name) normalise_loops( - invoke.schedule, - unwrap_array_ranges=True, + routine, + convert_array_notation=True, hoist_expressions=False, ) insert_explicit_loop_parallelism( - invoke.schedule, + routine, region_directive_trans=omp_parallel_trans, loop_directive_trans=omp_loop_trans, collapse=False ) - - return psy diff --git a/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py b/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py index be2f4927..6612a9f5 100644 --- a/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py @@ -36,19 +36,17 @@ ''' PSyclone transformation script to insert OpenMP Target Loop directives to the outermost loop that is parallelisable, including implicit loops. ''' +from psyclone.psyir.nodes import Node, Routine from psyclone.psyir.transformations import OMPTargetTrans, OMPLoopTrans from utils import insert_explicit_loop_parallelism, normalise_loops -def trans(psy): +def trans(psyir: Node) -> None: ''' Add OpenMP Target and Loop directives to all loops, including the implicit ones, to parallelise the code and execute it in an acceleration device. - :param psy: the PSy object which this script will transform. - :type psy: :py:class:`psyclone.psyGen.PSy` - :returns: the transformed PSy object. - :rtype: :py:class:`psyclone.psyGen.PSy` + :param psyir: the PSyIR which this script will transform. ''' omp_target_trans = OMPTargetTrans() @@ -56,21 +54,17 @@ def trans(psy): omp_loop_trans.omp_directive = "teamsdistributeparalleldo" omp_loop_trans.omp_schedule = "none" - print("Invokes found:") - for invoke in psy.invokes.invoke_list: - print(invoke.name) + print("Routines found:") + for routine in psyir.walk(Routine): + print(routine.name) normalise_loops( - invoke.schedule, - unwrap_array_ranges=True, - hoist_expressions=True, + routine, ) insert_explicit_loop_parallelism( - invoke.schedule, + routine, region_directive_trans=omp_target_trans, loop_directive_trans=omp_loop_trans, collapse=True ) - - return psy diff --git a/benchmarks/nemo/tracer_advection/scripts/problemsize.sh b/benchmarks/nemo/tracer_advection/scripts/problemsize.sh index 09441c3e..f7b7d985 100755 --- a/benchmarks/nemo/tracer_advection/scripts/problemsize.sh +++ b/benchmarks/nemo/tracer_advection/scripts/problemsize.sh @@ -2,6 +2,8 @@ # Bash script to execute the tracer-advection benchmark with increasing # domain sizes. +# By default the process is pinned to core 0. Please edit the taskset +# command below if you wish to change this. if [ "$#" -lt 1 ] || [ ! -x "$1" ]; then echo "Wrong arguments. Usage: ../../problemsize.sh ./executable" @@ -24,7 +26,8 @@ for power in $(seq 4 9); do export JPI=${size} export JPJ=${size} - time=$(taskset -c 2 $@ | awk '{if ($1 == "Time-stepping") {print $5} }') + # Execute - use taskset to pin the process to a core. + time=$(taskset -c 0 $@ | awk '{if ($1 == "Time-stepping") {print $5} }') echo $size $time done diff --git a/benchmarks/nemo/tracer_advection/scripts/utils.py b/benchmarks/nemo/tracer_advection/scripts/utils.py index 8c9f530a..db02f2be 100644 --- a/benchmarks/nemo/tracer_advection/scripts/utils.py +++ b/benchmarks/nemo/tracer_advection/scripts/utils.py @@ -35,16 +35,29 @@ ''' Utilities file to parallelise Nemo code. ''' -from psyclone.domain.nemo.transformations import NemoAllArrayRange2LoopTrans +import os +from typing import List, Union + from psyclone.errors import InternalError -from psyclone.psyir.nodes import Loop, Assignment, Directive, CodeBlock, Call -from psyclone.psyir.transformations import HoistLoopBoundExprTrans, HoistTrans -from psyclone.transformations import TransformationError, ACCKernelsTrans +from psyclone.psyir.nodes import ( + Assignment, Directive, CodeBlock, Call, IfBlock, IntrinsicCall, Loop, Node, + Reference, Return, Routine, Schedule, StructureReference) +from psyclone.psyir.symbols import DataSymbol +from psyclone.psyir.transformations import ( + ACCKernelsTrans, ArrayAssignment2LoopsTrans, HoistLocalArraysTrans, + HoistLoopBoundExprTrans, + HoistTrans, Maxval2LoopTrans, OMPMinimiseSyncTrans, + Reference2ArrayRangeTrans, ScalarisationTrans) +from psyclone.transformations import TransformationError def normalise_loops( schedule, - unwrap_array_ranges: bool = True, + hoist_local_arrays: bool = True, + convert_array_notation: bool = True, + loopify_array_intrinsics: bool = True, + convert_range_loops: bool = True, + scalarise_loops: bool = False, hoist_expressions: bool = True, ): ''' Normalise all loops in the given schedule so that they are in an @@ -52,20 +65,77 @@ def normalise_loops( them. :param schedule: the PSyIR Schedule to transform. - :param unwrap_array_ranges: whether to convert ranges to explicit loops. - :param hoist_expressions: whether to hoist bounds and loop invariant \ + :type schedule: :py:class:`psyclone.psyir.nodes.node` + :param bool hoist_local_arrays: whether to hoist local arrays. + :param bool convert_array_notation: whether to convert array notation + to explicit loops. + :param bool loopify_array_intrinsics: whether to convert intrinsics that + operate on arrays to explicit loops (currently only maxval). + :param bool convert_range_loops: whether to convert ranges to explicit + loops. + :param scalarise_loops: whether to attempt to convert arrays to scalars + where possible, default is False. + :param hoist_expressions: whether to hoist bounds and loop invariant statements out of the loop nest. ''' - if unwrap_array_ranges: + if hoist_local_arrays: + # Apply the HoistLocalArraysTrans when possible, it cannot be applied + # to files with statement functions because it will attempt to put the + # allocate above it, which is not valid Fortran. + try: + HoistLocalArraysTrans().apply(schedule) + except TransformationError: + pass + + if convert_array_notation: + # Make sure all array dimensions are explicit + for reference in schedule.walk(Reference): + part_of_the_call = reference.ancestor(Call) + if part_of_the_call: + if not part_of_the_call.is_elemental: + continue + if isinstance(reference.symbol, DataSymbol): + try: + Reference2ArrayRangeTrans().apply(reference) + except TransformationError: + pass + + if loopify_array_intrinsics: + for intr in schedule.walk(IntrinsicCall): + if intr.intrinsic.name == "MAXVAL": + try: + Maxval2LoopTrans().apply(intr) + except TransformationError as err: + print(err.value) + + if convert_range_loops: # Convert all array implicit loops to explicit loops - explicit_loops = NemoAllArrayRange2LoopTrans() + explicit_loops = ArrayAssignment2LoopsTrans() for assignment in schedule.walk(Assignment): - explicit_loops.apply(assignment) + if assignment.walk(StructureReference): + continue # TODO #2951 Fix issues with structure_refs + try: + explicit_loops.apply(assignment) + except TransformationError: + pass + + if scalarise_loops: + # Apply scalarisation to every loop. Execute this in reverse order + # as sometimes we can scalarise earlier loops if following loops + # have already been scalarised. + loops = schedule.walk(Loop) + loops.reverse() + scalartrans = ScalarisationTrans() + for loop in loops: + scalartrans.apply(loop) if hoist_expressions: # First hoist all possible expressions for loop in schedule.walk(Loop): - HoistLoopBoundExprTrans().apply(loop) + try: + HoistLoopBoundExprTrans().apply(loop) + except TransformationError: + pass # Hoist all possible assignments (in reverse order so the inner loop # constants are hoisted all the way out if possible) @@ -81,47 +151,76 @@ def insert_explicit_loop_parallelism( schedule, region_directive_trans=None, loop_directive_trans=None, - collapse: bool = True + collapse: bool = True, + privatise_arrays: bool = False, + asynchronous_parallelism: bool = False, + uniform_intrinsics_only: bool = False, + enable_reductions: bool = False, ): ''' For each loop in the schedule that doesn't already have a Directive as an ancestor, attempt to insert the given region and loop directives. - :param region_directive_trans: PSyclone transformation to insert the \ + :param schedule: the PSyIR Schedule to transform. + :type schedule: :py:class:`psyclone.psyir.nodes.node` + :param region_directive_trans: PSyclone transformation that inserts the region directive. - :param loop_directive_trans: PSyclone transformation to use to insert the \ - loop directive. - :param collapse: whether to attempt to insert the collapse clause to as \ + :type region_directive_trans: \ + :py:class:`psyclone.transformation.Transformation` + :param loop_directive_trans: PSyclone transformation that inserts the + loop parallelisation directive. + :type loop_directive_trans: \ + :py:class:`psyclone.transformation.Transformation` + :param collapse: whether to attempt to insert the collapse clause to as many nested loops as possible. + :param privatise_arrays: whether to attempt to privatise arrays that cause + write-write race conditions. + :param asynchronous_parallelism: whether to attempt to add asynchronocity + to the parallel sections. + :param uniform_intrinsics_only: if True it prevent offloading loops + with non-reproducible device intrinsics. + :param enable_reductions: whether to enable generation of reduction + clauses automatically. + ''' + nemo_v4 = os.environ.get('NEMOV4', False) # Add the parallel directives in each loop for loop in schedule.walk(Loop): if loop.ancestor(Directive): continue # Skip if an outer loop is already parallelised + opts = {"collapse": collapse, "privatise_arrays": privatise_arrays, + "verbose": True, "nowait": asynchronous_parallelism, + "enable_reductions": enable_reductions} + + if uniform_intrinsics_only: + opts["device_string"] = "nvfortran-uniform" + try: - loop_directive_trans.apply(loop) - # Only add the region directive if the loop was successfully - # parallelised. - if region_directive_trans is not None: - region_directive_trans.apply(loop.parent.parent) - except TransformationError as err: - # This loop can not be transformed, proceed to next loop - print("Loop not parallelised because:", str(err)) - continue + # First check that the region_directive is feasible for this region + if region_directive_trans: + # TODO psyclone/#3066 - validate *should* accept a single Node + # but currently has a bug and doesn't so we have to make a + # list and pass that. + region_directive_trans.validate([loop], options=opts) - if collapse: - # Count the number of perfectly nested loops - num_nested_loops = 0 - next_loop = loop - while isinstance(next_loop, Loop): - num_nested_loops += 1 - if len(next_loop.loop_body.children) > 1: - break - next_loop = next_loop.loop_body.children[0] + # If it is, apply the parallelisation directive + loop_directive_trans.apply(loop, options=opts) - if num_nested_loops > 1: - loop.parent.parent.collapse = num_nested_loops + # And if successful, the region directive on top. + if region_directive_trans: + region_directive_trans.apply(loop.parent.parent, options=opts) + except TransformationError: + # This loop cannot be transformed, proceed to next loop. + # The parallelisation restrictions will be explained with a comment + # associted to the loop in the generated output. + continue + + # If we are adding asynchronous parallelism then we now try to minimise + # the number of barriers. + if asynchronous_parallelism: + minsync_trans = OMPMinimiseSyncTrans() + minsync_trans.apply(schedule) def valid_kernel(node): @@ -136,20 +235,23 @@ def valid_kernel(node): :rtype: bool ''' - excluded_node_types = (CodeBlock, Call) - return node.walk(excluded_node_types) == [] + try: + ACCKernelsTrans().validate(node, {"disable_loop_check": True}) + except TransformationError: + return False + + return True -def add_kernels(children, default_present=True): +def add_kernels(children: list[Node], default_present: bool = True): ''' Walks through the PSyIR inserting OpenACC KERNELS directives at as high a level as possible. - :param children: list of sibling Nodes in PSyIR that are candidates for \ + :param children: list of sibling Nodes in PSyIR that are candidates for inclusion in an ACC KERNELS region. - :type children: list of :py:class:`psyclone.psyir.nodes.Node` - :param bool default_present: whether or not to supply the \ - DEFAULT(PRESENT) clause to ACC KERNELS directives. + :param default_present: whether or not to supply the + DEFAULT(PRESENT) clause to ACC KERNELS directives. ''' if not children: @@ -168,16 +270,15 @@ def add_kernels(children, default_present=True): try_kernels_trans(node_list, default_present) -def try_kernels_trans(nodes, default_present): +def try_kernels_trans(nodes: list[Node], default_present: bool): ''' Attempt to enclose the supplied list of nodes within a kernels region. If the transformation fails then the error message is reported but execution continues. :param nodes: list of Nodes to enclose within a Kernels region. - :type nodes: list of :py:class:`psyclone.psyir.nodes.Node` - :param bool default_present: whether or not to supply the \ - DEFAULT(PRESENT) clause to ACC KERNELS directives. + :param default_present: whether or not to supply the + DEFAULT(PRESENT) clause to ACC KERNELS directives. ''' if not nodes: diff --git a/benchmarks/shallow/SEQ/runme_loop_fuse.py b/benchmarks/shallow/SEQ/runme_loop_fuse.py deleted file mode 100644 index 0265c266..00000000 --- a/benchmarks/shallow/SEQ/runme_loop_fuse.py +++ /dev/null @@ -1,25 +0,0 @@ -from parse import parse,ParseError -from psyGen import PSyFactory,GenerationError -#from algGen import Alg -api="gocean" -filename="shallow_gocean.f90" -ast,invokeInfo=parse(filename,api=api,invoke_name="invoke") -psy=PSyFactory(api).create(invokeInfo) -print psy.gen -#alg=Alg(ast,psy) - -print psy.invokes.names -schedule=psy.invokes.get('invoke_0').schedule -schedule.view() - -from psyGen import TransInfo -t=TransInfo() -print t.list -#lf=t.get_trans_name('DoubleLoopFuse') -lf=t.get_trans_name('LoopFuse') - -newschedule,memento=lf.apply(schedule.children[0],schedule.children[1]) -#newschedule,memento=lf.apply(schedule.children[0].children[0].children[0],schedule.children[1].children[0].children[0]) -newschedule.view() -#psy.invokes.get('invoke_0')._schedule=newschedule -#print psy.gen diff --git a/compiler_setup/intel.sh b/compiler_setup/intel.sh index dd3a70dd..ce334af0 100644 --- a/compiler_setup/intel.sh +++ b/compiler_setup/intel.sh @@ -41,10 +41,8 @@ OMPFLAGS="-qopenmp" LDFLAGS= #LDFLAGS+= -fast -# The archiver used to generate the API library. We must -# use Intel's xiar if doing IPO as otherwise the library -# doesn't contain the necessary symbols. -AR=xiar +# The archiver used to generate the API library. +AR=ar ARFLAGS=cru export F90 diff --git a/compiler_setup/nvidia.sh b/compiler_setup/nvidia.sh index c9abd3b2..61c4db93 100644 --- a/compiler_setup/nvidia.sh +++ b/compiler_setup/nvidia.sh @@ -26,7 +26,7 @@ OMPFLAGS="-mp" # Flag to use when compiling with OpenMP GPU offloading support OMPTARGETFLAGS="-mp=gpu -gpu=ccnative" # Flag to use to specify use of 'managed memory' (unified memory) -UMEMFLAGS="-gpu=managed" +UMEMFLAGS="-gpu=mem:managed" # Flags to use when compiling with OpenACC support ACCFLAGS="-acc=gpu -gpu=ccnative" diff --git a/compiler_setup/nvidia_acc.sh b/compiler_setup/nvidia_acc.sh index 415e73a8..61c4f289 100644 --- a/compiler_setup/nvidia_acc.sh +++ b/compiler_setup/nvidia_acc.sh @@ -11,30 +11,14 @@ CFLAGS="-g" F90FLAGS="-O3 -Minfo=all" # Debugging options #F90FLAGS"+=" -fcheck=all -fbacktrace -ffpe-trap=invalid -g -O0" -# -Mcuda is for CUDA Fortran -# nordc - do not link to routines compiled for device (ensure -# kernel code is in-lined in loops) -# cc = compute capability -# Registers are shared by threads in an SMP. The more registers a kernel -# uses, the fewer threads it can support. This parameter can be tuned and -# should be a multiple of 8. -# -Mcuda is required to build CUDA Fortran -# For Quadro K600 -#F90FLAGS+=" -acc -ta=tesla:cc30,nordc -Mcuda=cc30,nordc" -# For Tesla K20c -#F90FLAGS+=" -acc -ta=tesla,cc35,maxregcount:80,nordc -Mcuda=cc35,maxregcount:80,nordc" -# V100 with managed memory -F90FLAGS+=" -acc=gpu -gpu=cc70,managed" +# managed memory +F90FLAGS+=" -acc=gpu -gpu=mem:managed" # Linker flags -# For Quadro K600 -#LDFLAGS+=" -acc -ta=tesla,cc30 -Mcuda=cc30,nordc" -# For Tesla K20c -#LDFLAGS="-acc -ta=nvidia,cc35 -Mcuda=cc35,nordc" -# V100 with managed memory -LDFLAGS="-acc=gpu -gpu=cc70,managed" -# Location of various CUDA maths libraries. libnvToolsExt is required when +# managed memory +LDFLAGS="-acc=gpu -gpu=mem:managed" +# Location of various CUDA maths libraries. nvtx3interop is required when # using nvtx for profiling. -LDFLAGS+=" -Mcuda -L${CUDA_MATH_DIR}/lib64 -lnvToolsExt" +LDFLAGS+=" -cuda -L${CUDA_MATH_DIR}/lib64 -lnvtx3interop" # Flags to use when compiling with OpenMP support OMPFLAGS="-mp" # Command to use to create archive of object files diff --git a/compiler_setup/spack_nvidia.sh b/compiler_setup/spack_nvidia.sh new file mode 100644 index 00000000..a3ac0450 --- /dev/null +++ b/compiler_setup/spack_nvidia.sh @@ -0,0 +1,13 @@ +# Build settings for the Nvidia compiler +# ================================================ +# Fortran compiler + +# ============================== +export F90=$FC + +export LDFLAGS="-cuda -L${CUDA_HOME}/lib64 -lnvtx3interop" +export OMPTARGETFLAGS="-mp=gpu -gpu=ccnative" +export OMPFLAGS="-mp" +export UMEMFLAGS="-gpu=mem:managed" +export ACCFLAGS="-acc=gpu -gpu=ccnative" + diff --git a/shared/FortCL b/shared/FortCL index d516ed01..401148e4 160000 --- a/shared/FortCL +++ b/shared/FortCL @@ -1 +1 @@ -Subproject commit d516ed01ea23565bfc4f531a795d2c7a2a57fe50 +Subproject commit 401148e4b6d6efdd4d0157123b118ed07d831446 diff --git a/shared/PSyclone b/shared/PSyclone index 106543da..63d4c225 160000 --- a/shared/PSyclone +++ b/shared/PSyclone @@ -1 +1 @@ -Subproject commit 106543dafe26fe114de192f27311637a85a28a81 +Subproject commit 63d4c22552fb6cd5fafbd4185ef373a1d9e3713c diff --git a/shared/dl_esm_inf b/shared/dl_esm_inf index ad209e9d..358402ec 160000 --- a/shared/dl_esm_inf +++ b/shared/dl_esm_inf @@ -1 +1 @@ -Subproject commit ad209e9d252995bd83127de4c481232ca14ed655 +Subproject commit 358402ecc4d88e93a62a3ca13dc9d20d2eb27f90 diff --git a/shared/kokkos b/shared/kokkos index ae5fc649..552f2375 160000 --- a/shared/kokkos +++ b/shared/kokkos @@ -1 +1 @@ -Subproject commit ae5fc649ef4b62b48a01123759ed066bff227b43 +Subproject commit 552f2375de06361f8a5662abc0859ae233b5d8f8