diff --git a/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile b/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile index 8e40229b..9453719e 100644 --- a/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile +++ b/benchmarks/nemo/tracer_advection/compute_in_subroutine/Makefile @@ -45,7 +45,7 @@ tra_adv_no_auto_serial: dl_timer # OpenACC version with timer around outer loop only. tra_adv_acc: dl_timer mkdir -p $@ - ${PSYCLONE} -s ../scripts/kernels_trans.py -opsy \ + ${PSYCLONE} -s ../scripts/acc_kernels_trans.py -opsy \ $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. @@ -55,39 +55,68 @@ tra_adv_acc: dl_timer # OpenACC version with nvtx profiling instrumentation. tra_adv_acc_prof: dl_timer mkdir -p $@ - ${PSYCLONE} --profile invokes -s ../scripts/kernels_trans.py -opsy \ + ${PSYCLONE} --profile invokes -s ../scripts/acc_kernels_trans.py -opsy \ $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90 cp Makefile_gen $@/Makefile cp tra_adv_driver.F90 $@/. ${MAKE} PROF_LIB_INC="-I${PSYCLONE_NVIDIA_LIB_DIR} -I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME} ${PSYCLONE_NVIDIA_LIB_DIR}/libnvtx_prof.a" -C $@ +# Serial version in SIR compliant form. +tra_adv_no_scalars_serial: dl_timer + mkdir -p $@ + cp tra_adv_driver_no_scalars.F90 $@/tra_adv_driver.F90 + cp tra_adv_compute_no_scalars.F90 $@/tra_adv_compute.F90 + cp Makefile_gen $@/Makefile + ${MAKE} PROF_LIB_INC="-I../${DL_TIMER_DIR}/src" \ + LDFLAGS="${LDFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ + +# OpenACC version with timer around outer loop only. +tra_adv_no_scalars_acc: dl_timer + mkdir -p $@ + ${PSYCLONE} -s ../scripts/acc_kernels_trans.py -opsy \ + $@/tra_adv_compute.f90 ./tra_adv_compute_no_scalars.F90 + cp Makefile_gen $@/Makefile + cp tra_adv_driver_no_scalars.F90 $@/tra_adv_driver.F90 + ${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} ${UMEMFLAGS} -I../${DL_TIMER_DIR}/src" \ + LDFLAGS="${LDFLAGS} ${ACCFLAGS} ${UMEMFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" PROF_LIB_INC="-I../${DL_TIMER_DIR}/src" \ + -C $@ + # Serial Fortran version after transformation to SIR-compliant form. -tra_adv_sir: dl_timer +tra_adv_sir: dl_timer ../scripts/sir_loop_trans.py mkdir -p $@ - ${PSYCLONE} -s ../scripts/sir_trans.py -opsy $@/tra_adv_compute.f90 \ - ./tra_adv_compute_auto_arrays.F90 + ${PSYCLONE} -s ../scripts/sir_loop_trans.py -opsy $@/tra_adv_compute.f90 \ + ./tra_adv_compute_no_scalars.F90 cp Makefile_gen $@/Makefile - cp tra_adv_driver.F90 $@/. + cp tra_adv_driver_no_scalars.F90 $@/tra_adv_driver.F90 ${MAKE} PROF_LIB_INC="-I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ # OpenACC added after transformation to SIR-compliant form. -tra_adv_sir_acc: dl_timer +tra_adv_sir_acc_um: dl_timer ../scripts/sir_loop_kernels_um_trans.py mkdir -p $@ - ${PSYCLONE} -s ../scripts/sir_kernels_trans.py -opsy \ - $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90 + ${PSYCLONE} -s ../scripts/sir_loop_kernels_trans.py -opsy \ + $@/tra_adv_compute.f90 ./tra_adv_compute_no_scalars.F90 cp Makefile_gen $@/Makefile - cp tra_adv_driver.F90 $@/. - ${MAKE} PROF_LIB_INC="-I../${DL_TIMER_DIR}/src" \ - LDFLAGS="${LDFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ + cp tra_adv_driver_no_scalars.F90 $@/tra_adv_driver.f90 + ${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} ${UMEMFLAGS} -Mfma -I../${DL_TIMER_DIR}/src" \ + LDFLAGS="${LDFLAGS} ${ACCFLAGS} ${UMEMFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" PROF_LIB_INC="-I../${DL_TIMER_DIR}/src" -C $@ + +tra_adv_sir_acc: dl_timer ../scripts/sir_loop_kernels_trans.py + mkdir -p $@ + ${PSYCLONE} -s ../scripts/sir_loop_kernels_trans.py -opsy \ + $@/tra_adv_compute.f90 ./tra_adv_compute_no_scalars.F90 + cp Makefile_gen $@/Makefile + cp tra_adv_driver_no_scalars.F90 $@/tra_adv_driver.f90 + ${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} -I../${DL_TIMER_DIR}/src" \ + LDFLAGS="${LDFLAGS} ${ACCFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" PROF_LIB_INC="-I../${DL_TIMER_DIR}/src" -C $@ tra_adv_sir_acc_prof: dl_timer mkdir -p $@ ${PSYCLONE} --profile invokes -s ../scripts/sir_kernels_trans.py -opsy \ - $@/tra_adv_compute.f90 ./tra_adv_compute_auto_arrays.F90 + $@/tra_adv_compute.f90 ./tra_adv_compute_no_scalars.F90 cp Makefile_gen $@/Makefile - cp tra_adv_driver.F90 $@/. + cp tra_adv_driver_no_scalars.F90 $@/tra_adv_driver.f90 ${MAKE} PROF_LIB_INC="-I${PSYCLONE_NVIDIA_LIB_DIR} -I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME} ${PSYCLONE_NVIDIA_LIB_DIR}/libnvtx_prof.a" -C $@ diff --git a/benchmarks/nemo/tracer_advection/compute_in_subroutine/tra_adv_compute_no_scalars.F90 b/benchmarks/nemo/tracer_advection/compute_in_subroutine/tra_adv_compute_no_scalars.F90 new file mode 100644 index 00000000..57e2c7c0 --- /dev/null +++ b/benchmarks/nemo/tracer_advection/compute_in_subroutine/tra_adv_compute_no_scalars.F90 @@ -0,0 +1,201 @@ +module tra_adv_compute_mod + implicit none + +contains + + subroutine tra_adv_compute(zind, tsn, ztfreez, rnfmsk, rnfmsk_z, upsmsk, tmask, zwx, zwy, umask, vmask, mydomain, zslpx, zslpy, pun, pvn, pwn, jpi, jpj, jpk, iter) + + REAL*8, ALLOCATABLE, DIMENSION(:, :, :), intent(inout):: zind + REAL*8, ALLOCATABLE, DIMENSION(:, :, :), intent(in) :: tsn, tmask + REAL*8, ALLOCATABLE, DIMENSION(:, :), intent(in) :: ztfreez, rnfmsk, upsmsk + REAL*8, ALLOCATABLE, DIMENSION(:), intent(in) :: rnfmsk_z + REAL*8, ALLOCATABLE, DIMENSION(:, :, :), intent(inout) :: zwx, zwy + REAL*8, ALLOCATABLE, DIMENSION(:, :, :), intent(in) :: umask, vmask + + REAL*8, ALLOCATABLE, DIMENSION(:, :, :), intent(inout) :: zslpx, zslpy + REAL*8, ALLOCATABLE, DIMENSION(:, :, :), intent(inout) :: mydomain + REAL*8, ALLOCATABLE, DIMENSION(:, :, :), intent(in) :: pun, pvn + + REAL*8, ALLOCATABLE, DIMENSION(:,:,:), intent(in) :: pwn + + INTEGER, INTENT(IN) :: jpi, jpj, jpk + INTEGER, INTENT(IN) :: iter + + REAL*8 :: zbtr, ztra + + REAL*8 :: z0u, zalpha, zu, zdt, zzwx, zzwy, z0v, zv + + REAL*8 :: zice + + REAL*8 :: z0w, zw + + INTEGER :: ji, jj, jk + + DO jk = 1, jpk + DO jj = 1, jpj + DO ji = 1, jpi + zice = 0.d0 + IF (tsn(ji, jj, jk) <= ztfreez(ji, jj) + 0.1d0) THEN; zice = 1.d0 + ELSE; zice = 0.d0 + END IF + + zind(ji, jj, jk) = MAX( & + rnfmsk(ji, jj)*rnfmsk_z(jk), & + upsmsk(ji, jj), & + zice & + & )*tmask(ji, jj, jk) + zind(ji, jj, jk) = 1 - zind(ji, jj, jk) + END DO + END DO + END DO + + zwx(:, :, jpk) = 0.e0; zwy(:, :, jpk) = 0.e0 + + DO jk = 1, jpk - 1 + DO jj = 1, jpj - 1 + DO ji = 1, jpi - 1 + zwx(ji, jj, jk) = umask(ji, jj, jk)*(mydomain(ji + 1, jj, jk) - mydomain(ji, jj, jk)) + zwy(ji, jj, jk) = vmask(ji, jj, jk)*(mydomain(ji, jj + 1, jk) - mydomain(ji, jj, jk)) + END DO + END DO + END DO + + zslpx(:, :, jpk) = 0.e0; zslpy(:, :, jpk) = 0.e0 + + DO jk = 1, jpk - 1 + DO jj = 2, jpj + DO ji = 2, jpi + zslpx(ji, jj, jk) = (zwx(ji, jj, jk) + zwx(ji - 1, jj, jk)) & + & *(0.25d0 + SIGN(0.25d0, zwx(ji, jj, jk)*zwx(ji - 1, jj, jk))) + zslpy(ji, jj, jk) = (zwy(ji, jj, jk) + zwy(ji, jj - 1, jk)) & + & *(0.25d0 + SIGN(0.25d0, zwy(ji, jj, jk)*zwy(ji, jj - 1, jk))) + END DO + END DO + END DO + + DO jk = 1, jpk - 1 + DO jj = 2, jpj + DO ji = 2, jpi + zslpx(ji, jj, jk) = SIGN(1.d0, zslpx(ji, jj, jk))*MIN(ABS(zslpx(ji, jj, jk)), & + & 2.d0*ABS(zwx(ji - 1, jj, jk)), & + & 2.d0*ABS(zwx(ji, jj, jk))) + zslpy(ji, jj, jk) = SIGN(1.d0, zslpy(ji, jj, jk))*MIN(ABS(zslpy(ji, jj, jk)), & + & 2.d0*ABS(zwy(ji, jj - 1, jk)), & + & 2.d0*ABS(zwy(ji, jj, jk))) + END DO + END DO + END DO + + DO jk = 1, jpk - 1 + !zdt = 1 + DO jj = 2, jpj - 1 + DO ji = 2, jpi - 1 + z0u = SIGN(0.5d0, pun(ji, jj, jk)) + zalpha = 0.5d0 - z0u + !zu = z0u - 0.5d0*pun(ji, jj, jk)*zdt + zu = z0u - 0.5d0*pun(ji, jj, jk)*1.0 + + zzwx = mydomain(ji + 1, jj, jk) + zind(ji, jj, jk)*(zu*zslpx(ji + 1, jj, jk)) + zzwy = mydomain(ji, jj, jk) + zind(ji, jj, jk)*(zu*zslpx(ji, jj, jk)) + + zwx(ji, jj, jk) = pun(ji, jj, jk)*(zalpha*zzwx + (1.-zalpha)*zzwy) + + z0v = SIGN(0.5d0, pvn(ji, jj, jk)) + zalpha = 0.5d0 - z0v + !zv = z0v - 0.5d0*pvn(ji, jj, jk)*zdt + zv = z0v - 0.5d0*pvn(ji, jj, jk)*1.0 + + zzwx = mydomain(ji, jj + 1, jk) + zind(ji, jj, jk)*(zv*zslpy(ji, jj + 1, jk)) + zzwy = mydomain(ji, jj, jk) + zind(ji, jj, jk)*(zv*zslpy(ji, jj, jk)) + + zwy(ji, jj, jk) = pvn(ji, jj, jk)*(zalpha*zzwx + (1.d0 - zalpha)*zzwy) + END DO + END DO + END DO + + DO jk = 1, jpk - 1 + DO jj = 2, jpj - 1 + DO ji = 2, jpi - 1 + zbtr = 1. + ztra = -zbtr*(zwx(ji, jj, jk) - zwx(ji - 1, jj, jk) & + & + zwy(ji, jj, jk) - zwy(ji, jj - 1, jk)) + mydomain(ji, jj, jk) = mydomain(ji, jj, jk) + ztra + END DO + END DO + END DO + + zwx(:, :, 1) = 0.e0; zwx(:, :, jpk) = 0.e0 + + DO jk = 2, jpk - 1 + zwx(:, :, jk) = tmask(:, :, jk)*(mydomain(:, :, jk - 1) - mydomain(:, :, jk)) + END DO + + zslpx(:, :, 1) = 0.e0 + + DO jk = 2, jpk - 1 + DO jj = 1, jpj + DO ji = 1, jpi + zslpx(ji, jj, jk) = (zwx(ji, jj, jk) + zwx(ji, jj, jk + 1)) & + & *(0.25d0 + SIGN(0.25d0, zwx(ji, jj, jk)*zwx(ji, jj, jk + 1))) + END DO + END DO + END DO + + DO jk = 2, jpk - 1 + DO jj = 1, jpj + DO ji = 1, jpi + zslpx(ji, jj, jk) = SIGN(1.d0, zslpx(ji, jj, jk))*MIN(ABS(zslpx(ji, jj, jk)), & + & 2.d0*ABS(zwx(ji, jj, jk + 1)), & + & 2.d0*ABS(zwx(ji, jj, jk))) + END DO + END DO + END DO + + zwx(:, :, 1) = pwn(:, :, 1)*mydomain(:, :, 1) + + !zdt = 1 + !zbtr = 1. + !DO jk = 1, jpk-1 + ! DO jj = 2, jpj-1 + ! DO ji = 2, jpi-1 + ! z0w = SIGN( 0.5d0, pwn(ji,jj,jk+1) ) + ! zalpha = 0.5d0 + z0w + ! zw = z0w - 0.5d0 * pwn(ji,jj,jk+1) * zdt * zbtr + ! + ! zzwx = mydomain(ji,jj,jk+1) + zind(ji,jj,jk) * (zw * zslpx(ji,jj,jk+1)) + ! zzwy = mydomain(ji,jj,jk ) + zind(ji,jj,jk) * (zw * zslpx(ji,jj,jk )) + ! + ! zwx(ji,jj,jk+1) = pwn(ji,jj,jk+1) * ( zalpha * zzwx + (1.-zalpha) * zzwy ) + ! END DO + ! END DO + !END DO + DO jk = 2, jpk + DO jj = 2, jpj - 1 + DO ji = 2, jpi - 1 + z0w = SIGN(0.5d0, pwn(ji, jj, jk)) + zalpha = 0.5d0 + z0w + !zw = z0w - 0.5d0*pwn(ji, jj, jk)*zdt*zbtr + zw = z0w - 0.5d0*pwn(ji, jj, jk)*1.0*1.0 + + zzwx = mydomain(ji, jj, jk) + zind(ji, jj, jk - 1)*(zw*zslpx(ji, jj, jk)) + zzwy = mydomain(ji, jj, jk - 1) + zind(ji, jj, jk - 1)*(zw*zslpx(ji, jj, jk - 1)) + + zwx(ji, jj, jk) = pwn(ji, jj, jk)*(zalpha*zzwx + (1.-zalpha)*zzwy) + END DO + END DO + END DO + + !zbtr = 1. + DO jk = 1, jpk - 1 + DO jj = 2, jpj - 1 + DO ji = 2, jpi - 1 + !ztra = -zbtr*(zwx(ji, jj, jk) - zwx(ji, jj, jk + 1)) + ztra = -1.0*(zwx(ji, jj, jk) - zwx(ji, jj, jk + 1)) + mydomain(ji, jj, jk) = ztra + END DO + END DO + END DO + + end subroutine tra_adv_compute + +end module tra_adv_compute_mod diff --git a/benchmarks/nemo/tracer_advection/compute_in_subroutine/tra_adv_driver_no_scalars.F90 b/benchmarks/nemo/tracer_advection/compute_in_subroutine/tra_adv_driver_no_scalars.F90 new file mode 100644 index 00000000..919ad031 --- /dev/null +++ b/benchmarks/nemo/tracer_advection/compute_in_subroutine/tra_adv_driver_no_scalars.F90 @@ -0,0 +1,150 @@ +program tracer_advection + USE dl_timer, only: timer_init, timer_register, timer_start, timer_stop, timer_report + use tra_adv_compute_mod, only: tra_adv_compute + implicit none + REAL*8, ALLOCATABLE, SAVE, DIMENSION(:,:,:) :: tsn + REAL*8, ALLOCATABLE, SAVE, DIMENSION(:,:,:) :: pun, pvn, pwn + REAL*8, ALLOCATABLE, SAVE, DIMENSION(:,:,:) :: mydomain, umask, vmask, tmask, zind + REAL*8, ALLOCATABLE, SAVE, DIMENSION(:,:,:) :: zslpx, zslpy, zwx, zwy + REAL*8, ALLOCATABLE, SAVE, DIMENSION(:,:) :: ztfreez, rnfmsk, upsmsk + REAL*8, ALLOCATABLE, SAVE, DIMENSION(:) :: rnfmsk_z + REAL*8 :: r, checksum + INTEGER :: jpi, jpj, jpk, ji, jj, jk, jt + INTEGER*8 :: itn_count + CHARACTER(len=10) :: env + !> Timer indexes, one for initialisation, one for the 'time-stepping' + INTEGER :: init_timer, step_timer + + CALL get_environment_variable("JPI", env) + READ ( env, '(i10)' ) jpi + CALL get_environment_variable("JPJ", env) + READ ( env, '(i10)' ) jpj + CALL get_environment_variable("JPK", env) + READ ( env, '(i10)' ) jpk + CALL get_environment_variable("IT", env) + READ ( env, '(i10)' ) itn_count + + ! Set-up our timers + + CALL timer_init() + CALL timer_register(init_timer, label='Initialisation') + ! We exclude the first step from the timed region. + CALL timer_register(step_timer, label='Time-stepping', & + num_repeats=itn_count-1) + + ! Initialisation + + call timer_start(init_timer) + + ALLOCATE( mydomain (jpi,jpj,jpk), & + pun (jpi,jpj,jpk), & + pvn (jpi,jpj,jpk), & + pwn (jpi,jpj,jpk), & + umask (jpi,jpj,jpk), & + vmask (jpi,jpj,jpk), & + tmask (jpi,jpj,jpk), & + zind (jpi,jpj,jpk), & + ztfreez (jpi,jpj), & + rnfmsk (jpi,jpj), & + upsmsk (jpi,jpj), & + rnfmsk_z (jpk), & + tsn(jpi,jpj,jpk), & + zslpx(jpi,jpj,jpk), & + zslpy(jpi,jpj,jpk), & + zwx(jpi,jpj,jpk), & + zwy(jpi,jpj,jpk)) + + ! Array initialization + + r = jpi*jpj*jpk + + ! the following three lines can be uncommented to randomize arrays initialization + !call random_seed() + !call random_number(r) + !r = r*jpi*jpj*jpk + + DO jk = 1, jpk + DO jj = 1, jpj + DO ji = 1, jpi + umask(ji,jj,jk) = ji*jj*jk/r + mydomain(ji,jj,jk) =ji*jj*jk/r + pun(ji,jj,jk) =ji*jj*jk/r + pvn(ji,jj,jk) =ji*jj*jk/r + pwn(ji,jj,jk) =ji*jj*jk/r + vmask(ji,jj,jk)= ji*jj*jk/r + tsn(ji,jj,jk)= ji*jj*jk/r + tmask(ji,jj,jk)= ji*jj*jk/r + END DO + END DO + END DO + + r = jpi*jpj + DO jj=1, jpj + DO ji=1, jpi + ztfreez(ji,jj) = ji*jj/r + upsmsk(ji,jj) = ji*jj/r + rnfmsk(ji,jj) = ji*jj/r + END DO + END DO + + DO jk=1, jpk + rnfmsk_z(jk)=jk/jpk + END DO + + call timer_stop(init_timer) + + jt = 1 + call tra_adv_compute(zind, tsn, ztfreez, rnfmsk, rnfmsk_z, upsmsk, tmask, & + zwx, zwy, umask, vmask, mydomain, zslpx, zslpy, pun, pvn, pwn, & + jpi, jpj, jpk, jt) + + call timer_start(step_timer) + + do jt = 2, itn_count + call tra_adv_compute(zind, tsn, ztfreez, rnfmsk, rnfmsk_z, upsmsk, & + tmask, zwx, zwy, umask, vmask, mydomain, zslpx, zslpy, & + pun, pvn, pwn, jpi, jpj, jpk, jt) + end do + + call timer_stop(step_timer) + + ! Output final field and compute checksum + + open(unit = 24, file = 'output.dat', form='formatted') + + checksum = 0.0d0 + do jk = 1, jpk-1 + do jj = 2, jpj-1 + do ji = 2, jpi-1 + checksum = checksum + mydomain(ji,jj,jk) + write(24,*) mydomain(ji,jj,jk) + end do + end do + end do + + write(*, "('Checksum for domain ', 2(I4, ' x'), I4, ' (',I4,' iterations) = ',E23.16)") & + jpi, jpj, jpk, itn_count, checksum + + close(24) + + deallocate( mydomain ) + deallocate( pun ) + deallocate( pvn ) + deallocate( pwn ) + deallocate( umask) + deallocate( vmask) + deallocate( tmask) + deallocate( zind ) + deallocate( ztfreez ) + deallocate( rnfmsk) + deallocate( upsmsk) + deallocate( rnfmsk_z) + deallocate( tsn) + deallocate( zslpx) + deallocate( zslpy) + deallocate( zwx) + deallocate( zwy) + + call timer_report() + +end program tracer_advection diff --git a/benchmarks/nemo/tracer_advection/original/Makefile b/benchmarks/nemo/tracer_advection/original/Makefile index 7ef7fe85..811ed84e 100644 --- a/benchmarks/nemo/tracer_advection/original/Makefile +++ b/benchmarks/nemo/tracer_advection/original/Makefile @@ -59,13 +59,20 @@ tra_adv_serial: dl_timer ./tra_adv.F90 ${MAKE} FORT_FLAGS="${F90FLAGS} -I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ${OMPFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ -tra_adv_acc_kernels: dl_timer ./tra_adv.F90 +tra_adv_acc_kernels_umem: dl_timer ./tra_adv.F90 mkdir -p $@ ${PSYCLONE} -s ../scripts/acc_kernels_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 cp Makefile_gen $@/Makefile ${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} ${UMEMFLAGS} -I../${DL_TIMER_DIR}/src" \ LDFLAGS="${LDFLAGS} ${ACCFLAGS} ${UMEMFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ +tra_adv_acc_kernels: dl_timer ./tra_adv.F90 + mkdir -p $@ + ${PSYCLONE} -s ../scripts/acc_kernels_explicit_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 + cp Makefile_gen $@/Makefile + ${MAKE} FORT_FLAGS="${F90FLAGS} ${ACCFLAGS} ${UMEMFLAGS} -I../${DL_TIMER_DIR}/src" \ + LDFLAGS="${LDFLAGS} ${ACCFLAGS} ${UMEMFLAGS} ../${DL_TIMER_DIR}/${DL_TIMER_NAME}" -C $@ + tra_adv_acc_loops: dl_timer ./tra_adv.F90 mkdir -p $@ ${PSYCLONE} -s ../scripts/acc_loops_trans.py -opsy $@/tra_adv.f90 ./tra_adv.F90 diff --git a/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_trans.py b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_trans.py new file mode 100644 index 00000000..9844c81a --- /dev/null +++ b/benchmarks/nemo/tracer_advection/scripts/acc_kernels_explicit_trans.py @@ -0,0 +1,500 @@ +# ----------------------------------------------------------------------------- +# BSD 3-Clause License +# +# Copyright (c) 2018-2022, Science and Technology Facilities Council. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# ----------------------------------------------------------------------------- +# Authors: R. W. Ford, A. R. Porter, N. Nobre and S. Siso, STFC Daresbury Lab + +'''A transformation script that seeks to apply OpenACC DATA and KERNELS +directives to NEMO style code. In order to use it you must first install +PSyclone. See README.md in the top-level directory. + +Once you have psyclone installed, this may be used by doing: + + $ psyclone -api nemo -s kernels_trans.py some_source_file.f90 + +This should produce a lot of output, ending with generated +Fortran. Note that the Fortran source files provided to PSyclone must +have already been preprocessed (if required). + +The transformation script attempts to insert Kernels directives at the +highest possible location(s) in the schedule tree (i.e. to enclose as +much code as possible in each Kernels region). However, due to +limitations in the Nvidia compiler, we must take care to exclude certain +nodes (such as If blocks) from within Kernel regions. If a proposed +region is found to contain such a node (by the ``valid_acc_kernel`` +routine) then the script moves a level down the tree and then repeats +the process of attempting to create the largest possible Kernel region. + +Tested with the NVIDIA HPC SDK version 22.5. +''' + +import logging +from psyclone.errors import InternalError +from psyclone.nemo import NemoInvokeSchedule, NemoKern, NemoLoop +from psyclone.psyGen import TransInfo +from psyclone.psyir.nodes import IfBlock, CodeBlock, Schedule, \ + ArrayReference, Assignment, BinaryOperation, Loop, \ + Literal, Return, Call, ACCDirective, ACCLoopDirective +from psyclone.psyir.transformations import TransformationError, ProfileTrans, \ + ACCUpdateTrans +from psyclone.transformations import ACCEnterDataTrans + +# Get the PSyclone transformations we will use +ACC_KERN_TRANS = TransInfo().get_trans_name('ACCKernelsTrans') +ACC_LOOP_TRANS = TransInfo().get_trans_name('ACCLoopTrans') +ACC_ROUTINE_TRANS = TransInfo().get_trans_name('ACCRoutineTrans') +ACC_EDATA_TRANS = ACCEnterDataTrans() +ACC_UPDATE_TRANS = ACCUpdateTrans() +PROFILE_TRANS = ProfileTrans() + +# Whether or not to add profiling calls around unaccelerated regions +PROFILE_NONACC = False + +# Whether or not to add OpenACC enter data and update directives to explicitly +# move data between host and device memory +ACC_DATA = True + +# If routine names contain these substrings then we do not profile them +PROFILING_IGNORE = ["_init", "_rst", "alloc", "agrif", "flo_dom", + "macho", "mpp_", "nemo_gcm", + # These are small functions that the addition of profiling + # prevents from being in-lined (and then breaks any attempt + # to create OpenACC regions with calls to them) + "interp1", "interp2", "interp3", "integ_spline", "sbc_dcy", + "sum", "sign_"] + +# Routines we do not attempt to add any OpenACC to (because it breaks with +# the Nvidia compiler or because it just isn't worth it) +ACC_IGNORE = ["day_mth", # Just calendar operations + "obs_surf_alloc", "oce_alloc", + # Compiler fails w/ "Unsupported local variable" + # Zero performance impact since outside execution path + "copy_obfbdata", "merge_obfbdata", + "turb_ncar", # Transforming hurts performance + "iom_open", "iom_get_123d", "iom_nf90_rp0123d", + "trc_bc_ini", "p2z_ini", "p4z_ini"] # Str handling, init routine + +# Currently fparser has no way of distinguishing array accesses from +# function calls if the symbol is imported from some other module. +# We therefore work-around this by keeping a list of known NEMO +# functions that must be excluded from within KERNELS regions. +NEMO_FUNCTIONS = ["alpha_charn", "cd_neutral_10m", "cpl_freq", "cp_air", + "eos_pt_from_ct", "gamma_moist", "l_vap", + "sbc_dcy", "solfrac", "psi_h", "psi_m", "psi_m_coare", + "psi_h_coare", "psi_m_ecmwf", "psi_h_ecmwf", "q_sat", + "rho_air", "visc_air", "sbc_dcy", "glob_sum", + "glob_sum_full", "ptr_sj", "ptr_sjk", "interp1", "interp2", + "interp3", "integ_spline"] + + +class ExcludeSettings(): + ''' + Class to hold settings on what to exclude from OpenACC KERNELS regions. + + :param Optional[dict] settings: map of settings to override. + + ''' + def __init__(self, settings={}): + # Whether we exclude IFs where the logical expression is not a + # comparison operation. + self.ifs_scalars = settings.get("ifs_scalars", False) + + +# Routines which are exceptions to the OpenACC Kernels regions exclusion rules. +EXCLUDING = {"default": ExcludeSettings(), + # Exclude for better GPU performance (requires further analysis). + "dyn_spg_ts": ExcludeSettings({"ifs_scalars": True}), + "tra_zdf_imp": ExcludeSettings({"ifs_scalars": True}), + # Exclude due to compiler bug preventing CPU multicore executions. + "dom_vvl_init": ExcludeSettings({"ifs_scalars": True})} + + +def log_msg(name, msg, node): + ''' + Log a message indicating why a transformation could not be performed. + + :param str name: the name of the routine. + :param str msg: the message to log. + :param node: the PSyIR node that prevented the transformation. + :type node: :py:class:`psyclone.psyir.nodes.Node` + + ''' + # Create a str representation of the position of the problematic node + # in the PSyIR tree. + node_strings = [] + parent = node + while parent: + node_strings.append(parent.node_str(colour=False)) + parent = parent.parent + node_strings.reverse() + location = "->".join(node_strings) + # Log the message + logging.info("%s: %s: %s", name, msg, location) + + +def valid_acc_kernel(node): + ''' + Whether the sub-tree that has `node` at its root is eligible to be + enclosed within an OpenACC KERNELS directive. + + :param node: the node in the PSyIRe to check. + :type node: :py:class:`psyclone.psyir.nodes.Node` + + :returns: True if the sub-tree can be enclosed in a KERNELS region. + :rtype: bool + + ''' + # The Fortran routine which our parent Invoke represents + routine_name = node.ancestor(NemoInvokeSchedule).invoke.name + + # Allow for per-routine setting of what to exclude from within KERNELS + # regions. This is because sometimes things work in one context but not + # in another (with the Nvidia compiler). + excluding = EXCLUDING.get(routine_name, EXCLUDING["default"]) + + # Rather than walk the tree multiple times, look for both excluded node + # types and possibly problematic operations + excluded_node_types = (CodeBlock, Return, Call, IfBlock, NemoLoop) + excluded_nodes = node.walk(excluded_node_types) + + for enode in excluded_nodes: + if isinstance(enode, (CodeBlock, Return, Call)): + log_msg(routine_name, + f"region contains {type(enode).__name__}", enode) + return False + + if isinstance(enode, IfBlock): + # We permit IF blocks originating from WHERE constructs and + # single-statement IF blocks containing a Loop in KERNELS regions + if "was_where" in enode.annotations or \ + "was_single_stmt" in enode.annotations and enode.walk(Loop): + continue + + arrays = enode.condition.walk(ArrayReference) + # We exclude if statements where the condition expression does + # not refer to arrays at all as this may cause compiler issues + # (get "Missing branch target block") or produce faster code. + if not arrays and excluding.ifs_scalars and \ + not isinstance(enode.condition, BinaryOperation): + log_msg(routine_name, "IF references scalars", enode) + return False + # When using CUDA Unified Memory, only allocated arrays reside in + # shared memory (including those that are created by compiler- + # -generated allocs, e.g. for automatic arrays). We assume that all + # arrays of rank 2 or greater are dynamically allocated, whereas 1D + # arrays are often static in NEMO. Hence, we disallow IFs where the + # logical expression involves the latter. + if any(len(array.children) == 1 for array in arrays): + log_msg(routine_name, + "IF references 1D arrays that may be static", enode) + return False + + elif isinstance(enode, NemoLoop): + # Heuristic: + # We don't want to put loops around 3D loops into KERNELS regions + # and nor do we want to put loops over levels into KERNELS regions + # if they themselves contain several 2D loops. + # In general, this heuristic will depend upon how many levels the + # model configuration will contain. + child = enode.loop_body[0] + if isinstance(child, Loop) and child.loop_type == "levels": + # We have a loop around a loop over levels + log_msg(routine_name, "Loop is around a loop over levels", + enode) + return False + if enode.loop_type == "levels" and \ + len(enode.loop_body.children) > 1: + # The body of the loop contains more than one statement. + # How many distinct loop nests are there? + loop_count = 0 + for child in enode.loop_body.children: + if child.walk(Loop): + loop_count += 1 + if loop_count > 1: + log_msg(routine_name, + "Loop over levels contains several " + "other loops", enode) + return False + + # For now we don't support putting *just* the implicit loop assignment in + # things like: + # if(do_this)my_array(:,:) = 1.0 + # inside a kernels region. Once we generate Fortran instead of modifying + # the fparser2 parse tree this will become possible. + if isinstance(node.parent, Schedule) and \ + isinstance(node.parent.parent, IfBlock) and \ + "was_single_stmt" in node.parent.parent.annotations: + log_msg(routine_name, "Would split single-line If statement", node) + return False + + # Finally, check that we haven't got any 'array accesses' that are in + # fact function calls. + refs = node.walk(ArrayReference) + # Since kernels are leaves in the PSyIR, we need to separately check + # their schedules for array references too. + kernels = node.walk(NemoKern) + for kern in kernels: + sched = kern.get_kernel_schedule() + refs += sched.walk(ArrayReference) + for ref in refs: + # Check if this reference has the name of a known function and if that + # reference appears outside said known function. + if ref.name.lower() in NEMO_FUNCTIONS and \ + ref.name.lower() != routine_name.lower(): + log_msg(routine_name, + f"Loop contains function call: {ref.name}", ref) + return False + return True + + +def add_kernels(children): + ''' + Walks through the PSyIR inserting OpenACC KERNELS directives at as + high a level as possible. + + :param children: list of sibling Nodes in PSyIR that are candidates for \ + inclusion in an ACC KERNELS region. + :type children: list of :py:class:`psyclone.psyir.nodes.Node` + + :returns: True if any KERNELS regions are successfully added. + :rtype: bool + + ''' + added_kernels = False + if not children: + return added_kernels + + node_list = [] + for child in children[:]: + # Can this node be included in a kernels region? + if not valid_acc_kernel(child): + # It can't so we put what we have so far inside a kernels region + success = try_kernels_trans(node_list) + added_kernels |= success + # A node that cannot be included in a kernels region marks the + # end of the current candidate region so reset the list. + node_list = [] + # Now we go down a level and try again + if isinstance(child, IfBlock): + success1 = add_kernels(child.if_body) + success2 = add_kernels(child.else_body) + success = success1 or success2 + elif isinstance(child, Loop): + success = add_kernels(child.loop_body) + else: + success = add_kernels(child.children) + added_kernels |= success + else: + # We can add this node to our list for the current region + node_list.append(child) + success = try_kernels_trans(node_list) + added_kernels |= success + + return added_kernels + + +def add_profiling(children): + ''' + Walks down the PSyIR and inserts the largest possible profiling regions. + Code that contains OpenACC directives is excluded. + + :param children: sibling nodes in the PSyIR to which to attempt to add \ + profiling regions. + :type children: list of :py:class:`psyclone.psyir.nodes.Node` + + ''' + if not children: + return + + node_list = [] + for child in children[:]: + # Do we want this node to be included in a profiling region? + if child.walk((ACCDirective, Return)): + # It contains OpenACC so we put what we have so far inside a + # profiling region + add_profile_region(node_list) + # A node that is not included in a profiling region marks the + # end of the current candidate region so reset the list. + node_list = [] + # Now we go down a level and try again without attempting to put + # profiling below OpenACC directives or within Assignments + if isinstance(child, IfBlock): + add_profiling(child.if_body) + add_profiling(child.else_body) + elif not isinstance(child, (Assignment, ACCDirective)): + add_profiling(child.children) + else: + # We can add this node to our list for the current region + node_list.append(child) + add_profile_region(node_list) + + +def add_profile_region(nodes): + ''' + Attempt to put the supplied list of nodes within a profiling region. + + :param nodes: list of sibling PSyIR nodes to enclose. + :type nodes: list of :py:class:`psyclone.psyir.nodes.Node` + + ''' + if nodes: + # Check whether we should be adding profiling inside this routine + routine_name = \ + nodes[0].ancestor(NemoInvokeSchedule).invoke.name.lower() + if any([ignore in routine_name for ignore in PROFILING_IGNORE]): + return + if len(nodes) == 1: + if isinstance(nodes[0], CodeBlock) and \ + len(nodes[0].get_ast_nodes) == 1: + # Don't create profiling regions for CodeBlocks consisting + # of a single statement + return + if isinstance(nodes[0], IfBlock) and \ + "was_single_stmt" in nodes[0].annotations and \ + isinstance(nodes[0].if_body[0], CodeBlock): + # We also don't put single statements consisting of + # 'IF(condition) CALL blah()' inside profiling regions + return + try: + PROFILE_TRANS.apply(nodes) + except TransformationError: + pass + + +def try_kernels_trans(nodes): + ''' + Attempt to enclose the supplied list of nodes within a kernels + region. If the transformation fails then the error message is + reported but execution continues. + + :param nodes: list of Nodes to enclose within a Kernels region. + :type nodes: list of :py:class:`psyclone.psyir.nodes.Node` + + :returns: True if the transformation was successful, False otherwise. + :rtype: bool + + ''' + # We only enclose the proposed region if it contains a loop. + have_loop = False + for node in nodes: + if node.walk(Loop): + have_loop = True + break + assigns = node.walk(Assignment) + for assign in assigns: + if assign.is_array_range: + have_loop = True + break + if not have_loop: + return False + + try: + ACC_KERN_TRANS.apply(nodes, {"default_present": False}) + + # Put COLLAPSE on any tightly-nested loops over latitude and longitude. + for node in nodes: + loops = node.walk(Loop) + for loop in loops: + if loop.ancestor(ACCLoopDirective): + # We've already transformed a parent Loop so skip this one. + continue + # We put a COLLAPSE(2) clause on any perfectly-nested lat-lon + # loops that have a Literal value for their step. The latter + # condition is necessary to avoid compiler errors. + if loop.loop_type == "lat" and \ + isinstance(loop.step_expr, Literal) and \ + isinstance(loop.loop_body[0], Loop) and \ + loop.loop_body[0].loop_type == "lon" and \ + isinstance(loop.loop_body[0].step_expr, Literal) and \ + len(loop.loop_body.children) == 1: + try: + ACC_LOOP_TRANS.apply(loop, {"collapse": 2}) + except (TransformationError) as err: + print(f"Failed to collapse lat-lon loop: {loop}") + print(f"Error was: {err}") + + return True + except (TransformationError, InternalError) as err: + print(f"Failed to insert acc kernels around nodes: {nodes}") + print(f"Error was: {err}") + return False + + +def trans(psy): + '''A PSyclone-script compliant transformation function. Applies + OpenACC 'kernels' directives to NEMO code. Data movement can be + handled manually or through CUDA's managed-memory functionality. + + :param psy: The PSy layer object to apply transformations to. + :type psy: :py:class:`psyclone.psyGen.PSy` + + ''' + logging.basicConfig(filename='psyclone.log', filemode='w', + level=logging.INFO) + + invoke_list = "\n".join([str(name) for name in psy.invokes.names]) + print(f"Invokes found:\n{invoke_list}\n") + + for invoke in psy.invokes.invoke_list: + + sched = invoke.schedule + if not sched: + print(f"Invoke {invoke.name} has no Schedule! Skipping...") + continue + + # In the lib_fortran file we annotate each routine that does not + # have a Loop or a Call with the OpenACC Routine Directive + if psy.name == "psy_lib_fortran_psy" and not sched.walk((Loop, Call)): + print(f"Transforming {invoke.name} with acc routine") + ACC_ROUTINE_TRANS.apply(sched) + continue + + # Attempt to add OpenACC directives unless we are ignoring this routine + if invoke.name.lower() not in ACC_IGNORE: + print(f"Transforming {invoke.name} with acc kernels") + have_kernels = add_kernels(sched.children) + if have_kernels and ACC_DATA: + print(f"Transforming {invoke.name} with acc enter data") + ACC_EDATA_TRANS.apply(sched) + else: + print(f"Addition of OpenACC to routine {invoke.name} disabled!") + + if ACC_DATA: + print(f"Transforming {invoke.name} with acc update") + ACC_UPDATE_TRANS.apply(sched) + + # Add profiling instrumentation + if PROFILE_NONACC: + print(f"Adding profiling to non-OpenACC regions in {invoke.name}") + add_profiling(sched.children) + + return psy diff --git a/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py b/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py index c1513957..b506880e 100755 --- a/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py +++ b/benchmarks/nemo/tracer_advection/scripts/omp_gpu_trans.py @@ -36,8 +36,7 @@ ''' PSyclone transformation script to insert OpenMP Target Loop directives to the outermost loop that is parallelisable, including implicit loops. ''' -from psyclone.psyir.transformations import OMPTargetTrans -from psyclone.transformations import OMPLoopTrans +from psyclone.psyir.transformations import OMPTargetTrans, OMPLoopTrans from utils import insert_explicit_loop_parallelism, normalise_loops @@ -54,9 +53,9 @@ def trans(psy): ''' omp_target_trans = OMPTargetTrans() omp_loop_trans = OMPLoopTrans() - # Disabling worksharing will produce the 'loop' directive which is better + # Use the 'loop' directive which is better # suited to map the work into the GPU - omp_loop_trans.omp_worksharing = False + omp_loop_trans.omp_directive = "loop" print("Invokes found:") for invoke in psy.invokes.invoke_list: diff --git a/benchmarks/nemo/tracer_advection/scripts/problemsize.sh b/benchmarks/nemo/tracer_advection/scripts/problemsize.sh index 09441c3e..85ca589f 100755 --- a/benchmarks/nemo/tracer_advection/scripts/problemsize.sh +++ b/benchmarks/nemo/tracer_advection/scripts/problemsize.sh @@ -16,15 +16,26 @@ export IT=500 # Number of vertical levels export JPK=75 +export JPI=128 +export JPJ=128 +taskset -c 2 $@ > /dev/null 2>&1 + base=2 #for power in $(seq 4 12); do -for power in $(seq 4 9); do +for power in $(seq 6 10); do size=$(echo "$base^$power" | bc) export JPI=${size} export JPJ=${size} + if (( $power < 6 )); + then + # Do a warm-up run if it's a small problem size + taskset -c 2 $@ > /dev/null 2>&1 + fi time=$(taskset -c 2 $@ | awk '{if ($1 == "Time-stepping") {print $5} }') echo $size $time done + +rm -f output.dat diff --git a/benchmarks/nemo/tracer_advection/scripts/sir_loop_kernels_trans.py b/benchmarks/nemo/tracer_advection/scripts/sir_loop_kernels_trans.py new file mode 100644 index 00000000..23d28eaa --- /dev/null +++ b/benchmarks/nemo/tracer_advection/scripts/sir_loop_kernels_trans.py @@ -0,0 +1,76 @@ +# ----------------------------------------------------------------------------- +# BSD 3-Clause License +# +# Copyright (c) 2022, Science and Technology Facilities Council. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# ----------------------------------------------------------------------------- +# Authors: R. W. Ford and A. R. Porter, STFC Daresbury Lab. + +'''Module providing a PSyclone transformation script that first converts +the supplied PSyIR into a form compatible with the Stencil Intermediate +Representation (SIR) and then adds OpenACC Kernels regions to it. + +''' + +from acc_kernels_trans import add_kernels +from sir_loop_trans import make_sir_compliant +from psyclone.psyir.transformations import ACCUpdateTrans +from psyclone.transformations import ACCEnterDataTrans + + +UPDATE_TRANS = ACCUpdateTrans() +EDATA_TRANS = ACCEnterDataTrans() + + +def trans(psy): + ''' + Transformation routine for use with PSyclone. It calls + :py:func:`sir_trans.make_sir_compliant` and then + :py:func:`kernels_trans.add_kernels` for each schedule in each invoke. + + :param psy: the PSy object which this script will transform. + :type psy: :py:class:`psyclone.psyGen.PSy` + + :returns: the transformed PSy object. + :rtype: :py:class:`psyclone.psyGen.PSy` + + ''' + for invoke in psy.invokes.invoke_list: + + sched = invoke.schedule + if not sched: + print(f"Invoke {invoke.name} has no Schedule! Skipping...") + continue + + make_sir_compliant(sched) + add_kernels(sched.children) + EDATA_TRANS.apply(sched) + UPDATE_TRANS.apply(sched) + print(sched.view()) diff --git a/benchmarks/nemo/tracer_advection/scripts/sir_loop_kernels_um_trans.py b/benchmarks/nemo/tracer_advection/scripts/sir_loop_kernels_um_trans.py new file mode 100644 index 00000000..390ae05d --- /dev/null +++ b/benchmarks/nemo/tracer_advection/scripts/sir_loop_kernels_um_trans.py @@ -0,0 +1,68 @@ +# ----------------------------------------------------------------------------- +# BSD 3-Clause License +# +# Copyright (c) 2022, Science and Technology Facilities Council. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# ----------------------------------------------------------------------------- +# Authors: R. W. Ford and A. R. Porter, STFC Daresbury Lab. + +'''Module providing a PSyclone transformation script that first converts +the supplied PSyIR into a form compatible with the Stencil Intermediate +Representation (SIR) and then adds OpenACC Kernels regions to it. + +''' + +from acc_kernels_trans import add_kernels +from sir_loop_trans import make_sir_compliant + + +def trans(psy): + ''' + Transformation routine for use with PSyclone. It calls + :py:func:`sir_trans.make_sir_compliant` and then + :py:func:`kernels_trans.add_kernels` for each schedule in each invoke. + + :param psy: the PSy object which this script will transform. + :type psy: :py:class:`psyclone.psyGen.PSy` + + :returns: the transformed PSy object. + :rtype: :py:class:`psyclone.psyGen.PSy` + + ''' + for invoke in psy.invokes.invoke_list: + + sched = invoke.schedule + if not sched: + print(f"Invoke {invoke.name} has no Schedule! Skipping...") + continue + + make_sir_compliant(sched) + add_kernels(sched.children) + print(sched.view()) diff --git a/benchmarks/nemo/tracer_advection/scripts/sir_loop_trans.py b/benchmarks/nemo/tracer_advection/scripts/sir_loop_trans.py new file mode 100644 index 00000000..016a41f8 --- /dev/null +++ b/benchmarks/nemo/tracer_advection/scripts/sir_loop_trans.py @@ -0,0 +1,113 @@ +# ----------------------------------------------------------------------------- +# BSD 3-Clause License +# +# Copyright (c) 2022, Science and Technology Facilities Council +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# ----------------------------------------------------------------------------- +# Author: R. W. Ford, STFC Daresbury Lab + +'''Module providing a transformation script that converts the supplied +PSyIR to the Stencil intermediate representation (SIR). Translation to +the SIR is limited to the NEMO API. The NEMO API has no algorithm +layer so all of the original code is captured in the invoke +objects. Therefore by translating all of the invoke objects, all of +the original code is translated. + +''' + +from psyclone.psyir.nodes import Assignment +from psyclone.psyir.transformations import HoistTrans +from psyclone.domain.nemo.transformations import NemoAllArrayRange2LoopTrans, \ + NemoAllArrayAccess2LoopTrans + + +def trans(psy): + '''Transformation routine for use with PSyclone. Applies the + NemoAllArrayRange2LoopTrans, NemoAllArrayAccess2LoopTrans and + HoistTrans transformations to the supplied invokes. This + transformation routine is limited to the NEMO API. + + :param psy: the PSy object which this script will transform. + :type psy: :py:class:`psyclone.psyGen.PSy` + :returns: the transformed PSy object. + :rtype: :py:class:`psyclone.psyGen.PSy` + + ''' + + # For each Invoke transform the schedule so that it is compatible + # with SIR generation. Note, there is no algorithm layer in the NEMO API + # so the invokes represent all of the original code. + for invoke in psy.invokes.invoke_list: + schedule = invoke.schedule + + make_sir_compliant(schedule) + + return psy + + +def make_sir_compliant(schedule): + ''' + Applies various transformations to the supplied schedule to replace any + features that cannot be represented in SIR with alternative forms: + + 1. Converts any accesses of individual array elements into 1-trip loops. + 2. Transforms array assignments into loops. + 3. Hoists any loop-invariant assignments out of loops over levels. + + :param schedule: the schedule to transform. + :type schedule: :py:class:`psyclone.psyir.nodes.Schedule` + + ''' + array_range_trans = NemoAllArrayRange2LoopTrans() + array_access_trans = NemoAllArrayAccess2LoopTrans() + hoist_trans = HoistTrans() + + # Transform any single index accesses in array assignments + # (e.g. a(1)) into 1-trip loops. + for assignment in schedule.walk(Assignment): + array_access_trans.apply(assignment) + + # Transform any array assignments (Fortran ':' notation) into loops. + for assignment in schedule.walk(Assignment): + array_range_trans.apply(assignment) + + # Remove any loop invariant assignments inside k-loops to make + # them perfectly nested. At the moment this transformation + # does not perform any dependence analysis validation so could + # move code that should not be moved, see issue + # #1387. However, it is known that it is safe do apply this + # transformation to this particular code + # (tra_adv_compute.F90). + for loop in schedule.loops(): + # outermost only + if loop.loop_type == "levels": + for child in loop.loop_body[:]: + if isinstance(child, Assignment): + hoist_trans.apply(child)