From e0e739b39b8f8600eb72112c32f4b33a7bd7bd24 Mon Sep 17 00:00:00 2001 From: cflag Date: Mon, 28 Feb 2022 10:20:32 +0100 Subject: [PATCH 01/10] Try to increase cache usage in derive / interp operators --- src/thomas.f90 | 29 +- src/x3d_derive.f90 | 1118 ++++++++++++++++++----------------------- src/x3d_staggered.f90 | 1066 ++++++++++++++++++--------------------- 3 files changed, 1017 insertions(+), 1196 deletions(-) diff --git a/src/thomas.f90 b/src/thomas.f90 index ebc18d4..e6365ff 100644 --- a/src/thomas.f90 +++ b/src/thomas.f90 @@ -27,6 +27,11 @@ module thomas module procedure zthomas_12 end interface zthomas + interface thomas1d + module procedure thomas1d_0 + module procedure thomas1d_12 + end interface thomas1d + contains @@ -181,7 +186,7 @@ end subroutine zthomas_12 ! fw, in, used during the backward step ! nn, in, size of the vector ! - pure subroutine thomas1d(tt, ff, fs, fw, nn) + pure subroutine thomas1d_12(tt, ff, fs, fw, nn) implicit none @@ -199,6 +204,26 @@ pure subroutine thomas1d(tt, ff, fs, fw, nn) tt(k) = (tt(k)-ff(k)*tt(k+1)) * fw(k) enddo - end subroutine thomas1d + end subroutine thomas1d_12 + + pure subroutine thomas1d_0(tt, ff, fs, fw, perio, alfa, nn) + + implicit none + + integer, intent(in) :: nn + real(mytype), intent(inout), dimension(nn) :: tt + real(mytype), intent(in), dimension(nn) :: ff, fs, fw, perio + real(mytype), intent(in) :: alfa + + integer :: k + real(mytype) :: ss + + call thomas1d_12(tt, ff, fs, fw, nn) + ss = (tt(1)-alfa*tt(nn)) / (one + perio(1) - alfa*tt(nn)) + do k = 1, nn + tt(k) = tt(k) - ss*perio(nn) + enddo + + end subroutine thomas1d_0 end module thomas diff --git a/src/x3d_derive.f90 b/src/x3d_derive.f90 index 0b5c158..7e131d1 100644 --- a/src/x3d_derive.f90 +++ b/src/x3d_derive.f90 @@ -141,27 +141,30 @@ subroutine derx_00(tx,ux,x3dop,nx,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nx) :: buffer do concurrent (k=1:nz, j=1:ny) ! Compute r.h.s. - tx(1,j,k) = afix*(ux(2,j,k)-ux(nx,j,k)) & + buffer(1) = afix*(ux(2,j,k)-ux(nx,j,k)) & + bfix*(ux(3,j,k)-ux(nx-1,j,k)) - tx(2,j,k) = afix*(ux(3,j,k)-ux(1,j,k)) & + buffer(2) = afix*(ux(3,j,k)-ux(1,j,k)) & + bfix*(ux(4,j,k)-ux(nx,j,k)) do concurrent (i=3:nx-2) - tx(i,j,k) = afix*(ux(i+1,j,k)-ux(i-1,j,k)) & + buffer(i) = afix*(ux(i+1,j,k)-ux(i-1,j,k)) & + bfix*(ux(i+2,j,k)-ux(i-2,j,k)) enddo - tx(nx-1,j,k) = afix*(ux(nx,j,k)-ux(nx-2,j,k)) & + buffer(nx-1) = afix*(ux(nx,j,k)-ux(nx-2,j,k)) & + bfix*(ux(1,j,k)-ux(nx-3,j,k)) - tx(nx,j,k) = afix*(ux(1,j,k)-ux(nx-1,j,k)) & + buffer(nx) = afix*(ux(1,j,k)-ux(nx-1,j,k)) & + bfix*(ux(2,j,k)-ux(nx-2,j,k)) + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx) + do concurrent(i=1:nx) + tx(i,j,k) = buffer(i) + enddo enddo - ! Solve tri-diagonal system - call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) - end subroutine derx_00 !******************************************************************** @@ -180,48 +183,52 @@ subroutine derx_ij(tx,ux,ff,fs,fw,nx,ny,nz,npaire,ncl1,ncln) ! Local variables integer :: i, j, k + real(mytype), dimension(nx) :: buffer do concurrent (k=1:nz, j=1:ny) ! Compute r.h.s. if (ncl1==1) then if (npaire==1) then - tx(1,j,k) = zero - tx(2,j,k) = afix*(ux(3,j,k)-ux(1,j,k)) & + buffer(1) = zero + buffer(2) = afix*(ux(3,j,k)-ux(1,j,k)) & + bfix*(ux(4,j,k)-ux(2,j,k)) else - tx(1,j,k) = afix*(ux(2,j,k)+ux(2,j,k)) & + buffer(1) = afix*(ux(2,j,k)+ux(2,j,k)) & + bfix*(ux(3,j,k)+ux(3,j,k)) - tx(2,j,k) = afix*(ux(3,j,k)-ux(1,j,k)) & + buffer(2) = afix*(ux(3,j,k)-ux(1,j,k)) & + bfix*(ux(4,j,k)+ux(2,j,k)) endif else - tx(1,j,k) = af1x*ux(1,j,k) + bf1x*ux(2,j,k) + cf1x*ux(3,j,k) - tx(2,j,k) = af2x*(ux(3,j,k)-ux(1,j,k)) + buffer(1) = af1x*ux(1,j,k) + bf1x*ux(2,j,k) + cf1x*ux(3,j,k) + buffer(2) = af2x*(ux(3,j,k)-ux(1,j,k)) endif do concurrent (i=3:nx-2) - tx(i,j,k) = afix*(ux(i+1,j,k)-ux(i-1,j,k)) & + buffer(i) = afix*(ux(i+1,j,k)-ux(i-1,j,k)) & + bfix*(ux(i+2,j,k)-ux(i-2,j,k)) enddo ! nx-1 <= i <= nx if (ncln==1) then if (npaire==1) then - tx(nx-1,j,k) = afix*(ux(nx,j,k)-ux(nx-2,j,k)) & + buffer(nx-1) = afix*(ux(nx,j,k)-ux(nx-2,j,k)) & + bfix*(ux(nx-1,j,k)-ux(nx-3,j,k)) - tx(nx,j,k) = zero + buffer(nx) = zero else - tx(nx-1,j,k) = afix*(ux(nx,j,k)-ux(nx-2,j,k)) & + buffer(nx-1) = afix*(ux(nx,j,k)-ux(nx-2,j,k)) & + bfix*((-ux(nx-1,j,k))-ux(nx-3,j,k)) - tx(nx,j,k) = afix*((-ux(nx-1,j,k))-ux(nx-1,j,k)) & + buffer(nx) = afix*((-ux(nx-1,j,k))-ux(nx-1,j,k)) & + bfix*((-ux(nx-2,j,k))-ux(nx-2,j,k)) endif else - tx(nx-1,j,k) = afmx*(ux(nx,j,k)-ux(nx-2,j,k)) - tx(nx,j,k) = - afnx*ux(nx,j,k) - bfnx*ux(nx-1,j,k) - cfnx*ux(nx-2,j,k) + buffer(nx-1) = afmx*(ux(nx,j,k)-ux(nx-2,j,k)) + buffer(nx) = - afnx*ux(nx,j,k) - bfnx*ux(nx-1,j,k) - cfnx*ux(nx-2,j,k) endif - enddo - ! Solve tri-diagonal system - call xthomas(tx, ff, fs, fw, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, fs, fw, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo end subroutine derx_ij @@ -304,40 +311,37 @@ subroutine dery_00(ty,uy,x3dop,ppy,nx,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(ny) :: buffer - ! Compute r.h.s. - do concurrent (k=1:nz) - do concurrent (i=1:nx) - ty(i,1,k) = afjy*(uy(i,2,k)-uy(i,ny,k)) & - + bfjy*(uy(i,3,k)-uy(i,ny-1,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = afjy*(uy(i,3,k)-uy(i,1,k)) & - + bfjy*(uy(i,4,k)-uy(i,ny,k)) - enddo - do concurrent (j=3:ny-2, i=1:nx) - ty(i,j,k) = afjy*(uy(i,j+1,k)-uy(i,j-1,k)) & + do concurrent (k=1:nz, i=1:nx) + ! Compute r.h.s. + buffer(1) = afjy*(uy(i,2,k)-uy(i,ny,k)) & + + bfjy*(uy(i,3,k)-uy(i,ny-1,k)) + buffer(2) = afjy*(uy(i,3,k)-uy(i,1,k)) & + + bfjy*(uy(i,4,k)-uy(i,ny,k)) + do concurrent (j=3:ny-2) + buffer(j) = afjy*(uy(i,j+1,k)-uy(i,j-1,k)) & + bfjy*(uy(i,j+2,k)-uy(i,j-2,k)) enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = afjy*(uy(i,ny,k)-uy(i,ny-2,k)) & - + bfjy*(uy(i,1,k)-uy(i,ny-3,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny,k) = afjy*(uy(i,1,k)-uy(i,ny-1,k)) & - + bfjy*(uy(i,2,k)-uy(i,ny-2,k)) - enddo - enddo - - ! Solve tri-diagonal system - call ythomas(ty, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + buffer(ny-1) = afjy*(uy(i,ny,k)-uy(i,ny-2,k)) & + + bfjy*(uy(i,1,k)-uy(i,ny-3,k)) + buffer(ny) = afjy*(uy(i,1,k)-uy(i,ny-1,k)) & + + bfjy*(uy(i,2,k)-uy(i,ny-2,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, ny) + + ! Apply stretching if needed + if (istret /= 0) then + do concurrent (j=1:ny) + buffer(j) = buffer(j) * ppy(j) + enddo + endif - ! Apply stretching if needed - if (istret /= 0) then - do concurrent (k=1:nz, j=1:ny, i=1:nx) - ty(i,j,k) = ty(i,j,k) * ppy(j) + do concurrent (j=1:ny) + ty(i,j,k) = buffer(j) enddo - endif + enddo end subroutine dery_00 @@ -357,79 +361,60 @@ subroutine dery_ij(ty,uy,ff,fs,fw,ppy,nx,ny,nz,npaire,ncl1,ncln) ! Local variables integer :: i, j, k + real(mytype), dimension(ny) :: buffer - do concurrent (k=1:nz) + do concurrent (k=1:nz, i=1:nx) ! Compute r.h.s. if (ncl1==1) then if (npaire==1) then - do concurrent (i=1:nx) - ty(i,1,k) = zero - enddo - do concurrent (i=1:nx) - ty(i,2,k) = afjy*(uy(i,3,k)-uy(i,1,k)) & - + bfjy*(uy(i,4,k)-uy(i,2,k)) - enddo + buffer(1) = zero + buffer(2) = afjy*(uy(i,3,k)-uy(i,1,k)) & + + bfjy*(uy(i,4,k)-uy(i,2,k)) else - do concurrent (i=1:nx) - ty(i,1,k) = afjy*(uy(i,2,k)+uy(i,2,k)) & - + bfjy*(uy(i,3,k)+uy(i,3,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = afjy*(uy(i,3,k)-uy(i,1,k)) & - + bfjy*(uy(i,4,k)+uy(i,2,k)) - enddo + buffer(1) = afjy*(uy(i,2,k)+uy(i,2,k)) & + + bfjy*(uy(i,3,k)+uy(i,3,k)) + buffer(2) = afjy*(uy(i,3,k)-uy(i,1,k)) & + + bfjy*(uy(i,4,k)+uy(i,2,k)) endif else - do concurrent (i=1:nx) - ty(i,1,k) = af1y*uy(i,1,k)+bf1y*uy(i,2,k)+cf1y*uy(i,3,k) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = af2y*(uy(i,3,k)-uy(i,1,k)) - enddo + buffer(1) = af1y*uy(i,1,k)+bf1y*uy(i,2,k)+cf1y*uy(i,3,k) + buffer(2) = af2y*(uy(i,3,k)-uy(i,1,k)) endif - do concurrent (j=3:ny-2, i=1:nx) - ty(i,j,k) = afjy*(uy(i,j+1,k)-uy(i,j-1,k)) & + do concurrent (j=3:ny-2) + buffer(j) = afjy*(uy(i,j+1,k)-uy(i,j-1,k)) & + bfjy*(uy(i,j+2,k)-uy(i,j-2,k)) enddo if (ncln==1) then if (npaire==1) then - do concurrent (i=1:nx) - ty(i,ny-1,k) = afjy*(uy(i,ny,k)-uy(i,ny-2,k)) & - + bfjy*(uy(i,ny-1,k)-uy(i,ny-3,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny,k) = zero - enddo + buffer(ny-1) = afjy*(uy(i,ny,k)-uy(i,ny-2,k)) & + + bfjy*(uy(i,ny-1,k)-uy(i,ny-3,k)) + buffer(ny) = zero else - do concurrent (i=1:nx) - ty(i,ny-1,k) = afjy*(uy(i,ny,k)-uy(i,ny-2,k)) & - + bfjy*((-uy(i,ny-1,k))-uy(i,ny-3,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny,k) = afjy*((-uy(i,ny-1,k))-uy(i,ny-1,k)) & - + bfjy*((-uy(i,ny-2,k))-uy(i,ny-2,k)) - enddo + buffer(ny-1) = afjy*(uy(i,ny,k)-uy(i,ny-2,k)) & + + bfjy*((-uy(i,ny-1,k))-uy(i,ny-3,k)) + buffer(ny) = afjy*((-uy(i,ny-1,k))-uy(i,ny-1,k)) & + + bfjy*((-uy(i,ny-2,k))-uy(i,ny-2,k)) endif else - do concurrent (i=1:nx) - ty(i,ny-1,k) = afmy*(uy(i,ny,k)-uy(i,ny-2,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny,k) = -afny*uy(i,ny,k)-bfny*uy(i,ny-1,k)-cfny*uy(i,ny-2,k) - enddo + buffer(ny-1) = afmy*(uy(i,ny,k)-uy(i,ny-2,k)) + buffer(ny) = -afny*uy(i,ny,k)-bfny*uy(i,ny-1,k)-cfny*uy(i,ny-2,k) endif - enddo - ! Solve tri-diagonal system - call ythomas(ty, ff, fs, fw, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, fs, fw, ny) + + ! Apply stretching if needed + if (istret /= 0) then + do concurrent (j=1:ny) + buffer(j) = buffer(j) * ppy(j) + enddo + endif - ! Apply stretching if needed - if (istret /= 0) then - do concurrent (k=1:nz, j=1:ny, i=1:nx) - ty(i,j,k) = ty(i,j,k) * ppy(j) + do concurrent(j=1:ny) + ty(i,j,k) = buffer(j) enddo - endif + enddo end subroutine dery_ij @@ -513,6 +498,7 @@ subroutine derz_00(tz,uz,x3dop,nx,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nz) :: buffer if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -521,30 +507,27 @@ subroutine derz_00(tz,uz,x3dop,nx,ny,nz) return endif - ! Compute r.h.s. do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = afkz*(uz(i,j,2)-uz(i,j,nz )) & + ! Compute r.h.s. + buffer(1) = afkz*(uz(i,j,2)-uz(i,j,nz )) & + bfkz*(uz(i,j,3)-uz(i,j,nz-1)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = afkz*(uz(i,j,3)-uz(i,j,1 )) & + buffer(2) = afkz*(uz(i,j,3)-uz(i,j,1 )) & + bfkz*(uz(i,j,4)-uz(i,j,nz)) - enddo - do concurrent (k=3:nz-2, j=1:ny, i=1:nx) - tz(i,j,k) = afkz*(uz(i,j,k+1)-uz(i,j,k-1)) & - + bfkz*(uz(i,j,k+2)-uz(i,j,k-2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = afkz*(uz(i,j,nz)-uz(i,j,nz-2)) & + do concurrent (k=3:nz-2) + buffer(k) = afkz*(uz(i,j,k+1)-uz(i,j,k-1)) & + + bfkz*(uz(i,j,k+2)-uz(i,j,k-2)) + enddo + buffer(nz-1) = afkz*(uz(i,j,nz)-uz(i,j,nz-2)) & + bfkz*(uz(i,j,1 )-uz(i,j,nz-3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz) = afkz*(uz(i,j,1)-uz(i,j,nz-1)) & + buffer(nz) = afkz*(uz(i,j,1)-uz(i,j,nz-1)) & + bfkz*(uz(i,j,2)-uz(i,j,nz-2)) - enddo - ! Solve tri-diagonal system - call zthomas(tz, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nz) + do concurrent (k=1:nz) + tz(i,j,k) = buffer(k) + enddo + enddo end subroutine derz_00 @@ -564,6 +547,7 @@ subroutine derz_ij(tz,uz,ff,fs,fw,nx,ny,nz,npaire,ncl1,ncln) ! Local variables integer :: i, j, k + real(mytype), dimension(nz) :: buffer if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -572,70 +556,51 @@ subroutine derz_ij(tz,uz,ff,fs,fw,nx,ny,nz,npaire,ncl1,ncln) return endif - ! Compute r.h.s. - if (ncl1==1) then - if (npaire==1) then - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = zero - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = afkz*(uz(i,j,3)-uz(i,j,1)) & + do concurrent (j=1:ny, i=1:nx) + ! Compute r.h.s. + if (ncl1==1) then + if (npaire==1) then + buffer(1) = zero + buffer(2) = afkz*(uz(i,j,3)-uz(i,j,1)) & + bfkz*(uz(i,j,4)-uz(i,j,2)) - enddo - else - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = afkz*(uz(i,j,2)+uz(i,j,2)) & + else + buffer(1) = afkz*(uz(i,j,2)+uz(i,j,2)) & + bfkz*(uz(i,j,3)+uz(i,j,3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = afkz*(uz(i,j,3)-uz(i,j,1)) & + buffer(2) = afkz*(uz(i,j,3)-uz(i,j,1)) & + bfkz*(uz(i,j,4)+uz(i,j,2)) - enddo - endif - else - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = af1z*uz(i,j,1) + bf1z*uz(i,j,2) & + endif + else + buffer(1) = af1z*uz(i,j,1) + bf1z*uz(i,j,2) & + cf1z*uz(i,j,3) + buffer(2) = af2z*(uz(i,j,3)-uz(i,j,1)) + endif + do concurrent (k=3:nz-2) + buffer(k) = afkz*(uz(i,j,k+1)-uz(i,j,k-1)) & + + bfkz*(uz(i,j,k+2)-uz(i,j,k-2)) enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = af2z*(uz(i,j,3)-uz(i,j,1)) - enddo - endif - do concurrent (k=3:nz-2, j=1:ny, i=1:nx) - tz(i,j,k) = afkz*(uz(i,j,k+1)-uz(i,j,k-1)) & - + bfkz*(uz(i,j,k+2)-uz(i,j,k-2)) - enddo - if (ncln==1) then - if (npaire==1) then - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = afkz*(uz(i,j,nz )-uz(i,j,nz-2)) & + if (ncln==1) then + if (npaire==1) then + buffer(nz-1) = afkz*(uz(i,j,nz )-uz(i,j,nz-2)) & + bfkz*(uz(i,j,nz-1)-uz(i,j,nz-3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz) = zero - enddo - else - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = afkz*( uz(i,j,nz )-uz(i,j,nz-2)) & + buffer(nz) = zero + else + buffer(nz-1) = afkz*( uz(i,j,nz )-uz(i,j,nz-2)) & + bfkz*(-uz(i,j,nz-1)-uz(i,j,nz-3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz) = afkz*(-uz(i,j,nz-1)-uz(i,j,nz-1)) & + buffer(nz) = afkz*(-uz(i,j,nz-1)-uz(i,j,nz-1)) & + bfkz*(-uz(i,j,nz-2)-uz(i,j,nz-2)) - enddo - endif - else - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = afmz*(uz(i,j,nz)-uz(i,j,nz-2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz) = - afnz*uz(i,j,nz) - bfnz*uz(i,j,nz-1) & + endif + else + buffer(nz-1) = afmz*(uz(i,j,nz)-uz(i,j,nz-2)) + buffer(nz) = - afnz*uz(i,j,nz) - bfnz*uz(i,j,nz-1) & - cfnz*uz(i,j,nz-2) - enddo - endif + endif - ! Solve tri-diagonal system - call zthomas(tz, ff, fs, fw, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, ff, fs, fw, nz) + do concurrent (k=1:nz) + tz(i,j,k) = buffer(k) + enddo + enddo end subroutine derz_ij @@ -715,10 +680,11 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nx) :: buffer ! Compute r.h.s. do concurrent (k=1:nz, j=1:ny) - tx(1,j,k) = asix*(ux(2,j,k)-ux(1 ,j,k) & + buffer(1) = asix*(ux(2,j,k)-ux(1 ,j,k) & -ux(1,j,k)+ux(nx ,j,k)) & + bsix*(ux(3,j,k)-ux(1 ,j,k) & -ux(1,j,k)+ux(nx-1,j,k)) & @@ -726,7 +692,7 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) -ux(1,j,k)+ux(nx-2,j,k)) & + dsix*(ux(5,j,k)-ux(1 ,j,k) & -ux(1,j,k)+ux(nx-3,j,k)) - tx(2,j,k) = asix*(ux(3,j,k)-ux(2 ,j,k) & + buffer(2) = asix*(ux(3,j,k)-ux(2 ,j,k) & -ux(2,j,k)+ux(1 ,j,k)) & + bsix*(ux(4,j,k)-ux(2 ,j,k) & -ux(2,j,k)+ux(nx ,j,k)) & @@ -734,7 +700,7 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) -ux(2,j,k)+ux(nx-1,j,k)) & + dsix*(ux(6,j,k)-ux(2 ,j,k) & -ux(2,j,k)+ux(nx-2,j,k)) - tx(3,j,k) = asix*(ux(4,j,k)-ux(3 ,j,k) & + buffer(3) = asix*(ux(4,j,k)-ux(3 ,j,k) & -ux(3,j,k)+ux(2 ,j,k)) & + bsix*(ux(5,j,k)-ux(3 ,j,k) & -ux(3,j,k)+ux(1 ,j,k)) & @@ -742,7 +708,7 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) -ux(3,j,k)+ux(nx,j,k)) & + dsix*(ux(7,j,k)-ux(3 ,j,k) & -ux(3,j,k)+ux(nx-1,j,k)) - tx(4,j,k) = asix*(ux(5,j,k)-ux(4 ,j,k) & + buffer(4) = asix*(ux(5,j,k)-ux(4 ,j,k) & -ux(4,j,k)+ux(3 ,j,k)) & + bsix*(ux(6,j,k)-ux(4 ,j,k) & -ux(4,j,k)+ux(2,j,k)) & @@ -751,7 +717,7 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) + dsix*(ux(8,j,k)-ux(4 ,j,k) & -ux(4,j,k)+ux(nx,j,k)) do concurrent (i=5:nx-4) - tx(i,j,k) = asix*(ux(i+1,j,k)-ux(i ,j,k) & + buffer(i) = asix*(ux(i+1,j,k)-ux(i ,j,k) & -ux(i ,j,k)+ux(i-1,j,k)) & + bsix*(ux(i+2,j,k)-ux(i ,j,k) & -ux(i ,j,k)+ux(i-2,j,k)) & @@ -760,7 +726,7 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) + dsix*(ux(i+4,j,k)-ux(i ,j,k) & -ux(i ,j,k)+ux(i-4,j,k)) enddo - tx(nx-3,j,k) = asix*(ux(nx-2,j,k)-ux(nx-3,j,k) & + buffer(nx-3) = asix*(ux(nx-2,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-4,j,k)) & + bsix*(ux(nx-1,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-5,j,k)) & @@ -768,7 +734,7 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) -ux(nx-3,j,k)+ux(nx-6,j,k)) & + dsix*(ux(1 ,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-7,j,k)) - tx(nx-2,j,k) = asix*(ux(nx-1,j,k)-ux(nx-2,j,k) & + buffer(nx-2) = asix*(ux(nx-1,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-3,j,k)) & + bsix*(ux(nx ,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-4,j,k)) & @@ -776,7 +742,7 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) -ux(nx-2,j,k)+ux(nx-5,j,k)) & + dsix*(ux(2 ,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-6,j,k)) - tx(nx-1,j,k) = asix*(ux(nx ,j,k)-ux(nx-1,j,k) & + buffer(nx-1) = asix*(ux(nx ,j,k)-ux(nx-1,j,k) & -ux(nx-1,j,k)+ux(nx-2,j,k)) & + bsix*(ux(1 ,j,k)-ux(nx-1,j,k) & -ux(nx-1,j,k)+ux(nx-3,j,k)) & @@ -784,7 +750,7 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) -ux(nx-1,j,k)+ux(nx-4,j,k)) & + dsix*(ux(3 ,j,k)-ux(nx-1,j,k) & -ux(nx-1,j,k)+ux(nx-5,j,k)) - tx(nx ,j,k) = asix*(ux(1 ,j,k)-ux(nx ,j,k) & + buffer(nx ) = asix*(ux(1 ,j,k)-ux(nx ,j,k) & -ux(nx,j,k)+ux(nx-1,j,k)) & + bsix*(ux(2 ,j,k)-ux(nx ,j,k) & -ux(nx,j,k)+ux(nx-2,j,k)) & @@ -792,10 +758,13 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) -ux(nx,j,k)+ux(nx-3,j,k)) & + dsix*(ux(4 ,j,k)-ux(nx ,j,k) & -ux(nx,j,k)+ux(nx-4,j,k)) - enddo - ! Solve tri-diagonal system - call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo end subroutine derxx_00 @@ -821,7 +790,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) ! Compute r.h.s. if (ncl1==1) then if (npaire==1) then - tx(1,j,k) = asix*(ux(2,j,k)-ux(1,j,k) & + buffer(1) = asix*(ux(2,j,k)-ux(1,j,k) & -ux(1,j,k)+ux(2,j,k)) & + bsix*(ux(3,j,k)-ux(1,j,k) & -ux(1,j,k)+ux(3,j,k)) & @@ -829,7 +798,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(1,j,k)+ux(4,j,k)) & + dsix*(ux(5,j,k)-ux(1,j,k) & -ux(1,j,k)+ux(5,j,k)) - tx(2,j,k) = asix*(ux(3,j,k)-ux(2,j,k) & + buffer(2) = asix*(ux(3,j,k)-ux(2,j,k) & -ux(2,j,k)+ux(1,j,k)) & + bsix*(ux(4,j,k)-ux(2,j,k) & -ux(2,j,k)+ux(2,j,k)) & @@ -837,7 +806,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(2,j,k)+ux(3,j,k)) & + dsix*(ux(6,j,k)-ux(2,j,k) & -ux(2,j,k)+ux(4,j,k)) - tx(3,j,k) = asix*(ux(4,j,k)-ux(3,j,k) & + buffer(3) = asix*(ux(4,j,k)-ux(3,j,k) & -ux(3,j,k)+ux(2,j,k)) & + bsix*(ux(5,j,k)-ux(3,j,k) & -ux(3,j,k)+ux(1,j,k)) & @@ -845,7 +814,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(3,j,k)+ux(2,j,k)) & + dsix*(ux(7,j,k)-ux(3,j,k) & -ux(3,j,k)+ux(3,j,k)) - tx(4,j,k) = asix*(ux(5,j,k)-ux(4,j,k) & + buffer(4) = asix*(ux(5,j,k)-ux(4,j,k) & -ux(4,j,k)+ux(3,j,k)) & + bsix*(ux(6,j,k)-ux(4,j,k) & -ux(4,j,k)+ux(2,j,k)) & @@ -854,8 +823,8 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) + dsix*(ux(8,j,k)-ux(4,j,k) & -ux(4,j,k)+ux(2,j,k)) else - tx(1,j,k) = zero - tx(2,j,k) = asix*(ux(3,j,k)-ux(2,j,k) & + buffer(1) = zero + buffer(2) = asix*(ux(3,j,k)-ux(2,j,k) & -ux(2,j,k)+ux(1,j,k)) & + bsix*(ux(4,j,k)-ux(2,j,k) & -ux(2,j,k)-ux(2,j,k)) & @@ -863,7 +832,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(2,j,k)-ux(3,j,k)) & + dsix*(ux(6,j,k)-ux(2,j,k) & -ux(2,j,k)-ux(4,j,k)) - tx(3,j,k) = asix*(ux(4,j,k)-ux(3,j,k) & + buffer(3) = asix*(ux(4,j,k)-ux(3,j,k) & -ux(3,j,k)+ux(2,j,k)) & + bsix*(ux(5,j,k)-ux(3,j,k) & -ux(3,j,k)+ux(1,j,k)) & @@ -871,7 +840,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(3,j,k)-ux(2,j,k)) & + dsix*(ux(7,j,k)-ux(3,j,k) & -ux(3,j,k)-ux(3,j,k)) - tx(4,j,k) = asix*(ux(5,j,k)-ux(4,j,k) & + buffer(4) = asix*(ux(5,j,k)-ux(4,j,k) & -ux(4,j,k)+ux(3,j,k)) & + bsix*(ux(6,j,k)-ux(4,j,k) & -ux(4,j,k)+ux(2,j,k)) & @@ -881,15 +850,15 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(4,j,k)-ux(2,j,k)) endif else - tx(1,j,k) = as1x*ux(1,j,k) + bs1x*ux(2,j,k) & + buffer(1) = as1x*ux(1,j,k) + bs1x*ux(2,j,k) & + cs1x*ux(3,j,k) + ds1x*ux(4,j,k) - tx(2,j,k) = as2x*(ux(3,j,k)-ux(2,j,k) & + buffer(2) = as2x*(ux(3,j,k)-ux(2,j,k) & -ux(2,j,k)+ux(1,j,k)) - tx(3,j,k) = as3x*(ux(4,j,k)-ux(3,j,k) & + buffer(3) = as3x*(ux(4,j,k)-ux(3,j,k) & -ux(3,j,k)+ux(2,j,k)) & + bs3x*(ux(5,j,k)-ux(3,j,k) & -ux(3,j,k)+ux(1,j,k)) - tx(4,j,k) = as4x*(ux(5,j,k)-ux(4,j,k) & + buffer(4) = as4x*(ux(5,j,k)-ux(4,j,k) & -ux(4,j,k)+ux(3,j,k)) & + bs4x*(ux(6,j,k)-ux(4,j,k) & -ux(4,j,k)+ux(2,j,k)) & @@ -897,7 +866,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(4,j,k)+ux(1,j,k)) endif do concurrent (i=5:nx-4) - tx(i,j,k) = asix*(ux(i+1,j,k)-ux(i ,j,k) & + buffer(i) = asix*(ux(i+1,j,k)-ux(i ,j,k) & -ux(i ,j,k)+ux(i-1,j,k)) & + bsix*(ux(i+2,j,k)-ux(i ,j,k) & -ux(i ,j,k)+ux(i-2,j,k)) & @@ -908,7 +877,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) enddo if (ncln == 1) then if (npaire==1) then - tx(nx-3,j,k) = asix*(ux(nx-2,j,k)-ux(nx-3,j,k) & + buffer(nx-3) = asix*(ux(nx-2,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-4,j,k)) & + bsix*(ux(nx-1,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-5,j,k)) & @@ -916,7 +885,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(nx-3,j,k)+ux(nx-6,j,k)) & + dsix*(ux(nx-1,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-7,j,k)) - tx(nx-2,j,k) = asix*(ux(nx-1,j,k)-ux(nx-2,j,k) & + buffer(nx-2) = asix*(ux(nx-1,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-3,j,k)) & + bsix*(ux(nx ,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-4,j,k)) & @@ -924,7 +893,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(nx-2,j,k)+ux(nx-5,j,k)) & + dsix*(ux(nx-2,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-6,j,k)) - tx(nx-1,j,k) = asix*(ux(nx ,j,k)-ux(nx-1,j,k) & + buffer(nx-1) = asix*(ux(nx ,j,k)-ux(nx-1,j,k) & -ux(nx-1,j,k)+ux(nx-2,j,k)) & + bsix*(ux(nx-1,j,k)-ux(nx-1,j,k) & -ux(nx-1,j,k)+ux(nx-3,j,k)) & @@ -932,7 +901,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(nx-1,j,k)+ux(nx-4,j,k)) & + dsix*(ux(nx-3,j,k)-ux(nx-1,j,k) & -ux(nx-1,j,k)+ux(nx-5,j,k)) - tx(nx ,j,k) = asix*(ux(nx-1,j,k)-ux(nx ,j,k) & + buffer(nx ) = asix*(ux(nx-1,j,k)-ux(nx ,j,k) & -ux(nx ,j,k)+ux(nx-1,j,k)) & + bsix*(ux(nx-2,j,k)-ux(nx ,j,k) & -ux(nx ,j,k)+ux(nx-2,j,k)) & @@ -941,7 +910,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) + dsix*(ux(nx-4,j,k)-ux(nx ,j,k) & -ux(nx ,j,k)+ux(nx-4,j,k)) else - tx(nx-3,j,k) = asix*( ux(nx-2,j,k)-ux(nx-3,j,k) & + buffer(nx-3) = asix*( ux(nx-2,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-4,j,k)) & + bsix*( ux(nx-1,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-5,j,k)) & @@ -949,7 +918,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(nx-3,j,k)+ux(nx-6,j,k)) & + dsix*(-ux(nx-1,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-7,j,k)) - tx(nx-2,j,k) = asix*( ux(nx-1,j,k)-ux(nx-2,j,k) & + buffer(nx-2) = asix*( ux(nx-1,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-3,j,k)) & + bsix*( ux(nx ,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-4,j,k)) & @@ -957,7 +926,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(nx-2,j,k)+ux(nx-5,j,k)) & + dsix*(-ux(nx-2,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-6,j,k)) - tx(nx-1,j,k) = asix*( ux(nx ,j,k)-ux(nx-1,j,k) & + buffer(nx-1) = asix*( ux(nx ,j,k)-ux(nx-1,j,k) & -ux(nx-1,j,k)+ux(nx-2,j,k)) & + bsix*(-ux(nx-1,j,k)-ux(nx-1,j,k) & -ux(nx-1,j,k)+ux(nx-3,j,k)) & @@ -965,28 +934,31 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -ux(nx-1,j,k)+ux(nx-4,j,k)) & + dsix*(-ux(nx-3,j,k)-ux(nx-1,j,k) & -ux(nx-1,j,k)+ux(nx-5,j,k)) - tx(nx ,j,k) = zero + buffer(nx ) = zero endif else - tx(nx-3,j,k) = asttx*(ux(nx-2,j,k)-ux(nx-3,j,k) & + buffer(nx-3) = asttx*(ux(nx-2,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-4,j,k)) & + bsttx*(ux(nx-1,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-5,j,k)) & + csttx*(ux(nx,j,k)-ux(nx-3,j,k) & -ux(nx-3,j,k)+ux(nx-6,j,k)) - tx(nx-2,j,k) = astx*(ux(nx-1,j,k)-ux(nx-2,j,k) & + buffer(nx-2) = astx*(ux(nx-1,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-3,j,k)) & + bstx*(ux(nx ,j,k)-ux(nx-2,j,k) & -ux(nx-2,j,k)+ux(nx-4,j,k)) - tx(nx-1,j,k) = asmx*(ux(nx ,j,k)-ux(nx-1,j,k) & + buffer(nx-1) = asmx*(ux(nx ,j,k)-ux(nx-1,j,k) & -ux(nx-1,j,k)+ux(nx-2,j,k)) - tx(nx ,j,k) = asnx*ux(nx ,j,k) + bsnx*ux(nx-1,j,k) & + buffer(nx ) = asnx*ux(nx ,j,k) + bsnx*ux(nx-1,j,k) & + csnx*ux(nx-2,j,k) + dsnx*ux(nx-3,j,k) endif - enddo - ! Solve tri-diagonal system - call xthomas(tx, sf, ss, sw, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, sf, ss, sw, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo end subroutine derxx_ij @@ -1068,49 +1040,41 @@ subroutine deryy_00(ty,uy,x3dop,nx,ny,nz) integer :: i, j, k ! Compute r.h.s. - do concurrent (k=1:nz) - do concurrent (i=1:nx) - ty(i,1,k) = asjy*(uy(i,2,k)-uy(i,1,k) & - -uy(i,1,k)+uy(i,ny,k)) & - + bsjy*(uy(i,3,k)-uy(i,1,k) & - -uy(i,1,k)+uy(i,ny-1,k)) & - + csjy*(uy(i,4,k)-uy(i,1,k) & - -uy(i,1,k)+uy(i,ny-2,k)) & - + dsjy*(uy(i,5,k)-uy(i,1,k) & - -uy(i,1,k)+uy(i,ny-3,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = asjy*(uy(i,3,k)-uy(i,2,k) & - -uy(i,2,k)+uy(i,1,k)) & - + bsjy*(uy(i,4,k)-uy(i,2,k) & - -uy(i,2,k)+uy(i,ny,k)) & - + csjy*(uy(i,5,k)-uy(i,2,k) & - -uy(i,2,k)+uy(i,ny-1,k)) & - + dsjy*(uy(i,6,k)-uy(i,2,k) & - -uy(i,2,k)+uy(i,ny-2,k)) - enddo - do concurrent (i=1:nx) - ty(i,3,k) = asjy*(uy(i,4,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,2,k)) & - + bsjy*(uy(i,5,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,1,k)) & - + csjy*(uy(i,6,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,ny,k)) & - + dsjy*(uy(i,7,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,ny-1,k)) - enddo - do concurrent (i=1:nx) - ty(i,4,k) = asjy*(uy(i,5,k)-uy(i,4,k) & - -uy(i,4,k)+uy(i,3,k)) & - + bsjy*(uy(i,6,k)-uy(i,4,k) & - -uy(i,4,k)+uy(i,2,k)) & - + csjy*(uy(i,7,k)-uy(i,4,k) & - -uy(i,4,k)+uy(i,1,k)) & - + dsjy*(uy(i,8,k)-uy(i,4,k) & - -uy(i,4,k)+uy(i,ny,k)) - enddo - do concurrent (j=5:ny-4, i=1:nx) - ty(i,j,k) = asjy*(uy(i,j+1,k)-uy(i,j,k) & + do concurrent (k=1:nz, i=1:nx) + buffer(1) = asjy*(uy(i,2,k)-uy(i,1,k) & + -uy(i,1,k)+uy(i,ny,k)) & + + bsjy*(uy(i,3,k)-uy(i,1,k) & + -uy(i,1,k)+uy(i,ny-1,k)) & + + csjy*(uy(i,4,k)-uy(i,1,k) & + -uy(i,1,k)+uy(i,ny-2,k)) & + + dsjy*(uy(i,5,k)-uy(i,1,k) & + -uy(i,1,k)+uy(i,ny-3,k)) + buffer(2) = asjy*(uy(i,3,k)-uy(i,2,k) & + -uy(i,2,k)+uy(i,1,k)) & + + bsjy*(uy(i,4,k)-uy(i,2,k) & + -uy(i,2,k)+uy(i,ny,k)) & + + csjy*(uy(i,5,k)-uy(i,2,k) & + -uy(i,2,k)+uy(i,ny-1,k)) & + + dsjy*(uy(i,6,k)-uy(i,2,k) & + -uy(i,2,k)+uy(i,ny-2,k)) + buffer(3) = asjy*(uy(i,4,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,2,k)) & + + bsjy*(uy(i,5,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,1,k)) & + + csjy*(uy(i,6,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,ny,k)) & + + dsjy*(uy(i,7,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,ny-1,k)) + buffer(4) = asjy*(uy(i,5,k)-uy(i,4,k) & + -uy(i,4,k)+uy(i,3,k)) & + + bsjy*(uy(i,6,k)-uy(i,4,k) & + -uy(i,4,k)+uy(i,2,k)) & + + csjy*(uy(i,7,k)-uy(i,4,k) & + -uy(i,4,k)+uy(i,1,k)) & + + dsjy*(uy(i,8,k)-uy(i,4,k) & + -uy(i,4,k)+uy(i,ny,k)) + do concurrent (j=5:ny-4) + buffer(j) = asjy*(uy(i,j+1,k)-uy(i,j,k) & -uy(i,j,k)+uy(i,j-1,k)) & + bsjy*(uy(i,j+2,k)-uy(i,j,k) & -uy(i,j,k)+uy(i,j-2,k)) & @@ -1119,51 +1083,46 @@ subroutine deryy_00(ty,uy,x3dop,nx,ny,nz) + dsjy*(uy(i,j+4,k)-uy(i,j,k) & -uy(i,j,k)+uy(i,j-4,k)) enddo - do concurrent (i=1:nx) - ty(i,ny-3,k) = asjy*(uy(i,ny-2,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-4,k)) & - + bsjy*(uy(i,ny-1,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-5,k)) & - + csjy*(uy(i,ny ,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-6,k)) & - + dsjy*(uy(i,1 ,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-7,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-2,k) = asjy*(uy(i,ny-1,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-3,k)) & - + bsjy*(uy(i,ny ,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-4,k)) & - + csjy*(uy(i,1 ,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-5,k)) & - + dsjy*(uy(i,2 ,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-6,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = asjy*(uy(i,ny ,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-2,k)) & - + bsjy*(uy(i,1 ,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-3,k)) & - + csjy*(uy(i,2 ,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-4,k)) & - + dsjy*(uy(i,3 ,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-5,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny ,k) = asjy*(uy(i,1 ,k)-uy(i,ny ,k) & - -uy(i,ny,k)+uy(i,ny-1,k)) & - + bsjy*(uy(i,2 ,k)-uy(i,ny ,k) & - -uy(i,ny,k)+uy(i,ny-2,k)) & - + csjy*(uy(i,3 ,k)-uy(i,ny ,k) & - -uy(i,ny,k)+uy(i,ny-3,k)) & - + dsjy*(uy(i,4 ,k)-uy(i,ny ,k) & - -uy(i,ny,k)+uy(i,ny-4,k)) + buffer(ny-3) = asjy*(uy(i,ny-2,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-4,k)) & + + bsjy*(uy(i,ny-1,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-5,k)) & + + csjy*(uy(i,ny ,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-6,k)) & + + dsjy*(uy(i,1 ,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-7,k)) + buffer(ny-2) = asjy*(uy(i,ny-1,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-3,k)) & + + bsjy*(uy(i,ny ,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-4,k)) & + + csjy*(uy(i,1 ,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-5,k)) & + + dsjy*(uy(i,2 ,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-6,k)) + buffer(ny-1) = asjy*(uy(i,ny ,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-2,k)) & + + bsjy*(uy(i,1 ,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-3,k)) & + + csjy*(uy(i,2 ,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-4,k)) & + + dsjy*(uy(i,3 ,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-5,k)) + buffer(ny ) = asjy*(uy(i,1 ,k)-uy(i,ny ,k) & + -uy(i,ny,k)+uy(i,ny-1,k)) & + + bsjy*(uy(i,2 ,k)-uy(i,ny ,k) & + -uy(i,ny,k)+uy(i,ny-2,k)) & + + csjy*(uy(i,3 ,k)-uy(i,ny ,k) & + -uy(i,ny,k)+uy(i,ny-3,k)) & + + dsjy*(uy(i,4 ,k)-uy(i,ny ,k) & + -uy(i,ny,k)+uy(i,ny-4,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, ny) + do concurrent (j=1:ny) + ty(i,j,k) = buffer(j) enddo enddo - ! Solve tri-diagonal system - call ythomas(ty, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) - end subroutine deryy_00 !******************************************************************** @@ -1182,112 +1141,89 @@ subroutine deryy_ij(ty,uy,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) ! Local variables integer :: i, j, k + real(mytype), dimension(ny) :: buffer - ! Compute r.h.s. - do concurrent (k=1:nz) + do concurrent (k=1:nz, i=1:nx) + ! Compute r.h.s. if (ncl1==1) then if (npaire==1) then - do concurrent (i=1:nx) - ty(i,1,k) = asjy*(uy(i,2,k)-uy(i,1,k) & - -uy(i,1,k)+uy(i,2,k)) & - + bsjy*(uy(i,3,k)-uy(i,1,k) & - -uy(i,1,k)+uy(i,3,k)) & - + csjy*(uy(i,4,k)-uy(i,1,k) & - -uy(i,1,k)+uy(i,4,k)) & - + dsjy*(uy(i,5,k)-uy(i,1,k) & - -uy(i,1,k)+uy(i,5,k)) - do concurrent (i=1:nx) - enddo - ty(i,2,k) = asjy*(uy(i,3,k)-uy(i,2,k) & - -uy(i,2,k)+uy(i,1,k)) & - + bsjy*(uy(i,4,k)-uy(i,2,k) & - -uy(i,2,k)+uy(i,2,k)) & - + csjy*(uy(i,5,k)-uy(i,2,k) & - -uy(i,2,k)+uy(i,3,k)) & - + dsjy*(uy(i,6,k)-uy(i,2,k) & - -uy(i,2,k)+uy(i,4,k)) - do concurrent (i=1:nx) - enddo - ty(i,3,k) = asjy*(uy(i,4,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,2,k)) & - + bsjy*(uy(i,5,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,1,k)) & - + csjy*(uy(i,6,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,2,k)) & - + dsjy*(uy(i,7,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,3,k)) - do concurrent (i=1:nx) - enddo - ty(i,4,k) = asjy*(uy(i,5,k)-uy(i,4,k) & - -uy(i,4,k)+uy(i,3,k)) & - + bsjy*(uy(i,6,k)-uy(i,4,k) & - -uy(i,4,k)+uy(i,2,k)) & - + csjy*(uy(i,7,k)-uy(i,4,k) & - -uy(i,4,k)+uy(i,1,k)) & - + dsjy*(uy(i,8,k)-uy(i,4,k) & - -uy(i,4,k)+uy(i,2,k)) - enddo + buffer(1) = asjy*(uy(i,2,k)-uy(i,1,k) & + -uy(i,1,k)+uy(i,2,k)) & + + bsjy*(uy(i,3,k)-uy(i,1,k) & + -uy(i,1,k)+uy(i,3,k)) & + + csjy*(uy(i,4,k)-uy(i,1,k) & + -uy(i,1,k)+uy(i,4,k)) & + + dsjy*(uy(i,5,k)-uy(i,1,k) & + -uy(i,1,k)+uy(i,5,k)) + buffer(2) = asjy*(uy(i,3,k)-uy(i,2,k) & + -uy(i,2,k)+uy(i,1,k)) & + + bsjy*(uy(i,4,k)-uy(i,2,k) & + -uy(i,2,k)+uy(i,2,k)) & + + csjy*(uy(i,5,k)-uy(i,2,k) & + -uy(i,2,k)+uy(i,3,k)) & + + dsjy*(uy(i,6,k)-uy(i,2,k) & + -uy(i,2,k)+uy(i,4,k)) + buffer(3) = asjy*(uy(i,4,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,2,k)) & + + bsjy*(uy(i,5,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,1,k)) & + + csjy*(uy(i,6,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,2,k)) & + + dsjy*(uy(i,7,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,3,k)) + buffer(4) = asjy*(uy(i,5,k)-uy(i,4,k) & + -uy(i,4,k)+uy(i,3,k)) & + + bsjy*(uy(i,6,k)-uy(i,4,k) & + -uy(i,4,k)+uy(i,2,k)) & + + csjy*(uy(i,7,k)-uy(i,4,k) & + -uy(i,4,k)+uy(i,1,k)) & + + dsjy*(uy(i,8,k)-uy(i,4,k) & + -uy(i,4,k)+uy(i,2,k)) else - do concurrent (i=1:nx) - ty(i,1,k) = zero - enddo - do concurrent (i=1:nx) - ty(i,2,k) = asjy*(uy(i,3,k)-uy(i,2,k) & - -uy(i,2,k)+uy(i,1,k)) & - + bsjy*(uy(i,4,k)-uy(i,2,k) & - -uy(i,2,k)-uy(i,2,k)) & - + csjy*(uy(i,5,k)-uy(i,2,k) & - -uy(i,2,k)-uy(i,3,k)) & - + dsjy*(uy(i,6,k)-uy(i,2,k) & - -uy(i,2,k)-uy(i,4,k)) - enddo - do concurrent (i=1:nx) - ty(i,3,k) = asjy*(uy(i,4,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,2,k)) & - + bsjy*(uy(i,5,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,1,k)) & - + csjy*(uy(i,6,k)-uy(i,3,k) & - -uy(i,3,k)-uy(i,2,k)) & - + dsjy*(uy(i,7,k)-uy(i,3,k) & - -uy(i,3,k)-uy(i,3,k)) - enddo - do concurrent (i=1:nx) - ty(i,4,k) = asjy*(uy(i,5,k)-uy(i,4,k) & - -uy(i,4,k)+uy(i,3,k)) & - + bsjy*(uy(i,6,k)-uy(i,4,k) & - -uy(i,4,k)+uy(i,2,k)) & - + csjy*(uy(i,7,k)-uy(i,4,k) & - -uy(i,4,k)-uy(i,1,k)) & - + dsjy*(uy(i,8,k)-uy(i,4,k) & - -uy(i,4,k)-uy(i,2,k)) - enddo + buffer(1) = zero + buffer(2) = asjy*(uy(i,3,k)-uy(i,2,k) & + -uy(i,2,k)+uy(i,1,k)) & + + bsjy*(uy(i,4,k)-uy(i,2,k) & + -uy(i,2,k)-uy(i,2,k)) & + + csjy*(uy(i,5,k)-uy(i,2,k) & + -uy(i,2,k)-uy(i,3,k)) & + + dsjy*(uy(i,6,k)-uy(i,2,k) & + -uy(i,2,k)-uy(i,4,k)) + buffer(3) = asjy*(uy(i,4,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,2,k)) & + + bsjy*(uy(i,5,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,1,k)) & + + csjy*(uy(i,6,k)-uy(i,3,k) & + -uy(i,3,k)-uy(i,2,k)) & + + dsjy*(uy(i,7,k)-uy(i,3,k) & + -uy(i,3,k)-uy(i,3,k)) + buffer(4) = asjy*(uy(i,5,k)-uy(i,4,k) & + -uy(i,4,k)+uy(i,3,k)) & + + bsjy*(uy(i,6,k)-uy(i,4,k) & + -uy(i,4,k)+uy(i,2,k)) & + + csjy*(uy(i,7,k)-uy(i,4,k) & + -uy(i,4,k)-uy(i,1,k)) & + + dsjy*(uy(i,8,k)-uy(i,4,k) & + -uy(i,4,k)-uy(i,2,k)) endif else - do concurrent (i=1:nx) - ty(i,1,k) = as1y*uy(i,1,k) + bs1y*uy(i,2,k) & - + cs1y*uy(i,3,k) + ds1y*uy(i,4,k) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = as2y*(uy(i,3,k)-uy(i,2,k) & - -uy(i,2,k)+uy(i,1,k)) - enddo - do concurrent (i=1:nx) - ty(i,3,k) = as3y*(uy(i,4,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,2,k)) & - + bs3y*(uy(i,5,k)-uy(i,3,k) & - -uy(i,3,k)+uy(i,1,k)) - enddo - do concurrent (i=1:nx) - ty(i,4,k) = as4y*(uy(i,5,k)-uy(i,4,k) & - -uy(i,4 ,k)+uy(i,3,k)) & - + bs4y*(uy(i,6,k)-uy(i,4 ,k) & - -uy(i,4 ,k)+uy(i,2,k)) & - + cs4y*(uy(i,7,k)-uy(i,4 ,k) & - -uy(i,4 ,k)+uy(i,1,k)) - enddo + buffer(1) = as1y*uy(i,1,k) + bs1y*uy(i,2,k) & + + cs1y*uy(i,3,k) + ds1y*uy(i,4,k) + buffer(2) = as2y*(uy(i,3,k)-uy(i,2,k) & + -uy(i,2,k)+uy(i,1,k)) + buffer(3) = as3y*(uy(i,4,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,2,k)) & + + bs3y*(uy(i,5,k)-uy(i,3,k) & + -uy(i,3,k)+uy(i,1,k)) + buffer(4) = as4y*(uy(i,5,k)-uy(i,4,k) & + -uy(i,4 ,k)+uy(i,3,k)) & + + bs4y*(uy(i,6,k)-uy(i,4 ,k) & + -uy(i,4 ,k)+uy(i,2,k)) & + + cs4y*(uy(i,7,k)-uy(i,4 ,k) & + -uy(i,4 ,k)+uy(i,1,k)) endif - do concurrent (j=5:ny-4, i=1:nx) - ty(i,j,k) = asjy*(uy(i,j+1,k)-uy(i,j ,k) & + do concurrent (j=5:ny-4) + buffer(j) = asjy*(uy(i,j+1,k)-uy(i,j ,k) & -uy(i,j ,k)+uy(i,j-1,k)) & + bsjy*(uy(i,j+2,k)-uy(i,j ,k) & -uy(i,j ,k)+uy(i,j-2,k)) & @@ -1298,107 +1234,88 @@ subroutine deryy_ij(ty,uy,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) enddo if (ncln==1) then if (npaire==1) then - do concurrent (i=1:nx) - ty(i,ny-3,k) = asjy*(uy(i,ny-2,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-4,k)) & - + bsjy*(uy(i,ny-1,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-5,k)) & - + csjy*(uy(i,ny ,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-6,k)) & - + dsjy*(uy(i,ny-1,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-7,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-2,k) = asjy*(uy(i,ny-1,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-3,k)) & - + bsjy*(uy(i,ny ,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-4,k)) & - + csjy*(uy(i,ny-1,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-5,k)) & - + dsjy*(uy(i,ny-2,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-6,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = asjy*(uy(i,ny ,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-2,k)) & - + bsjy*(uy(i,ny-1,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-3,k)) & - + csjy*(uy(i,ny-2,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-4,k)) & - + dsjy*(uy(i,ny-3,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-5,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny ,k) = asjy*(uy(i,ny-1,k)-uy(i,ny ,k) & - -uy(i,ny ,k)+uy(i,ny-1,k)) & - + bsjy*(uy(i,ny-2,k)-uy(i,ny ,k) & - -uy(i,ny ,k)+uy(i,ny-2,k)) & - + csjy*(uy(i,ny-3,k)-uy(i,ny ,k) & - -uy(i,ny ,k)+uy(i,ny-3,k)) & - + dsjy*(uy(i,ny-4,k)-uy(i,ny ,k) & - -uy(i,ny ,k)+uy(i,ny-4,k)) - enddo + buffer(ny-3) = asjy*(uy(i,ny-2,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-4,k)) & + + bsjy*(uy(i,ny-1,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-5,k)) & + + csjy*(uy(i,ny ,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-6,k)) & + + dsjy*(uy(i,ny-1,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-7,k)) + buffer(ny-2) = asjy*(uy(i,ny-1,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-3,k)) & + + bsjy*(uy(i,ny ,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-4,k)) & + + csjy*(uy(i,ny-1,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-5,k)) & + + dsjy*(uy(i,ny-2,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-6,k)) + buffer(ny-1) = asjy*(uy(i,ny ,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-2,k)) & + + bsjy*(uy(i,ny-1,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-3,k)) & + + csjy*(uy(i,ny-2,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-4,k)) & + + dsjy*(uy(i,ny-3,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-5,k)) + buffer(ny ) = asjy*(uy(i,ny-1,k)-uy(i,ny ,k) & + -uy(i,ny ,k)+uy(i,ny-1,k)) & + + bsjy*(uy(i,ny-2,k)-uy(i,ny ,k) & + -uy(i,ny ,k)+uy(i,ny-2,k)) & + + csjy*(uy(i,ny-3,k)-uy(i,ny ,k) & + -uy(i,ny ,k)+uy(i,ny-3,k)) & + + dsjy*(uy(i,ny-4,k)-uy(i,ny ,k) & + -uy(i,ny ,k)+uy(i,ny-4,k)) else - do concurrent (i=1:nx) - ty(i,ny-3,k) = asjy*( uy(i,ny-2,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-4,k)) & - + bsjy*( uy(i,ny-1,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-5,k)) & - + csjy*(-uy(i,ny ,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-6,k)) & - + dsjy*(-uy(i,ny-1,k)-uy(i,ny-3,k) & - -uy(i,ny-3,k)+uy(i,ny-7,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-2,k) = asjy*( uy(i,ny-1,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-3,k)) & - + bsjy*( uy(i,ny ,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-4,k)) & - + csjy*(-uy(i,ny-1,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-5,k)) & - + dsjy*(-uy(i,ny-2,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-6,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = asjy*( uy(i,ny ,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-2,k)) & - + bsjy*(-uy(i,ny-1,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-3,k)) & - + csjy*(-uy(i,ny-2,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-4,k)) & - + dsjy*(-uy(i,ny-3,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-5,k)) - ty(i,ny ,k) = zero - enddo + buffer(ny-3) = asjy*( uy(i,ny-2,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-4,k)) & + + bsjy*( uy(i,ny-1,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-5,k)) & + + csjy*(-uy(i,ny ,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-6,k)) & + + dsjy*(-uy(i,ny-1,k)-uy(i,ny-3,k) & + -uy(i,ny-3,k)+uy(i,ny-7,k)) + buffer(ny-2) = asjy*( uy(i,ny-1,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-3,k)) & + + bsjy*( uy(i,ny ,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-4,k)) & + + csjy*(-uy(i,ny-1,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-5,k)) & + + dsjy*(-uy(i,ny-2,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-6,k)) + buffer(ny-1) = asjy*( uy(i,ny ,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-2,k)) & + + bsjy*(-uy(i,ny-1,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-3,k)) & + + csjy*(-uy(i,ny-2,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-4,k)) & + + dsjy*(-uy(i,ny-3,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-5,k)) + buffer(ny ) = zero endif else - do concurrent (i=1:nx) - ty(i,ny-3,k) = astty*(uy(i,ny-2,k)-uy(i,ny-3 ,k) & - -uy(i,ny-3 ,k)+uy(i,ny-4,k)) & - + bstty*(uy(i,ny-1,k)-uy(i,ny-3 ,k) & - -uy(i,ny-3 ,k)+uy(i,ny-5,k)) & - + cstty*(uy(i,ny,k)-uy(i,ny-3 ,k) & - -uy(i,ny-3 ,k)+uy(i,ny-6,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-2,k) = asty*(uy(i,ny-1,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-3,k)) & - + bsty*(uy(i,ny ,k)-uy(i,ny-2,k) & - -uy(i,ny-2,k)+uy(i,ny-4,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = asmy*(uy(i,ny ,k)-uy(i,ny-1,k) & - -uy(i,ny-1,k)+uy(i,ny-2,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny ,k) = asny*uy(i,ny ,k) + bsny*uy(i,ny-1,k) & - + csny*uy(i,ny-2,k) + dsny*uy(i,ny-3,k) - enddo + buffer(ny-3) = astty*(uy(i,ny-2,k)-uy(i,ny-3 ,k) & + -uy(i,ny-3 ,k)+uy(i,ny-4,k)) & + + bstty*(uy(i,ny-1,k)-uy(i,ny-3 ,k) & + -uy(i,ny-3 ,k)+uy(i,ny-5,k)) & + + cstty*(uy(i,ny,k)-uy(i,ny-3 ,k) & + -uy(i,ny-3 ,k)+uy(i,ny-6,k)) + buffer(ny-2) = asty*(uy(i,ny-1,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-3,k)) & + + bsty*(uy(i,ny ,k)-uy(i,ny-2,k) & + -uy(i,ny-2,k)+uy(i,ny-4,k)) + buffer(ny-1) = asmy*(uy(i,ny ,k)-uy(i,ny-1,k) & + -uy(i,ny-1,k)+uy(i,ny-2,k)) + buffer(ny ) = asny*uy(i,ny ,k) + bsny*uy(i,ny-1,k) & + + csny*uy(i,ny-2,k) + dsny*uy(i,ny-3,k) endif - enddo - ! Solve tri-diagonal system - call ythomas(ty, sf, ss, sw, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, sf, ss, sw, ny) + do concurrent (j=1:ny) + ty(i,j,k) = buffer(j) + enddo + enddo end subroutine deryy_ij @@ -1478,6 +1395,7 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nz) :: buffer if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -1486,9 +1404,9 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) return endif - ! Compute r.h.s. do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = askz*(uz(i,j,2)-uz(i,j,1 ) & + ! Compute r.h.s. + buffer(1) = askz*(uz(i,j,2)-uz(i,j,1 ) & -uz(i,j,1)+uz(i,j,nz )) & + bskz*(uz(i,j,3)-uz(i,j,1 ) & -uz(i,j,1)+uz(i,j,nz-1)) & @@ -1496,9 +1414,7 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) -uz(i,j,1)+uz(i,j,nz-2)) & + dskz*(uz(i,j,5)-uz(i,j,1 ) & -uz(i,j,1)+uz(i,j,nz-3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = askz*(uz(i,j,3)-uz(i,j,2 ) & + buffer(2) = askz*(uz(i,j,3)-uz(i,j,2 ) & -uz(i,j,2)+uz(i,j,1 )) & + bskz*(uz(i,j,4)-uz(i,j,2 ) & -uz(i,j,2)+uz(i,j,nz)) & @@ -1506,9 +1422,7 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) -uz(i,j,2)+uz(i,j,nz-1)) & + dskz*(uz(i,j,6)-uz(i,j,2 ) & -uz(i,j,2)+uz(i,j,nz-2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,3) = askz*(uz(i,j,4)-uz(i,j,3 ) & + buffer(3) = askz*(uz(i,j,4)-uz(i,j,3 ) & -uz(i,j,3)+uz(i,j,2 )) & + bskz*(uz(i,j,5)-uz(i,j,3 ) & -uz(i,j,3)+uz(i,j,1 )) & @@ -1516,9 +1430,7 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) -uz(i,j,3)+uz(i,j,nz)) & + dskz*(uz(i,j,7)-uz(i,j,3 ) & -uz(i,j,3)+uz(i,j,nz-1)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,4) = askz*(uz(i,j,5)-uz(i,j,4 ) & + buffer(4) = askz*(uz(i,j,5)-uz(i,j,4 ) & -uz(i,j,4)+uz(i,j,3 )) & + bskz*(uz(i,j,6)-uz(i,j,4 ) & -uz(i,j,4)+uz(i,j,2 )) & @@ -1526,19 +1438,17 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) -uz(i,j,4)+uz(i,j,1)) & + dskz*(uz(i,j,8)-uz(i,j,4 ) & -uz(i,j,4)+uz(i,j,nz)) - enddo - do concurrent (k=5:nz-4, j=1:ny, i=1:nx) - tz(i,j,k) = askz*(uz(i,j,k+1)-uz(i,j,k ) & - -uz(i,j,k )+uz(i,j,k-1)) & - + bskz*(uz(i,j,k+2)-uz(i,j,k ) & - -uz(i,j,k )+uz(i,j,k-2)) & - + cskz*(uz(i,j,k+3)-uz(i,j,k ) & - -uz(i,j,k )+uz(i,j,k-3)) & - + dskz*(uz(i,j,k+4)-uz(i,j,k ) & - -uz(i,j,k )+uz(i,j,k-4)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-3) = askz*(uz(i,j,nz-2)-uz(i,j,nz-3) & + do concurrent (k=5:nz-4) + tz(i,j,k) = askz*(uz(i,j,k+1)-uz(i,j,k ) & + -uz(i,j,k )+uz(i,j,k-1)) & + + bskz*(uz(i,j,k+2)-uz(i,j,k ) & + -uz(i,j,k )+uz(i,j,k-2)) & + + cskz*(uz(i,j,k+3)-uz(i,j,k ) & + -uz(i,j,k )+uz(i,j,k-3)) & + + dskz*(uz(i,j,k+4)-uz(i,j,k ) & + -uz(i,j,k )+uz(i,j,k-4)) + enddo + buffer(nz-3) = askz*(uz(i,j,nz-2)-uz(i,j,nz-3) & -uz(i,j,nz-3)+uz(i,j,nz-4)) & + bskz*(uz(i,j,nz-1 )-uz(i,j,nz-3) & -uz(i,j,nz-3)+uz(i,j,nz-5)) & @@ -1546,7 +1456,7 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) -uz(i,j,nz-3)+uz(i,j,nz-6)) & + dskz*(uz(i,j,1 )-uz(i,j,nz-3) & -uz(i,j,nz-3)+uz(i,j,nz-7)) - tz(i,j,nz-2) = askz*(uz(i,j,nz-1)-uz(i,j,nz-2) & + buffer(nz-2) = askz*(uz(i,j,nz-1)-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-3)) & + bskz*(uz(i,j,nz )-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-4)) & @@ -1554,7 +1464,7 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) -uz(i,j,nz-2)+uz(i,j,nz-5)) & + dskz*(uz(i,j,2 )-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-6)) - tz(i,j,nz-1) = askz*(uz(i,j,nz )-uz(i,j,nz-1) & + buffer(nz-1) = askz*(uz(i,j,nz )-uz(i,j,nz-1) & -uz(i,j,nz-1)+uz(i,j,nz-2)) & + bskz*(uz(i,j,1 )-uz(i,j,nz-1) & -uz(i,j,nz-1)+uz(i,j,nz-3)) & @@ -1562,7 +1472,7 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) -uz(i,j,nz-1)+uz(i,j,nz-4)) & + dskz*(uz(i,j,3 )-uz(i,j,nz-1) & -uz(i,j,nz-1)+uz(i,j,nz-5)) - tz(i,j,nz ) = askz*(uz(i,j,1 )-uz(i,j,nz ) & + buffer(nz ) = askz*(uz(i,j,1 )-uz(i,j,nz ) & -uz(i,j,nz)+uz(i,j,nz-1)) & + bskz*(uz(i,j,2 )-uz(i,j,nz ) & -uz(i,j,nz)+uz(i,j,nz-2)) & @@ -1570,10 +1480,13 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) -uz(i,j,nz)+uz(i,j,nz-3)) & + dskz*(uz(i,j,4 )-uz(i,j,nz ) & -uz(i,j,nz)+uz(i,j,nz-4)) - enddo - ! Solve tri-diagonal system - call zthomas(tz, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nz) + do concurrent (k=1:nz) + tz(i,j,k) = buffer(k) + enddo + enddo end subroutine derzz_00 @@ -1600,11 +1513,11 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) enddo endif - ! Compute r.h.s. - if (ncl1==1) then - if (npaire==1) then - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = askz*(uz(i,j,2)-uz(i,j,1) & + do concurrent (j=1:ny, i=1:nx) + ! Compute r.h.s. + if (ncl1==1) then + if (npaire==1) then + buffer(1) = askz*(uz(i,j,2)-uz(i,j,1) & -uz(i,j,1)+uz(i,j,2)) & + bskz*(uz(i,j,3)-uz(i,j,1) & -uz(i,j,1)+uz(i,j,3)) & @@ -1612,9 +1525,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,1)+uz(i,j,4)) & + dskz*(uz(i,j,5)-uz(i,j,1) & -uz(i,j,1)+uz(i,j,5)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = askz*(uz(i,j,3)-uz(i,j,2) & + buffer(2) = askz*(uz(i,j,3)-uz(i,j,2) & -uz(i,j,2)+uz(i,j,1)) & + bskz*(uz(i,j,4)-uz(i,j,2) & -uz(i,j,2)+uz(i,j,2)) & @@ -1622,9 +1533,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,2)+uz(i,j,3)) & + dskz*(uz(i,j,6)-uz(i,j,2) & -uz(i,j,2)+uz(i,j,4)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,3) = askz*(uz(i,j,4)-uz(i,j,3) & + buffer(3) = askz*(uz(i,j,4)-uz(i,j,3) & -uz(i,j,3)+uz(i,j,2)) & + bskz*(uz(i,j,5)-uz(i,j,3) & -uz(i,j,3)+uz(i,j,1)) & @@ -1632,9 +1541,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,3)+uz(i,j,2)) & + dskz*(uz(i,j,7)-uz(i,j,3) & -uz(i,j,3)+uz(i,j,3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,4) = askz*(uz(i,j,5)-uz(i,j,4) & + buffer(4) = askz*(uz(i,j,5)-uz(i,j,4) & -uz(i,j,4)+uz(i,j,3)) & + bskz*(uz(i,j,6)-uz(i,j,4) & -uz(i,j,4)+uz(i,j,2)) & @@ -1642,13 +1549,9 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,4)+uz(i,j,1)) & + dskz*(uz(i,j,8)-uz(i,j,4) & -uz(i,j,4)+uz(i,j,2)) - enddo - else - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = zero - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = askz*(uz(i,j,3)-uz(i,j,2) & + else + buffer(1) = zero + buffer(2) = askz*(uz(i,j,3)-uz(i,j,2) & -uz(i,j,2)+uz(i,j,1)) & + bskz*(uz(i,j,4)-uz(i,j,2) & -uz(i,j,2)-uz(i,j,2)) & @@ -1656,9 +1559,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,2)-uz(i,j,3)) & + dskz*(uz(i,j,6)-uz(i,j,2) & -uz(i,j,2)-uz(i,j,4)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,3) = askz*(uz(i,j,4)-uz(i,j,3) & + buffer(3) = askz*(uz(i,j,4)-uz(i,j,3) & -uz(i,j,3)+uz(i,j,2)) & + bskz*(uz(i,j,5)-uz(i,j,3) & -uz(i,j,3)+uz(i,j,1)) & @@ -1666,9 +1567,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,3)-uz(i,j,2)) & + dskz*(uz(i,j,7)-uz(i,j,3) & -uz(i,j,3)-uz(i,j,3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,4) = askz*(uz(i,j,5)-uz(i,j,4) & + buffer(4) = askz*(uz(i,j,5)-uz(i,j,4) & -uz(i,j,4)+uz(i,j,3)) & + bskz*(uz(i,j,6)-uz(i,j,4) & -uz(i,j,4)+uz(i,j,2)) & @@ -1676,46 +1575,36 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,4)-uz(i,j,1)) & + dskz*(uz(i,j,8)-uz(i,j,4) & -uz(i,j,4)-uz(i,j,2)) - enddo - endif - else - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = as1z*uz(i,j,1) + bs1z*uz(i,j,2) & + endif + else + buffer(1) = as1z*uz(i,j,1) + bs1z*uz(i,j,2) & + cs1z*uz(i,j,3) + ds1z*uz(i,j,4) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = as2z*(uz(i,j,3)-uz(i,j,2) & + buffer(2) = as2z*(uz(i,j,3)-uz(i,j,2) & -uz(i,j,2)+uz(i,j,1)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,3) = as3z*(uz(i,j,4)-uz(i,j,3) & + buffer(3) = as3z*(uz(i,j,4)-uz(i,j,3) & -uz(i,j,3)+uz(i,j,2)) & + bs3z*(uz(i,j,5)-uz(i,j,3) & -uz(i,j,3)+uz(i,j,1)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,4) = as4z*(uz(i,j,5)-uz(i,j,4 ) & + buffer(4) = as4z*(uz(i,j,5)-uz(i,j,4 ) & -uz(i,j,4 )+uz(i,j,3)) & + bs4z*(uz(i,j,6)-uz(i,j,4 ) & -uz(i,j,4 )+uz(i,j,2)) & + cs4z*(uz(i,j,7)-uz(i,j,4 ) & -uz(i,j,4 )+uz(i,j,1)) + endif + do concurrent (k=5:nz-4) + buffer(k) = askz*(uz(i,j,k+1)-uz(i,j,k ) & + -uz(i,j,k )+uz(i,j,k-1)) & + + bskz*(uz(i,j,k+2)-uz(i,j,k ) & + -uz(i,j,k )+uz(i,j,k-2)) & + + cskz*(uz(i,j,k+3)-uz(i,j,k ) & + -uz(i,j,k )+uz(i,j,k-3)) & + + dskz*(uz(i,j,k+4)-uz(i,j,k ) & + -uz(i,j,k )+uz(i,j,k-4)) enddo - endif - do concurrent (k=5:nz-4, j=1:ny, i=1:nx) - tz(i,j,k) = askz*(uz(i,j,k+1)-uz(i,j,k ) & - -uz(i,j,k )+uz(i,j,k-1)) & - + bskz*(uz(i,j,k+2)-uz(i,j,k ) & - -uz(i,j,k )+uz(i,j,k-2)) & - + cskz*(uz(i,j,k+3)-uz(i,j,k ) & - -uz(i,j,k )+uz(i,j,k-3)) & - + dskz*(uz(i,j,k+4)-uz(i,j,k ) & - -uz(i,j,k )+uz(i,j,k-4)) - enddo - if (ncln==1) then - if (npaire==1) then - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-3) = askz*(uz(i,j,nz-2)-uz(i,j,nz-3) & + if (ncln==1) then + if (npaire==1) then + buffer(nz-3) = askz*(uz(i,j,nz-2)-uz(i,j,nz-3) & -uz(i,j,nz-3)+uz(i,j,nz-4)) & + bskz*(uz(i,j,nz-1)-uz(i,j,nz-3) & -uz(i,j,nz-3)+uz(i,j,nz-5)) & @@ -1723,9 +1612,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,nz-3)+uz(i,j,nz-6)) & + dskz*(uz(i,j,nz-1)-uz(i,j,nz-3) & -uz(i,j,nz-3)+uz(i,j,nz-7)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-2) = askz*(uz(i,j,nz-1)-uz(i,j,nz-2) & + buffer(nz-2) = askz*(uz(i,j,nz-1)-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-3)) & + bskz*(uz(i,j,nz )-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-4)) & @@ -1733,9 +1620,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,nz-2)+uz(i,j,nz-5)) & + dskz*(uz(i,j,nz-2)-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-6)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = askz*(uz(i,j,nz )-uz(i,j,nz-1) & + buffer(nz-1) = askz*(uz(i,j,nz )-uz(i,j,nz-1) & -uz(i,j,nz-1)+uz(i,j,nz-2)) & + bskz*(uz(i,j,nz-1)-uz(i,j,nz-1) & -uz(i,j,nz-1)+uz(i,j,nz-3)) & @@ -1743,9 +1628,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,nz-1)+uz(i,j,nz-4)) & + dskz*(uz(i,j,nz-3)-uz(i,j,nz-1) & -uz(i,j,nz-1)+uz(i,j,nz-5)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz ) = askz*(uz(i,j,nz-1)-uz(i,j,nz ) & + buffer(nz ) = askz*(uz(i,j,nz-1)-uz(i,j,nz ) & -uz(i,j,nz )+uz(i,j,nz-1)) & + bskz*(uz(i,j,nz-2)-uz(i,j,nz ) & -uz(i,j,nz )+uz(i,j,nz-2)) & @@ -1753,10 +1636,8 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,nz )+uz(i,j,nz-3)) & + dskz*(uz(i,j,nz-4)-uz(i,j,nz ) & -uz(i,j,nz )+uz(i,j,nz-4)) - enddo - else - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-3) = askz*( uz(i,j,nz-2)-uz(i,j,nz-3) & + else + buffer(nz-3) = askz*( uz(i,j,nz-2)-uz(i,j,nz-3) & -uz(i,j,nz-3)+uz(i,j,nz-4)) & + bskz*( uz(i,j,nz-1)-uz(i,j,nz-3) & -uz(i,j,nz-3)+uz(i,j,nz-5)) & @@ -1764,9 +1645,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,nz-3)+uz(i,j,nz-6)) & + dskz*(-uz(i,j,nz-1)-uz(i,j,nz-3) & -uz(i,j,nz-3)+uz(i,j,nz-7)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-2) = askz*( uz(i,j,nz-1)-uz(i,j,nz-2) & + buffer(nz-2) = askz*( uz(i,j,nz-1)-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-3)) & + bskz*( uz(i,j,nz )-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-4)) & @@ -1774,9 +1653,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,nz-2)+uz(i,j,nz-5)) & + dskz*(-uz(i,j,nz-2)-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-6)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = askz*( uz(i,j,nz )-uz(i,j,nz-1) & + buffer(nz-1) = askz*( uz(i,j,nz )-uz(i,j,nz-1) & -uz(i,j,nz-1)+uz(i,j,nz-2)) & + bskz*(-uz(i,j,nz-1)-uz(i,j,nz-1) & -uz(i,j,nz-1)+uz(i,j,nz-3)) & @@ -1784,38 +1661,31 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) -uz(i,j,nz-1)+uz(i,j,nz-4)) & + dskz*(-uz(i,j,nz-3)-uz(i,j,nz-1) & -uz(i,j,nz-1)+uz(i,j,nz-5)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz ) = zero - enddo - endif - else - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-3) = asttz*(uz(i,j,nz-2)-uz(i,j,nz-3 ) & + buffer(nz ) = zero + endif + else + buffer(nz-3) = asttz*(uz(i,j,nz-2)-uz(i,j,nz-3 ) & -uz(i,j,nz-3 )+uz(i,j,nz-4)) & + bsttz*(uz(i,j,nz-1)-uz(i,j,nz-3 ) & -uz(i,j,nz-3 )+uz(i,j,nz-5)) & + csttz*(uz(i,j,nz)-uz(i,j,nz-3 ) & -uz(i,j,nz-3 )+uz(i,j,nz-6)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-2) = astz*(uz(i,j,nz-1)-uz(i,j,nz-2) & + buffer(nz-2) = astz*(uz(i,j,nz-1)-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-3)) & + bstz*(uz(i,j,nz )-uz(i,j,nz-2) & -uz(i,j,nz-2)+uz(i,j,nz-4)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = asmz*(uz(i,j,nz )-uz(i,j,nz-1) & + buffer(nz-1) = asmz*(uz(i,j,nz )-uz(i,j,nz-1) & -uz(i,j,nz-1)+uz(i,j,nz-2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz ) = asnz*uz(i,j,nz ) + bsnz*uz(i,j,nz-1) & + buffer(nz ) = asnz*uz(i,j,nz ) + bsnz*uz(i,j,nz-1) & + csnz*uz(i,j,nz-2) + dsnz*uz(i,j,nz-3) - enddo - endif + endif - ! Solve tri-diagonal system - call zthomas(tz, sf, ss, sw, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, sf, ss, sw, nz) + do concurrent(k=1:nz) + tz(i,j,k) = buffer(k) + enddo + enddo end subroutine derzz_ij diff --git a/src/x3d_staggered.f90 b/src/x3d_staggered.f90 index 2322dda..797e7d4 100644 --- a/src/x3d_staggered.f90 +++ b/src/x3d_staggered.f90 @@ -34,28 +34,32 @@ subroutine derxvp(tx,ux,x3dop,nx,nxm,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nxm) :: buffer if (nclx) then ! nxm = nx do concurrent (k=1:nz, j=1:ny) ! Compute r.h.s. - tx(1,j,k) = acix6*(ux(2,j,k)-ux(1 ,j,k)) & + buffer(1) = acix6*(ux(2,j,k)-ux(1 ,j,k)) & + bcix6*(ux(3,j,k)-ux(nx,j,k)) - tx(2,j,k) = acix6*(ux(3,j,k)-ux(2,j,k)) & + buffer(2) = acix6*(ux(3,j,k)-ux(2,j,k)) & + bcix6*(ux(4,j,k)-ux(1,j,k)) do concurrent (i=3:nx-2) - tx(i,j,k) = acix6*(ux(i+1,j,k)-ux(i ,j,k)) & + buffer(i) = acix6*(ux(i+1,j,k)-ux(i ,j,k)) & + bcix6*(ux(i+2,j,k)-ux(i-1,j,k)) enddo - tx(nx-1,j,k) = acix6*(ux(nx,j,k)-ux(nx-1,j,k)) & + buffer(nx-1) = acix6*(ux(nx,j,k)-ux(nx-1,j,k)) & + bcix6*(ux(1 ,j,k)-ux(nx-2,j,k)) - tx(nx ,j,k) = acix6*(ux(1,j,k)-ux(nx ,j,k)) & + buffer(nx ) = acix6*(ux(1,j,k)-ux(nx ,j,k)) & + bcix6*(ux(2,j,k)-ux(nx-1,j,k)) - enddo - ! Solve tri-diagonal system - call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo else ! nxm = nx-1 @@ -63,35 +67,38 @@ subroutine derxvp(tx,ux,x3dop,nx,nxm,ny,nz) ! Compute r.h.s. if (x3dop%npaire==1) then - tx(1,j,k) = acix6*(ux(2,j,k)-ux(1,j,k)) & + buffer(1) = acix6*(ux(2,j,k)-ux(1,j,k)) & + bcix6*(ux(3,j,k)-ux(2,j,k)) - tx(2,j,k) = acix6*(ux(3,j,k)-ux(2,j,k)) & + buffer(2) = acix6*(ux(3,j,k)-ux(2,j,k)) & + bcix6*(ux(4,j,k)-ux(1,j,k)) else - tx(1,j,k) = acix6*(ux(2,j,k)-ux(1,j,k)) & + buffer(1) = acix6*(ux(2,j,k)-ux(1,j,k)) & + bcix6*(ux(3,j,k)-two*ux(1,j,k)+ux(2,j,k)) - tx(2,j,k) = acix6*(ux(3,j,k)-ux(2,j,k)) & + buffer(2) = acix6*(ux(3,j,k)-ux(2,j,k)) & + bcix6*(ux(4,j,k)-ux(1,j,k)) endif do concurrent (i=3:nxm-2) - tx(i,j,k) = acix6*(ux(i+1,j,k)-ux(i ,j,k)) & + buffer(i) = acix6*(ux(i+1,j,k)-ux(i ,j,k)) & + bcix6*(ux(i+2,j,k)-ux(i-1,j,k)) enddo if (x3dop%npaire==1) then - tx(nxm-1,j,k) = acix6*(ux(nxm,j,k)-ux(nxm-1,j,k)) & + buffer(nxm-1) = acix6*(ux(nxm,j,k)-ux(nxm-1,j,k)) & + bcix6*(ux(nx ,j,k)-ux(nxm-2,j,k)) - tx(nxm,j,k) = acix6*(ux(nx ,j,k)-ux(nxm ,j,k)) & + buffer(nxm) = acix6*(ux(nx ,j,k)-ux(nxm ,j,k)) & + bcix6*(ux(nxm,j,k)-ux(nxm-1,j,k)) else - tx(nxm-1,j,k) = acix6*(ux(nxm,j,k)-ux(nxm-1,j,k)) & + buffer(nxm-1) = acix6*(ux(nxm,j,k)-ux(nxm-1,j,k)) & + bcix6*(ux(nx ,j,k)-ux(nxm-2,j,k)) - tx(nxm,j,k) = acix6*(ux(nx,j,k)-ux(nxm,j,k)) & + buffer(nxm) = acix6*(ux(nx,j,k)-ux(nxm,j,k)) & + bcix6*(two*ux(nx,j,k)-ux(nxm,j,k)-ux(nxm-1,j,k)) endif - enddo - ! Solve tri-diagonal system - call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, nxm, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, nxm) + do concurrent (i=1:nxm) + tx(i,j,k) = buffer(i) + enddo + enddo endif @@ -113,50 +120,54 @@ subroutine interxvp(tx,ux,x3dop,nx,nxm,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nxm) :: buffer if (nclx) then ! nxm = nx do concurrent (k=1:nz, j=1:ny) ! Compute r.h.s. - tx(1,j,k) = aicix6*(ux(2,j,k)+ux(1 ,j,k)) & + buffer(1) = aicix6*(ux(2,j,k)+ux(1 ,j,k)) & + bicix6*(ux(3,j,k)+ux(nx,j,k)) & + cicix6*(ux(4,j,k)+ux(nx-1,j,k)) & + dicix6*(ux(5,j,k)+ux(nx-2,j,k)) - tx(2,j,k) = aicix6*(ux(3,j,k)+ux(2 ,j,k)) & + buffer(2) = aicix6*(ux(3,j,k)+ux(2 ,j,k)) & + bicix6*(ux(4,j,k)+ux(1,j,k)) & + cicix6*(ux(5,j,k)+ux(nx,j,k)) & + dicix6*(ux(6,j,k)+ux(nx-1,j,k)) - tx(3,j,k) = aicix6*(ux(4,j,k)+ux(3 ,j,k)) & + buffer(3) = aicix6*(ux(4,j,k)+ux(3 ,j,k)) & + bicix6*(ux(5,j,k)+ux(2,j,k)) & + cicix6*(ux(6,j,k)+ux(1,j,k)) & + dicix6*(ux(7,j,k)+ux(nx,j,k)) do concurrent (i=4:nx-4) - tx(i,j,k) = aicix6*(ux(i+1,j,k)+ux(i,j,k)) & + buffer(i) = aicix6*(ux(i+1,j,k)+ux(i,j,k)) & + bicix6*(ux(i+2,j,k)+ux(i-1,j,k)) & + cicix6*(ux(i+3,j,k)+ux(i-2,j,k)) & + dicix6*(ux(i+4,j,k)+ux(i-3,j,k)) enddo - tx(nx-3,j,k) = aicix6*(ux(nx-2,j,k)+ux(nx-3,j,k)) & + buffer(nx-3) = aicix6*(ux(nx-2,j,k)+ux(nx-3,j,k)) & + bicix6*(ux(nx-1,j,k)+ux(nx-4,j,k)) & + cicix6*(ux(nx,j,k)+ux(nx-5,j,k)) & + dicix6*(ux(1,j,k)+ux(nx-6,j,k)) - tx(nx-2,j,k) = aicix6*(ux(nx-1,j,k)+ux(nx-2,j,k)) & + buffer(nx-2) = aicix6*(ux(nx-1,j,k)+ux(nx-2,j,k)) & + bicix6*(ux(nx ,j,k)+ux(nx-3,j,k)) & + cicix6*(ux(1,j,k)+ux(nx-4,j,k)) & + dicix6*(ux(2,j,k)+ux(nx-5,j,k)) - tx(nx-1,j,k) = aicix6*(ux(nx,j,k)+ux(nx-1,j,k)) & + buffer(nx-1) = aicix6*(ux(nx,j,k)+ux(nx-1,j,k)) & + bicix6*(ux(1 ,j,k)+ux(nx-2,j,k)) & + cicix6*(ux(2,j,k)+ux(nx-3,j,k)) & + dicix6*(ux(3,j,k)+ux(nx-4,j,k)) - tx(nx ,j,k) = aicix6*(ux(1,j,k)+ux(nx,j,k)) & + buffer(nx ) = aicix6*(ux(1,j,k)+ux(nx,j,k)) & + bicix6*(ux(2,j,k)+ux(nx-1,j,k)) & + cicix6*(ux(3,j,k)+ux(nx-2,j,k)) & + dicix6*(ux(4,j,k)+ux(nx-3,j,k)) - enddo - ! Solve tri-diagonal system - call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo else ! nxm = nx-1 @@ -164,40 +175,43 @@ subroutine interxvp(tx,ux,x3dop,nx,nxm,ny,nz) do concurrent (k=1:nz, j=1:ny) ! Compute r.h.s. - tx(1,j,k) = aicix6*(ux(2,j,k)+ux(1,j,k)) & + buffer(1) = aicix6*(ux(2,j,k)+ux(1,j,k)) & + bicix6*(ux(3,j,k)+ux(2,j,k)) & + cicix6*(ux(4,j,k)+ux(3,j,k)) & + dicix6*(ux(5,j,k)+ux(4,j,k)) - tx(2,j,k) = aicix6*(ux(3,j,k)+ux(2,j,k)) & + buffer(2) = aicix6*(ux(3,j,k)+ux(2,j,k)) & + bicix6*(ux(4,j,k)+ux(1,j,k)) & + cicix6*(ux(5,j,k)+ux(2,j,k)) & + dicix6*(ux(6,j,k)+ux(3,j,k)) - tx(3,j,k) = aicix6*(ux(4,j,k)+ux(3,j,k)) & + buffer(3) = aicix6*(ux(4,j,k)+ux(3,j,k)) & + bicix6*(ux(5,j,k)+ux(2,j,k)) & + cicix6*(ux(6,j,k)+ux(1,j,k)) & + dicix6*(ux(7,j,k)+ux(2,j,k)) do concurrent (i=4:nxm-3) - tx(i,j,k) = aicix6*(ux(i+1,j,k)+ux(i,j,k)) & + buffer(i) = aicix6*(ux(i+1,j,k)+ux(i,j,k)) & + bicix6*(ux(i+2,j,k)+ux(i-1,j,k)) & + cicix6*(ux(i+3,j,k)+ux(i-2,j,k)) & + dicix6*(ux(i+4,j,k)+ux(i-3,j,k)) enddo - tx(nxm-2,j,k) = aicix6*(ux(nxm-1,j,k)+ux(nxm-2,j,k)) & + buffer(nxm-2) = aicix6*(ux(nxm-1,j,k)+ux(nxm-2,j,k)) & + bicix6*(ux(nxm,j,k)+ux(nxm-3,j,k)) & + cicix6*(ux(nx,j,k)+ux(nxm-4,j,k)) & + dicix6*(ux(nxm,j,k)+ux(nxm-5,j,k)) - tx(nxm-1,j,k) = aicix6*(ux(nxm,j,k)+ux(nxm-1,j,k)) & + buffer(nxm-1) = aicix6*(ux(nxm,j,k)+ux(nxm-1,j,k)) & + bicix6*(ux(nx,j,k)+ux(nxm-2,j,k)) & + cicix6*(ux(nxm,j,k)+ux(nxm-3,j,k)) & + dicix6*(ux(nxm-1,j,k)+ux(nxm-4,j,k)) - tx(nxm ,j,k) = aicix6*(ux(nx,j,k)+ux(nxm,j,k)) & + buffer(nxm ) = aicix6*(ux(nx,j,k)+ux(nxm,j,k)) & + bicix6*(ux(nxm,j,k)+ux(nxm-1,j,k)) & + cicix6*(ux(nxm-1,j,k)+ux(nxm-2,j,k)) & + dicix6*(ux(nxm-2,j,k)+ux(nxm-3,j,k)) - enddo - ! Solve tri-diagonal system - call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, nxm, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, nxm) + do concurrent (i=1:nxm) + tx(i,j,k) = buffer(i) + enddo + enddo endif endif @@ -220,28 +234,32 @@ subroutine derxpv(tx,ux,x3dop,nxm,nx,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nx) :: buffer if (nclx) then ! nxm = nx do concurrent (k=1:nz, j=1:ny) ! Compute r.h.s. - tx(1,j,k) = acix6*(ux(1,j,k)-ux(nx ,j,k)) & + buffer(1) = acix6*(ux(1,j,k)-ux(nx ,j,k)) & + bcix6*(ux(2,j,k)-ux(nx-1,j,k)) - tx(2,j,k) = acix6*(ux(2,j,k)-ux(1 ,j,k)) & + buffer(2) = acix6*(ux(2,j,k)-ux(1 ,j,k)) & + bcix6*(ux(3,j,k)-ux(nx,j,k)) do concurrent (i=3:nx-2) - tx(i,j,k) = acix6*(ux(i,j,k)-ux(i-1,j,k)) & + buffer(i) = acix6*(ux(i,j,k)-ux(i-1,j,k)) & + bcix6*(ux(i+1,j,k)-ux(i-2,j,k)) enddo - tx(nx-1,j,k) = acix6*(ux(nx-1,j,k)-ux(nx-2,j,k)) & + buffer(nx-1) = acix6*(ux(nx-1,j,k)-ux(nx-2,j,k)) & + bcix6*(ux(nx ,j,k)-ux(nx-3,j,k)) - tx(nx ,j,k) = acix6*(ux(nx,j,k)-ux(nx-1,j,k)) & + buffer(nx ) = acix6*(ux(nx,j,k)-ux(nx-1,j,k)) & + bcix6*(ux(1,j,k)-ux(nx-2,j,k)) - enddo - ! Solve tri-diagonal system - call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo else ! nxm = nx-1 @@ -249,20 +267,23 @@ subroutine derxpv(tx,ux,x3dop,nxm,nx,ny,nz) do concurrent (k=1:nz, j=1:ny) ! Compute r.h.s. - tx(1,j,k) = zero - tx(2,j,k) = acix6*(ux(2,j,k)-ux(1,j,k)) & + buffer(1) = zero + buffer(2) = acix6*(ux(2,j,k)-ux(1,j,k)) & + bcix6*(ux(3,j,k)-ux(1,j,k)) do concurrent (i=3:nx-2) - tx(i,j,k) = acix6*(ux(i,j,k)-ux(i-1,j,k)) & + buffer(i) = acix6*(ux(i,j,k)-ux(i-1,j,k)) & + bcix6*(ux(i+1,j,k)-ux(i-2,j,k)) enddo - tx(nx-1,j,k) = acix6*(ux(nx-1,j,k)-ux(nx-2,j,k)) & + buffer(nx-1) = acix6*(ux(nx-1,j,k)-ux(nx-2,j,k)) & + bcix6*(ux(nx-1,j,k)-ux(nx-3,j,k)) - tx(nx,j,k) = zero - enddo + buffer(nx) = zero - ! Solve tri-diagonal system - call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo endif endif @@ -285,50 +306,54 @@ subroutine interxpv(tx,ux,x3dop,nxm,nx,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nx) :: buffer if (nclx) then ! nxm = nx do concurrent (k=1:nz, j=1:ny) ! Compute r.h.s. - tx(1,j,k) = aicix6*(ux(1,j,k)+ux(nx ,j,k)) & + buffer(1) = aicix6*(ux(1,j,k)+ux(nx ,j,k)) & + bicix6*(ux(2,j,k)+ux(nx-1,j,k)) & + cicix6*(ux(3,j,k)+ux(nx-2,j,k)) & + dicix6*(ux(4,j,k)+ux(nx-3,j,k)) - tx(2,j,k) = aicix6*(ux(2,j,k)+ux(1 ,j,k)) & + buffer(2) = aicix6*(ux(2,j,k)+ux(1 ,j,k)) & + bicix6*(ux(3,j,k)+ux(nx,j,k)) & + cicix6*(ux(4,j,k)+ux(nx-1,j,k)) & + dicix6*(ux(5,j,k)+ux(nx-2,j,k)) - tx(3,j,k) = aicix6*(ux(3,j,k)+ux(2 ,j,k)) & + buffer(3) = aicix6*(ux(3,j,k)+ux(2 ,j,k)) & + bicix6*(ux(4,j,k)+ux(1,j,k)) & + cicix6*(ux(5,j,k)+ux(nx,j,k)) & + dicix6*(ux(6,j,k)+ux(nx-1,j,k)) - tx(4,j,k) = aicix6*(ux(4,j,k)+ux(3 ,j,k)) & + buffer(4) = aicix6*(ux(4,j,k)+ux(3 ,j,k)) & + bicix6*(ux(5,j,k)+ux(2,j,k)) & + cicix6*(ux(6,j,k)+ux(1,j,k)) & + dicix6*(ux(7,j,k)+ux(nx,j,k)) do concurrent (i=5:nx-3) - tx(i,j,k) = aicix6*(ux(i,j,k)+ux(i-1,j,k)) & + buffer(i) = aicix6*(ux(i,j,k)+ux(i-1,j,k)) & + bicix6*(ux(i+1,j,k)+ux(i-2,j,k)) & + cicix6*(ux(i+2,j,k)+ux(i-3,j,k)) & + dicix6*(ux(i+3,j,k)+ux(i-4,j,k)) enddo - tx(nx-2,j,k) = aicix6*(ux(nx-2,j,k)+ux(nx-3,j,k)) & + buffer(nx-2) = aicix6*(ux(nx-2,j,k)+ux(nx-3,j,k)) & + bicix6*(ux(nx-1,j,k)+ux(nx-4,j,k)) & + cicix6*(ux(nx,j,k)+ux(nx-5,j,k)) & + dicix6*(ux(1,j,k)+ux(nx-6,j,k)) - tx(nx-1,j,k) = aicix6*(ux(nx-1,j,k)+ux(nx-2,j,k)) & + buffer(nx-1) = aicix6*(ux(nx-1,j,k)+ux(nx-2,j,k)) & + bicix6*(ux(nx ,j,k)+ux(nx-3,j,k)) & + cicix6*(ux(1,j,k)+ux(nx-4,j,k)) & + dicix6*(ux(2,j,k)+ux(nx-5,j,k)) - tx(nx ,j,k) = aicix6*(ux(nx,j,k)+ux(nx-1,j,k)) & + buffer(nx ) = aicix6*(ux(nx,j,k)+ux(nx-1,j,k)) & + bicix6*(ux(1,j,k)+ux(nx-2,j,k)) & + cicix6*(ux(2,j,k)+ux(nx-3,j,k)) & + dicix6*(ux(3,j,k)+ux(nx-4,j,k)) - enddo - ! Solve tri-diagonal system - call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo else ! nxm = nx-1 @@ -336,48 +361,51 @@ subroutine interxpv(tx,ux,x3dop,nxm,nx,ny,nz) do concurrent (k=1:nz, j=1:ny) ! Compute r.h.s. - tx(1,j,k) = aicix6*(ux(1,j,k)+ux(1,j,k)) & + buffer(1) = aicix6*(ux(1,j,k)+ux(1,j,k)) & + bicix6*(ux(2,j,k)+ux(2,j,k)) & + cicix6*(ux(3,j,k)+ux(3,j,k)) & + dicix6*(ux(4,j,k)+ux(4,j,k)) - tx(2,j,k) = aicix6*(ux(2,j,k)+ux(1,j,k)) & + buffer(2) = aicix6*(ux(2,j,k)+ux(1,j,k)) & + bicix6*(ux(3,j,k)+ux(1,j,k)) & + cicix6*(ux(4,j,k)+ux(2,j,k)) & + dicix6*(ux(5,j,k)+ux(3,j,k)) - tx(3,j,k) = aicix6*(ux(3,j,k)+ux(2,j,k)) & + buffer(3) = aicix6*(ux(3,j,k)+ux(2,j,k)) & + bicix6*(ux(4,j,k)+ux(1,j,k)) & + cicix6*(ux(5,j,k)+ux(1,j,k)) & + dicix6*(ux(6,j,k)+ux(2,j,k)) - tx(4,j,k) = aicix6*(ux(4,j,k)+ux(3,j,k)) & + buffer(4) = aicix6*(ux(4,j,k)+ux(3,j,k)) & + bicix6*(ux(5,j,k)+ux(2,j,k)) & + cicix6*(ux(6,j,k)+ux(1,j,k)) & + dicix6*(ux(7,j,k)+ux(1,j,k)) do concurrent (i=5:nx-4) - tx(i,j,k) = aicix6*(ux(i,j,k)+ux(i-1,j,k)) & + buffer(i) = aicix6*(ux(i,j,k)+ux(i-1,j,k)) & + bicix6*(ux(i+1,j,k)+ux(i-2,j,k)) & + cicix6*(ux(i+2,j,k)+ux(i-3,j,k)) & + dicix6*(ux(i+3,j,k)+ux(i-4,j,k)) enddo - tx(nx-3,j,k) = aicix6*(ux(nx-3,j,k)+ux(nx-4,j,k)) & + buffer(nx-3) = aicix6*(ux(nx-3,j,k)+ux(nx-4,j,k)) & + bicix6*(ux(nx-2,j,k)+ux(nx-5,j,k)) & + cicix6*(ux(nx-1,j,k)+ux(nx-6,j,k)) & + dicix6*(ux(nx-1,j,k)+ux(nx-7,j,k)) - tx(nx-2,j,k) = aicix6*(ux(nx-2,j,k)+ux(nx-3,j,k)) & + buffer(nx-2) = aicix6*(ux(nx-2,j,k)+ux(nx-3,j,k)) & + bicix6*(ux(nx-1,j,k)+ux(nx-4,j,k)) & + cicix6*(ux(nx-1,j,k)+ux(nx-5,j,k)) & + dicix6*(ux(nx-2,j,k)+ux(nx-6,j,k)) - tx(nx-1,j,k) = aicix6*(ux(nx-1,j,k)+ux(nx-2,j,k)) & + buffer(nx-1) = aicix6*(ux(nx-1,j,k)+ux(nx-2,j,k)) & + bicix6*(ux(nx-1,j,k)+ux(nx-3,j,k)) & + cicix6*(ux(nx-2,j,k)+ux(nx-4,j,k)) & + dicix6*(ux(nx-3,j,k)+ux(nx-5,j,k)) - tx(nx ,j,k) = aicix6*(ux(nx-1,j,k)+ux(nx-1,j,k)) & + buffer(nx ) = aicix6*(ux(nx-1,j,k)+ux(nx-1,j,k)) & + bicix6*(ux(nx-2,j,k)+ux(nx-2,j,k)) & + cicix6*(ux(nx-3,j,k)+ux(nx-3,j,k)) & + dicix6*(ux(nx-4,j,k)+ux(nx-4,j,k)) - enddo - ! Solve tri-diagonal system - call xthomas(tx, x3dop%f, x3dop%s, x3dop%w, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo endif endif @@ -400,118 +428,100 @@ subroutine interyvp(ty,uy,x3dop,nx,ny,nym,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nym) :: buffer if (ncly) then ! nym = ny - do concurrent (k=1:nz) + do concurrent (k=1:nz, i=1:nx) ! Compute r.h.s. - do concurrent (i=1:nx) - ty(i,1,k) = aiciy6*(uy(i,2,k)+uy(i,1,k)) & - + biciy6*(uy(i,3,k)+uy(i,ny,k)) & - + ciciy6*(uy(i,4,k)+uy(i,ny-1,k)) & - + diciy6*(uy(i,5,k)+uy(i,ny-2,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = aiciy6*(uy(i,3,k)+uy(i,2,k)) & - + biciy6*(uy(i,4,k)+uy(i,1,k)) & - + ciciy6*(uy(i,5,k)+uy(i,ny,k)) & - + diciy6*(uy(i,6,k)+uy(i,ny-1,k)) - enddo - do concurrent (i=1:nx) - ty(i,3,k) = aiciy6*(uy(i,4,k)+uy(i,3,k)) & - + biciy6*(uy(i,5,k)+uy(i,2,k)) & - + ciciy6*(uy(i,6,k)+uy(i,1,k)) & - + diciy6*(uy(i,7,k)+uy(i,ny,k)) - enddo - do concurrent (j=4:ny-4, i=1:nx) - ty(i,j,k) = aiciy6*(uy(i,j+1,k)+uy(i,j,k)) & + buffer(1) = aiciy6*(uy(i,2,k)+uy(i,1,k)) & + + biciy6*(uy(i,3,k)+uy(i,ny,k)) & + + ciciy6*(uy(i,4,k)+uy(i,ny-1,k)) & + + diciy6*(uy(i,5,k)+uy(i,ny-2,k)) + buffer(2) = aiciy6*(uy(i,3,k)+uy(i,2,k)) & + + biciy6*(uy(i,4,k)+uy(i,1,k)) & + + ciciy6*(uy(i,5,k)+uy(i,ny,k)) & + + diciy6*(uy(i,6,k)+uy(i,ny-1,k)) + buffer(3) = aiciy6*(uy(i,4,k)+uy(i,3,k)) & + + biciy6*(uy(i,5,k)+uy(i,2,k)) & + + ciciy6*(uy(i,6,k)+uy(i,1,k)) & + + diciy6*(uy(i,7,k)+uy(i,ny,k)) + enddo + do concurrent (j=4:ny-4) + buffer(j) = aiciy6*(uy(i,j+1,k)+uy(i,j,k)) & + biciy6*(uy(i,j+2,k)+uy(i,j-1,k)) & + ciciy6*(uy(i,j+3,k)+uy(i,j-2,k)) & + diciy6*(uy(i,j+4,k)+uy(i,j-3,k)) enddo - do concurrent (i=1:nx) - ty(i,ny-3,k) = aiciy6*(uy(i,ny-2,k)+uy(i,ny-3,k)) & - + biciy6*(uy(i,ny-1,k)+uy(i,ny-4,k)) & - + ciciy6*(uy(i,ny,k)+uy(i,ny-5,k)) & - + diciy6*(uy(i,1,k)+uy(i,ny-6,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-2,k) = aiciy6*(uy(i,ny-1,k)+uy(i,ny-2,k)) & - + biciy6*(uy(i,ny,k)+uy(i,ny-3,k)) & - + ciciy6*(uy(i,1,k)+uy(i,ny-4,k)) & - + diciy6*(uy(i,2,k)+uy(i,ny-5,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = aiciy6*(uy(i,ny,k)+uy(i,ny-1,k)) & - + biciy6*(uy(i,1,k)+uy(i,ny-2,k)) & - + ciciy6*(uy(i,2,k)+uy(i,ny-3,k)) & - + diciy6*(uy(i,3,k)+uy(i,ny-4,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny ,k) = aiciy6*(uy(i,1,k)+uy(i,ny,k)) & - + biciy6*(uy(i,2,k)+uy(i,ny-1,k)) & - + ciciy6*(uy(i,3,k)+uy(i,ny-2,k)) & - + diciy6*(uy(i,4,k)+uy(i,ny-3,k)) + buffer(ny-3) = aiciy6*(uy(i,ny-2,k)+uy(i,ny-3,k)) & + + biciy6*(uy(i,ny-1,k)+uy(i,ny-4,k)) & + + ciciy6*(uy(i,ny,k)+uy(i,ny-5,k)) & + + diciy6*(uy(i,1,k)+uy(i,ny-6,k)) + buffer(ny-2) = aiciy6*(uy(i,ny-1,k)+uy(i,ny-2,k)) & + + biciy6*(uy(i,ny,k)+uy(i,ny-3,k)) & + + ciciy6*(uy(i,1,k)+uy(i,ny-4,k)) & + + diciy6*(uy(i,2,k)+uy(i,ny-5,k)) + buffer(ny-1) = aiciy6*(uy(i,ny,k)+uy(i,ny-1,k)) & + + biciy6*(uy(i,1,k)+uy(i,ny-2,k)) & + + ciciy6*(uy(i,2,k)+uy(i,ny-3,k)) & + + diciy6*(uy(i,3,k)+uy(i,ny-4,k)) + buffer(ny ) = aiciy6*(uy(i,1,k)+uy(i,ny,k)) & + + biciy6*(uy(i,2,k)+uy(i,ny-1,k)) & + + ciciy6*(uy(i,3,k)+uy(i,ny-2,k)) & + + diciy6*(uy(i,4,k)+uy(i,ny-3,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, ny) + do concurrent (j=1:ny) + ty(i,j,k) = buffer(j) enddo enddo - ! Solve tri-diagonal system - call ythomas(ty, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) - else ! nym = ny-1 if (x3dop%npaire==1) then - do concurrent (k=1:nz) + do concurrent (k=1:nz, i=1:nx) ! Compute r.h.s. - do concurrent (i=1:nx) - ty(i,1,k) = aiciy6*(uy(i,2,k)+uy(i,1,k)) & - + biciy6*(uy(i,3,k)+uy(i,2,k)) & - + ciciy6*(uy(i,4,k)+uy(i,3,k)) & - + diciy6*(uy(i,5,k)+uy(i,4,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = aiciy6*(uy(i,3,k)+uy(i,2,k)) & - + biciy6*(uy(i,4,k)+uy(i,1,k)) & - + ciciy6*(uy(i,5,k)+uy(i,2,k)) & - + diciy6*(uy(i,6,k)+uy(i,3,k)) - enddo - do concurrent (i=1:nx) - ty(i,3,k) = aiciy6*(uy(i,4,k)+uy(i,3,k)) & - + biciy6*(uy(i,5,k)+uy(i,2,k)) & - + ciciy6*(uy(i,6,k)+uy(i,1,k)) & - + diciy6*(uy(i,7,k)+uy(i,2,k)) - enddo - do concurrent (j=4:nym-3, i=1:nx) - ty(i,j,k) = aiciy6*(uy(i,j+1,k)+uy(i,j,k)) & + buffer(1) = aiciy6*(uy(i,2,k)+uy(i,1,k)) & + + biciy6*(uy(i,3,k)+uy(i,2,k)) & + + ciciy6*(uy(i,4,k)+uy(i,3,k)) & + + diciy6*(uy(i,5,k)+uy(i,4,k)) + buffer(2) = aiciy6*(uy(i,3,k)+uy(i,2,k)) & + + biciy6*(uy(i,4,k)+uy(i,1,k)) & + + ciciy6*(uy(i,5,k)+uy(i,2,k)) & + + diciy6*(uy(i,6,k)+uy(i,3,k)) + buffer(3) = aiciy6*(uy(i,4,k)+uy(i,3,k)) & + + biciy6*(uy(i,5,k)+uy(i,2,k)) & + + ciciy6*(uy(i,6,k)+uy(i,1,k)) & + + diciy6*(uy(i,7,k)+uy(i,2,k)) + do concurrent (j=4:nym-3) + buffer(j) = aiciy6*(uy(i,j+1,k)+uy(i,j,k)) & + biciy6*(uy(i,j+2,k)+uy(i,j-1,k)) & + ciciy6*(uy(i,j+3,k)+uy(i,j-2,k)) & + diciy6*(uy(i,j+4,k)+uy(i,j-3,k)) enddo - do concurrent (i=1:nx) - ty(i,nym-2,k) = aiciy6*(uy(i,nym-1,k)+uy(i,nym-2,k)) & - + biciy6*(uy(i,nym,k)+uy(i,nym-3,k)) & - + ciciy6*(uy(i,ny,k)+uy(i,nym-4,k)) & - + diciy6*(uy(i,nym,k)+uy(i,nym-5,k)) - enddo - do concurrent (i=1:nx) - ty(i,nym-1,k) = aiciy6*(uy(i,nym,k)+uy(i,nym-1,k)) & - + biciy6*(uy(i,ny,k)+uy(i,nym-2,k)) & - + ciciy6*(uy(i,nym,k)+uy(i,nym-3,k)) & - + diciy6*(uy(i,nym-1,k)+uy(i,nym-4,k)) - enddo - do concurrent (i=1:nx) - ty(i,nym ,k) = aiciy6*(uy(i,ny,k)+uy(i,nym,k)) & - + biciy6*(uy(i,nym,k)+uy(i,nym-1,k)) & - + ciciy6*(uy(i,nym-1,k)+uy(i,nym-2,k)) & - + diciy6*(uy(i,nym-2,k)+uy(i,nym-3,k)) + buffer(nym-2) = aiciy6*(uy(i,nym-1,k)+uy(i,nym-2,k)) & + + biciy6*(uy(i,nym,k)+uy(i,nym-3,k)) & + + ciciy6*(uy(i,ny,k)+uy(i,nym-4,k)) & + + diciy6*(uy(i,nym,k)+uy(i,nym-5,k)) + buffer(nym-1) = aiciy6*(uy(i,nym,k)+uy(i,nym-1,k)) & + + biciy6*(uy(i,ny,k)+uy(i,nym-2,k)) & + + ciciy6*(uy(i,nym,k)+uy(i,nym-3,k)) & + + diciy6*(uy(i,nym-1,k)+uy(i,nym-4,k)) + buffer(nym ) = aiciy6*(uy(i,ny,k)+uy(i,nym,k)) & + + biciy6*(uy(i,nym,k)+uy(i,nym-1,k)) & + + ciciy6*(uy(i,nym-1,k)+uy(i,nym-2,k)) & + + diciy6*(uy(i,nym-2,k)+uy(i,nym-3,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, nym) + do concurrent (j=1:nym) + ty(i,j,k) = buffer(j) enddo enddo - ! Solve tri-diagonal system - call ythomas(ty, x3dop%f, x3dop%s, x3dop%w, nx, nym, nz) - endif endif @@ -534,77 +544,73 @@ subroutine deryvp(ty,uy,x3dop,ppyi,nx,ny,nym,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nym) :: buffer if (ncly) then ! nym = ny - do concurrent (k=1:nz) + do concurrent (k=1:nz, i=1:nx) ! Compute r.h.s. - do concurrent (i=1:nx) - ty(i,1,k) = aciy6*(uy(i,2,k)-uy(i,1,k)) & - + bciy6*(uy(i,3,k)-uy(i,ny,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = aciy6*(uy(i,3,k)-uy(i,2,k)) & - + bciy6*(uy(i,4,k)-uy(i,1,k)) + buffer(1) = aciy6*(uy(i,2,k)-uy(i,1,k)) & + + bciy6*(uy(i,3,k)-uy(i,ny,k)) + buffer(2) = aciy6*(uy(i,3,k)-uy(i,2,k)) & + + bciy6*(uy(i,4,k)-uy(i,1,k)) enddo - do concurrent (j=3:ny-2, i=1:nx) - ty(i,j,k) = aciy6*(uy(i,j+1,k)-uy(i,j,k)) & + do concurrent (j=3:ny-2) + buffer(j) = aciy6*(uy(i,j+1,k)-uy(i,j,k)) & + bciy6*(uy(i,j+2,k)-uy(i,j-1,k)) enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = aciy6*(uy(i,ny,k)-uy(i,ny-1,k)) & - + bciy6*(uy(i,1,k)-uy(i,ny-2,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny ,k) = aciy6*(uy(i,1,k)-uy(i,ny,k)) & - + bciy6*(uy(i,2,k)-uy(i,ny-1,k)) + buffer(ny-1) = aciy6*(uy(i,ny,k)-uy(i,ny-1,k)) & + + bciy6*(uy(i,1,k)-uy(i,ny-2,k)) + buffer(ny ) = aciy6*(uy(i,1,k)-uy(i,ny,k)) & + + bciy6*(uy(i,2,k)-uy(i,ny-1,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, ny) + if (istret /= 0) then + do concurrent (j=1:nym) + buffer(j) = buffer(j) * ppyi(j) + enddo + endif + do concurrent (j=1:ny) + ty(i,j,k) = buffer(j) enddo enddo - ! Solve tri-diagonal system - call ythomas(ty, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) - else ! nym = ny-1 if (x3dop%npaire==0) then - do concurrent (k=1:nz) + do concurrent (k=1:nz, i=1:nx) ! Compute r.h.s. - do concurrent (i=1:nx) - ty(i,1,k) = aciy6*(uy(i,2,k)-uy(i,1,k)) & + buffer(1) = aciy6*(uy(i,2,k)-uy(i,1,k)) & + bciy6*(uy(i,3,k)-two*uy(i,1,k)+uy(i,2,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = aciy6*(uy(i,3,k)-uy(i,2,k)) & + buffer(2) = aciy6*(uy(i,3,k)-uy(i,2,k)) & + bciy6*(uy(i,4,k)-uy(i,1,k)) - enddo - do concurrent (j=3:nym-2, i=1:nx) - ty(i,j,k) = aciy6*(uy(i,j+1,k)-uy(i,j,k)) & + do concurrent (j=3:nym-2) + buffer(j) = aciy6*(uy(i,j+1,k)-uy(i,j,k)) & + bciy6*(uy(i,j+2,k)-uy(i,j-1,k)) enddo - do concurrent (i=1:nx) - ty(i,nym-1,k) = aciy6*(uy(i,nym,k)-uy(i,nym-1,k)) & + buffer(nym-1) = aciy6*(uy(i,nym,k)-uy(i,nym-1,k)) & + bciy6*(uy(i,ny,k)-uy(i,nym-2,k)) - enddo - do concurrent (i=1:nx) - ty(i,nym ,k) = aciy6*(uy(i,ny,k)-uy(i,nym,k)) & + buffer(nym ) = aciy6*(uy(i,ny,k)-uy(i,nym,k)) & + bciy6*(two*uy(i,ny,k)-uy(i,nym,k)-uy(i,nym-1,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, nym) + if (istret /= 0) then + do concurrent (j=1:nym) + buffer(j) = buffer(j) * ppyi(j) + enddo + endif + do concurrent (j=1:nym) + ty(i,j,k) = buffer(j) enddo enddo - ! Solve tri-diagonal system - call ythomas(ty, x3dop%f, x3dop%s, x3dop%w, nx, nym, nz) - endif endif - if (istret /= 0) then - do concurrent (k=1:nz, j=1:nym, i=1:nx) - ty(i,j,k) = ty(i,j,k) * ppyi(j) - enddo - endif - end subroutine deryvp !******************************************************************** @@ -623,130 +629,107 @@ subroutine interypv(ty,uy,x3dop,nx,nym,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(ny) :: buffer if (ncly) then ! nym = ny - do concurrent (k=1:nz) + do concurrent (k=1:nz, i=1:nx) ! Compute r.h.s. - do concurrent (i=1:nx) - ty(i,1,k) = aiciy6*(uy(i,1,k)+uy(i,ny,k)) & - + biciy6*(uy(i,2,k)+uy(i,ny-1,k)) & - + ciciy6*(uy(i,3,k)+uy(i,ny-2,k)) & - + diciy6*(uy(i,4,k)+uy(i,ny-3,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = aiciy6*(uy(i,2,k)+uy(i,1,k)) & - + biciy6*(uy(i,3,k)+uy(i,ny,k)) & - + ciciy6*(uy(i,4,k)+uy(i,ny-1,k)) & - + diciy6*(uy(i,5,k)+uy(i,ny-2,k)) - enddo - do concurrent (i=1:nx) - ty(i,3,k) = aiciy6*(uy(i,3,k)+uy(i,2,k)) & - + biciy6*(uy(i,4,k)+uy(i,1,k)) & - + ciciy6*(uy(i,5,k)+uy(i,ny,k)) & - + diciy6*(uy(i,6,k)+uy(i,ny-1,k)) - enddo - do concurrent (i=1:nx) - ty(i,4,k) = aiciy6*(uy(i,4,k)+uy(i,3,k)) & - + biciy6*(uy(i,5,k)+uy(i,2,k)) & - + ciciy6*(uy(i,6,k)+uy(i,1,k)) & - + diciy6*(uy(i,7,k)+uy(i,ny,k)) - enddo - do concurrent (j=5:ny-3, i=1:nx) - ty(i,j,k) = aiciy6*(uy(i,j,k)+uy(i,j-1,k)) & + buffer(1) = aiciy6*(uy(i,1,k)+uy(i,ny,k)) & + + biciy6*(uy(i,2,k)+uy(i,ny-1,k)) & + + ciciy6*(uy(i,3,k)+uy(i,ny-2,k)) & + + diciy6*(uy(i,4,k)+uy(i,ny-3,k)) + buffer(2) = aiciy6*(uy(i,2,k)+uy(i,1,k)) & + + biciy6*(uy(i,3,k)+uy(i,ny,k)) & + + ciciy6*(uy(i,4,k)+uy(i,ny-1,k)) & + + diciy6*(uy(i,5,k)+uy(i,ny-2,k)) + buffer(3) = aiciy6*(uy(i,3,k)+uy(i,2,k)) & + + biciy6*(uy(i,4,k)+uy(i,1,k)) & + + ciciy6*(uy(i,5,k)+uy(i,ny,k)) & + + diciy6*(uy(i,6,k)+uy(i,ny-1,k)) + buffer(4) = aiciy6*(uy(i,4,k)+uy(i,3,k)) & + + biciy6*(uy(i,5,k)+uy(i,2,k)) & + + ciciy6*(uy(i,6,k)+uy(i,1,k)) & + + diciy6*(uy(i,7,k)+uy(i,ny,k)) + do concurrent (j=5:ny-3) + buffer(j) = aiciy6*(uy(i,j,k)+uy(i,j-1,k)) & + biciy6*(uy(i,j+1,k)+uy(i,j-2,k)) & + ciciy6*(uy(i,j+2,k)+uy(i,j-3,k)) & + diciy6*(uy(i,j+3,k)+uy(i,j-4,k)) enddo - do concurrent (i=1:nx) - ty(i,ny-2,k) = aiciy6*(uy(i,ny-2,k)+uy(i,ny-3,k)) & - + biciy6*(uy(i,ny-1,k)+uy(i,ny-4,k)) & - + ciciy6*(uy(i,ny,k)+uy(i,ny-5,k)) & - + diciy6*(uy(i,1,k)+uy(i,ny-6,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = aiciy6*(uy(i,ny-1,k)+uy(i,ny-2,k)) & - + biciy6*(uy(i,ny,k)+uy(i,ny-3,k)) & - + ciciy6*(uy(i,1,k)+uy(i,ny-4,k)) & - + diciy6*(uy(i,2,k)+uy(i,ny-5,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny ,k) = aiciy6*(uy(i,ny,k)+uy(i,ny-1,k)) & - + biciy6*(uy(i,1,k)+uy(i,ny-2,k)) & - + ciciy6*(uy(i,2,k)+uy(i,ny-3,k)) & - + diciy6*(uy(i,3,k)+uy(i,ny-4,k)) + buffer(ny-2) = aiciy6*(uy(i,ny-2,k)+uy(i,ny-3,k)) & + + biciy6*(uy(i,ny-1,k)+uy(i,ny-4,k)) & + + ciciy6*(uy(i,ny,k)+uy(i,ny-5,k)) & + + diciy6*(uy(i,1,k)+uy(i,ny-6,k)) + buffer(ny-1) = aiciy6*(uy(i,ny-1,k)+uy(i,ny-2,k)) & + + biciy6*(uy(i,ny,k)+uy(i,ny-3,k)) & + + ciciy6*(uy(i,1,k)+uy(i,ny-4,k)) & + + diciy6*(uy(i,2,k)+uy(i,ny-5,k)) + buffer(ny ) = aiciy6*(uy(i,ny,k)+uy(i,ny-1,k)) & + + biciy6*(uy(i,1,k)+uy(i,ny-2,k)) & + + ciciy6*(uy(i,2,k)+uy(i,ny-3,k)) & + + diciy6*(uy(i,3,k)+uy(i,ny-4,k)) + + ! Solve tri-diagonal system + call thomas1d(ty, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, ny) + do concurrent (j=1:ny) + ty(i,j,k) = buffer(j) enddo enddo - ! Solve tri-diagonal system - call ythomas(ty, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) - else ! nym = ny-1 if (x3dop%npaire==1) then - do concurrent (k=1:nz) + do concurrent (k=1:nz, i=1:nx) ! Compute r.h.s. - do concurrent (i=1:nx) - ty(i,1,k) = aiciy6*(uy(i,1,k)+uy(i,1,k)) & - + biciy6*(uy(i,2,k)+uy(i,2,k)) & - + ciciy6*(uy(i,3,k)+uy(i,3,k)) & - + diciy6*(uy(i,4,k)+uy(i,4,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = aiciy6*(uy(i,2,k)+uy(i,1,k)) & - + biciy6*(uy(i,3,k)+uy(i,1,k)) & - + ciciy6*(uy(i,4,k)+uy(i,2,k)) & - + diciy6*(uy(i,5,k)+uy(i,3,k)) - enddo - do concurrent (i=1:nx) - ty(i,3,k) = aiciy6*(uy(i,3,k)+uy(i,2,k)) & - + biciy6*(uy(i,4,k)+uy(i,1,k)) & - + ciciy6*(uy(i,5,k)+uy(i,1,k)) & - + diciy6*(uy(i,6,k)+uy(i,2,k)) - enddo - do concurrent (i=1:nx) - ty(i,4,k) = aiciy6*(uy(i,4,k)+uy(i,3,k)) & - + biciy6*(uy(i,5,k)+uy(i,2,k)) & - + ciciy6*(uy(i,6,k)+uy(i,1,k)) & - + diciy6*(uy(i,7,k)+uy(i,1,k)) - enddo - do concurrent (j=5:ny-4, i=1:nx) - ty(i,j,k) = aiciy6*(uy(i,j,k)+uy(i,j-1,k)) & + buffer(1) = aiciy6*(uy(i,1,k)+uy(i,1,k)) & + + biciy6*(uy(i,2,k)+uy(i,2,k)) & + + ciciy6*(uy(i,3,k)+uy(i,3,k)) & + + diciy6*(uy(i,4,k)+uy(i,4,k)) + buffer(2) = aiciy6*(uy(i,2,k)+uy(i,1,k)) & + + biciy6*(uy(i,3,k)+uy(i,1,k)) & + + ciciy6*(uy(i,4,k)+uy(i,2,k)) & + + diciy6*(uy(i,5,k)+uy(i,3,k)) + buffer(3) = aiciy6*(uy(i,3,k)+uy(i,2,k)) & + + biciy6*(uy(i,4,k)+uy(i,1,k)) & + + ciciy6*(uy(i,5,k)+uy(i,1,k)) & + + diciy6*(uy(i,6,k)+uy(i,2,k)) + buffer(4) = aiciy6*(uy(i,4,k)+uy(i,3,k)) & + + biciy6*(uy(i,5,k)+uy(i,2,k)) & + + ciciy6*(uy(i,6,k)+uy(i,1,k)) & + + diciy6*(uy(i,7,k)+uy(i,1,k)) + do concurrent (j=5:ny-4) + buffer(j) = aiciy6*(uy(i,j,k)+uy(i,j-1,k)) & + biciy6*(uy(i,j+1,k)+uy(i,j-2,k)) & + ciciy6*(uy(i,j+2,k)+uy(i,j-3,k)) & + diciy6*(uy(i,j+3,k)+uy(i,j-4,k)) enddo - do concurrent (i=1:nx) - ty(i,ny-3,k) = aiciy6*(uy(i,ny-3,k)+uy(i,ny-4,k)) & - + biciy6*(uy(i,ny-2,k)+uy(i,ny-5,k)) & - + ciciy6*(uy(i,ny-1,k)+uy(i,ny-6,k)) & - + diciy6*(uy(i,ny-1,k)+uy(i,ny-7,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-2,k) = aiciy6*(uy(i,ny-2,k)+uy(i,ny-3,k)) & - + biciy6*(uy(i,ny-1,k)+uy(i,ny-4,k)) & - + ciciy6*(uy(i,ny-1,k)+uy(i,ny-5,k)) & - + diciy6*(uy(i,ny-2,k)+uy(i,ny-6,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = aiciy6*(uy(i,ny-1,k)+uy(i,ny-2,k)) & - + biciy6*(uy(i,ny-1,k)+uy(i,ny-3,k)) & - + ciciy6*(uy(i,ny-2,k)+uy(i,ny-4,k)) & - + diciy6*(uy(i,ny-3,k)+uy(i,ny-5,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny ,k) = aiciy6*(uy(i,ny-1,k)+uy(i,ny-1,k)) & - + biciy6*(uy(i,ny-2,k)+uy(i,ny-2,k)) & - + ciciy6*(uy(i,ny-3,k)+uy(i,ny-3,k)) & - + diciy6*(uy(i,ny-4,k)+uy(i,ny-4,k)) + buffer(ny-3) = aiciy6*(uy(i,ny-3,k)+uy(i,ny-4,k)) & + + biciy6*(uy(i,ny-2,k)+uy(i,ny-5,k)) & + + ciciy6*(uy(i,ny-1,k)+uy(i,ny-6,k)) & + + diciy6*(uy(i,ny-1,k)+uy(i,ny-7,k)) + buffer(ny-2) = aiciy6*(uy(i,ny-2,k)+uy(i,ny-3,k)) & + + biciy6*(uy(i,ny-1,k)+uy(i,ny-4,k)) & + + ciciy6*(uy(i,ny-1,k)+uy(i,ny-5,k)) & + + diciy6*(uy(i,ny-2,k)+uy(i,ny-6,k)) + buffer(ny-1) = aiciy6*(uy(i,ny-1,k)+uy(i,ny-2,k)) & + + biciy6*(uy(i,ny-1,k)+uy(i,ny-3,k)) & + + ciciy6*(uy(i,ny-2,k)+uy(i,ny-4,k)) & + + diciy6*(uy(i,ny-3,k)+uy(i,ny-5,k)) + buffer(ny ) = aiciy6*(uy(i,ny-1,k)+uy(i,ny-1,k)) & + + biciy6*(uy(i,ny-2,k)+uy(i,ny-2,k)) & + + ciciy6*(uy(i,ny-3,k)+uy(i,ny-3,k)) & + + diciy6*(uy(i,ny-4,k)+uy(i,ny-4,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, ny) + do concurrent (j=1:ny) + ty(i,j,k) = buffer(j) enddo enddo - ! Solve tri-diagonal system - call ythomas(ty, x3dop%f, x3dop%s, x3dop%w, nx, ny, nz) - endif endif @@ -769,75 +752,70 @@ subroutine derypv(ty,uy,x3dop,ppy,nx,nym,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(ny) :: buffer if (ncly) then ! nym = ny - do concurrent (k=1:nz) + do concurrent (k=1:nz, i=1:nx) ! Compute r.h.s. - do concurrent (i=1:nx) - ty(i,1,k) = aciy6*(uy(i,1,k)-uy(i,ny,k)) & - + bciy6*(uy(i,2,k)-uy(i,ny-1,k)) - enddo - do concurrent (i=1:nx) - ty(i,2,k) = aciy6*(uy(i,2,k)-uy(i,1,k)) & - + bciy6*(uy(i,3,k)-uy(i,ny,k)) - enddo - do concurrent (j=3:ny-2, i=1:nx) - ty(i,j,k) = aciy6*(uy(i,j,k)-uy(i,j-1,k)) & + buffer(1) = aciy6*(uy(i,1,k)-uy(i,ny,k)) & + + bciy6*(uy(i,2,k)-uy(i,ny-1,k)) + buffer(2) = aciy6*(uy(i,2,k)-uy(i,1,k)) & + + bciy6*(uy(i,3,k)-uy(i,ny,k)) + do concurrent (j=3:ny-2) + buffer(j) = aciy6*(uy(i,j,k)-uy(i,j-1,k)) & + bciy6*(uy(i,j+1,k)-uy(i,j-2,k)) enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = aciy6*(uy(i,ny-1,k)-uy(i,ny-2,k)) & - + bciy6*(uy(i,ny,k)-uy(i,ny-3,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny,k) = aciy6*(uy(i,ny,k)-uy(i,ny-1,k)) & - + bciy6*(uy(i,1,k)-uy(i,ny-2,k)) + buffer(ny-1) = aciy6*(uy(i,ny-1,k)-uy(i,ny-2,k)) & + + bciy6*(uy(i,ny,k)-uy(i,ny-3,k)) + buffer(ny) = aciy6*(uy(i,ny,k)-uy(i,ny-1,k)) & + + bciy6*(uy(i,1,k)-uy(i,ny-2,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, ny) + if (istret /= 0) then + do concurrent (j=1:ny) + buffer(j) = buffer(j) * ppy(j) + enddo + endif + do concurrent (j=1:ny) + ty(i,j,k) = buffer(j) enddo enddo - ! Solve tri-diagonal system - call ythomas(ty, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) - else ! nym = ny-1 if (x3dop%npaire==1) then - do concurrent (k=1:nz) + do concurrent (k=1:nz, i=1:nx) ! Compute r.h.s. - do concurrent (i=1:nx) - ty(i,1,k) = zero - enddo - do concurrent (i=1:nx) - ty(i,2,k) = aciy6*(uy(i,2,k)-uy(i,1,k)) & - + bciy6*(uy(i,3,k)-uy(i,1,k)) - enddo - do concurrent (j=3:ny-2, i=1:nx) - ty(i,j,k) = aciy6*(uy(i,j,k)-uy(i,j-1,k)) & + buffer(1) = zero + buffer(2) = aciy6*(uy(i,2,k)-uy(i,1,k)) & + + bciy6*(uy(i,3,k)-uy(i,1,k)) + do concurrent (j=3:ny-2) + buffer(j) = aciy6*(uy(i,j,k)-uy(i,j-1,k)) & + bciy6*(uy(i,j+1,k)-uy(i,j-2,k)) enddo - do concurrent (i=1:nx) - ty(i,ny-1,k) = aciy6*(uy(i,ny-1,k)-uy(i,ny-2,k)) & - + bciy6*(uy(i,ny-1,k)-uy(i,ny-3,k)) - enddo - do concurrent (i=1:nx) - ty(i,ny,k) = zero + buffer(ny-1) = aciy6*(uy(i,ny-1,k)-uy(i,ny-2,k)) & + + bciy6*(uy(i,ny-1,k)-uy(i,ny-3,k)) + buffer(ny) = zero + + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, ny) + if (istret /= 0) then + do concurrent (j=1:ny) + buffer(j) = buffer(j) * ppy(j) + enddo + endif + do concurrent (j=1:ny) + ty(i,j,k) = buffer(j) enddo enddo - ! Solve tri-diagonal system - call ythomas(ty, x3dop%f, x3dop%s, x3dop%w, nx, ny, nz) - endif endif - if (istret /= 0) then - do concurrent (k=1:nz, j=1:ny, i=1:nx) - ty(i,j,k) = ty(i,j,k) * ppy(j) - enddo - endif - end subroutine derypv !******************************************************************** @@ -856,6 +834,7 @@ subroutine derzvp(tz,uz,x3dop,nx,ny,nz,nzm) ! Local variables integer :: i, j, k + real(mytype), dimension(nzm) :: buffer if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -866,81 +845,67 @@ subroutine derzvp(tz,uz,x3dop,nx,ny,nz,nzm) if (nclz) then ! nzm = nz - - ! Compute r.h.s. do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = aciz6*(uz(i,j,2)-uz(i,j,1)) & + + ! Compute r.h.s. + buffer(1) = aciz6*(uz(i,j,2)-uz(i,j,1)) & + bciz6*(uz(i,j,3)-uz(i,j,nz)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = aciz6*(uz(i,j,3)-uz(i,j,2)) & + buffer(2) = aciz6*(uz(i,j,3)-uz(i,j,2)) & + bciz6*(uz(i,j,4)-uz(i,j,1)) - enddo - do concurrent (k=3:nz-2, j=1:ny, i=1:nx) - tz(i,j,k) = aciz6*(uz(i,j,k+1)-uz(i,j,k)) & - + bciz6*(uz(i,j,k+2)-uz(i,j,k-1)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = aciz6*(uz(i,j,nz)-uz(i,j,nz-1)) & + do concurrent (k=3:nz-2) + buffer(k) = aciz6*(uz(i,j,k+1)-uz(i,j,k)) & + + bciz6*(uz(i,j,k+2)-uz(i,j,k-1)) + enddo + buffer(nz-1) = aciz6*(uz(i,j,nz)-uz(i,j,nz-1)) & + bciz6*(uz(i,j,1)-uz(i,j,nz-2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz ) = aciz6*(uz(i,j,1)-uz(i,j,nz)) & + buffer(nz ) = aciz6*(uz(i,j,1)-uz(i,j,nz)) & + bciz6*(uz(i,j,2)-uz(i,j,nz-1)) - enddo - ! Solve tri-diagonal system - call zthomas(tz, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nz) + do concurrent (k=1:nz) + tz(i,j,k) = buffer(k) + enddo + enddo else ! nzm = nz-1 + do concurrent (j=1:ny, i=1:nx) - ! Compute r.h.s. - if (x3dop%npaire==1) then - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = aciz6*(uz(i,j,2)-uz(i,j,1)) & + ! Compute r.h.s. + if (x3dop%npaire==1) then + buffer(1) = aciz6*(uz(i,j,2)-uz(i,j,1)) & + bciz6*(uz(i,j,3)-uz(i,j,2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = aciz6*(uz(i,j,3)-uz(i,j,2))& + buffer(2) = aciz6*(uz(i,j,3)-uz(i,j,2))& + bciz6*(uz(i,j,4)-uz(i,j,1)) - enddo - else - do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = aciz6*(uz(i,j,2)-uz(i,j,1)) & + else + buffer(1) = aciz6*(uz(i,j,2)-uz(i,j,1)) & + bciz6*(uz(i,j,3)-two*uz(i,j,1)+uz(i,j,2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = aciz6*(uz(i,j,3)-uz(i,j,2)) & + buffer(2) = aciz6*(uz(i,j,3)-uz(i,j,2)) & + bciz6*(uz(i,j,4)-uz(i,j,1)) + endif + do concurrent (k=3:nzm-2) + buffer(k) = aciz6*(uz(i,j,k+1)-uz(i,j,k)) & + + bciz6*(uz(i,j,k+2)-uz(i,j,k-1)) enddo - endif - do concurrent (k=3:nzm-2, j=1:ny, i=1:nx) - tz(i,j,k) = aciz6*(uz(i,j,k+1)-uz(i,j,k)) & - + bciz6*(uz(i,j,k+2)-uz(i,j,k-1)) - enddo - if (x3dop%npaire==1) then - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nzm-1) = aciz6*(uz(i,j,nzm)-uz(i,j,nzm-1)) & + if (x3dop%npaire==1) then + buffer(nzm-1) = aciz6*(uz(i,j,nzm)-uz(i,j,nzm-1)) & + bciz6*(uz(i,j,nz)-uz(i,j,nzm-2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nzm ) = aciz6*(uz(i,j,nz)-uz(i,j,nzm)) & + buffer(nzm ) = aciz6*(uz(i,j,nz)-uz(i,j,nzm)) & + bciz6*(uz(i,j,nzm)-uz(i,j,nzm-1)) - enddo - else - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nzm-1) = aciz6*(uz(i,j,nz-1)-uz(i,j,nz-2)) & + else + buffer(nzm-1) = aciz6*(uz(i,j,nz-1)-uz(i,j,nz-2)) & + bciz6*(uz(i,j,nz)-uz(i,j,nz-3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nzm ) = aciz6*(uz(i,j,nz)-uz(i,j,nz-1)) & + buffer(nzm ) = aciz6*(uz(i,j,nz)-uz(i,j,nz-1)) & + bciz6*(two*uz(i,j,nz)-uz(i,j,nz-1)-uz(i,j,nz-2)) - enddo - endif + endif - ! Solve tri-diagonal system - call zthomas(tz, x3dop%f, x3dop%s, x3dop%w, nx, ny, nzm) + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, nzm) + do concurrent (k=1:nzm) + tz(i,j,k) = buffer(k) + enddo + enddo endif @@ -962,6 +927,7 @@ subroutine interzvp(tz,uz,x3dop,nx,ny,nz,nzm) ! Local variables integer :: i, j, k + real(mytype), dimension(nzm) :: buffer if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -972,110 +938,94 @@ subroutine interzvp(tz,uz,x3dop,nx,ny,nz,nzm) if (nclz) then ! nzm = nz - - ! Compute r.h.s. do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = aiciz6*(uz(i,j,2)+uz(i,j,1)) & + + ! Compute r.h.s. + buffer(1) = aiciz6*(uz(i,j,2)+uz(i,j,1)) & + biciz6*(uz(i,j,3)+uz(i,j,nz)) & + ciciz6*(uz(i,j,4)+uz(i,j,nz-1)) & + diciz6*(uz(i,j,5)+uz(i,j,nz-2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = aiciz6*(uz(i,j,3)+uz(i,j,2)) & + buffer(2) = aiciz6*(uz(i,j,3)+uz(i,j,2)) & + biciz6*(uz(i,j,4)+uz(i,j,1)) & + ciciz6*(uz(i,j,5)+uz(i,j,nz)) & + diciz6*(uz(i,j,6)+uz(i,j,nz-1)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,3) = aiciz6*(uz(i,j,4)+uz(i,j,3)) & + buffer(3) = aiciz6*(uz(i,j,4)+uz(i,j,3)) & + biciz6*(uz(i,j,5)+uz(i,j,2)) & + ciciz6*(uz(i,j,6)+uz(i,j,1)) & + diciz6*(uz(i,j,7)+uz(i,j,nz)) - enddo - do concurrent (k=4:nz-4, j=1:ny, i=1:nx) - tz(i,j,k) = aiciz6*(uz(i,j,k+1)+uz(i,j,k)) & - + biciz6*(uz(i,j,k+2)+uz(i,j,k-1)) & - + ciciz6*(uz(i,j,k+3)+uz(i,j,k-2)) & - + diciz6*(uz(i,j,k+4)+uz(i,j,k-3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-3) = aiciz6*(uz(i,j,nz-2)+uz(i,j,nz-3)) & + do concurrent (k=4:nz-4) + buffer(k) = aiciz6*(uz(i,j,k+1)+uz(i,j,k)) & + + biciz6*(uz(i,j,k+2)+uz(i,j,k-1)) & + + ciciz6*(uz(i,j,k+3)+uz(i,j,k-2)) & + + diciz6*(uz(i,j,k+4)+uz(i,j,k-3)) + enddo + buffer(nz-3) = aiciz6*(uz(i,j,nz-2)+uz(i,j,nz-3)) & + biciz6*(uz(i,j,nz-1)+uz(i,j,nz-4)) & + ciciz6*(uz(i,j,nz)+uz(i,j,nz-5)) & + diciz6*(uz(i,j,1)+uz(i,j,nz-6)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-2) = aiciz6*(uz(i,j,nz-1)+uz(i,j,nz-2)) & + buffer(nz-2) = aiciz6*(uz(i,j,nz-1)+uz(i,j,nz-2)) & + biciz6*(uz(i,j,nz)+uz(i,j,nz-3)) & + ciciz6*(uz(i,j,1)+uz(i,j,nz-4)) & + diciz6*(uz(i,j,2)+uz(i,j,nz-5)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = aiciz6*(uz(i,j,nz)+uz(i,j,nz-1)) & + buffer(nz-1) = aiciz6*(uz(i,j,nz)+uz(i,j,nz-1)) & + biciz6*(uz(i,j,1)+uz(i,j,nz-2)) & + ciciz6*(uz(i,j,2)+uz(i,j,nz-3)) & + diciz6*(uz(i,j,3)+uz(i,j,nz-4)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz ) = aiciz6*(uz(i,j,1)+uz(i,j,nz)) & + buffer(nz ) = aiciz6*(uz(i,j,1)+uz(i,j,nz)) & + biciz6*(uz(i,j,2)+uz(i,j,nz-1)) & + ciciz6*(uz(i,j,3)+uz(i,j,nz-2)) & + diciz6*(uz(i,j,4)+uz(i,j,nz-3)) - enddo - ! Solve tri-diagonal system - call zthomas(tz, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nz) + do concurrent (k=1:nz) + tz(i,j,k) = buffer(k) + enddo + enddo else ! nzm = nz-1 if (x3dop%npaire==1) then - - ! Compute r.h.s. do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = aiciz6*(uz(i,j,2)+uz(i,j,1)) & + + ! Compute r.h.s. + buffer(1) = aiciz6*(uz(i,j,2)+uz(i,j,1)) & + biciz6*(uz(i,j,3)+uz(i,j,2)) & + ciciz6*(uz(i,j,4)+uz(i,j,3)) & + diciz6*(uz(i,j,5)+uz(i,j,4)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = aiciz6*(uz(i,j,3)+uz(i,j,2)) & + buffer(2) = aiciz6*(uz(i,j,3)+uz(i,j,2)) & + biciz6*(uz(i,j,4)+uz(i,j,1)) & + ciciz6*(uz(i,j,5)+uz(i,j,2)) & + diciz6*(uz(i,j,6)+uz(i,j,3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,3) = aiciz6*(uz(i,j,4)+uz(i,j,3)) & + buffer(3) = aiciz6*(uz(i,j,4)+uz(i,j,3)) & + biciz6*(uz(i,j,5)+uz(i,j,2)) & + ciciz6*(uz(i,j,6)+uz(i,j,1)) & + diciz6*(uz(i,j,7)+uz(i,j,2)) - enddo - do concurrent (k=4:nzm-3, j=1:ny, i=1:nx) - tz(i,j,k) = aiciz6*(uz(i,j,k+1)+uz(i,j,k)) & - + biciz6*(uz(i,j,k+2)+uz(i,j,k-1)) & - + ciciz6*(uz(i,j,k+3)+uz(i,j,k-2)) & - + diciz6*(uz(i,j,k+4)+uz(i,j,k-3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nzm-2) = aiciz6*(uz(i,j,nzm-1)+uz(i,j,nzm-2)) & + do concurrent (k=4:nzm-3) + buffer(k) = aiciz6*(uz(i,j,k+1)+uz(i,j,k)) & + + biciz6*(uz(i,j,k+2)+uz(i,j,k-1)) & + + ciciz6*(uz(i,j,k+3)+uz(i,j,k-2)) & + + diciz6*(uz(i,j,k+4)+uz(i,j,k-3)) + enddo + buffer(nzm-2) = aiciz6*(uz(i,j,nzm-1)+uz(i,j,nzm-2)) & + biciz6*(uz(i,j,nzm)+uz(i,j,nzm-3)) & + ciciz6*(uz(i,j,nz)+uz(i,j,nzm-4)) & + diciz6*(uz(i,j,nzm)+uz(i,j,nzm-5)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nzm-1) = aiciz6*(uz(i,j,nzm)+uz(i,j,nzm-1)) & + buffer(nzm-1) = aiciz6*(uz(i,j,nzm)+uz(i,j,nzm-1)) & + biciz6*(uz(i,j,nz)+uz(i,j,nzm-2)) & + ciciz6*(uz(i,j,nzm)+uz(i,j,nzm-3)) & + diciz6*(uz(i,j,nzm-1)+uz(i,j,nzm-4)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nzm) = aiciz6*(uz(i,j,nz)+uz(i,j,nzm)) & + buffer(nzm) = aiciz6*(uz(i,j,nz)+uz(i,j,nzm)) & + biciz6*(uz(i,j,nzm)+uz(i,j,nzm-1)) & + ciciz6*(uz(i,j,nzm-1)+uz(i,j,nzm-2)) & + diciz6*(uz(i,j,nzm-2)+uz(i,j,nzm-3)) - enddo - ! Solve tri-diagonal system - call zthomas(tz, x3dop%f, x3dop%s, x3dop%w, nx, ny, nzm) + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, nzm) + do concurrent (k=1:nzm) + tz(i,j,k) = buffer(k) + enddo + enddo endif endif @@ -1098,6 +1048,7 @@ subroutine derzpv(tz,uz,x3dop,nx,ny,nzm,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nz) :: buffer if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -1108,58 +1059,52 @@ subroutine derzpv(tz,uz,x3dop,nx,ny,nzm,nz) if (nclz) then ! nzm = nz - - ! Compute r.h.s. do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = aciz6*(uz(i,j,1)-uz(i,j,nz)) & + + ! Compute r.h.s. + buffer(1) = aciz6*(uz(i,j,1)-uz(i,j,nz)) & + bciz6*(uz(i,j,2)-uz(i,j,nz-1)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = aciz6*(uz(i,j,2)-uz(i,j,1)) & + buffer(2) = aciz6*(uz(i,j,2)-uz(i,j,1)) & + bciz6*(uz(i,j,3)-uz(i,j,nz)) - enddo - do concurrent (k=3:nz-2, j=1:ny, i=1:nx) - tz(i,j,k) = aciz6*(uz(i,j,k)-uz(i,j,k-1)) & - + bciz6*(uz(i,j,k+1)-uz(i,j,k-2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = aciz6*(uz(i,j,nz-1)-uz(i,j,nz-2)) & + do concurrent (k=3:nz-2) + buffer(k) = aciz6*(uz(i,j,k)-uz(i,j,k-1)) & + + bciz6*(uz(i,j,k+1)-uz(i,j,k-2)) + enddo + buffer(nz-1) = aciz6*(uz(i,j,nz-1)-uz(i,j,nz-2)) & + bciz6*(uz(i,j,nz)-uz(i,j,nz-3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz) = aciz6*(uz(i,j,nz)-uz(i,j,nz-1)) & + buffer(nz) = aciz6*(uz(i,j,nz)-uz(i,j,nz-1)) & + bciz6*(uz(i,j,1)-uz(i,j,nz-2)) - enddo - ! Solve tri-diagonal system - call zthomas(tz, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nz) + do concurrent (k=1:nz) + tz(i,j,k) = buffer(k) + enddo + enddo else ! nzm = nz-1 if (x3dop%npaire==1) then - - ! Compute r.h.s. do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = zero - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = aciz6*(uz(i,j,2)-uz(i,j,1)) & + + ! Compute r.h.s. + buffer(1) = zero + buffer(2) = aciz6*(uz(i,j,2)-uz(i,j,1)) & + bciz6*(uz(i,j,3)-uz(i,j,1)) - enddo - do concurrent (k=3:nz-2, j=1:ny, i=1:nx) - tz(i,j,k) = aciz6*(uz(i,j,k)-uz(i,j,k-1)) & - + bciz6*(uz(i,j,k+1)-uz(i,j,k-2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = aciz6*(uz(i,j,nz-1)-uz(i,j,nz-2)) & + do concurrent (k=3:nz-2) + buffer(k) = aciz6*(uz(i,j,k)-uz(i,j,k-1)) & + + bciz6*(uz(i,j,k+1)-uz(i,j,k-2)) + enddo + buffer(nz-1) = aciz6*(uz(i,j,nz-1)-uz(i,j,nz-2)) & + bciz6*(uz(i,j,nz-1)-uz(i,j,nz-3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz) = zero - enddo + buffer(nz) = zero - ! Solve tri-diagonal system - call zthomas(tz, x3dop%f, x3dop%s, x3dop%w, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, nz) + do concurrent (k=1:nz) + tz(i,j,k) = buffer(k) + enddo + enddo endif endif @@ -1182,6 +1127,7 @@ subroutine interzpv(tz,uz,x3dop,nx,ny,nzm,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(nz) :: buffer if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -1192,122 +1138,102 @@ subroutine interzpv(tz,uz,x3dop,nx,ny,nzm,nz) if (nclz) then ! nzm = nz - - ! Compute r.h.s. do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = aiciz6*(uz(i,j,1)+uz(i,j,nz)) & + + ! Compute r.h.s. + buffer(1) = aiciz6*(uz(i,j,1)+uz(i,j,nz)) & + biciz6*(uz(i,j,2)+uz(i,j,nz-1)) & + ciciz6*(uz(i,j,3)+uz(i,j,nz-2)) & + diciz6*(uz(i,j,4)+uz(i,j,nz-3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = aiciz6*(uz(i,j,2)+uz(i,j,1)) & + buffer(2) = aiciz6*(uz(i,j,2)+uz(i,j,1)) & + biciz6*(uz(i,j,3)+uz(i,j,nz)) & + ciciz6*(uz(i,j,4)+uz(i,j,nz-1)) & + diciz6*(uz(i,j,5)+uz(i,j,nz-2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,3) = aiciz6*(uz(i,j,3)+uz(i,j,2)) & + buffer(3) = aiciz6*(uz(i,j,3)+uz(i,j,2)) & + biciz6*(uz(i,j,4)+uz(i,j,1)) & + ciciz6*(uz(i,j,5)+uz(i,j,nz)) & + diciz6*(uz(i,j,6)+uz(i,j,nz-1)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,4) = aiciz6*(uz(i,j,4)+uz(i,j,3)) & + buffer(4) = aiciz6*(uz(i,j,4)+uz(i,j,3)) & + biciz6*(uz(i,j,5)+uz(i,j,2)) & + ciciz6*(uz(i,j,6)+uz(i,j,1)) & + diciz6*(uz(i,j,7)+uz(i,j,nz)) - enddo - do concurrent (k=5:nz-3, j=1:ny, i=1:nx) - tz(i,j,k) = aiciz6*(uz(i,j,k)+uz(i,j,k-1)) & - + biciz6*(uz(i,j,k+1)+uz(i,j,k-2)) & - + ciciz6*(uz(i,j,k+2)+uz(i,j,k-3)) & - + diciz6*(uz(i,j,k+3)+uz(i,j,k-4)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-2) = aiciz6*(uz(i,j,nz-2)+uz(i,j,nz-3)) & + do concurrent (k=5:nz-3) + buffer(k) = aiciz6*(uz(i,j,k)+uz(i,j,k-1)) & + + biciz6*(uz(i,j,k+1)+uz(i,j,k-2)) & + + ciciz6*(uz(i,j,k+2)+uz(i,j,k-3)) & + + diciz6*(uz(i,j,k+3)+uz(i,j,k-4)) + enddo + buffer(nz-2) = aiciz6*(uz(i,j,nz-2)+uz(i,j,nz-3)) & + biciz6*(uz(i,j,nz-1)+uz(i,j,nz-4)) & + ciciz6*(uz(i,j,nz)+uz(i,j,nz-5)) & + diciz6*(uz(i,j,1)+uz(i,j,nz-6)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = aiciz6*(uz(i,j,nz-1)+uz(i,j,nz-2)) & + buffer(nz-1) = aiciz6*(uz(i,j,nz-1)+uz(i,j,nz-2)) & + biciz6*(uz(i,j,nz)+uz(i,j,nz-3)) & + ciciz6*(uz(i,j,1)+uz(i,j,nz-4)) & + diciz6*(uz(i,j,2)+uz(i,j,nz-5)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz ) = aiciz6*(uz(i,j,nz)+uz(i,j,nz-1)) & + buffer(nz ) = aiciz6*(uz(i,j,nz)+uz(i,j,nz-1)) & + biciz6*(uz(i,j,1)+uz(i,j,nz-2)) & + ciciz6*(uz(i,j,2)+uz(i,j,nz-3)) & + diciz6*(uz(i,j,3)+uz(i,j,nz-4)) - enddo - ! Solve tri-diagonal system - call zthomas(tz, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nz) + do concurrent (k=1:nz) + tz(i,j,k) = buffer(k) + enddo + enddo else ! nzm = nz-1 if (x3dop%npaire==1) then - - ! Compute r.h.s. do concurrent (j=1:ny, i=1:nx) - tz(i,j,1) = aiciz6*(uz(i,j,1)+uz(i,j,1)) & + + ! Compute r.h.s. + buffer(1) = aiciz6*(uz(i,j,1)+uz(i,j,1)) & + biciz6*(uz(i,j,2)+uz(i,j,2)) & + ciciz6*(uz(i,j,3)+uz(i,j,3)) & + diciz6*(uz(i,j,4)+uz(i,j,4)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,2) = aiciz6*(uz(i,j,2)+uz(i,j,1)) & + buffer(2) = aiciz6*(uz(i,j,2)+uz(i,j,1)) & + biciz6*(uz(i,j,3)+uz(i,j,1))& + ciciz6*(uz(i,j,4)+uz(i,j,2))& + diciz6*(uz(i,j,5)+uz(i,j,3)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,3) = aiciz6*(uz(i,j,3)+uz(i,j,2)) & + buffer(3) = aiciz6*(uz(i,j,3)+uz(i,j,2)) & + biciz6*(uz(i,j,4)+uz(i,j,1)) & + ciciz6*(uz(i,j,5)+uz(i,j,1)) & + diciz6*(uz(i,j,6)+uz(i,j,2)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,4) = aiciz6*(uz(i,j,4)+uz(i,j,3)) & + buffer(4) = aiciz6*(uz(i,j,4)+uz(i,j,3)) & + biciz6*(uz(i,j,5)+uz(i,j,2)) & + ciciz6*(uz(i,j,6)+uz(i,j,1)) & + diciz6*(uz(i,j,7)+uz(i,j,1)) - enddo - do concurrent (k=5:nz-4, j=1:ny, i=1:nx) - tz(i,j,k) = aiciz6*(uz(i,j,k)+uz(i,j,k-1)) & - + biciz6*(uz(i,j,k+1)+uz(i,j,k-2)) & - + ciciz6*(uz(i,j,k+2)+uz(i,j,k-3)) & - + diciz6*(uz(i,j,k+3)+uz(i,j,k-4)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-3) = aiciz6*(uz(i,j,nz-3)+uz(i,j,nz-4)) & + do concurrent (k=5:nz-4) + buffer(k) = aiciz6*(uz(i,j,k)+uz(i,j,k-1)) & + + biciz6*(uz(i,j,k+1)+uz(i,j,k-2)) & + + ciciz6*(uz(i,j,k+2)+uz(i,j,k-3)) & + + diciz6*(uz(i,j,k+3)+uz(i,j,k-4)) + enddo + buffer(nz-3) = aiciz6*(uz(i,j,nz-3)+uz(i,j,nz-4)) & + biciz6*(uz(i,j,nz-2)+uz(i,j,nz-5)) & + ciciz6*(uz(i,j,nz-1)+uz(i,j,nz-6)) & + diciz6*(uz(i,j,nz-1)+uz(i,j,nz-7)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-2) = aiciz6*(uz(i,j,nz-2)+uz(i,j,nz-3)) & + buffer(nz-2) = aiciz6*(uz(i,j,nz-2)+uz(i,j,nz-3)) & + biciz6*(uz(i,j,nz-1)+uz(i,j,nz-4)) & + ciciz6*(uz(i,j,nz-1)+uz(i,j,nz-5)) & + diciz6*(uz(i,j,nz-2)+uz(i,j,nz-6)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz-1) = aiciz6*(uz(i,j,nz-1)+uz(i,j,nz-2)) & + buffer(nz-1) = aiciz6*(uz(i,j,nz-1)+uz(i,j,nz-2)) & + biciz6*(uz(i,j,nz-1)+uz(i,j,nz-3)) & + ciciz6*(uz(i,j,nz-2)+uz(i,j,nz-4)) & + diciz6*(uz(i,j,nz-3)+uz(i,j,nz-5)) - enddo - do concurrent (j=1:ny, i=1:nx) - tz(i,j,nz ) = aiciz6*(uz(i,j,nz-1)+uz(i,j,nz-1)) & + buffer(nz ) = aiciz6*(uz(i,j,nz-1)+uz(i,j,nz-1)) & + biciz6*(uz(i,j,nz-2)+uz(i,j,nz-2)) & + ciciz6*(uz(i,j,nz-3)+uz(i,j,nz-3)) & + diciz6*(uz(i,j,nz-4)+uz(i,j,nz-4)) - enddo - ! Solve tri-diagonal system - call zthomas(tz, x3dop%f, x3dop%s, x3dop%w, nx, ny, nz) + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, nz) + do concurrent (k=1:nz) + tz(i,j,k) = buffer(k) + enddo + enddo endif endif From 955d3a32e7948c5cdea8fd3e848d2689bfd1831d Mon Sep 17 00:00:00 2001 From: cflag Date: Mon, 28 Feb 2022 12:27:36 +0100 Subject: [PATCH 02/10] Fix previous commit --- src/x3d_derive.f90 | 3 +++ src/x3d_staggered.f90 | 4 +--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/x3d_derive.f90 b/src/x3d_derive.f90 index 7e131d1..dea4052 100644 --- a/src/x3d_derive.f90 +++ b/src/x3d_derive.f90 @@ -784,6 +784,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) ! Local variables integer :: i, j, k + real(mytype), dimension(nx) :: buffer do concurrent (k=1:nz, j=1:ny) @@ -1038,6 +1039,7 @@ subroutine deryy_00(ty,uy,x3dop,nx,ny,nz) ! Local variables integer :: i, j, k + real(mytype), dimension(ny) :: buffer ! Compute r.h.s. do concurrent (k=1:nz, i=1:nx) @@ -1506,6 +1508,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) ! Local variables integer :: i, j, k + real(mytype), dimension(nz) :: buffer if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) diff --git a/src/x3d_staggered.f90 b/src/x3d_staggered.f90 index 797e7d4..e4d1c10 100644 --- a/src/x3d_staggered.f90 +++ b/src/x3d_staggered.f90 @@ -447,7 +447,6 @@ subroutine interyvp(ty,uy,x3dop,nx,ny,nym,nz) + biciy6*(uy(i,5,k)+uy(i,2,k)) & + ciciy6*(uy(i,6,k)+uy(i,1,k)) & + diciy6*(uy(i,7,k)+uy(i,ny,k)) - enddo do concurrent (j=4:ny-4) buffer(j) = aiciy6*(uy(i,j+1,k)+uy(i,j,k)) & + biciy6*(uy(i,j+2,k)+uy(i,j-1,k)) & @@ -555,7 +554,6 @@ subroutine deryvp(ty,uy,x3dop,ppyi,nx,ny,nym,nz) + bciy6*(uy(i,3,k)-uy(i,ny,k)) buffer(2) = aciy6*(uy(i,3,k)-uy(i,2,k)) & + bciy6*(uy(i,4,k)-uy(i,1,k)) - enddo do concurrent (j=3:ny-2) buffer(j) = aciy6*(uy(i,j+1,k)-uy(i,j,k)) & + bciy6*(uy(i,j+2,k)-uy(i,j-1,k)) @@ -672,7 +670,7 @@ subroutine interypv(ty,uy,x3dop,nx,nym,ny,nz) + diciy6*(uy(i,3,k)+uy(i,ny-4,k)) ! Solve tri-diagonal system - call thomas1d(ty, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, ny) + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, ny) do concurrent (j=1:ny) ty(i,j,k) = buffer(j) enddo From 8db39e7c50513e1f321725a49e77fbc6f8c3a7a8 Mon Sep 17 00:00:00 2001 From: cflag Date: Mon, 28 Feb 2022 13:46:23 +0100 Subject: [PATCH 03/10] Fix intent in _imp --- src/x3d_operator_1d.f90 | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/x3d_operator_1d.f90 b/src/x3d_operator_1d.f90 index f4b0719..4ce9ec9 100644 --- a/src/x3d_operator_1d.f90 +++ b/src/x3d_operator_1d.f90 @@ -463,8 +463,8 @@ subroutine first_deriv_imp_(alfa1, af1, bf1, cf1, df1, alfa2, af2, alfan, afn, b real(mytype), intent(in) :: d integer, intent(in) :: n, ncl1, ncln real(mytype), dimension(n), intent(out) :: ff, fs, fw, ffp, fsp, fwp - real(mytype), intent(out) :: alfa1, af1, bf1, cf1, df1, alfa2, af2, alfan, afn, bfn, & - cfn, dfn, alfam, afm, alfai, afi, bfi + real(mytype), intent(in) :: alfa1, af1, bf1, cf1, df1, alfa2, af2, alfan, afn, bfn, & + cfn, dfn, alfam, afm, alfai, afi, bfi integer :: i real(mytype), dimension(n) :: fb, fc @@ -696,12 +696,12 @@ subroutine second_deriv_imp_(alsa1, as1, bs1, & real(mytype), intent(in) :: d2 integer, intent(in) :: n, ncl1, ncln real(mytype), dimension(n), intent(out) :: sf, ss, sw, sfp, ssp, swp - real(mytype), intent(out) :: alsa1, as1, bs1, & - cs1, ds1, alsa2, as2, alsan, asn, bsn, csn, dsn, alsam, & - asm, alsa3, as3, bs3, alsat, ast, bst, & - alsa4, as4, bs4, cs4, & - alsatt, astt, bstt, cstt, & - alsai, asi, bsi, csi, dsi + real(mytype), intent(in) :: alsa1, as1, bs1, & + cs1, ds1, alsa2, as2, alsan, asn, bsn, csn, dsn, alsam, & + asm, alsa3, as3, bs3, alsat, ast, bst, & + alsa4, as4, bs4, cs4, & + alsatt, astt, bstt, cstt, & + alsai, asi, bsi, csi, dsi integer :: i real(mytype), dimension(n) :: sb, sc From 30b61bace77f913acbe9e8db0314fb1c278bd192 Mon Sep 17 00:00:00 2001 From: cflag Date: Mon, 28 Feb 2022 15:25:14 +0100 Subject: [PATCH 04/10] Fix thomas1d_0 --- src/thomas.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/thomas.f90 b/src/thomas.f90 index e6365ff..ac54fd3 100644 --- a/src/thomas.f90 +++ b/src/thomas.f90 @@ -221,7 +221,7 @@ pure subroutine thomas1d_0(tt, ff, fs, fw, perio, alfa, nn) call thomas1d_12(tt, ff, fs, fw, nn) ss = (tt(1)-alfa*tt(nn)) / (one + perio(1) - alfa*tt(nn)) do k = 1, nn - tt(k) = tt(k) - ss*perio(nn) + tt(k) = tt(k) - ss*perio(k) enddo end subroutine thomas1d_0 From 1c2a62bfd26abef1864f994e9c073c6a80a739d4 Mon Sep 17 00:00:00 2001 From: cflag Date: Mon, 28 Feb 2022 15:41:53 +0100 Subject: [PATCH 05/10] Add missing buffer in derzz --- src/x3d_derive.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/x3d_derive.f90 b/src/x3d_derive.f90 index dea4052..82c53a5 100644 --- a/src/x3d_derive.f90 +++ b/src/x3d_derive.f90 @@ -1441,7 +1441,7 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) + dskz*(uz(i,j,8)-uz(i,j,4 ) & -uz(i,j,4)+uz(i,j,nz)) do concurrent (k=5:nz-4) - tz(i,j,k) = askz*(uz(i,j,k+1)-uz(i,j,k ) & + buffer(k) = askz*(uz(i,j,k+1)-uz(i,j,k ) & -uz(i,j,k )+uz(i,j,k-1)) & + bskz*(uz(i,j,k+2)-uz(i,j,k ) & -uz(i,j,k )+uz(i,j,k-2)) & From 90f836ebf440c2ade301906078a736861ef77dfd Mon Sep 17 00:00:00 2001 From: cflag Date: Mon, 28 Feb 2022 15:53:55 +0100 Subject: [PATCH 06/10] Fix thomas1d_0 --- src/thomas.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/thomas.f90 b/src/thomas.f90 index ac54fd3..fd8745b 100644 --- a/src/thomas.f90 +++ b/src/thomas.f90 @@ -219,7 +219,7 @@ pure subroutine thomas1d_0(tt, ff, fs, fw, perio, alfa, nn) real(mytype) :: ss call thomas1d_12(tt, ff, fs, fw, nn) - ss = (tt(1)-alfa*tt(nn)) / (one + perio(1) - alfa*tt(nn)) + ss = (tt(1)-alfa*tt(nn)) / (one + perio(1) - alfa*perio(nn)) do k = 1, nn tt(k) = tt(k) - ss*perio(k) enddo From 66b16b9738381d67920f9aa4f73fde8b941a89f4 Mon Sep 17 00:00:00 2001 From: CFLAG Date: Fri, 4 Mar 2022 15:16:07 +0100 Subject: [PATCH 07/10] Add OpenACC instructions to help nvidia compiler parallelize loops calling the Thomas solver (#1) --- src/thomas.f90 | 4 ++++ src/x3d_derive.f90 | 24 ++++++++++++++++++++++++ src/x3d_staggered.f90 | 24 ++++++++++++++++++++++++ 3 files changed, 52 insertions(+) diff --git a/src/thomas.f90 b/src/thomas.f90 index d1bd202..48ab70b 100644 --- a/src/thomas.f90 +++ b/src/thomas.f90 @@ -188,6 +188,8 @@ end subroutine zthomas_12 ! pure subroutine thomas1d_12(tt, ff, fs, fw, nn) + !$acc routine seq + implicit none integer, intent(in) :: nn @@ -208,6 +210,8 @@ end subroutine thomas1d_12 pure subroutine thomas1d_0(tt, ff, fs, fw, perio, alfa, nn) + !$acc routine seq + implicit none integer, intent(in) :: nn diff --git a/src/x3d_derive.f90 b/src/x3d_derive.f90 index 23a8259..239993b 100644 --- a/src/x3d_derive.f90 +++ b/src/x3d_derive.f90 @@ -183,6 +183,8 @@ subroutine derx_00(tx,ux,x3dop,nx,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz real(mytype), intent(out), dimension(nx,ny,nz) :: tx @@ -225,6 +227,8 @@ subroutine derx_ij(tx,ux,ff,fs,fw,nx,ny,nz,npaire,ncl1,ncln) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz, npaire, ncl1, ncln real(mytype), intent(out), dimension(nx,ny,nz) :: tx @@ -352,6 +356,8 @@ subroutine dery_00(ty,uy,x3dop,ppy,nx,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz real(mytype), intent(out), dimension(nx,ny,nz) :: ty @@ -403,6 +409,8 @@ subroutine dery_ij(ty,uy,ff,fs,fw,ppy,nx,ny,nz,npaire,ncl1,ncln) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz, npaire, ncl1, ncln real(mytype), intent(out), dimension(nx,ny,nz) :: ty @@ -540,6 +548,8 @@ subroutine derz_00(tz,uz,x3dop,nx,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx,ny,nz real(mytype), intent(out), dimension(nx,ny,nz) :: tz @@ -589,6 +599,8 @@ subroutine derz_ij(tz,uz,ff,fs,fw,nx,ny,nz,npaire,ncl1,ncln) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz, npaire, ncl1, ncln real(mytype), intent(out), dimension(nx,ny,nz) :: tz @@ -722,6 +734,8 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz real(mytype), intent(out), dimension(nx,ny,nz) :: tx @@ -826,6 +840,8 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz, npaire, ncl1, ncln real(mytype), intent(out), dimension(nx,ny,nz) :: tx @@ -1081,6 +1097,8 @@ subroutine deryy_00(ty,uy,x3dop,nx,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz real(mytype), intent(out), dimension(nx,ny,nz) :: ty @@ -1185,6 +1203,8 @@ subroutine deryy_ij(ty,uy,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz, npaire, ncl1, ncln real(mytype), intent(out), dimension(nx,ny,nz) :: ty @@ -1439,6 +1459,8 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz real(mytype), intent(out), dimension(nx,ny,nz) :: tz @@ -1550,6 +1572,8 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz, npaire, ncl1, ncln real(mytype), intent(out), dimension(nx,ny,nz) :: tz diff --git a/src/x3d_staggered.f90 b/src/x3d_staggered.f90 index e4757e1..4ed6cb6 100644 --- a/src/x3d_staggered.f90 +++ b/src/x3d_staggered.f90 @@ -26,6 +26,8 @@ subroutine derxvp(tx,ux,x3dop,nx,nxm,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, nxm, ny, nz real(mytype), intent(out), dimension(nxm,ny,nz) :: tx @@ -112,6 +114,8 @@ subroutine interxvp(tx,ux,x3dop,nx,nxm,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, nxm, ny, nz real(mytype), intent(out), dimension(nxm,ny,nz) :: tx @@ -226,6 +230,8 @@ subroutine derxpv(tx,ux,x3dop,nxm,nx,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, nxm, ny, nz real(mytype), intent(out), dimension(nx,ny,nz) :: tx @@ -298,6 +304,8 @@ subroutine interxpv(tx,ux,x3dop,nxm,nx,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, nxm, ny, nz real(mytype), intent(out), dimension(nx,ny,nz) :: tx @@ -420,6 +428,8 @@ subroutine interyvp(ty,uy,x3dop,nx,ny,nym,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nym, nz real(mytype), intent(out), dimension(nx,nym,nz) :: ty @@ -534,6 +544,8 @@ subroutine deryvp(ty,uy,x3dop,ppyi,nx,ny,nym,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nym, nz real(mytype), intent(out), dimension(nx,nym,nz) :: ty @@ -619,6 +631,8 @@ subroutine interypv(ty,uy,x3dop,nx,nym,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nym, nz real(mytype), intent(out), dimension(nx,ny,nz) :: ty @@ -741,6 +755,8 @@ subroutine derypv(ty,uy,x3dop,ppy,nx,nym,ny,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nym, nz real(mytype), intent(out), dimension(nx,ny,nz) :: ty @@ -824,6 +840,8 @@ subroutine derzvp(tz,uz,x3dop,nx,ny,nz,nzm) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz, nzm real(mytype), intent(out), dimension(nx,ny,nzm) :: tz @@ -917,6 +935,8 @@ subroutine interzvp(tz,uz,x3dop,nx,ny,nz,nzm) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz, nzm real(mytype), intent(out), dimension(nx,ny,nzm) :: tz @@ -1038,6 +1058,8 @@ subroutine derzpv(tz,uz,x3dop,nx,ny,nzm,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, nzm, ny, nz real(mytype), intent(out), dimension(nx,ny,nz) :: tz @@ -1117,6 +1139,8 @@ subroutine interzpv(tz,uz,x3dop,nx,ny,nzm,nz) implicit none + !$acc routine(thomas1d_0, thomas1d_12) seq + ! Arguments integer, intent(in) :: nx, ny, nz, nzm real(mytype), intent(out), dimension(nx,ny,nz) :: tz From 49383110a1b6387df350e6e3ab76ab3dd7a148ec Mon Sep 17 00:00:00 2001 From: cflag Date: Tue, 8 Mar 2022 11:23:43 +0100 Subject: [PATCH 08/10] Preliminary fix for 1D buffers --- Makefile | 8 +- src/transeq.f90 | 6 + src/x3d_derive.f90 | 96 ++++++++--- src/x3d_operator_1d.f90 | 16 +- src/x3d_staggered.f90 | 349 ++++++++++++++++++++++++---------------- src/xcompact3d.f90 | 12 +- 6 files changed, 308 insertions(+), 179 deletions(-) diff --git a/Makefile b/Makefile index 82e4dca..3ae4dc2 100644 --- a/Makefile +++ b/Makefile @@ -47,11 +47,13 @@ else ifeq ($(CMP),nagfor) else ifeq ($(CMP),cray) FC = ftn FFLAGS += -eF -g -O3 -N 1023 + FFLAGS += -h omp -h thread_do_concurrent + LFLAGS += -h omp -h thread_do_concurrent else ifeq ($(CMP),nvhpc) FC = mpif90 FFLAGS += -cpp -O3 -march=native FFLAGS += -Minfo=accel -stdpar -acc -target=multicore -# FFLAGS = -cpp -Mfree -Kieee -Minfo=accel -g -acc -target=gpu -fast -O3 -Minstrument + FFLAGS = -cpp -Mfree -Kieee -Minfo=accel,ftn,inline,loop,vect,opt,stdpar -stdpar=gpu -gpu=cc70,managed,lineinfo,deepcopy -acc -target=gpu -traceback -O3 -Minstrument -g LFLAGS += -acc -lnvhpcwrapnvtx endif @@ -68,9 +70,9 @@ SRC = $(SRCDIR)/x3d_precision.f90 $(SRCDIR)/module_param.f90 $(SRCDIR)/time_inte ifeq ($(FFT),fftw3) #FFTW3_PATH=/usr #FFTW3_PATH=/usr/lib64 - FFTW3_PATH=/usr/local/Cellar/fftw/3.3.7_1 + FFTW3_PATH=/opt/software/builder/developers/libraries/fftw/3.3.8/1/gcc-10.2.0-openmpi-4.0.5 INC=-I$(FFTW3_PATH)/include - LIBFFT=-L$(FFTW3_PATH) -lfftw3 -lfftw3f + LIBFFT=-L$(FFTW3_PATH)/lib -lfftw3 -lfftw3f else ifeq ($(FFT),fftw3_f03) FFTW3_PATH=/usr #ubuntu # apt install libfftw3-dev #FFTW3_PATH=/usr/lib64 #fedora # dnf install fftw fftw-devel diff --git a/src/transeq.f90 b/src/transeq.f90 index b1078c4..215833f 100644 --- a/src/transeq.f90 +++ b/src/transeq.f90 @@ -75,11 +75,17 @@ subroutine momentum_rhs_eq(dux1,duy1,duz1,ux1,uy1,uz1) enddo call derx (td1,ta1,x3d_op_derxp,xsize(1),xsize(2),xsize(3)) + print *, "td1", minval(td1), maxval(td1) call derx (te1,tb1,x3d_op_derx, xsize(1),xsize(2),xsize(3)) + print *, "te1", minval(te1), maxval(te1) call derx (tf1,tc1,x3d_op_derx, xsize(1),xsize(2),xsize(3)) + print *, "tf1", minval(tf1), maxval(tf1) call derx (ta1,ux1,x3d_op_derx, xsize(1),xsize(2),xsize(3)) + print *, "ta1", minval(ta1), maxval(ta1) call derx (tb1,uy1,x3d_op_derxp,xsize(1),xsize(2),xsize(3)) + print *, "tb1", minval(tb1), maxval(tb1) call derx (tc1,uz1,x3d_op_derxp,xsize(1),xsize(2),xsize(3)) + print *, "tc1", minval(tc1), maxval(tc1) ! Convective terms of x-pencil are stored in tg1,th1,ti1 do concurrent (k=1:xsize(3), j=1:xsize(2), i=1:xsize(1)) diff --git a/src/x3d_derive.f90 b/src/x3d_derive.f90 index 239993b..78d9d2d 100644 --- a/src/x3d_derive.f90 +++ b/src/x3d_derive.f90 @@ -193,9 +193,18 @@ subroutine derx_00(tx,ux,x3dop,nx,ny,nz) ! Local variables integer :: i, j, k - real(mytype), dimension(nx) :: buffer - - do concurrent (k=1:nz, j=1:ny) + real(mytype), dimension(nx) :: buffer, ff, ss, ww, pp + real(mytype) :: alfa + + do concurrent (i=1:nx) + ff(i) = x3dop%f(i) + ss(i) = x3dop%s(i) + ww(i) = x3dop%w(i) + pp(i) = x3dop%periodic(i) + end do + alfa = x3dop%alfa + + do concurrent (k=1:nz, j=1:ny) local(buffer) ! Compute r.h.s. buffer(1) = afix*(ux(2,j,k)-ux(nx,j,k)) & + bfix*(ux(3,j,k)-ux(nx-1,j,k)) @@ -211,8 +220,8 @@ subroutine derx_00(tx,ux,x3dop,nx,ny,nz) + bfix*(ux(2,j,k)-ux(nx-2,j,k)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx) - do concurrent(i=1:nx) + call thomas1d(buffer, ff, ss, ww, pp, alfa, nx) + do concurrent (i=1:nx) tx(i,j,k) = buffer(i) enddo enddo @@ -239,7 +248,7 @@ subroutine derx_ij(tx,ux,ff,fs,fw,nx,ny,nz,npaire,ncl1,ncln) integer :: i, j, k real(mytype), dimension(nx) :: buffer - do concurrent (k=1:nz, j=1:ny) + do concurrent (k=1:nz, j=1:ny) local(buffer) ! Compute r.h.s. if (ncl1==1) then if (npaire==1) then @@ -367,9 +376,16 @@ subroutine dery_00(ty,uy,x3dop,ppy,nx,ny,nz) ! Local variables integer :: i, j, k - real(mytype), dimension(ny) :: buffer + real(mytype), dimension(ny) :: buffer, ff, ss, ww, pp - do concurrent (k=1:nz, i=1:nx) + do concurrent (j=1:ny) + ff(j) = x3dop%f(j) + ss(j) = x3dop%s(j) + ww(j) = x3dop%w(j) + pp(j) = x3dop%periodic(j) + end do + + do concurrent (k=1:nz, i=1:nx) local(buffer) ! Compute r.h.s. buffer(1) = afjy*(uy(i,2,k)-uy(i,ny,k)) & + bfjy*(uy(i,3,k)-uy(i,ny-1,k)) @@ -385,7 +401,7 @@ subroutine dery_00(ty,uy,x3dop,ppy,nx,ny,nz) + bfjy*(uy(i,2,k)-uy(i,ny-2,k)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, ny) + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, ny) ! Apply stretching if needed if (istret /= 0) then @@ -421,7 +437,7 @@ subroutine dery_ij(ty,uy,ff,fs,fw,ppy,nx,ny,nz,npaire,ncl1,ncln) integer :: i, j, k real(mytype), dimension(ny) :: buffer - do concurrent (k=1:nz, i=1:nx) + do concurrent (k=1:nz, i=1:nx) local(buffer) ! Compute r.h.s. if (ncl1==1) then @@ -558,7 +574,7 @@ subroutine derz_00(tz,uz,x3dop,nx,ny,nz) ! Local variables integer :: i, j, k - real(mytype), dimension(nz) :: buffer + real(mytype), dimension(nz) :: buffer, ff, ss, ww, pp if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -567,7 +583,14 @@ subroutine derz_00(tz,uz,x3dop,nx,ny,nz) return endif - do concurrent (j=1:ny, i=1:nx) + do concurrent (k=1:nz) + ff(k) = x3dop%f(k) + ss(k) = x3dop%s(k) + ww(k) = x3dop%w(k) + if (allocated(x3dop%periodic)) pp(k) = x3dop%periodic(k) + end do + + do concurrent (j=1:ny, i=1:nx) local(buffer) ! Compute r.h.s. buffer(1) = afkz*(uz(i,j,2)-uz(i,j,nz )) & + bfkz*(uz(i,j,3)-uz(i,j,nz-1)) @@ -583,7 +606,7 @@ subroutine derz_00(tz,uz,x3dop,nx,ny,nz) + bfkz*(uz(i,j,2)-uz(i,j,nz-2)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nz) + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nz) do concurrent (k=1:nz) tz(i,j,k) = buffer(k) enddo @@ -618,7 +641,7 @@ subroutine derz_ij(tz,uz,ff,fs,fw,nx,ny,nz,npaire,ncl1,ncln) return endif - do concurrent (j=1:ny, i=1:nx) + do concurrent (j=1:ny, i=1:nx) local(buffer) ! Compute r.h.s. if (ncl1==1) then if (npaire==1) then @@ -744,10 +767,17 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) ! Local variables integer :: i, j, k - real(mytype), dimension(nx) :: buffer + real(mytype), dimension(nx) :: buffer, ff, ss, ww, pp + + do concurrent (i=1:nx) + ff(i) = x3dop%f(i) + ss(i) = x3dop%s(i) + ww(i) = x3dop%w(i) + if (allocated(x3dop%periodic)) pp(i) = x3dop%periodic(i) + end do ! Compute r.h.s. - do concurrent (k=1:nz, j=1:ny) + do concurrent (k=1:nz, j=1:ny) local(buffer) buffer(1) = asix*(ux(2,j,k)-ux(1 ,j,k) & -ux(1,j,k)+ux(nx ,j,k)) & + bsix*(ux(3,j,k)-ux(1 ,j,k) & @@ -824,7 +854,7 @@ subroutine derxx_00(tx,ux,x3dop,nx,ny,nz) -ux(nx,j,k)+ux(nx-4,j,k)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx) + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nx) do concurrent (i=1:nx) tx(i,j,k) = buffer(i) enddo @@ -852,7 +882,7 @@ subroutine derxx_ij(tx,ux,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) integer :: i, j, k real(mytype), dimension(nx) :: buffer - do concurrent (k=1:nz, j=1:ny) + do concurrent (k=1:nz, j=1:ny) local(buffer) ! Compute r.h.s. if (ncl1==1) then @@ -1107,10 +1137,17 @@ subroutine deryy_00(ty,uy,x3dop,nx,ny,nz) ! Local variables integer :: i, j, k - real(mytype), dimension(ny) :: buffer + real(mytype), dimension(ny) :: buffer, ff, ss, ww, pp + + do concurrent (j=1:ny) + ff(j) = x3dop%f(j) + ss(j) = x3dop%s(j) + ww(j) = x3dop%w(j) + if (allocated(x3dop%periodic)) pp(j) = x3dop%periodic(j) + end do ! Compute r.h.s. - do concurrent (k=1:nz, i=1:nx) + do concurrent (k=1:nz, i=1:nx) local(buffer) buffer(1) = asjy*(uy(i,2,k)-uy(i,1,k) & -uy(i,1,k)+uy(i,ny,k)) & + bsjy*(uy(i,3,k)-uy(i,1,k) & @@ -1187,7 +1224,7 @@ subroutine deryy_00(ty,uy,x3dop,nx,ny,nz) -uy(i,ny,k)+uy(i,ny-4,k)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, ny) + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, ny) do concurrent (j=1:ny) ty(i,j,k) = buffer(j) enddo @@ -1215,7 +1252,7 @@ subroutine deryy_ij(ty,uy,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) integer :: i, j, k real(mytype), dimension(ny) :: buffer - do concurrent (k=1:nz, i=1:nx) + do concurrent (k=1:nz, i=1:nx) local(buffer) ! Compute r.h.s. if (ncl1==1) then if (npaire==1) then @@ -1469,7 +1506,14 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) ! Local variables integer :: i, j, k - real(mytype), dimension(nz) :: buffer + real(mytype), dimension(nz) :: buffer, ff, ss, ww, pp + + do concurrent (k=1:nz) + ff(k) = x3dop%f(k) + ss(k) = x3dop%s(k) + ww(k) = x3dop%w(k) + if (allocated(x3dop%periodic)) pp(k) = x3dop%periodic(k) + end do if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -1478,7 +1522,7 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) return endif - do concurrent (j=1:ny, i=1:nx) + do concurrent (j=1:ny, i=1:nx) local(buffer) ! Compute r.h.s. buffer(1) = askz*(uz(i,j,2)-uz(i,j,1 ) & -uz(i,j,1)+uz(i,j,nz )) & @@ -1556,7 +1600,7 @@ subroutine derzz_00(tz,uz,x3dop,nx,ny,nz) -uz(i,j,nz)+uz(i,j,nz-4)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nz) + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nz) do concurrent (k=1:nz) tz(i,j,k) = buffer(k) enddo @@ -1590,7 +1634,7 @@ subroutine derzz_ij(tz,uz,sf,ss,sw,nx,ny,nz,npaire,ncl1,ncln) enddo endif - do concurrent (j=1:ny, i=1:nx) + do concurrent (j=1:ny, i=1:nx) local(buffer) ! Compute r.h.s. if (ncl1==1) then if (npaire==1) then diff --git a/src/x3d_operator_1d.f90 b/src/x3d_operator_1d.f90 index ee5dda0..fd0bdea 100644 --- a/src/x3d_operator_1d.f90 +++ b/src/x3d_operator_1d.f90 @@ -993,14 +993,14 @@ subroutine interpol_imp_(dx, nxm, nx, nclx1, nclxn, & real(mytype), intent(in) :: dx integer, intent(in) :: nxm, nx, nclx1, nclxn - real(mytype) :: alcaix6, acix6, bcix6 - real(mytype) :: ailcaix6, aicix6, bicix6, cicix6, dicix6 - real(mytype), dimension(nxm) :: cfx6, ccx6, cbx6, cfxp6, ciwxp6, csxp6, & - cwxp6, csx6, cwx6, cifx6, cicx6, cisx6 - real(mytype), dimension(nxm) :: cibx6, cifxp6, cisxp6, ciwx6 - real(mytype), dimension(nx) :: cfi6, cci6, cbi6, cfip6, csip6, cwip6, csi6, & - cwi6, cifi6, cici6, cibi6, cifip6 - real(mytype), dimension(nx) :: cisip6, ciwip6, cisi6, ciwi6 + real(mytype), intent(in) :: alcaix6, acix6, bcix6 + real(mytype), intent(in) :: ailcaix6, aicix6, bicix6, cicix6, dicix6 + real(mytype), dimension(nxm), intent(out) :: cfx6, ccx6, cbx6, cfxp6, ciwxp6, csxp6, & + cwxp6, csx6, cwx6, cifx6, cicx6, cisx6 + real(mytype), dimension(nxm), intent(out) :: cibx6, cifxp6, cisxp6, ciwx6 + real(mytype), dimension(nx), intent(out) :: cfi6, cci6, cbi6, cfip6, csip6, cwip6, csi6, & + cwi6, cifi6, cici6, cibi6, cifip6 + real(mytype), dimension(nx), intent(out) :: cisip6, ciwip6, cisi6, ciwi6 integer :: i diff --git a/src/x3d_staggered.f90 b/src/x3d_staggered.f90 index 4ed6cb6..ecbdc53 100644 --- a/src/x3d_staggered.f90 +++ b/src/x3d_staggered.f90 @@ -22,87 +22,89 @@ module x3d_staggered ! subroutine derxvp(tx,ux,x3dop,nx,nxm,ny,nz) - use x3d_operator_x_data - - implicit none - - !$acc routine(thomas1d_0, thomas1d_12) seq - - ! Arguments - integer, intent(in) :: nx, nxm, ny, nz - real(mytype), intent(out), dimension(nxm,ny,nz) :: tx - real(mytype), intent(in), dimension(nx,ny,nz) :: ux - type(x3doperator1d), intent(in) :: x3dop - - ! Local variables - integer :: i, j, k - real(mytype), dimension(nxm) :: buffer - - if (nclx) then - ! nxm = nx - do concurrent (k=1:nz, j=1:ny) - - ! Compute r.h.s. - buffer(1) = acix6*(ux(2,j,k)-ux(1 ,j,k)) & - + bcix6*(ux(3,j,k)-ux(nx,j,k)) - buffer(2) = acix6*(ux(3,j,k)-ux(2,j,k)) & - + bcix6*(ux(4,j,k)-ux(1,j,k)) - do concurrent (i=3:nx-2) - buffer(i) = acix6*(ux(i+1,j,k)-ux(i ,j,k)) & - + bcix6*(ux(i+2,j,k)-ux(i-1,j,k)) - enddo - buffer(nx-1) = acix6*(ux(nx,j,k)-ux(nx-1,j,k)) & - + bcix6*(ux(1 ,j,k)-ux(nx-2,j,k)) - buffer(nx ) = acix6*(ux(1,j,k)-ux(nx ,j,k)) & - + bcix6*(ux(2,j,k)-ux(nx-1,j,k)) - - ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx) - do concurrent (i=1:nx) - tx(i,j,k) = buffer(i) - enddo - enddo - - else - ! nxm = nx-1 - do concurrent (k=1:nz, j=1:ny) - - ! Compute r.h.s. - if (x3dop%npaire==1) then - buffer(1) = acix6*(ux(2,j,k)-ux(1,j,k)) & - + bcix6*(ux(3,j,k)-ux(2,j,k)) - buffer(2) = acix6*(ux(3,j,k)-ux(2,j,k)) & - + bcix6*(ux(4,j,k)-ux(1,j,k)) - else - buffer(1) = acix6*(ux(2,j,k)-ux(1,j,k)) & - + bcix6*(ux(3,j,k)-two*ux(1,j,k)+ux(2,j,k)) - buffer(2) = acix6*(ux(3,j,k)-ux(2,j,k)) & - + bcix6*(ux(4,j,k)-ux(1,j,k)) - endif - do concurrent (i=3:nxm-2) - buffer(i) = acix6*(ux(i+1,j,k)-ux(i ,j,k)) & - + bcix6*(ux(i+2,j,k)-ux(i-1,j,k)) - enddo - if (x3dop%npaire==1) then - buffer(nxm-1) = acix6*(ux(nxm,j,k)-ux(nxm-1,j,k)) & + use x3d_operator_x_data + + implicit none + + !$acc routine(thomas1d_0, thomas1d_12) seq + + ! Arguments + integer, intent(in) :: nx, nxm, ny, nz + real(mytype), intent(out), dimension(nxm,ny,nz) :: tx + real(mytype), intent(in), dimension(nx,ny,nz) :: ux + type(x3doperator1d), intent(in) :: x3dop + + ! Local variables + integer :: i, j, k + real(mytype), dimension(nxm) :: buffer, ff, ss, ww, pp + + do concurrent (i=1:nxm) + ff(i) = x3dop%f(i) + ss(i) = x3dop%s(i) + ww(i) = x3dop%w(i) + if (allocated(x3dop%periodic)) pp(i) = x3dop%periodic(i) + end do + + if (nclx) then + ! nxm = nx + do concurrent (k=1:nz, j=1:ny) local(buffer) + ! Compute r.h.s. + buffer(1) = acix6*(ux(2,j,k)-ux(1 ,j,k)) & + + bcix6*(ux(3,j,k)-ux(nx,j,k)) + buffer(2) = acix6*(ux(3,j,k)-ux(2,j,k)) & + + bcix6*(ux(4,j,k)-ux(1,j,k)) + do concurrent (i=3:nx-2) + buffer(i) = acix6*(ux(i+1,j,k)-ux(i ,j,k)) & + + bcix6*(ux(i+2,j,k)-ux(i-1,j,k)) + enddo + buffer(nx-1) = acix6*(ux(nx,j,k)-ux(nx-1,j,k)) & + + bcix6*(ux(1 ,j,k)-ux(nx-2,j,k)) + buffer(nx ) = acix6*(ux(1,j,k)-ux(nx ,j,k)) & + + bcix6*(ux(2,j,k)-ux(nx-1,j,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo + + else + ! nxm = nx-1 + do concurrent (k=1:nz, j=1:ny) + + ! Compute r.h.s. + if (x3dop%npaire==1) then + buffer(1) = acix6*(ux(2,j,k)-ux(1,j,k)) & + + bcix6*(ux(3,j,k)-ux(2,j,k)) + buffer(2) = acix6*(ux(3,j,k)-ux(2,j,k)) & + + bcix6*(ux(4,j,k)-ux(1,j,k)) + else + buffer(1) = acix6*(ux(2,j,k)-ux(1,j,k)) & + + bcix6*(ux(3,j,k)-two*ux(1,j,k)+ux(2,j,k)) + buffer(2) = acix6*(ux(3,j,k)-ux(2,j,k)) & + + bcix6*(ux(4,j,k)-ux(1,j,k)) + endif + do concurrent (i=3:nxm-2) + buffer(i) = acix6*(ux(i+1,j,k)-ux(i ,j,k)) & + + bcix6*(ux(i+2,j,k)-ux(i-1,j,k)) + enddo + if (x3dop%npaire==1) then + buffer(nxm-1) = acix6*(ux(nxm,j,k)-ux(nxm-1,j,k)) & + + bcix6*(ux(nx ,j,k)-ux(nxm-2,j,k)) + buffer(nxm) = acix6*(ux(nx ,j,k)-ux(nxm ,j,k)) & + bcix6*(ux(nx ,j,k)-ux(nxm-2,j,k)) - buffer(nxm) = acix6*(ux(nx ,j,k)-ux(nxm ,j,k)) & - + bcix6*(ux(nxm,j,k)-ux(nxm-1,j,k)) - else - buffer(nxm-1) = acix6*(ux(nxm,j,k)-ux(nxm-1,j,k)) & - + bcix6*(ux(nx ,j,k)-ux(nxm-2,j,k)) - buffer(nxm) = acix6*(ux(nx,j,k)-ux(nxm,j,k)) & - + bcix6*(two*ux(nx,j,k)-ux(nxm,j,k)-ux(nxm-1,j,k)) - endif - - ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, nxm) - do concurrent (i=1:nxm) - tx(i,j,k) = buffer(i) - enddo - enddo - - endif + buffer(nxm) = acix6*(ux(nx,j,k)-ux(nxm,j,k)) & + + bcix6*(two*ux(nx,j,k)-ux(nxm,j,k)-ux(nxm-1,j,k)) + endif + + ! Solve tri-diagonal system + call thomas1d(buffer, ff, ss, ww, nxm) + do concurrent (i=1:nxm) + tx(i,j,k) = buffer(i) + enddo + enddo + endif end subroutine derxvp @@ -124,11 +126,18 @@ subroutine interxvp(tx,ux,x3dop,nx,nxm,ny,nz) ! Local variables integer :: i, j, k - real(mytype), dimension(nxm) :: buffer + real(mytype), dimension(nxm) :: buffer, ff, ss, ww, pp + + do concurrent (i=1:nxm) + ff(i) = x3dop%f(i) + ss(i) = x3dop%s(i) + ww(i) = x3dop%w(i) + if (allocated(x3dop%periodic)) pp(i) = x3dop%periodic(i) + end do if (nclx) then ! nxm = nx - do concurrent (k=1:nz, j=1:ny) + do concurrent (k=1:nz, j=1:ny) local(buffer) ! Compute r.h.s. buffer(1) = aicix6*(ux(2,j,k)+ux(1 ,j,k)) & @@ -167,7 +176,7 @@ subroutine interxvp(tx,ux,x3dop,nx,nxm,ny,nz) + dicix6*(ux(4,j,k)+ux(nx-3,j,k)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx) + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nx) do concurrent (i=1:nx) tx(i,j,k) = buffer(i) enddo @@ -176,7 +185,7 @@ subroutine interxvp(tx,ux,x3dop,nx,nxm,ny,nz) else ! nxm = nx-1 if (x3dop%npaire==1) then - do concurrent (k=1:nz, j=1:ny) + do concurrent (k=1:nz, j=1:ny) local(buffer) ! Compute r.h.s. buffer(1) = aicix6*(ux(2,j,k)+ux(1,j,k)) & @@ -211,7 +220,7 @@ subroutine interxvp(tx,ux,x3dop,nx,nxm,ny,nz) + dicix6*(ux(nxm-2,j,k)+ux(nxm-3,j,k)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, nxm) + call thomas1d(buffer, ff, ss, ww, nxm) do concurrent (i=1:nxm) tx(i,j,k) = buffer(i) enddo @@ -240,11 +249,18 @@ subroutine derxpv(tx,ux,x3dop,nxm,nx,ny,nz) ! Local variables integer :: i, j, k - real(mytype), dimension(nx) :: buffer + real(mytype), dimension(nx) :: buffer, ff, ss, ww, pp + + do concurrent (i=1:nx) + ff(i) = x3dop%f(i) + ss(i) = x3dop%s(i) + ww(i) = x3dop%w(i) + if (allocated(x3dop%periodic)) pp(i) = x3dop%periodic(i) + end do if (nclx) then ! nxm = nx - do concurrent (k=1:nz, j=1:ny) + do concurrent (k=1:nz, j=1:ny) local(buffer) ! Compute r.h.s. buffer(1) = acix6*(ux(1,j,k)-ux(nx ,j,k)) & @@ -261,7 +277,7 @@ subroutine derxpv(tx,ux,x3dop,nxm,nx,ny,nz) + bcix6*(ux(1,j,k)-ux(nx-2,j,k)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx) + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nx) do concurrent (i=1:nx) tx(i,j,k) = buffer(i) enddo @@ -270,7 +286,7 @@ subroutine derxpv(tx,ux,x3dop,nxm,nx,ny,nz) else ! nxm = nx-1 if (x3dop%npaire==1) then - do concurrent (k=1:nz, j=1:ny) + do concurrent (k=1:nz, j=1:ny) local(buffer) ! Compute r.h.s. buffer(1) = zero @@ -285,7 +301,7 @@ subroutine derxpv(tx,ux,x3dop,nxm,nx,ny,nz) buffer(nx) = zero ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, nx) + call thomas1d(buffer, ff, ss, ww, nx) do concurrent (i=1:nx) tx(i,j,k) = buffer(i) enddo @@ -314,11 +330,18 @@ subroutine interxpv(tx,ux,x3dop,nxm,nx,ny,nz) ! Local variables integer :: i, j, k - real(mytype), dimension(nx) :: buffer + real(mytype), dimension(nx) :: buffer, ff, ss, ww, pp + + do concurrent (i=1:nx) + ff(i) = x3dop%f(i) + ss(i) = x3dop%s(i) + ww(i) = x3dop%w(i) + if (allocated(x3dop%periodic)) pp(i) = x3dop%periodic(i) + end do if (nclx) then ! nxm = nx - do concurrent (k=1:nz, j=1:ny) + do concurrent (k=1:nz, j=1:ny) local(buffer) ! Compute r.h.s. buffer(1) = aicix6*(ux(1,j,k)+ux(nx ,j,k)) & @@ -357,7 +380,7 @@ subroutine interxpv(tx,ux,x3dop,nxm,nx,ny,nz) + dicix6*(ux(3,j,k)+ux(nx-4,j,k)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx) + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nx) do concurrent (i=1:nx) tx(i,j,k) = buffer(i) enddo @@ -366,7 +389,7 @@ subroutine interxpv(tx,ux,x3dop,nxm,nx,ny,nz) else ! nxm = nx-1 if (x3dop%npaire==1) then - do concurrent (k=1:nz, j=1:ny) + do concurrent (k=1:nz, j=1:ny) local(buffer) ! Compute r.h.s. buffer(1) = aicix6*(ux(1,j,k)+ux(1,j,k)) & @@ -409,7 +432,7 @@ subroutine interxpv(tx,ux,x3dop,nxm,nx,ny,nz) + dicix6*(ux(nx-4,j,k)+ux(nx-4,j,k)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, nx) + call thomas1d(buffer, ff, ss, ww, nx) do concurrent (i=1:nx) tx(i,j,k) = buffer(i) enddo @@ -438,11 +461,18 @@ subroutine interyvp(ty,uy,x3dop,nx,ny,nym,nz) ! Local variables integer :: i, j, k - real(mytype), dimension(nym) :: buffer + real(mytype), dimension(nym) :: buffer, ff, ss, ww, pp + + do concurrent (j=1:nym) + ff(j) = x3dop%f(j) + ss(j) = x3dop%s(j) + ww(j) = x3dop%w(j) + if (allocated(x3dop%periodic)) pp(j) = x3dop%periodic(j) + end do if (ncly) then ! nym = ny - do concurrent (k=1:nz, i=1:nx) + do concurrent (k=1:nz, i=1:nx) local(buffer) ! Compute r.h.s. buffer(1) = aiciy6*(uy(i,2,k)+uy(i,1,k)) & @@ -481,7 +511,7 @@ subroutine interyvp(ty,uy,x3dop,nx,ny,nym,nz) + diciy6*(uy(i,4,k)+uy(i,ny-3,k)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, ny) + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, ny) do concurrent (j=1:ny) ty(i,j,k) = buffer(j) enddo @@ -490,7 +520,7 @@ subroutine interyvp(ty,uy,x3dop,nx,ny,nym,nz) else ! nym = ny-1 if (x3dop%npaire==1) then - do concurrent (k=1:nz, i=1:nx) + do concurrent (k=1:nz, i=1:nx) local(buffer) ! Compute r.h.s. buffer(1) = aiciy6*(uy(i,2,k)+uy(i,1,k)) & @@ -525,7 +555,7 @@ subroutine interyvp(ty,uy,x3dop,nx,ny,nym,nz) + diciy6*(uy(i,nym-2,k)+uy(i,nym-3,k)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, nym) + call thomas1d(buffer, ff, ss, ww, nym) do concurrent (j=1:nym) ty(i,j,k) = buffer(j) enddo @@ -555,11 +585,18 @@ subroutine deryvp(ty,uy,x3dop,ppyi,nx,ny,nym,nz) ! Local variables integer :: i, j, k - real(mytype), dimension(nym) :: buffer + real(mytype), dimension(nym) :: buffer, ff, ss, ww, pp + + do concurrent (j=1:nym) + ff(j) = x3dop%f(j) + ss(j) = x3dop%s(j) + ww(j) = x3dop%w(j) + if (allocated(x3dop%periodic)) pp(j) = x3dop%periodic(j) + end do if (ncly) then ! nym = ny - do concurrent (k=1:nz, i=1:nx) + do concurrent (k=1:nz, i=1:nx) local(buffer) ! Compute r.h.s. buffer(1) = aciy6*(uy(i,2,k)-uy(i,1,k)) & @@ -576,7 +613,7 @@ subroutine deryvp(ty,uy,x3dop,ppyi,nx,ny,nym,nz) + bciy6*(uy(i,2,k)-uy(i,ny-1,k)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, ny) + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, ny) if (istret /= 0) then do concurrent (j=1:nym) buffer(j) = buffer(j) * ppyi(j) @@ -590,7 +627,7 @@ subroutine deryvp(ty,uy,x3dop,ppyi,nx,ny,nym,nz) else ! nym = ny-1 if (x3dop%npaire==0) then - do concurrent (k=1:nz, i=1:nx) + do concurrent (k=1:nz, i=1:nx) local(buffer) ! Compute r.h.s. buffer(1) = aciy6*(uy(i,2,k)-uy(i,1,k)) & @@ -607,7 +644,7 @@ subroutine deryvp(ty,uy,x3dop,ppyi,nx,ny,nym,nz) + bciy6*(two*uy(i,ny,k)-uy(i,nym,k)-uy(i,nym-1,k)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, nym) + call thomas1d(buffer, ff, ss, ww, nym) if (istret /= 0) then do concurrent (j=1:nym) buffer(j) = buffer(j) * ppyi(j) @@ -641,11 +678,18 @@ subroutine interypv(ty,uy,x3dop,nx,nym,ny,nz) ! Local variables integer :: i, j, k - real(mytype), dimension(ny) :: buffer + real(mytype), dimension(ny) :: buffer, ff, ss, ww, pp + + do concurrent (j=1:ny) + ff(j) = x3dop%f(j) + ss(j) = x3dop%s(j) + ww(j) = x3dop%w(j) + if (allocated(x3dop%periodic)) pp(j) = x3dop%periodic(j) + end do if (ncly) then ! nym = ny - do concurrent (k=1:nz, i=1:nx) + do concurrent (k=1:nz, i=1:nx) local(buffer) ! Compute r.h.s. buffer(1) = aiciy6*(uy(i,1,k)+uy(i,ny,k)) & @@ -684,7 +728,7 @@ subroutine interypv(ty,uy,x3dop,nx,nym,ny,nz) + diciy6*(uy(i,3,k)+uy(i,ny-4,k)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, ny) + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, ny) do concurrent (j=1:ny) ty(i,j,k) = buffer(j) enddo @@ -693,7 +737,7 @@ subroutine interypv(ty,uy,x3dop,nx,nym,ny,nz) else ! nym = ny-1 if (x3dop%npaire==1) then - do concurrent (k=1:nz, i=1:nx) + do concurrent (k=1:nz, i=1:nx) local(buffer) ! Compute r.h.s. buffer(1) = aiciy6*(uy(i,1,k)+uy(i,1,k)) & @@ -736,7 +780,7 @@ subroutine interypv(ty,uy,x3dop,nx,nym,ny,nz) + diciy6*(uy(i,ny-4,k)+uy(i,ny-4,k)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, ny) + call thomas1d(buffer, ff, ss, ww, ny) do concurrent (j=1:ny) ty(i,j,k) = buffer(j) enddo @@ -766,11 +810,18 @@ subroutine derypv(ty,uy,x3dop,ppy,nx,nym,ny,nz) ! Local variables integer :: i, j, k - real(mytype), dimension(ny) :: buffer + real(mytype), dimension(ny) :: buffer, ff, ss, ww, pp + + do concurrent (j=1:ny) + ff(j) = x3dop%f(j) + ss(j) = x3dop%s(j) + ww(j) = x3dop%w(j) + if (allocated(x3dop%periodic)) pp(j) = x3dop%periodic(j) + end do if (ncly) then ! nym = ny - do concurrent (k=1:nz, i=1:nx) + do concurrent (k=1:nz, i=1:nx) local(buffer) ! Compute r.h.s. buffer(1) = aciy6*(uy(i,1,k)-uy(i,ny,k)) & @@ -787,7 +838,7 @@ subroutine derypv(ty,uy,x3dop,ppy,nx,nym,ny,nz) + bciy6*(uy(i,1,k)-uy(i,ny-2,k)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, ny) + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, ny) if (istret /= 0) then do concurrent (j=1:ny) buffer(j) = buffer(j) * ppy(j) @@ -801,7 +852,7 @@ subroutine derypv(ty,uy,x3dop,ppy,nx,nym,ny,nz) else ! nym = ny-1 if (x3dop%npaire==1) then - do concurrent (k=1:nz, i=1:nx) + do concurrent (k=1:nz, i=1:nx) local(buffer) ! Compute r.h.s. buffer(1) = zero @@ -816,7 +867,7 @@ subroutine derypv(ty,uy,x3dop,ppy,nx,nym,ny,nz) buffer(ny) = zero ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, ny) + call thomas1d(buffer, ff, ss, ww, ny) if (istret /= 0) then do concurrent (j=1:ny) buffer(j) = buffer(j) * ppy(j) @@ -850,7 +901,7 @@ subroutine derzvp(tz,uz,x3dop,nx,ny,nz,nzm) ! Local variables integer :: i, j, k - real(mytype), dimension(nzm) :: buffer + real(mytype), dimension(nzm) :: buffer, ff, ss, ww, pp if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -859,9 +910,16 @@ subroutine derzvp(tz,uz,x3dop,nx,ny,nz,nzm) return endif + do concurrent (k=1:nzm) + ff(k) = x3dop%f(k) + ss(k) = x3dop%s(k) + ww(k) = x3dop%w(k) + if (allocated(x3dop%periodic)) pp(k) = x3dop%periodic(k) + end do + if (nclz) then ! nzm = nz - do concurrent (j=1:ny, i=1:nx) + do concurrent (j=1:ny, i=1:nx) local(buffer) ! Compute r.h.s. buffer(1) = aciz6*(uz(i,j,2)-uz(i,j,1)) & @@ -878,7 +936,7 @@ subroutine derzvp(tz,uz,x3dop,nx,ny,nz,nzm) + bciz6*(uz(i,j,2)-uz(i,j,nz-1)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nz) + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nz) do concurrent (k=1:nz) tz(i,j,k) = buffer(k) enddo @@ -886,7 +944,7 @@ subroutine derzvp(tz,uz,x3dop,nx,ny,nz,nzm) else ! nzm = nz-1 - do concurrent (j=1:ny, i=1:nx) + do concurrent (j=1:ny, i=1:nx) local(buffer) ! Compute r.h.s. if (x3dop%npaire==1) then @@ -917,7 +975,7 @@ subroutine derzvp(tz,uz,x3dop,nx,ny,nz,nzm) endif ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, nzm) + call thomas1d(buffer, ff, ss, ww, nzm) do concurrent (k=1:nzm) tz(i,j,k) = buffer(k) enddo @@ -945,7 +1003,7 @@ subroutine interzvp(tz,uz,x3dop,nx,ny,nz,nzm) ! Local variables integer :: i, j, k - real(mytype), dimension(nzm) :: buffer + real(mytype), dimension(nzm) :: buffer, ff, ss, ww, pp if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -954,9 +1012,16 @@ subroutine interzvp(tz,uz,x3dop,nx,ny,nz,nzm) return endif + do concurrent (k=1:nzm) + ff(k) = x3dop%f(k) + ss(k) = x3dop%s(k) + ww(k) = x3dop%w(k) + if (allocated(x3dop%periodic)) pp(k) = x3dop%periodic(k) + end do + if (nclz) then ! nzm = nz - do concurrent (j=1:ny, i=1:nx) + do concurrent (j=1:ny, i=1:nx) local(buffer) ! Compute r.h.s. buffer(1) = aiciz6*(uz(i,j,2)+uz(i,j,1)) & @@ -995,7 +1060,7 @@ subroutine interzvp(tz,uz,x3dop,nx,ny,nz,nzm) + diciz6*(uz(i,j,4)+uz(i,j,nz-3)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nz) + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nz) do concurrent (k=1:nz) tz(i,j,k) = buffer(k) enddo @@ -1004,7 +1069,7 @@ subroutine interzvp(tz,uz,x3dop,nx,ny,nz,nzm) else ! nzm = nz-1 if (x3dop%npaire==1) then - do concurrent (j=1:ny, i=1:nx) + do concurrent (j=1:ny, i=1:nx) local(buffer) ! Compute r.h.s. buffer(1) = aiciz6*(uz(i,j,2)+uz(i,j,1)) & @@ -1039,7 +1104,7 @@ subroutine interzvp(tz,uz,x3dop,nx,ny,nz,nzm) + diciz6*(uz(i,j,nzm-2)+uz(i,j,nzm-3)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, nzm) + call thomas1d(buffer, ff, ss, ww, nzm) do concurrent (k=1:nzm) tz(i,j,k) = buffer(k) enddo @@ -1068,7 +1133,7 @@ subroutine derzpv(tz,uz,x3dop,nx,ny,nzm,nz) ! Local variables integer :: i, j, k - real(mytype), dimension(nz) :: buffer + real(mytype), dimension(nz) :: buffer, ff, ss, ww, pp if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -1077,9 +1142,16 @@ subroutine derzpv(tz,uz,x3dop,nx,ny,nzm,nz) return endif + do concurrent (k=1:nz) + ff(k) = x3dop%f(k) + ss(k) = x3dop%s(k) + ww(k) = x3dop%w(k) + if (allocated(x3dop%periodic)) pp(k) = x3dop%periodic(k) + end do + if (nclz) then ! nzm = nz - do concurrent (j=1:ny, i=1:nx) + do concurrent (j=1:ny, i=1:nx) local(buffer) ! Compute r.h.s. buffer(1) = aciz6*(uz(i,j,1)-uz(i,j,nz)) & @@ -1096,7 +1168,7 @@ subroutine derzpv(tz,uz,x3dop,nx,ny,nzm,nz) + bciz6*(uz(i,j,1)-uz(i,j,nz-2)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nz) + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nz) do concurrent (k=1:nz) tz(i,j,k) = buffer(k) enddo @@ -1105,7 +1177,7 @@ subroutine derzpv(tz,uz,x3dop,nx,ny,nzm,nz) else ! nzm = nz-1 if (x3dop%npaire==1) then - do concurrent (j=1:ny, i=1:nx) + do concurrent (j=1:ny, i=1:nx) local(buffer) ! Compute r.h.s. buffer(1) = zero @@ -1120,7 +1192,7 @@ subroutine derzpv(tz,uz,x3dop,nx,ny,nzm,nz) buffer(nz) = zero ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, nz) + call thomas1d(buffer, ff, ss, ww, nz) do concurrent (k=1:nz) tz(i,j,k) = buffer(k) enddo @@ -1149,7 +1221,7 @@ subroutine interzpv(tz,uz,x3dop,nx,ny,nzm,nz) ! Local variables integer :: i, j, k - real(mytype), dimension(nz) :: buffer + real(mytype), dimension(nz) :: buffer, ff, ss, ww, pp if (nz==1) then do concurrent(k=1:nz, j=1:ny, i=1:nx) @@ -1158,9 +1230,16 @@ subroutine interzpv(tz,uz,x3dop,nx,ny,nzm,nz) return endif + do concurrent (k=1:nz) + ff(k) = x3dop%f(k) + ss(k) = x3dop%s(k) + ww(k) = x3dop%w(k) + if (allocated(x3dop%periodic)) pp(k) = x3dop%periodic(k) + end do + if (nclz) then ! nzm = nz - do concurrent (j=1:ny, i=1:nx) + do concurrent (j=1:ny, i=1:nx) local(buffer) ! Compute r.h.s. buffer(1) = aiciz6*(uz(i,j,1)+uz(i,j,nz)) & @@ -1199,7 +1278,7 @@ subroutine interzpv(tz,uz,x3dop,nx,ny,nzm,nz) + diciz6*(uz(i,j,3)+uz(i,j,nz-4)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nz) + call thomas1d(buffer, ff, ss, ww, pp, x3dop%alfa, nz) do concurrent (k=1:nz) tz(i,j,k) = buffer(k) enddo @@ -1208,7 +1287,7 @@ subroutine interzpv(tz,uz,x3dop,nx,ny,nzm,nz) else ! nzm = nz-1 if (x3dop%npaire==1) then - do concurrent (j=1:ny, i=1:nx) + do concurrent (j=1:ny, i=1:nx) local(buffer) ! Compute r.h.s. buffer(1) = aiciz6*(uz(i,j,1)+uz(i,j,1)) & @@ -1251,7 +1330,7 @@ subroutine interzpv(tz,uz,x3dop,nx,ny,nzm,nz) + diciz6*(uz(i,j,nz-4)+uz(i,j,nz-4)) ! Solve tri-diagonal system - call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, nz) + call thomas1d(buffer, ff, ss, ww, nz) do concurrent (k=1:nz) tz(i,j,k) = buffer(k) enddo diff --git a/src/xcompact3d.f90 b/src/xcompact3d.f90 index 031fa64..0ecc95b 100644 --- a/src/xcompact3d.f90 +++ b/src/xcompact3d.f90 @@ -13,6 +13,7 @@ program xcompact3d use navier, only : solve_poisson, cor_vel use case use time_integrators, only : int_time + use nvtx implicit none @@ -23,9 +24,7 @@ program xcompact3d integer :: code call boot_xcompact3d() - call init_xcompact3d(ndt_max) - call case_init(ux1, uy1, uz1) telapsed = 0 @@ -36,12 +35,12 @@ program xcompact3d itr = 1 ! no inner iterations !call init_flowfield() - - tstart = MPI_Wtime() - call case_bc(ux1, uy1, uz1) - + call nvtxStartRange("transeq") + tstart = MPI_Wtime() call calculate_transeq_rhs(dux1,duy1,duz1,ux1,uy1,uz1) + tend = MPI_Wtime() + call nvtxEndRange call int_time(ux1,uy1,uz1,dux1,duy1,duz1) !do concurrent (k=1:zsize(3), j=1:zsize(2), i=1:zsize(1)) @@ -50,7 +49,6 @@ program xcompact3d call solve_poisson(pp3,px1,py1,pz1,ux1,uy1,uz1) call cor_vel(ux1,uy1,uz1,px1,py1,pz1) - tend = MPI_Wtime() telapsed = telapsed + (tend - tstart) tmin = telapsed tmax = telapsed From 3f5ba5b3e54073b4552439dc5f9653792e4edc24 Mon Sep 17 00:00:00 2001 From: cflag Date: Wed, 9 Mar 2022 11:36:41 +0100 Subject: [PATCH 09/10] Fix 1D buffers in derx_00 --- src/x3d_derive.f90 | 77 +++++++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 28 deletions(-) diff --git a/src/x3d_derive.f90 b/src/x3d_derive.f90 index 78d9d2d..99773ff 100644 --- a/src/x3d_derive.f90 +++ b/src/x3d_derive.f90 @@ -193,38 +193,59 @@ subroutine derx_00(tx,ux,x3dop,nx,ny,nz) ! Local variables integer :: i, j, k - real(mytype), dimension(nx) :: buffer, ff, ss, ww, pp - real(mytype) :: alfa + real(mytype), dimension(nx) :: buffer - do concurrent (i=1:nx) - ff(i) = x3dop%f(i) - ss(i) = x3dop%s(i) - ww(i) = x3dop%w(i) - pp(i) = x3dop%periodic(i) - end do - alfa = x3dop%alfa + ! This is working (with deepcopy) + !$acc parallel loop gang vector collapse(2) private(buffer) + do k = 1, nz + do j = 1, ny + ! Compute r.h.s. + buffer(1) = afix*(ux(2,j,k)-ux(nx,j,k)) & + + bfix*(ux(3,j,k)-ux(nx-1,j,k)) + buffer(2) = afix*(ux(3,j,k)-ux(1,j,k)) & + + bfix*(ux(4,j,k)-ux(nx,j,k)) + do concurrent (i=3:nx-2) + buffer(i) = afix*(ux(i+1,j,k)-ux(i-1,j,k)) & + + bfix*(ux(i+2,j,k)-ux(i-2,j,k)) + enddo + buffer(nx-1) = afix*(ux(nx,j,k)-ux(nx-2,j,k)) & + + bfix*(ux(1,j,k)-ux(nx-3,j,k)) + buffer(nx) = afix*(ux(1,j,k)-ux(nx-1,j,k)) & + + bfix*(ux(2,j,k)-ux(nx-2,j,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo + enddo + enddo + !$acc end parallel loop +#ifdef 0 + ! This is broken (with deepcopy) do concurrent (k=1:nz, j=1:ny) local(buffer) - ! Compute r.h.s. - buffer(1) = afix*(ux(2,j,k)-ux(nx,j,k)) & - + bfix*(ux(3,j,k)-ux(nx-1,j,k)) - buffer(2) = afix*(ux(3,j,k)-ux(1,j,k)) & - + bfix*(ux(4,j,k)-ux(nx,j,k)) - do concurrent (i=3:nx-2) - buffer(i) = afix*(ux(i+1,j,k)-ux(i-1,j,k)) & - + bfix*(ux(i+2,j,k)-ux(i-2,j,k)) - enddo - buffer(nx-1) = afix*(ux(nx,j,k)-ux(nx-2,j,k)) & - + bfix*(ux(1,j,k)-ux(nx-3,j,k)) - buffer(nx) = afix*(ux(1,j,k)-ux(nx-1,j,k)) & - + bfix*(ux(2,j,k)-ux(nx-2,j,k)) - - ! Solve tri-diagonal system - call thomas1d(buffer, ff, ss, ww, pp, alfa, nx) - do concurrent (i=1:nx) - tx(i,j,k) = buffer(i) - enddo + ! Compute r.h.s. + buffer(1) = afix*(ux(2,j,k)-ux(nx,j,k)) & + + bfix*(ux(3,j,k)-ux(nx-1,j,k)) + buffer(2) = afix*(ux(3,j,k)-ux(1,j,k)) & + + bfix*(ux(4,j,k)-ux(nx,j,k)) + do concurrent (i=3:nx-2) + buffer(i) = afix*(ux(i+1,j,k)-ux(i-1,j,k)) & + + bfix*(ux(i+2,j,k)-ux(i-2,j,k)) + enddo + buffer(nx-1) = afix*(ux(nx,j,k)-ux(nx-2,j,k)) & + + bfix*(ux(1,j,k)-ux(nx-3,j,k)) + buffer(nx) = afix*(ux(1,j,k)-ux(nx-1,j,k)) & + + bfix*(ux(2,j,k)-ux(nx-2,j,k)) + + ! Solve tri-diagonal system + call thomas1d(buffer, x3dop%f, x3dop%s, x3dop%w, x3dop%periodic, x3dop%alfa, nx) + do concurrent (i=1:nx) + tx(i,j,k) = buffer(i) + enddo enddo +#endif end subroutine derx_00 From 1182a1ee54824f47f2e2a931cd466a02e72c6b83 Mon Sep 17 00:00:00 2001 From: cflag Date: Wed, 9 Mar 2022 12:10:37 +0100 Subject: [PATCH 10/10] Minor update --- src/x3d_derive.f90 | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/x3d_derive.f90 b/src/x3d_derive.f90 index 99773ff..9c93987 100644 --- a/src/x3d_derive.f90 +++ b/src/x3d_derive.f90 @@ -193,6 +193,7 @@ subroutine derx_00(tx,ux,x3dop,nx,ny,nz) ! Local variables integer :: i, j, k +#if 1 real(mytype), dimension(nx) :: buffer ! This is working (with deepcopy) @@ -221,8 +222,9 @@ subroutine derx_00(tx,ux,x3dop,nx,ny,nz) enddo enddo !$acc end parallel loop +#else + real(mytype), dimension(nx) :: buffer -#ifdef 0 ! This is broken (with deepcopy) do concurrent (k=1:nz, j=1:ny) local(buffer) ! Compute r.h.s.