Skip to content

Commit 14ad361

Browse files
authored
Merge pull request #3020 from andrew-platt/f/TurbSim_OpenMP
TurbSim: OpenMP parallelization in factorization and FFT
2 parents 5aa1e87 + 5858941 commit 14ad361

File tree

2 files changed

+102
-133
lines changed

2 files changed

+102
-133
lines changed

CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,11 @@ if (OPENMP)
125125
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
126126
link_libraries("${OpenMP_CXX_LIBRARIES}")
127127
endif()
128+
elseif (NOT BLA_VENDOR)
129+
# If we're not using OpenMP, and a specific BLAS vendor has not been set,
130+
# set MKL threading to sequential to avoid potential issues with
131+
# small calculations taking longer due to threading overhead (turbsim).
132+
set(MKL_THREADING "sequential")
128133
endif()
129134

130135
#-------------------------------------------------------------------------------

modules/turbsim/src/TSsubs.f90

Lines changed: 97 additions & 133 deletions
Original file line numberDiff line numberDiff line change
@@ -39,20 +39,24 @@ MODULE TSSubs
3939
!! real array) of the simulated velocity (wind/water speed). It returns
4040
!! values FOR ONLY the velocity components that use the IEC method for
4141
!! computing spatial coherence; i.e., for i where SCMod(i) == CohMod_IEC
42-
SUBROUTINE CalcFourierCoeffs_IEC( p, U, PhaseAngles, S, V, TRH, ErrStat, ErrMsg )
42+
!!
43+
!! OpenMP: This makes a copy of the TRH array for each thread to use, which
44+
!! is a little inefficient, but the speedup from parallelization
45+
!! should outweigh the memory overhead. In the single threaded case,
46+
!! a single copy is made, which is relatively negligible.
47+
SUBROUTINE CalcFourierCoeffs_IEC( p, U, PhaseAngles, S, V, TRH_in, ErrStat, ErrMsg )
4348

4449
TYPE(TurbSim_ParameterType), INTENT(IN ) :: p !< TurbSim parameters
4550
REAL(ReKi), INTENT(IN) :: U (:) !< The steady u-component wind speeds for the grid (NPoints).
4651
REAL(ReKi), INTENT(IN) :: PhaseAngles (:,:,:) !< The array that holds the random phases [number of points, number of frequencies, number of wind components=3].
4752
REAL(ReKi), INTENT(IN) :: S (:,:,:) !< The turbulence PSD array (NumFreq,NPoints,3).
4853
REAL(ReKi), INTENT(INOUT) :: V (:,:,:) !< An array containing the summations of the rows of H (NumSteps,NPoints,3).
49-
REAL(ReKi), INTENT(INOUT) :: TRH (:) !< The transfer function matrix. just used as a work array
54+
REAL(ReKi), INTENT(INOUT) :: TRH_in(:) !< The transfer function matrix. just used as a work array
5055
INTEGER(IntKi), INTENT(OUT) :: ErrStat
5156
CHARACTER(*), INTENT(OUT) :: ErrMsg
5257

53-
54-
! Internal variables
55-
58+
!$OMP THREADPRIVATE(TRH)
59+
REAL(ReKi), allocatable, save :: TRH(:) ! Each OMP thread gets its own copy of this array
5660
REAL(ReKi), ALLOCATABLE :: Dist(:) ! The distance between points
5761
REAL(ReKi), ALLOCATABLE :: DistU(:)
5862

@@ -64,25 +68,22 @@ SUBROUTINE CalcFourierCoeffs_IEC( p, U, PhaseAngles, S, V, TRH, ErrStat, ErrMsg
6468

6569
INTEGER(IntKi) :: ErrStat2
6670
CHARACTER(MaxMsgLen) :: ErrMsg2
67-
68-
69-
71+
7072
ErrStat = ErrID_None
7173
ErrMsg = ""
7274

73-
IF (.NOT. ANY(p%met%SCMod == CohMod_IEC) ) RETURN
75+
IF (.NOT. ANY(p%met%SCMod == CohMod_IEC)) RETURN
7476

7577
!--------------------------------------------------------------------------------
7678
! allocate arrays
7779
!--------------------------------------------------------------------------------
78-
CALL AllocAry( Dist, p%grid%NPacked, 'Dist coherence array', ErrStat2, ErrMsg2 ); CALL SetErrStat(ErrStat2, ErrMsg2, ErrStat, ErrMsg, 'CalcFourierCoeffs_IEC')
79-
CALL AllocAry( DistU, p%grid%NPacked, 'DistU coherence array', ErrStat2, ErrMsg2 ); CALL SetErrStat(ErrStat2, ErrMsg2, ErrStat, ErrMsg, 'CalcFourierCoeffs_IEC')
80-
IF (ErrStat >= AbortErrLev) THEN
81-
CALL Cleanup()
82-
RETURN
83-
END IF
84-
85-
80+
81+
CALL AllocAry( Dist, p%grid%NPacked, 'Dist coherence array', ErrStat2, ErrMsg2 ); CALL SetErrStat(ErrStat2, ErrMsg2, ErrStat, ErrMsg, 'CalcFourierCoeffs_IEC')
82+
CALL AllocAry( DistU, p%grid%NPacked, 'DistU coherence array', ErrStat2, ErrMsg2 ); CALL SetErrStat(ErrStat2, ErrMsg2, ErrStat, ErrMsg, 'CalcFourierCoeffs_IEC')
83+
IF (ErrStat >= AbortErrLev) RETURN
84+
85+
TRH = TRH_in ! point the PRIVATE array to the passed in array for single thread case
86+
8687
!--------------------------------------------------------------------------------
8788
! Calculate the distances and other parameters that don't change with frequency
8889
!---------------------------------------------------------------------------------
@@ -115,7 +116,13 @@ SUBROUTINE CalcFourierCoeffs_IEC( p, U, PhaseAngles, S, V, TRH, ErrStat, ErrMsg
115116
! Calculate the coherence, Veers' H matrix (CSDs), and the fourier coefficients
116117
!---------------------------------------------------------------------------------
117118

118-
DO IFREQ = 1,p%grid%NumFreq
119+
!$OMP PARALLEL DO &
120+
!$OMP DEFAULT(None) &
121+
!$OMP SHARED(p, PhaseAngles, S, V, Dist, DistU, IVec, ErrStat, ErrMsg, AbortErrLev) &
122+
!$OMP PRIVATE(Indx, I, J, ErrStat2, ErrMsg2) &
123+
!$OMP COPYIN(TRH)
124+
DO IFREQ = 1, p%grid%NumFreq
125+
119126
! -----------------------------------------------
120127
! Create the coherence matrix for this frequency
121128
! -----------------------------------------------
@@ -149,27 +156,18 @@ SUBROUTINE CalcFourierCoeffs_IEC( p, U, PhaseAngles, S, V, TRH, ErrStat, ErrMsg
149156
! use H matrix to calculate coefficients
150157
! -----------------------------------------------
151158

152-
CALL Coh2H( p, IVec, IFreq, TRH, S, ErrStat2, ErrMsg2 )
159+
CALL Coh2H(p, IVec, IFreq, TRH, S, ErrStat2, ErrMsg2)
160+
if (ErrStat2 >= AbortErrLev) then
161+
!$OMP CRITICAL
153162
CALL SetErrStat(ErrStat2, ErrMsg2, ErrStat, ErrMsg, 'CalcFourierCoeffs_IEC')
154-
IF (ErrStat >= AbortErrLev) THEN
155-
CALL Cleanup()
156-
RETURN
157-
END IF
158-
CALL H2Coeffs( IVec, IFreq, TRH, PhaseAngles, V, p%grid%NPoints )
159-
END DO !IFreq
163+
!$OMP END CRITICAL
164+
else
165+
CALL H2Coeffs( IVec, IFreq, TRH, PhaseAngles, V, p%grid%NPoints )
166+
endif
160167

168+
END DO !IFreq
161169
END DO !IVec
162170

163-
CALL Cleanup()
164-
RETURN
165-
166-
!............................................
167-
CONTAINS
168-
SUBROUTINE Cleanup()
169-
170-
IF ( ALLOCATED( Dist ) ) DEALLOCATE( Dist )
171-
IF ( ALLOCATED( DistU ) ) DEALLOCATE( DistU )
172-
END SUBROUTINE Cleanup
173171
!............................................
174172
END SUBROUTINE CalcFourierCoeffs_IEC
175173
!=======================================================================
@@ -720,18 +718,17 @@ SUBROUTINE EyeCoh2H( IVec, IFreq, TRH, S, NPoints )
720718
! -----------------------------------------------------------------------------------
721719

722720
Indx = 1
721+
TRH = 0.0
723722
DO J = 1,NPoints ! The column number
724723

725724
! The diagonal entries of the matrix:
726725

727726
TRH(Indx) = SQRT( ABS( S(IFreq,J,IVec) ) )
728727

729-
! The off-diagonal values:
730-
Indx = Indx + 1
731-
DO I = J+1,NPoints ! The row number
732-
TRH(Indx) = 0.0
733-
Indx = Indx + 1
734-
ENDDO ! I
728+
! skip rest of row (NPoints-1) -- these are off diagonal elements that are zero.
729+
! Then add 1 to get to next diagonal entry
730+
Indx = Indx + (NPoints - J) + 1
731+
735732
ENDDO ! J
736733

737734
END SUBROUTINE EyeCoh2H
@@ -751,8 +748,9 @@ SUBROUTINE Coh2H( p, IVec, IFreq, TRH, S, ErrStat, ErrMsg )
751748

752749

753750
integer :: Indx, J, I, NPts
754-
751+
integer :: old_max_levels ! maximum nesting levels for OPENMP
755752

753+
756754
! -------------------------------------------------------------
757755
! Calculate the Cholesky factorization for the coherence matrix
758756
! -------------------------------------------------------------
@@ -872,115 +870,81 @@ END SUBROUTINE H2Coeffs
872870
!> This routine takes the Fourier coefficients and converts them to velocity
873871
!! note that the resulting time series has zero mean.
874872
SUBROUTINE Coeffs2TimeSeries( V, NumSteps, NPoints, NUsrPoints, ErrStat, ErrMsg )
875-
876-
877-
!USE NWTC_FFTPACK
878-
879-
IMPLICIT NONE
880-
881-
882-
! passed variables
883873
INTEGER(IntKi), INTENT(IN) :: NumSteps !< Size of dimension 1 of V (number of time steps)
884874
INTEGER(IntKi), INTENT(IN) :: NPoints !< Size of dimension 2 of V (number of grid points)
885875
INTEGER(IntKi), INTENT(IN) :: NUsrPoints !< number of user-defined time series
886-
887876
REAL(ReKi), INTENT(INOUT) :: V (NumSteps,NPoints,3) !< An array containing the summations of the rows of H (NumSteps,NPoints,3).
888-
889877
INTEGER(IntKi), intent( out) :: ErrStat !< Error level
890878
CHARACTER(*), intent( out) :: ErrMsg !< Message describing error
891879

892-
893-
! local variables
894880
TYPE(FFT_DataType) :: FFT_Data ! data for applying FFT
895881
REAL(SiKi), ALLOCATABLE :: Work ( : ) ! working array to hold coefficients of fft !bjj: made it allocatable so it doesn't take stack space
896-
897-
898-
INTEGER(IntKi) :: ITime ! loop counter for time step/frequency
899882
INTEGER(IntKi) :: IVec ! loop counter for velocity components
900883
INTEGER(IntKi) :: IPoint ! loop counter for grid points
901-
884+
logical :: ExitOMPlooping ! flag to indicate skipping other loops
902885
INTEGER(IntKi) :: ErrStat2 ! Error level (local)
903-
!CHARACTER(MaxMsgLen) :: ErrMsg2 ! Message describing error (local)
904886

905887

906-
! initialize variables
907-
908-
!ErrStat = ErrID_None
909-
!ErrMsg = ""
910-
911888
CALL AllocAry(Work, NumSteps, 'Work',ErrStat,ErrMsg)
912889
if (ErrStat >= AbortErrLev) return
913890

914-
! Allocate the FFT working storage and initialize its variables
915-
916-
CALL InitFFT( NumSteps, FFT_Data, ErrStat=ErrStat2 )
917-
CALL SetErrStat(ErrStat2, 'Error in InitFFT', ErrStat, ErrMsg, 'Coeffs2TimeSeries' )
918-
IF (ErrStat >= AbortErrLev) THEN
919-
CALL Cleanup()
920-
RETURN
921-
END IF
922-
923-
924891
! Get the stationary-point time series.
925892

926-
CALL WrScr ( ' Generating time series for all points:' )
893+
CALL WrScr ( ' Generating time series for all points:' )
927894

928-
DO IVec=1,3
895+
! Since we are potentially using OpenMP here, we cannot
896+
ExitOMPlooping = .false.
929897

930-
CALL WrScr ( ' '//Comp(IVec)//'-component' )
898+
DO IVec=1,3
931899

932-
DO IPoint=1,NPoints !NTotB
900+
! make sure we didn't have a failure on prior OMP loop
901+
if (ExitOMPlooping) cycle
933902

934-
! Overwrite the first point with zero. This sets the real (and
935-
! imaginary) part of the steady-state value to zero so that we
936-
! can add in the mean value later.
903+
CALL WrScr ( ' '//Comp(IVec)//'-component' )
937904

938-
Work(1) = 0.0_ReKi
905+
! The FFT_Data is not thread safe with the allocation inside.
906+
CALL InitFFT( NumSteps, FFT_Data, ErrStat=ErrStat2 )
907+
CALL SetErrStat(ErrStat2, 'Error in InitFFT', ErrStat, ErrMsg, 'Coeffs2TimeSeries' )
939908

940-
! DO ITime = 2,NumSteps-1
941-
DO ITime = 2,NumSteps
942-
Work(ITime) = V(ITime-1, IPoint, IVec)
943-
ENDDO ! ITime
944-
945-
IF (iPoint > NUsrPoints) THEN
946-
! BJJ: we can't override this for the user-input spectra or we don't get the correct time series out.
947-
! Per JMJ, I will keep this here for the other points, but I personally think it could be skipped, too.
909+
! Proceed only if the InitFFT worked.
910+
! NOTE: this is to allow for OpenMP - can't return from inside loop
911+
if (ErrStat2 < AbortErrLev) then ! check ErrStat2 for this OMPthread
912+
DO IPoint=1,NPoints !NTotB
913+
! Overwrite the first point with zero. This sets the real (and
914+
! imaginary) part of the steady-state value to zero so that we
915+
! can add in the mean value later.
916+
Work(1) = 0.0_ReKi
917+
Work(2:NumSteps) = V(1:NumSteps-1, IPoint, IVec)
948918

949-
! Now, let's add a complex zero to the end to set the power in the Nyquist
950-
! frequency to zero.
919+
IF (iPoint > NUsrPoints) THEN
920+
! BJJ: we can't override this for the user-input spectra or we don't get the correct time series out.
921+
! Per JMJ, I will keep this here for the other points, but I personally think it could be skipped, too.
922+
923+
! Now, let's add a complex zero to the end to set the power in the Nyquist
924+
! frequency to zero.
925+
Work(NumSteps) = 0.0
926+
END IF
951927

952-
Work(NumSteps) = 0.0
953-
END IF
954-
928+
! perform FFT
929+
CALL ApplyFFT( Work, FFT_Data, ErrStat2 )
930+
IF (ErrStat2 /= ErrID_None ) THEN
931+
CALL SetErrStat(ErrStat2, 'Error in ApplyFFT for point '//TRIM(Num2LStr(IPoint))//'.', ErrStat, ErrMsg, 'Coeffs2TimeSeries' )
932+
IF (ErrStat >= AbortErrLev) EXIT
933+
END IF
934+
935+
V(:,IPoint,IVec) = Work
936+
ENDDO ! IPoint
955937

938+
! Clean up memory from FFT_Data.
939+
CALL ExitFFT( FFT_Data, ErrStat2 )
940+
CALL SetErrStat(ErrStat2, 'Error in ExitFFT', ErrStat, ErrMsg, 'Coeffs2TimeSeries' )
956941

957-
! perform FFT
958-
959-
CALL ApplyFFT( Work, FFT_Data, ErrStat2 )
960-
IF (ErrStat2 /= ErrID_None ) THEN
961-
CALL SetErrStat(ErrStat2, 'Error in ApplyFFT for point '//TRIM(Num2LStr(IPoint))//'.', ErrStat, ErrMsg, 'Coeffs2TimeSeries' )
962-
IF (ErrStat >= AbortErrLev) EXIT
963-
END IF
964-
965-
V(:,IPoint,IVec) = Work
966-
967-
ENDDO ! IPoint
968-
969-
ENDDO ! IVec
970-
971-
CALL Cleanup()
972-
973-
RETURN
974-
CONTAINS
975-
!...........................................
976-
SUBROUTINE Cleanup()
977-
978-
CALL ExitFFT( FFT_Data, ErrStat2 )
979-
CALL SetErrStat(ErrStat2, 'Error in ExitFFT', ErrStat, ErrMsg, 'Coeffs2TimeSeries' )
942+
! skip further OMP loops if any sequential (or if OMP not used).
943+
! NOTE: OMP doesn't allow return inside OMP thread
944+
if (ErrStat2 >= AbortErrLev) ExitOMPlooping = .true.
945+
endif
946+
ENDDO ! IVec
980947

981-
if (allocated(work)) deallocate(work)
982-
983-
END SUBROUTINE Cleanup
984948
END SUBROUTINE Coeffs2TimeSeries
985949
!=======================================================================
986950
!> This routine calculates the two-sided Fourier amplitudes of the frequencies
@@ -2030,28 +1994,28 @@ SUBROUTINE AddMeanAndRotate(p, V, U, HWindDir, VWindDir)
20301994
REAL(ReKi) :: v3(3) ! temporary 3-component array containing velocity
20311995
INTEGER(IntKi) :: ITime ! loop counter for time step
20321996
INTEGER(IntKi) :: IPoint ! loop counter for grid points
2033-
2034-
2035-
2036-
1997+
20371998
!..............................................................................
2038-
! Add mean wind to u' components and rotate to inertial reference
1999+
! Add mean wind to u' components and rotate to inertial reference
20392000
! frame coordinate system
2040-
!..............................................................................
2001+
!..............................................................................
2002+
2003+
!$OMP PARALLEL DO &
2004+
!$OMP COLLAPSE(2) &
2005+
!$OMP DEFAULT(None) &
2006+
!$OMP PRIVATE(v3) &
2007+
!$OMP SHARED(p, U, V, HWindDir, VWindDir)
20412008
DO IPoint=1,p%grid%Npoints
20422009
DO ITime=1,p%grid%NumSteps
20432010

2044-
! Add mean wind speed to the streamwise component and
2045-
! Rotate the wind to the X-Y-Z (inertial) reference frame coordinates:
2046-
2011+
! Add mean wind speed to the streamwise component and
2012+
! Rotate the wind to the X-Y-Z (inertial) reference frame coordinates:
20472013
v3 = V(ITime,IPoint,:)
20482014
CALL CalculateWindComponents( v3, U(IPoint), HWindDir(IPoint), VWindDir(IPoint), V(ITime,IPoint,:) )
20492015

20502016
ENDDO ! ITime
2051-
20522017
ENDDO ! IPoint
2053-
2054-
2018+
20552019
END SUBROUTINE AddMeanAndRotate
20562020
!=======================================================================
20572021
SUBROUTINE TS_ValidateInput(P, ErrStat, ErrMsg)

0 commit comments

Comments
 (0)