From 873d71d9265423a34291e7760a51ea7eccb518ee Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 22 Nov 2021 09:10:14 -0500 Subject: [PATCH 01/33] Start adding preprocessors guards around openmp statements --- src/helpme.h | 2 ++ src/tensor_utils.h | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/src/helpme.h b/src/helpme.h index a269c10..0de4c6d 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -817,7 +817,9 @@ class PMEInstance { // Exclude m=0 cell. int start = (nodeZero ? 1 : 0); // Writing the three nested loops in one allows for better load balancing in parallel. +#ifdef _OPENMP #pragma omp parallel for reduction(+ : energy, Vxx, Vxy, Vyy, Vxz, Vyz, Vzz) num_threads(nThreads) +#endif for (size_t yxz = start; yxz < nyxz; ++yxz) { size_t xz = yxz % nxz; short ky = yxz / nxz; diff --git a/src/tensor_utils.h b/src/tensor_utils.h index 807367f..97d043a 100644 --- a/src/tensor_utils.h +++ b/src/tensor_utils.h @@ -32,7 +32,9 @@ namespace helpme { template void permuteABCtoCBA(Real const *__restrict__ abcPtr, int const aDimension, int const bDimension, int const cDimension, Real *__restrict__ cbaPtr, size_t nThreads = 1) { +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads) +#endif for (int C = 0; C <= -1 + cDimension; ++C) for (int B = 0; B <= -1 + bDimension; ++B) for (int A = 0; A <= -1 + aDimension; ++A) @@ -52,7 +54,9 @@ void permuteABCtoCBA(Real const *__restrict__ abcPtr, int const aDimension, int template void permuteABCtoACB(Real const *__restrict__ abcPtr, int const aDimension, int const bDimension, int const cDimension, Real *__restrict__ acbPtr, size_t nThreads = 1) { +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads) +#endif for (int A = 0; A <= -1 + aDimension; ++A) for (int C = 0; C <= -1 + cDimension; ++C) for (int B = 0; B <= -1 + bDimension; ++B) From 62098c12cbbe7ab671060f6e6e0d32399d4979dd Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 22 Nov 2021 09:15:28 -0500 Subject: [PATCH 02/33] Add more guards --- src/helpme.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/helpme.h b/src/helpme.h index 0de4c6d..f5017a9 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -926,7 +926,9 @@ class PMEInstance { // Exclude m=0 cell. int start = (nodeZero ? 1 : 0); // Writing the three nested loops in one allows for better load balancing in parallel. +#ifdef _OPENMP #pragma omp parallel for reduction(+ : energy, Vxx, Vxy, Vyy, Vxz, Vyz, Vzz) num_threads(nThreads) +#endif for (size_t yxz = start; yxz < nyxz; ++yxz) { size_t xz = yxz % nxz; short ky = yxz / nxz; @@ -1085,7 +1087,9 @@ class PMEInstance { // Exclude m=0 cell. int start = (nodeZero ? 1 : 0); // Writing the three nested loops in one allows for better load balancing in parallel. +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads) +#endif for (size_t yxz = start; yxz < nyxz; ++yxz) { size_t xz = yxz % nxz; short ky = yxz / nxz; From ac907bca37f7dd3c8db20232640d71939ff6f3dc Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 22 Nov 2021 09:19:50 -0500 Subject: [PATCH 03/33] Add another guard --- src/helpme.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/helpme.h b/src/helpme.h index f5017a9..35ce561 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -1546,9 +1546,9 @@ class PMEInstance { int nComponents = nCartesian(parameterAngMom); size_t numBA = (size_t)myGridDimensionB_ * myGridDimensionA_; +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); #else int threadID = 0; @@ -1565,7 +1565,9 @@ class PMEInstance { spreadParametersImpl(atom, realGrid, nComponents, splineA, splineB, splineC, fractionalParameters, threadID); } +#ifdef _OPENMP } +#endif return realGrid; } From 47226ff5a2ff9f3010b1ab35bd3583d35a1010fb Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 22 Nov 2021 09:25:39 -0500 Subject: [PATCH 04/33] 3 more guards --- src/helpme.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/helpme.h b/src/helpme.h index 35ce561..bcf4ee0 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -1587,9 +1587,9 @@ class PMEInstance { gridAtomList_.resize(gridDimensionC_); // Classify atoms to their worker threads first, then construct splines for each thread +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); #else int threadID = 0; @@ -1630,8 +1630,9 @@ class PMEInstance { } } numAtomsPerThread_[threadID] = myNumAtoms; +#ifdef _OPENMP } - +#endif // We could intervene here and do some load balancing by inspecting the list. Currently // the lazy approach of just assuming that the atoms are evenly distributed along c is used. @@ -1650,9 +1651,9 @@ class PMEInstance { threadOffset[thread] = threadOffset[thread - 1] + numAtomsPerThread_[thread - 1]; } +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); #else int threadID = 0; @@ -1685,12 +1686,13 @@ class PMEInstance { splineOrder_, splineDerivativeLevel); } } +#ifdef _OPENMP } - +#endif // Finally, find all of the splines that this thread will need to handle +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); #else int threadID = 0; @@ -1705,7 +1707,9 @@ class PMEInstance { } ++count; } +#ifdef _OPENMP } +#endif } /*! From d7818e05636407367f1de5d1dea33a844e346e7f Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 22 Nov 2021 09:36:39 -0500 Subject: [PATCH 05/33] Need to keep the code blocks to avoid redeclaring the threadID variable. --- src/helpme.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/helpme.h b/src/helpme.h index bcf4ee0..3fa077b 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -1592,6 +1592,7 @@ class PMEInstance { { int threadID = omp_get_thread_num(); #else + { int threadID = 0; #endif for (size_t row = threadID; row < gridDimensionC_; row += nThreads_) { @@ -1630,9 +1631,7 @@ class PMEInstance { } } numAtomsPerThread_[threadID] = myNumAtoms; -#ifdef _OPENMP } -#endif // We could intervene here and do some load balancing by inspecting the list. Currently // the lazy approach of just assuming that the atoms are evenly distributed along c is used. @@ -1656,6 +1655,7 @@ class PMEInstance { { int threadID = omp_get_thread_num(); #else + { int threadID = 0; #endif size_t entry = threadOffset[threadID]; @@ -1686,15 +1686,14 @@ class PMEInstance { splineOrder_, splineDerivativeLevel); } } -#ifdef _OPENMP } -#endif // Finally, find all of the splines that this thread will need to handle #ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { int threadID = omp_get_thread_num(); #else + { int threadID = 0; #endif auto &mySplineList = splinesPerThread_[threadID]; @@ -1707,9 +1706,7 @@ class PMEInstance { } ++count; } -#ifdef _OPENMP } -#endif } /*! @@ -1924,15 +1921,17 @@ class PMEInstance { helpme::vector buffer(nThreads_ * scratchRowDim); // A transform, with instant sort to CAB ordering for each local block +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); #else int threadID = 0; #endif auto scratch = &buffer[threadID * scratchRowDim]; +#ifdef _OPENMP #pragma omp for +#endif for (int c = 0; c < subsetOfCAlongA_; ++c) { for (int b = 0; b < myGridDimensionB_; ++b) { Real *gridPtr = realGrid + c * myGridDimensionB_ * gridDimensionA_ + b * gridDimensionA_; @@ -1945,7 +1944,9 @@ class PMEInstance { } } } +#ifdef _OPENMP } +#endif #if HAVE_MPI == 1 // Communicate A back to blocks From 5f7ad1fb409d8641f7e42bdde4f812b14b34d066 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 22 Nov 2021 09:45:43 -0500 Subject: [PATCH 06/33] Three more guards --- src/helpme.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/helpme.h b/src/helpme.h index 3fa077b..d0dcab3 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -1978,7 +1978,9 @@ class PMEInstance { // B transform size_t numCA = (size_t)subsetOfCAlongB_ * myComplexGridDimensionA_; +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t ca = 0; ca < numCA; ++ca) { fftHelperB_.transform(buffer1 + ca * gridDimensionB_, FFTW_FORWARD); } @@ -2026,7 +2028,9 @@ class PMEInstance { #endif // C transform size_t numBA = (size_t)subsetOfBAlongC_ * myComplexGridDimensionA_; +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t ba = 0; ba < numBA; ++ba) { fftHelperC_.transform(buffer2 + ba * gridDimensionC_, FFTW_FORWARD); } @@ -2053,7 +2057,9 @@ class PMEInstance { // C transform size_t numYX = (size_t)subsetOfBAlongC_ * myComplexGridDimensionA_; +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t yx = 0; yx < numYX; ++yx) { fftHelperC_.transform(convolvedGrid + yx * gridDimensionC_, FFTW_BACKWARD); } From fe0622e5354edc8ee40eca24e64dba8400dbec50 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 22 Nov 2021 15:50:23 -0500 Subject: [PATCH 07/33] More pragma guards --- src/helpme.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/helpme.h b/src/helpme.h index d0dcab3..4f82de8 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -2111,11 +2111,15 @@ class PMEInstance { // B transform with instant sort of local blocks from CAB -> CBA order size_t numCA = (size_t)subsetOfCAlongB_ * myComplexGridDimensionA_; +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t ca = 0; ca < numCA; ++ca) { fftHelperB_.transform(buffer1 + ca * gridDimensionB_, FFTW_BACKWARD); } +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (int c = 0; c < subsetOfCAlongB_; ++c) { for (int a = 0; a < myComplexGridDimensionA_; ++a) { int cx = c * myComplexGridDimensionA_ * gridDimensionB_ + a * gridDimensionB_; @@ -2162,7 +2166,9 @@ class PMEInstance { // A transform Real *realGrid = reinterpret_cast(buffer2); +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (int cb = 0; cb < subsetOfCAlongA_ * myGridDimensionB_; ++cb) { fftHelperA_.transform(buffer1 + cb * complexGridDimensionA_, realGrid + cb * gridDimensionA_); } From d600734f4e72607784be981ee250f02c26e27f02 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 22 Nov 2021 15:55:19 -0500 Subject: [PATCH 08/33] 2 more guards --- src/helpme.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/helpme.h b/src/helpme.h index 4f82de8..c57286c 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -2281,7 +2281,9 @@ class PMEInstance { } if (iAmNodeZero) transformedGrid[0] = 0; // Writing the three nested loops in one allows for better load balancing in parallel. +#ifdef _OPENMP #pragma omp parallel for reduction(+ : energy) num_threads(nThreads_) +#endif for (size_t yxz = 0; yxz < nyxz; ++yxz) { energy += transformedGrid[yxz] * transformedGrid[yxz] * influenceFunction[yxz]; transformedGrid[yxz] *= influenceFunction[yxz]; @@ -2312,7 +2314,9 @@ class PMEInstance { } if (iAmNodeZero) transformedGrid[0] = Complex(0, 0); const size_t numCTerms(myNumKSumTermsC_); +#ifdef _OPENMP #pragma omp parallel for reduction(+ : energy) num_threads(nThreads_) +#endif for (size_t yxz = 0; yxz < nyxz; ++yxz) { size_t xz = yxz % nxz; int kx = firstKSumTermA_ + xz / numCTerms; From 3e9aae684a36537b07c0307e01c0e919d1f61417 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 22 Nov 2021 17:16:26 -0500 Subject: [PATCH 09/33] Add brackets for non openmp to be consistent with other code blocks --- src/helpme.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/helpme.h b/src/helpme.h index c57286c..f61ffc6 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -1551,6 +1551,7 @@ class PMEInstance { { int threadID = omp_get_thread_num(); #else + { int threadID = 0; #endif for (size_t row = threadID; row < myGridDimensionC_; row += nThreads_) { @@ -1565,9 +1566,7 @@ class PMEInstance { spreadParametersImpl(atom, realGrid, nComponents, splineA, splineB, splineC, fractionalParameters, threadID); } -#ifdef _OPENMP } -#endif return realGrid; } @@ -1926,6 +1925,7 @@ class PMEInstance { { int threadID = omp_get_thread_num(); #else + { int threadID = 0; #endif auto scratch = &buffer[threadID * scratchRowDim]; @@ -1944,9 +1944,7 @@ class PMEInstance { } } } -#ifdef _OPENMP } -#endif #if HAVE_MPI == 1 // Communicate A back to blocks @@ -2414,14 +2412,15 @@ class PMEInstance { } } } +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); +#pragma omp for #else + { int threadID = 0; #endif -#pragma omp for for (size_t atom = 0; atom < nAtoms; ++atom) { const auto &cacheEntry = splineCache_[atom]; const auto &absAtom = cacheEntry.absoluteAtomNumber; From 4327b35751d294a65f01a8350faf7d9c3c986b31 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 22 Nov 2021 17:21:20 -0500 Subject: [PATCH 10/33] Guard 2 more --- src/helpme.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/helpme.h b/src/helpme.h index f61ffc6..daa4f0c 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -2802,7 +2802,9 @@ class PMEInstance { // Direct space, using simple O(N^2) algorithm. This can be improved using a nonbonded list if needed. Real cutoffSquared = sphericalCutoff * sphericalCutoff; Real kappaSquared = kappa_ * kappa_; +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t i = 0; i < nAtoms; ++i) { const auto &coordsI = coordinates.row(i); Real *phiPtr = potential[i]; @@ -2834,7 +2836,9 @@ class PMEInstance { } else { std::logic_error("Unknown algorithm in helpme::computePAtAtomicSites"); } +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t atom = 0; atom < nAtoms; ++atom) { const auto &cacheEntry = splineCache_[atom]; const auto &absAtom = cacheEntry.absoluteAtomNumber; From 6f85b6a0a8fb8d716d49b5185bd7b9aaf8d47723 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 06:30:45 -0500 Subject: [PATCH 11/33] Fix signed/unsigned comparison --- src/matrix.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/matrix.h b/src/matrix.h index e3f2d0a..467382c 100644 --- a/src/matrix.h +++ b/src/matrix.h @@ -575,10 +575,10 @@ class Matrix { unsortedEigenVectors.transposeInPlace(); std::vector> eigenPairs; - for (int val = 0; val < nRows_; ++val) eigenPairs.push_back({eigenValues[val][0], unsortedEigenVectors[val]}); + for (size_t val = 0; val < nRows_; ++val) eigenPairs.push_back({eigenValues[val][0], unsortedEigenVectors[val]}); std::sort(eigenPairs.begin(), eigenPairs.end()); if (order == SortOrder::Descending) std::reverse(eigenPairs.begin(), eigenPairs.end()); - for (int val = 0; val < nRows_; ++val) { + for (size_t val = 0; val < nRows_; ++val) { const auto& e = eigenPairs[val]; eigenValues.data_[val] = std::get<0>(e); std::copy(std::get<1>(e), std::get<1>(e) + nCols_, sortedEigenVectors[val]); From 489ca5089aab278459b44f0b021e1c39db5f6a3e Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 06:55:51 -0500 Subject: [PATCH 12/33] Fix signed/unsigned comparison --- src/helpme.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/helpme.h b/src/helpme.h index daa4f0c..bb91de4 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -1037,7 +1037,8 @@ class PMEInstance { if (coordinates.nRows() != parameters.nRows()) throw std::runtime_error( "Inconsistent number of coordinates and parameters; there should be nAtoms of each."); - if (parameters.nCols() != (nCartesian(parameterAngMom) - cartesianOffset)) + if (n_param_cols < 0 || + parameters.nCols() != (size_t)n_param_cols) throw std::runtime_error( "Mismatch in the number of parameters provided and the parameter angular momentum"); } From 65e22cffef51f1e8adad494a6904caad8b9ba014 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 07:02:57 -0500 Subject: [PATCH 13/33] Fix signed/unsigne comparison --- src/helpme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helpme.h b/src/helpme.h index bb91de4..266da6d 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -1595,7 +1595,7 @@ class PMEInstance { { int threadID = 0; #endif - for (size_t row = threadID; row < gridDimensionC_; row += nThreads_) { + for (int row = threadID; row < gridDimensionC_; row += nThreads_) { gridAtomList_[row].clear(); } auto &mySplineList = splinesPerThread_[threadID]; From 5e1363faf630f9759863cedea3250208e0c73dd2 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 07:23:07 -0500 Subject: [PATCH 14/33] Use integer type and cast to integer to be consistent and avoid losing bits. --- src/helpme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helpme.h b/src/helpme.h index 266da6d..5c1801d 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -1608,7 +1608,7 @@ class PMEInstance { atomCoords[2] * recVecs_(2, 2) - EPS; cCoord -= floor(cCoord); short cStartingGridPoint = gridDimensionC_ * cCoord; - size_t thisAtomsThread = cStartingGridPoint % nThreads_; + int thisAtomsThread = (int)cStartingGridPoint % nThreads_; const auto &cGridIterator = gridIteratorC_[cStartingGridPoint]; if (cGridIterator.size() && thisAtomsThread == threadID) { Real aCoord = atomCoords[0] * recVecs_(0, 0) + atomCoords[1] * recVecs_(1, 0) + From 4380e389d877e7acdb74c01ebd35dc8ffcc571f4 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 07:28:24 -0500 Subject: [PATCH 15/33] Comment out currently unused variable. Need to check if this should be gridIteratorC_ or not. --- src/helpme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helpme.h b/src/helpme.h index 5c1801d..6c512e7 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -1599,7 +1599,7 @@ class PMEInstance { gridAtomList_[row].clear(); } auto &mySplineList = splinesPerThread_[threadID]; - const auto &gridIteratorC = threadedGridIteratorC_[threadID]; + //const auto &gridIteratorC = threadedGridIteratorC_[threadID]; FIXME currently unused mySplineList.clear(); size_t myNumAtoms = 0; for (int atom = 0; atom < nAtoms; ++atom) { From c7e2e7c95a20d71c672491ef93c1166aa2d31306 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 07:31:53 -0500 Subject: [PATCH 16/33] Fix signed/unsigned comparison --- src/helpme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helpme.h b/src/helpme.h index 6c512e7..4c2e0f2 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -1642,7 +1642,7 @@ class PMEInstance { // certain scale factor to try and minimize allocations in a not-too-wasteful manner. if (splineCache_.size() < numCacheEntries) { size_t newSize = static_cast(1.2 * numCacheEntries); - for (int atom = splineCache_.size(); atom < newSize; ++atom) + for (size_t atom = splineCache_.size(); atom < newSize; ++atom) splineCache_.emplace_back(splineOrder_, splineDerivativeLevel); } std::vector threadOffset(nThreads_, 0); From 13f53d9c39b05fe9cf31c526390f340f106ee00d Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 07:34:59 -0500 Subject: [PATCH 17/33] Make loop var int since all other vars are int --- src/helpme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helpme.h b/src/helpme.h index 4c2e0f2..8dbcca1 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -1659,7 +1659,7 @@ class PMEInstance { int threadID = 0; #endif size_t entry = threadOffset[threadID]; - for (size_t cRow = threadID; cRow < gridDimensionC_; cRow += nThreads_) { + for (int cRow = threadID; cRow < gridDimensionC_; cRow += nThreads_) { for (const auto &gridPointAndAtom : gridAtomList_[cRow]) { size_t atom = gridPointAndAtom.second; const Real *atomCoords = coords[atom]; From 44524427140f2ea58ca36107759838e733ce4cfe Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 07:37:26 -0500 Subject: [PATCH 18/33] Make loop var int --- src/helpme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helpme.h b/src/helpme.h index 8dbcca1..228b400 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -1555,7 +1555,7 @@ class PMEInstance { { int threadID = 0; #endif - for (size_t row = threadID; row < myGridDimensionC_; row += nThreads_) { + for (int row = threadID; row < myGridDimensionC_; row += nThreads_) { std::fill(&realGrid[row * numBA], &realGrid[(row + 1) * numBA], Real(0)); } for (const auto &spline : splinesPerThread_[threadID]) { From 4a7f7cfc221da2051f05d099a22920c812d95adc Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 07:49:28 -0500 Subject: [PATCH 19/33] Protect MPI-only variable --- src/helpme.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/helpme.h b/src/helpme.h index 228b400..e9ec691 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -1881,14 +1881,20 @@ class PMEInstance { * \return Pointer to the transformed grid, which is stored in one of the buffers in BAC order. */ Complex *forwardTransform(Real *realGrid) { +#if HAVE_MPI == 1 Real *__restrict__ realCBA; +#endif Complex *__restrict__ buffer1, *__restrict__ buffer2; if (realGrid == reinterpret_cast(workSpace1_.data())) { +#if HAVE_MPI == 1 realCBA = reinterpret_cast(workSpace2_.data()); +#endif buffer1 = workSpace2_.data(); buffer2 = workSpace1_.data(); } else { +#if HAVE_MPI == 1 realCBA = reinterpret_cast(workSpace1_.data()); +#endif buffer1 = workSpace1_.data(); buffer2 = workSpace2_.data(); } From 48603e525673318c19d4d88f7863dce806502f04 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 08:33:47 -0500 Subject: [PATCH 20/33] Change int to size_t to avoid signed/unsigned comparison --- src/matrix.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/matrix.h b/src/matrix.h index 467382c..ba47982 100644 --- a/src/matrix.h +++ b/src/matrix.h @@ -333,8 +333,8 @@ class Matrix { */ void assertSymmetric(const Real& threshold = 1e-10f) const { assertSquare(); - for (int row = 0; row < nRows_; ++row) { - for (int col = 0; col < row; ++col) { + for (size_t row = 0; row < nRows_; ++row) { + for (size_t col = 0; col < row; ++col) { if (std::abs(data_[row * nCols_ + col] - data_[col * nCols_ + row]) > threshold) throw std::runtime_error("Unexpected non-symmetric matrix found."); } From c5f646af492098cb686495a51ca1b7137712e922 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 08:40:09 -0500 Subject: [PATCH 21/33] Comment out unused var --- src/lapack_wrapper.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/lapack_wrapper.h b/src/lapack_wrapper.h index fbe62d9..f6c5956 100644 --- a/src/lapack_wrapper.h +++ b/src/lapack_wrapper.h @@ -128,7 +128,8 @@ void JacobiCyclicDiagonalization(Real *eigenvalues, Real *eigenvectors, const Re Real threshold_norm; Real threshold; Real tan_phi, sin_phi, cos_phi, tan2_phi, sin2_phi, cos2_phi; - Real sin_2phi, cos_2phi, cot_2phi; + Real sin_2phi, cot_2phi; + //Real cos_2phi; Real dum1; Real dum2; Real dum3; @@ -181,7 +182,7 @@ void JacobiCyclicDiagonalization(Real *eigenvalues, Real *eigenvectors, const Re if (tan_phi < 0) sin_phi = -sin_phi; cos_phi = sqrt(cos2_phi); sin_2phi = 2 * sin_phi * cos_phi; - cos_2phi = cos2_phi - sin2_phi; + //cos_2phi = cos2_phi - sin2_phi; // Rotate columns k and m for both the matrix A // and the matrix of eigenvectors. From 78ab56686eac055e1942a141ef65cbc3b168fe46 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 08:42:58 -0500 Subject: [PATCH 22/33] Change to size_t to avoid signed/unsigned comparison --- src/matrix.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/matrix.h b/src/matrix.h index ba47982..599cf27 100644 --- a/src/matrix.h +++ b/src/matrix.h @@ -361,7 +361,7 @@ class Matrix { Matrix evecs = std::get<1>(eigenPairs); evalsReal.applyOperationToEachElement(function); Matrix evecsT = evecs.transpose(); - for (int row = 0; row < nRows_; ++row) { + for (size_t row = 0; row < nRows_; ++row) { Real transformedEigenvalue = evalsReal[row][0]; std::for_each(evecsT.data_ + row * nCols_, evecsT.data_ + (row + 1) * nCols_, [&](Real& val) { val *= transformedEigenvalue; }); From ca0bd2e19f0e345ce0d2599474bffed17df262e2 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 10:26:16 -0500 Subject: [PATCH 23/33] Protect against signed int comparison to unsigned overflow --- src/splines.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/splines.h b/src/splines.h index acbe7f6..f6015d3 100644 --- a/src/splines.h +++ b/src/splines.h @@ -88,7 +88,10 @@ class BSpline { derivativeLevel_ = derivativeLevel; // The +1 is to account for the fact that we need to store entries up to and including the max. - if (splines_.nRows() < derivativeLevel + 1 || splines_.nCols() != order) + if (derivativeLevel < 0 || order < 0) { + throw std::runtime_error("BSpline::update: derivativeLevel and/or order < 0."); + } + if (splines_.nRows() < (size_t)derivativeLevel + 1 || splines_.nCols() != (size_t)order) splines_ = Matrix(derivativeLevel + 1, order); splines_.setZero(); From 3dde10e741ed845aa0562941c3fe52ce82e67d1f Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 10:29:55 -0500 Subject: [PATCH 24/33] Fix 3 signed/unsigned comparison --- src/matrix.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/matrix.h b/src/matrix.h index 599cf27..cb13439 100644 --- a/src/matrix.h +++ b/src/matrix.h @@ -397,10 +397,10 @@ class Matrix { throw std::runtime_error("Attempting to multiply matrices with incompatible dimensions."); Matrix product(nRows_, other.nCols_); Real* output = product.data_; - for (int row = 0; row < nRows_; ++row) { + for (size_t row = 0; row < nRows_; ++row) { const Real* rowPtr = data_ + row * nCols_; - for (int col = 0; col < other.nCols_; ++col) { - for (int link = 0; link < nCols_; ++link) { + for (size_t col = 0; col < other.nCols_; ++col) { + for (size_t link = 0; link < nCols_; ++link) { *output += rowPtr[link] * other.data_[link * other.nCols_ + col]; } ++output; From 8ceaae954f5d2108a8700c9b67055c23647a6092 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 10:35:57 -0500 Subject: [PATCH 25/33] Init uninitialized var --- src/helpme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helpme.h b/src/helpme.h index e9ec691..c55d94f 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -2965,7 +2965,7 @@ class PMEInstance { filterAtomsAndBuildSplineCache(parameterAngMom, coordinates); auto realGrid = spreadParameters(parameterAngMom, parameters); - Real energy; + Real energy = 0; if (algorithmType_ == AlgorithmType::PME) { auto gridAddress = forwardTransform(realGrid); energy = convolveE(gridAddress); From 51c65a7038af806103506b6608fe7ac1a54f2143 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 13:13:46 -0500 Subject: [PATCH 26/33] Add missing variable declaration --- src/helpme.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/helpme.h b/src/helpme.h index c55d94f..7a54134 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -1037,6 +1037,7 @@ class PMEInstance { if (coordinates.nRows() != parameters.nRows()) throw std::runtime_error( "Inconsistent number of coordinates and parameters; there should be nAtoms of each."); + int n_param_cols = nCartesian(parameterAngMom) - cartesianOffset; if (n_param_cols < 0 || parameters.nCols() != (size_t)n_param_cols) throw std::runtime_error( From 8618fbd7e235ff2f19777bc79e92de40189c2c53 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Wed, 24 Nov 2021 09:28:48 -0500 Subject: [PATCH 27/33] Fix signed/unsigned comparison --- src/helpme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helpme.h b/src/helpme.h index 7a54134..b195658 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -1603,7 +1603,7 @@ class PMEInstance { //const auto &gridIteratorC = threadedGridIteratorC_[threadID]; FIXME currently unused mySplineList.clear(); size_t myNumAtoms = 0; - for (int atom = 0; atom < nAtoms; ++atom) { + for (size_t atom = 0; atom < nAtoms; ++atom) { const Real *atomCoords = coords[atom]; Real cCoord = atomCoords[0] * recVecs_(0, 2) + atomCoords[1] * recVecs_(1, 2) + atomCoords[2] * recVecs_(2, 2) - EPS; From 04ac60153b0f77b0167ae09fd64ae6911cf042f8 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Wed, 24 Nov 2021 09:29:09 -0500 Subject: [PATCH 28/33] Update the standalone include file --- single_include/helpme_standalone.h | 106 +++++++++++++++++++++-------- 1 file changed, 77 insertions(+), 29 deletions(-) diff --git a/single_include/helpme_standalone.h b/single_include/helpme_standalone.h index 8e57c01..acb8f5d 100644 --- a/single_include/helpme_standalone.h +++ b/single_include/helpme_standalone.h @@ -210,7 +210,8 @@ void JacobiCyclicDiagonalization(Real *eigenvalues, Real *eigenvectors, const Re Real threshold_norm; Real threshold; Real tan_phi, sin_phi, cos_phi, tan2_phi, sin2_phi, cos2_phi; - Real sin_2phi, cos_2phi, cot_2phi; + Real sin_2phi, cot_2phi; + //Real cos_2phi; Real dum1; Real dum2; Real dum3; @@ -263,7 +264,7 @@ void JacobiCyclicDiagonalization(Real *eigenvalues, Real *eigenvectors, const Re if (tan_phi < 0) sin_phi = -sin_phi; cos_phi = sqrt(cos2_phi); sin_2phi = 2 * sin_phi * cos_phi; - cos_2phi = cos2_phi - sin2_phi; + //cos_2phi = cos2_phi - sin2_phi; // Rotate columns k and m for both the matrix A // and the matrix of eigenvectors. @@ -793,8 +794,8 @@ class Matrix { */ void assertSymmetric(const Real& threshold = 1e-10f) const { assertSquare(); - for (int row = 0; row < nRows_; ++row) { - for (int col = 0; col < row; ++col) { + for (size_t row = 0; row < nRows_; ++row) { + for (size_t col = 0; col < row; ++col) { if (std::abs(data_[row * nCols_ + col] - data_[col * nCols_ + row]) > threshold) throw std::runtime_error("Unexpected non-symmetric matrix found."); } @@ -821,7 +822,7 @@ class Matrix { Matrix evecs = std::get<1>(eigenPairs); evalsReal.applyOperationToEachElement(function); Matrix evecsT = evecs.transpose(); - for (int row = 0; row < nRows_; ++row) { + for (size_t row = 0; row < nRows_; ++row) { Real transformedEigenvalue = evalsReal[row][0]; std::for_each(evecsT.data_ + row * nCols_, evecsT.data_ + (row + 1) * nCols_, [&](Real& val) { val *= transformedEigenvalue; }); @@ -857,10 +858,10 @@ class Matrix { throw std::runtime_error("Attempting to multiply matrices with incompatible dimensions."); Matrix product(nRows_, other.nCols_); Real* output = product.data_; - for (int row = 0; row < nRows_; ++row) { + for (size_t row = 0; row < nRows_; ++row) { const Real* rowPtr = data_ + row * nCols_; - for (int col = 0; col < other.nCols_; ++col) { - for (int link = 0; link < nCols_; ++link) { + for (size_t col = 0; col < other.nCols_; ++col) { + for (size_t link = 0; link < nCols_; ++link) { *output += rowPtr[link] * other.data_[link * other.nCols_ + col]; } ++output; @@ -1035,10 +1036,10 @@ class Matrix { unsortedEigenVectors.transposeInPlace(); std::vector> eigenPairs; - for (int val = 0; val < nRows_; ++val) eigenPairs.push_back({eigenValues[val][0], unsortedEigenVectors[val]}); + for (size_t val = 0; val < nRows_; ++val) eigenPairs.push_back({eigenValues[val][0], unsortedEigenVectors[val]}); std::sort(eigenPairs.begin(), eigenPairs.end()); if (order == SortOrder::Descending) std::reverse(eigenPairs.begin(), eigenPairs.end()); - for (int val = 0; val < nRows_; ++val) { + for (size_t val = 0; val < nRows_; ++val) { const auto& e = eigenPairs[val]; eigenValues.data_[val] = std::get<0>(e); std::copy(std::get<1>(e), std::get<1>(e) + nCols_, sortedEigenVectors[val]); @@ -2263,7 +2264,10 @@ class BSpline { derivativeLevel_ = derivativeLevel; // The +1 is to account for the fact that we need to store entries up to and including the max. - if (splines_.nRows() < derivativeLevel + 1 || splines_.nCols() != order) + if (derivativeLevel < 0 || order < 0) { + throw std::runtime_error("BSpline::update: derivativeLevel and/or order < 0."); + } + if (splines_.nRows() < (size_t)derivativeLevel + 1 || splines_.nCols() != (size_t)order) splines_ = Matrix(derivativeLevel + 1, order); splines_.setZero(); @@ -2378,7 +2382,9 @@ namespace helpme { template void permuteABCtoCBA(Real const *__restrict__ abcPtr, int const aDimension, int const bDimension, int const cDimension, Real *__restrict__ cbaPtr, size_t nThreads = 1) { +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads) +#endif for (int C = 0; C <= -1 + cDimension; ++C) for (int B = 0; B <= -1 + bDimension; ++B) for (int A = 0; A <= -1 + aDimension; ++A) @@ -2398,7 +2404,9 @@ void permuteABCtoCBA(Real const *__restrict__ abcPtr, int const aDimension, int template void permuteABCtoACB(Real const *__restrict__ abcPtr, int const aDimension, int const bDimension, int const cDimension, Real *__restrict__ acbPtr, size_t nThreads = 1) { +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads) +#endif for (int A = 0; A <= -1 + aDimension; ++A) for (int C = 0; C <= -1 + cDimension; ++C) for (int B = 0; B <= -1 + bDimension; ++B) @@ -3233,7 +3241,9 @@ class PMEInstance { // Exclude m=0 cell. int start = (nodeZero ? 1 : 0); // Writing the three nested loops in one allows for better load balancing in parallel. +#ifdef _OPENMP #pragma omp parallel for reduction(+ : energy, Vxx, Vxy, Vyy, Vxz, Vyz, Vzz) num_threads(nThreads) +#endif for (size_t yxz = start; yxz < nyxz; ++yxz) { size_t xz = yxz % nxz; short ky = yxz / nxz; @@ -3340,7 +3350,9 @@ class PMEInstance { // Exclude m=0 cell. int start = (nodeZero ? 1 : 0); // Writing the three nested loops in one allows for better load balancing in parallel. +#ifdef _OPENMP #pragma omp parallel for reduction(+ : energy, Vxx, Vxy, Vyy, Vxz, Vyz, Vzz) num_threads(nThreads) +#endif for (size_t yxz = start; yxz < nyxz; ++yxz) { size_t xz = yxz % nxz; short ky = yxz / nxz; @@ -3449,7 +3461,9 @@ class PMEInstance { if (coordinates.nRows() != parameters.nRows()) throw std::runtime_error( "Inconsistent number of coordinates and parameters; there should be nAtoms of each."); - if (parameters.nCols() != (nCartesian(parameterAngMom) - cartesianOffset)) + int n_param_cols = nCartesian(parameterAngMom) - cartesianOffset; + if (n_param_cols < 0 || + parameters.nCols() != (size_t)n_param_cols) throw std::runtime_error( "Mismatch in the number of parameters provided and the parameter angular momentum"); } @@ -3499,7 +3513,9 @@ class PMEInstance { // Exclude m=0 cell. int start = (nodeZero ? 1 : 0); // Writing the three nested loops in one allows for better load balancing in parallel. +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads) +#endif for (size_t yxz = start; yxz < nyxz; ++yxz) { size_t xz = yxz % nxz; short ky = yxz / nxz; @@ -3956,14 +3972,15 @@ class PMEInstance { int nComponents = nCartesian(parameterAngMom); size_t numBA = (size_t)myGridDimensionB_ * myGridDimensionA_; +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); #else + { int threadID = 0; #endif - for (size_t row = threadID; row < myGridDimensionC_; row += nThreads_) { + for (int row = threadID; row < myGridDimensionC_; row += nThreads_) { std::fill(&realGrid[row * numBA], &realGrid[(row + 1) * numBA], Real(0)); } for (const auto &spline : splinesPerThread_[threadID]) { @@ -3995,27 +4012,28 @@ class PMEInstance { gridAtomList_.resize(gridDimensionC_); // Classify atoms to their worker threads first, then construct splines for each thread +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); #else + { int threadID = 0; #endif - for (size_t row = threadID; row < gridDimensionC_; row += nThreads_) { + for (int row = threadID; row < gridDimensionC_; row += nThreads_) { gridAtomList_[row].clear(); } auto &mySplineList = splinesPerThread_[threadID]; - const auto &gridIteratorC = threadedGridIteratorC_[threadID]; + //const auto &gridIteratorC = threadedGridIteratorC_[threadID]; FIXME currently unused mySplineList.clear(); size_t myNumAtoms = 0; - for (int atom = 0; atom < nAtoms; ++atom) { + for (size_t atom = 0; atom < nAtoms; ++atom) { const Real *atomCoords = coords[atom]; Real cCoord = atomCoords[0] * recVecs_(0, 2) + atomCoords[1] * recVecs_(1, 2) + atomCoords[2] * recVecs_(2, 2) - EPS; cCoord -= floor(cCoord); short cStartingGridPoint = gridDimensionC_ * cCoord; - size_t thisAtomsThread = cStartingGridPoint % nThreads_; + int thisAtomsThread = (int)cStartingGridPoint % nThreads_; const auto &cGridIterator = gridIteratorC_[cStartingGridPoint]; if (cGridIterator.size() && thisAtomsThread == threadID) { Real aCoord = atomCoords[0] * recVecs_(0, 0) + atomCoords[1] * recVecs_(1, 0) + @@ -4039,7 +4057,6 @@ class PMEInstance { } numAtomsPerThread_[threadID] = myNumAtoms; } - // We could intervene here and do some load balancing by inspecting the list. Currently // the lazy approach of just assuming that the atoms are evenly distributed along c is used. @@ -4050,7 +4067,7 @@ class PMEInstance { // certain scale factor to try and minimize allocations in a not-too-wasteful manner. if (splineCache_.size() < numCacheEntries) { size_t newSize = static_cast(1.2 * numCacheEntries); - for (int atom = splineCache_.size(); atom < newSize; ++atom) + for (size_t atom = splineCache_.size(); atom < newSize; ++atom) splineCache_.emplace_back(splineOrder_, splineDerivativeLevel); } std::vector threadOffset(nThreads_, 0); @@ -4058,15 +4075,16 @@ class PMEInstance { threadOffset[thread] = threadOffset[thread - 1] + numAtomsPerThread_[thread - 1]; } +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); #else + { int threadID = 0; #endif size_t entry = threadOffset[threadID]; - for (size_t cRow = threadID; cRow < gridDimensionC_; cRow += nThreads_) { + for (int cRow = threadID; cRow < gridDimensionC_; cRow += nThreads_) { for (const auto &gridPointAndAtom : gridAtomList_[cRow]) { size_t atom = gridPointAndAtom.second; const Real *atomCoords = coords[atom]; @@ -4094,13 +4112,13 @@ class PMEInstance { } } } - // Finally, find all of the splines that this thread will need to handle +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); #else + { int threadID = 0; #endif auto &mySplineList = splinesPerThread_[threadID]; @@ -4288,14 +4306,20 @@ class PMEInstance { * \return Pointer to the transformed grid, which is stored in one of the buffers in BAC order. */ Complex *forwardTransform(Real *realGrid) { +#if HAVE_MPI == 1 Real *__restrict__ realCBA; +#endif Complex *__restrict__ buffer1, *__restrict__ buffer2; if (realGrid == reinterpret_cast(workSpace1_.data())) { +#if HAVE_MPI == 1 realCBA = reinterpret_cast(workSpace2_.data()); +#endif buffer1 = workSpace2_.data(); buffer2 = workSpace1_.data(); } else { +#if HAVE_MPI == 1 realCBA = reinterpret_cast(workSpace1_.data()); +#endif buffer1 = workSpace1_.data(); buffer2 = workSpace2_.data(); } @@ -4328,15 +4352,18 @@ class PMEInstance { helpme::vector buffer(nThreads_ * scratchRowDim); // A transform, with instant sort to CAB ordering for each local block +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); #else + { int threadID = 0; #endif auto scratch = &buffer[threadID * scratchRowDim]; +#ifdef _OPENMP #pragma omp for +#endif for (int c = 0; c < subsetOfCAlongA_; ++c) { for (int b = 0; b < myGridDimensionB_; ++b) { Real *gridPtr = realGrid + c * myGridDimensionB_ * gridDimensionA_ + b * gridDimensionA_; @@ -4381,7 +4408,9 @@ class PMEInstance { // B transform size_t numCA = (size_t)subsetOfCAlongB_ * myComplexGridDimensionA_; +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t ca = 0; ca < numCA; ++ca) { fftHelperB_.transform(buffer1 + ca * gridDimensionB_, FFTW_FORWARD); } @@ -4429,7 +4458,9 @@ class PMEInstance { #endif // C transform size_t numBA = (size_t)subsetOfBAlongC_ * myComplexGridDimensionA_; +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t ba = 0; ba < numBA; ++ba) { fftHelperC_.transform(buffer2 + ba * gridDimensionC_, FFTW_FORWARD); } @@ -4456,7 +4487,9 @@ class PMEInstance { // C transform size_t numYX = (size_t)subsetOfBAlongC_ * myComplexGridDimensionA_; +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t yx = 0; yx < numYX; ++yx) { fftHelperC_.transform(convolvedGrid + yx * gridDimensionC_, FFTW_BACKWARD); } @@ -4508,11 +4541,15 @@ class PMEInstance { // B transform with instant sort of local blocks from CAB -> CBA order size_t numCA = (size_t)subsetOfCAlongB_ * myComplexGridDimensionA_; +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t ca = 0; ca < numCA; ++ca) { fftHelperB_.transform(buffer1 + ca * gridDimensionB_, FFTW_BACKWARD); } +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (int c = 0; c < subsetOfCAlongB_; ++c) { for (int a = 0; a < myComplexGridDimensionA_; ++a) { int cx = c * myComplexGridDimensionA_ * gridDimensionB_ + a * gridDimensionB_; @@ -4559,7 +4596,9 @@ class PMEInstance { // A transform Real *realGrid = reinterpret_cast(buffer2); +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (int cb = 0; cb < subsetOfCAlongA_ * myGridDimensionB_; ++cb) { fftHelperA_.transform(buffer1 + cb * complexGridDimensionA_, realGrid + cb * gridDimensionA_); } @@ -4672,7 +4711,9 @@ class PMEInstance { } if (iAmNodeZero) transformedGrid[0] = 0; // Writing the three nested loops in one allows for better load balancing in parallel. +#ifdef _OPENMP #pragma omp parallel for reduction(+ : energy) num_threads(nThreads_) +#endif for (size_t yxz = 0; yxz < nyxz; ++yxz) { energy += transformedGrid[yxz] * transformedGrid[yxz] * influenceFunction[yxz]; transformedGrid[yxz] *= influenceFunction[yxz]; @@ -4703,7 +4744,9 @@ class PMEInstance { } if (iAmNodeZero) transformedGrid[0] = Complex(0, 0); const size_t numCTerms(myNumKSumTermsC_); +#ifdef _OPENMP #pragma omp parallel for reduction(+ : energy) num_threads(nThreads_) +#endif for (size_t yxz = 0; yxz < nyxz; ++yxz) { size_t xz = yxz % nxz; int kx = firstKSumTermA_ + xz / numCTerms; @@ -4801,14 +4844,15 @@ class PMEInstance { } } } +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); +#pragma omp for #else + { int threadID = 0; #endif -#pragma omp for for (size_t atom = 0; atom < nAtoms; ++atom) { const auto &cacheEntry = splineCache_[atom]; const auto &absAtom = cacheEntry.absoluteAtomNumber; @@ -5190,7 +5234,9 @@ class PMEInstance { // Direct space, using simple O(N^2) algorithm. This can be improved using a nonbonded list if needed. Real cutoffSquared = sphericalCutoff * sphericalCutoff; Real kappaSquared = kappa_ * kappa_; +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t i = 0; i < nAtoms; ++i) { const auto &coordsI = coordinates.row(i); Real *phiPtr = potential[i]; @@ -5222,7 +5268,9 @@ class PMEInstance { } else { std::logic_error("Unknown algorithm in helpme::computePAtAtomicSites"); } +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t atom = 0; atom < nAtoms; ++atom) { const auto &cacheEntry = splineCache_[atom]; const auto &absAtom = cacheEntry.absoluteAtomNumber; @@ -5342,7 +5390,7 @@ class PMEInstance { filterAtomsAndBuildSplineCache(parameterAngMom, coordinates); auto realGrid = spreadParameters(parameterAngMom, parameters); - Real energy; + Real energy = 0; if (algorithmType_ == AlgorithmType::PME) { auto gridAddress = forwardTransform(realGrid); energy = convolveE(gridAddress); From d2e37c6bb53540c53fcce29901d27f9f5db2360a Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Wed, 24 Nov 2021 09:37:45 -0500 Subject: [PATCH 29/33] Init uninitialized var in Matrix constructor --- src/matrix.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/matrix.h b/src/matrix.h index cb13439..f57234b 100644 --- a/src/matrix.h +++ b/src/matrix.h @@ -185,7 +185,7 @@ class Matrix { /*! * \brief Matrix Constructs an empty matrix. */ - Matrix() : nRows_(0), nCols_(0) {} + Matrix() : nRows_(0), nCols_(0), data_(0) {} /*! * \brief Matrix Constructs a new matrix, allocating memory. From 46bb70360973025fa104e60e1118d170c392b07d Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Wed, 24 Nov 2021 09:46:15 -0500 Subject: [PATCH 30/33] Init uninitialized var --- src/helpme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helpme.h b/src/helpme.h index b195658..a00261d 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -451,7 +451,7 @@ class PMEInstance { } } - Real *potentialGrid; + Real *potentialGrid = 0; if (algorithmType_ == AlgorithmType::PME) { auto gridAddress = forwardTransform(realGrid); if (virial.nRows() == 0 && virial.nCols() == 0) { From f1b3c4f1d0258b9f1c785de0f4c2dfff299244e9 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Wed, 24 Nov 2021 09:46:24 -0500 Subject: [PATCH 31/33] Update standalone include --- single_include/helpme_standalone.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/single_include/helpme_standalone.h b/single_include/helpme_standalone.h index acb8f5d..56edcd5 100644 --- a/single_include/helpme_standalone.h +++ b/single_include/helpme_standalone.h @@ -646,7 +646,7 @@ class Matrix { /*! * \brief Matrix Constructs an empty matrix. */ - Matrix() : nRows_(0), nCols_(0) {} + Matrix() : nRows_(0), nCols_(0), data_(0) {} /*! * \brief Matrix Constructs a new matrix, allocating memory. @@ -2875,7 +2875,7 @@ class PMEInstance { } } - Real *potentialGrid; + Real *potentialGrid = 0; if (algorithmType_ == AlgorithmType::PME) { auto gridAddress = forwardTransform(realGrid); if (virial.nRows() == 0 && virial.nCols() == 0) { From 64bede2d626571651910b0b4ddff3180305afaf0 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Wed, 24 Nov 2021 10:16:32 -0500 Subject: [PATCH 32/33] Try to fix linting issues --- src/helpme.h | 5 ++--- src/lapack_wrapper.h | 4 ++-- src/matrix.h | 3 ++- src/splines.h | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/helpme.h b/src/helpme.h index a00261d..b33eb53 100644 --- a/src/helpme.h +++ b/src/helpme.h @@ -1038,8 +1038,7 @@ class PMEInstance { throw std::runtime_error( "Inconsistent number of coordinates and parameters; there should be nAtoms of each."); int n_param_cols = nCartesian(parameterAngMom) - cartesianOffset; - if (n_param_cols < 0 || - parameters.nCols() != (size_t)n_param_cols) + if (n_param_cols < 0 || parameters.nCols() != (size_t)n_param_cols) throw std::runtime_error( "Mismatch in the number of parameters provided and the parameter angular momentum"); } @@ -1600,7 +1599,7 @@ class PMEInstance { gridAtomList_[row].clear(); } auto &mySplineList = splinesPerThread_[threadID]; - //const auto &gridIteratorC = threadedGridIteratorC_[threadID]; FIXME currently unused + // const auto &gridIteratorC = threadedGridIteratorC_[threadID]; FIXME currently unused mySplineList.clear(); size_t myNumAtoms = 0; for (size_t atom = 0; atom < nAtoms; ++atom) { diff --git a/src/lapack_wrapper.h b/src/lapack_wrapper.h index f6c5956..c2abe12 100644 --- a/src/lapack_wrapper.h +++ b/src/lapack_wrapper.h @@ -129,7 +129,7 @@ void JacobiCyclicDiagonalization(Real *eigenvalues, Real *eigenvectors, const Re Real threshold; Real tan_phi, sin_phi, cos_phi, tan2_phi, sin2_phi, cos2_phi; Real sin_2phi, cot_2phi; - //Real cos_2phi; + // Real cos_2phi; Real dum1; Real dum2; Real dum3; @@ -182,7 +182,7 @@ void JacobiCyclicDiagonalization(Real *eigenvalues, Real *eigenvectors, const Re if (tan_phi < 0) sin_phi = -sin_phi; cos_phi = sqrt(cos2_phi); sin_2phi = 2 * sin_phi * cos_phi; - //cos_2phi = cos2_phi - sin2_phi; + // cos_2phi = cos2_phi - sin2_phi; // Rotate columns k and m for both the matrix A // and the matrix of eigenvectors. diff --git a/src/matrix.h b/src/matrix.h index f57234b..2c85088 100644 --- a/src/matrix.h +++ b/src/matrix.h @@ -575,7 +575,8 @@ class Matrix { unsortedEigenVectors.transposeInPlace(); std::vector> eigenPairs; - for (size_t val = 0; val < nRows_; ++val) eigenPairs.push_back({eigenValues[val][0], unsortedEigenVectors[val]}); + for (size_t val = 0; val < nRows_; ++val) + eigenPairs.push_back({eigenValues[val][0], unsortedEigenVectors[val]}); std::sort(eigenPairs.begin(), eigenPairs.end()); if (order == SortOrder::Descending) std::reverse(eigenPairs.begin(), eigenPairs.end()); for (size_t val = 0; val < nRows_; ++val) { diff --git a/src/splines.h b/src/splines.h index f6015d3..2b3c1a8 100644 --- a/src/splines.h +++ b/src/splines.h @@ -89,7 +89,7 @@ class BSpline { // The +1 is to account for the fact that we need to store entries up to and including the max. if (derivativeLevel < 0 || order < 0) { - throw std::runtime_error("BSpline::update: derivativeLevel and/or order < 0."); + throw std::runtime_error("BSpline::update: derivativeLevel and/or order < 0."); } if (splines_.nRows() < (size_t)derivativeLevel + 1 || splines_.nCols() != (size_t)order) splines_ = Matrix(derivativeLevel + 1, order); From 42c686f6ee80182580ef861dd3b4bc90f056dad3 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Wed, 24 Nov 2021 10:27:54 -0500 Subject: [PATCH 33/33] Fix linting issues in the single include --- single_include/helpme_standalone.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/single_include/helpme_standalone.h b/single_include/helpme_standalone.h index 56edcd5..dd02231 100644 --- a/single_include/helpme_standalone.h +++ b/single_include/helpme_standalone.h @@ -211,7 +211,7 @@ void JacobiCyclicDiagonalization(Real *eigenvalues, Real *eigenvectors, const Re Real threshold; Real tan_phi, sin_phi, cos_phi, tan2_phi, sin2_phi, cos2_phi; Real sin_2phi, cot_2phi; - //Real cos_2phi; + // Real cos_2phi; Real dum1; Real dum2; Real dum3; @@ -264,7 +264,7 @@ void JacobiCyclicDiagonalization(Real *eigenvalues, Real *eigenvectors, const Re if (tan_phi < 0) sin_phi = -sin_phi; cos_phi = sqrt(cos2_phi); sin_2phi = 2 * sin_phi * cos_phi; - //cos_2phi = cos2_phi - sin2_phi; + // cos_2phi = cos2_phi - sin2_phi; // Rotate columns k and m for both the matrix A // and the matrix of eigenvectors. @@ -1036,7 +1036,8 @@ class Matrix { unsortedEigenVectors.transposeInPlace(); std::vector> eigenPairs; - for (size_t val = 0; val < nRows_; ++val) eigenPairs.push_back({eigenValues[val][0], unsortedEigenVectors[val]}); + for (size_t val = 0; val < nRows_; ++val) + eigenPairs.push_back({eigenValues[val][0], unsortedEigenVectors[val]}); std::sort(eigenPairs.begin(), eigenPairs.end()); if (order == SortOrder::Descending) std::reverse(eigenPairs.begin(), eigenPairs.end()); for (size_t val = 0; val < nRows_; ++val) { @@ -2265,7 +2266,7 @@ class BSpline { // The +1 is to account for the fact that we need to store entries up to and including the max. if (derivativeLevel < 0 || order < 0) { - throw std::runtime_error("BSpline::update: derivativeLevel and/or order < 0."); + throw std::runtime_error("BSpline::update: derivativeLevel and/or order < 0."); } if (splines_.nRows() < (size_t)derivativeLevel + 1 || splines_.nCols() != (size_t)order) splines_ = Matrix(derivativeLevel + 1, order); @@ -3462,8 +3463,7 @@ class PMEInstance { throw std::runtime_error( "Inconsistent number of coordinates and parameters; there should be nAtoms of each."); int n_param_cols = nCartesian(parameterAngMom) - cartesianOffset; - if (n_param_cols < 0 || - parameters.nCols() != (size_t)n_param_cols) + if (n_param_cols < 0 || parameters.nCols() != (size_t)n_param_cols) throw std::runtime_error( "Mismatch in the number of parameters provided and the parameter angular momentum"); } @@ -4024,7 +4024,7 @@ class PMEInstance { gridAtomList_[row].clear(); } auto &mySplineList = splinesPerThread_[threadID]; - //const auto &gridIteratorC = threadedGridIteratorC_[threadID]; FIXME currently unused + // const auto &gridIteratorC = threadedGridIteratorC_[threadID]; FIXME currently unused mySplineList.clear(); size_t myNumAtoms = 0; for (size_t atom = 0; atom < nAtoms; ++atom) {