Merge gpu calculation for rpa and wc #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

userchba wants to merge 22 commits into AroundPeking:merge_qsgw from userchba:merge_qsgw_device

CMakeLists.txt

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -20,14 +20,18 @@ set(LIB_NAME            "rpa")
  
    set(FORTRAN_LIB_NAME    "rpa_f")

    # options setup

    option(USE_LIBRI            "Use LibRI for tensor contraction" OFF)

    option(USE_LIBRI            "Use LibRI for tensor contraction" ON)

    option(USE_CMAKE_INC        "Use cmake.inc for configure" OFF)

    option(USE_GREENX_API       "Use GreenX API for minimax grids generation" OFF)

    option(USE_GREENX_API       "Use GreenX API for minimax grids generation" ON)

    option(USE_EXTERNAL_GREENX  "Use external GreenX library rather than the packaged one" OFF)

    option(ENABLE_TEST          "Flag to build unit tests" ON)

    option(ENABLE_DRIVER        "Flag to build driver executables" ON)

    option(ENABLE_FORTRAN_BIND  "Flag to build Fotran binding" OFF)

    option(VERBOSE_OUTPUT       "Flag to print verbose information in stdout and process output" OFF)

    option(USE_CUDA "Use cuda for EcRPA calculation" OFF)

    option(ENABLE_CUSOLVERMP "Use cusolverMp for EcRPA calculation" OFF)

    option(ENABLE_CUBLASMP "Use cublasmp for EcRPA calculation" OFF )

    option(ENABLE_NVHPC "Use nvhpc for calculation" OFF )

    # NOTE: static library not tested

    option(BUILD_LIBRPA_SHARED  "Flag to build shared libraries" ON)

    @@ -61,6 +65,10 @@ if(USE_GREENX_API OR ENABLE_FORTRAN_BIND)
  
    else()

      enable_language(CXX)

    endif()

    # added by hbchen in 2025-5-19

    if(USE_CUDA)

      enable_language(CUDA)

    endif()

    # bypass the deprecation warning of classic C++ of oneAPI

    if(CMAKE_CXX_COMPILER_ID MATCHES Intel)

    @@ -177,6 +185,46 @@ if(USE_LIBRI)
  
      add_compile_definitions("LIBRPA_USE_LIBRI")

    endif()

    # add gpu accelaration by chenhaobo in 2025-04-26

    # cmake_policy(SET CMP0146 NEW)

    if(USE_CUDA)

      find_package(CUDA REQUIRED)

      find_package(CUDAToolkit REQUIRED)

      add_definitions(-DADD_)

      include_directories(${MAGMA_ROOT}/include)

      set(MAGMA_INCLUDE_DIR "${MAGMA_ROOT}/include")

      set(MAGMA_INCLUDE_DIR $ENV{MAGMA_INCLUDE_DIR})

      link_directories(${MAGMA_ROOT}/lib)

      set(CMAKE_CUDA_STANDARD 14)

      set(CUDA_LIBRARIES "-lcublas -lcudart -lcusolver")

      list(APPEND math_libs

                  ${CUDA_LIBRARIES}

                  ${MAGMA_ROOT}/lib/libmagma.so

                  )

      add_compile_definitions("LIBRPA_USE_CUDA")

    endif()

    if(ENABLE_NVHPC)

    add_compile_definitions("ENABLE_NVHPC")

    set(ENABLE_CUSOLVERMP ON)

    set(ENABLE_CUBLASMP ON)

    endif()

    if(ENABLE_CUSOLVERMP)

      add_compile_definitions("ENABLE_CUSOLVERMP")

      set(CUSOLVERMP_LIBRAYIES "-lcal -lcusolverMp")

      list(APPEND math_libs

                  ${CUSOLVERMP_LIBRAYIES}

                  )

    endif()

    if(ENABLE_CUBLASMP)

      add_compile_definitions("ENABLE_CUBLASMP")

      set(CUBLASMP_LIBRAYIES "-lcublasmp -lcurand -lcublas")

      list(APPEND math_libs

                  ${CUBLASMP_LIBRAYIES}

                  )

    endif()

    # finish adding gpu accelaration by chenhaobo in 2025-04-26

    if(CMAKE_BUILD_TYPE MATCHES "Debug")

      message(STATUS "Build type set to Debug, adding LIBRPA_DEBUG preprocessor directive")

      add_compile_definitions("LIBRPA_DEBUG")

driver/main.cpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -39,7 +39,9 @@ @@
     #include "utils_cmake.h"
     #include "utils_mem.h"
     #include "utils_mpi_io.h"
+    #ifdef ENABLE_NVHPC
+    #include "device_stream.h"
+    #endif
     static void initialize(int argc, char **argv)
     {
         using namespace LIBRPA::envs;
@@ Expand All / @@ -54,7 +56,9 @@ static void initialize(int argc, char **argv) @@
         }
         initialize_librpa_environment(MPI_COMM_WORLD, 0, 0, "");
+        #ifdef ENABLE_NVHPC
+        device_stream.init();
+        #endif
         // Global profiler begins right after MPI is initialized
         Profiler::start("total", "Total");
     }
@@ Expand All / @@ -77,7 +81,9 @@ static void finalize(bool success) @@
                 lib_printf("libRPA failed\n");
             }
         }
+        #ifdef ENABLE_NVHPC
+        device_stream.finalize();
+        #endif
         finalize_librpa_environment();
         MPI_Finalize();
@@ Expand Down @@

driver/task_gw_band.cpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -23,7 +23,10 @@ @@
     #include "ri.h"
     #include "utils_timefreq.h"
     #include "write_aims.h"
+    #ifdef ENABLE_NVHPC
+    #include "epsilon_cuda.h"
+    #include <cuda_runtime.h>
+    #endif
     void task_g0w0_band(std::map<Vector3_Order<double>, ComplexMatrix> &sinvS)
     {
         using LIBRPA::envs::mpi_comm_global_h;
@@ Expand Down Expand Up @@
             Wc_freq_q;
         if (Params::use_scalapack_gw_wc)
         {
+            #ifdef ENABLE_NVHPC
+            int numDevices;
+            cudaError_t cudaStat = cudaGetDeviceCount(&numDevices);
+            if(cudaStat == cudaSuccess && numDevices>0)
+                Wc_freq_q = compute_Wc_freq_q_blacs_cuda(chi0, Vq, Vq_cut, epsmac_LF_imagfreq);
+            else
+            #endif
             Wc_freq_q = compute_Wc_freq_q_blacs(chi0, Vq, Vq_cut, epsmac_LF_imagfreq);
         }
         else
@@ Expand Down @@

src/CMakeLists.txt

-Original file line number
+Diff line change
@@ Expand Up / @@ -72,6 +72,29 @@ else() @@
       list(APPEND lib_sources get_minimax_local.cpp)
     endif()
+    if(USE_CUDA)
+      list(APPEND lib_sources
+            epsilon_cuda.cpp
+            cuda_connector.cu
+            )
+    endif()
+    if(ENABLE_NVHPC)
+      list(APPEND lib_sources
+            cuda_connector.cpp
+            array_desc_device.cu
+            device_stream.cpp
+            matrix_device.cpp
+            device_connector.cpp
+            )
+      # set_source_files_properties(
+      #   base_blacs.cpp
+      #   PROPERTIES
+      #   LANGUAGE CUDA
+      #   # COMPILE_OPTIONS "--x cu"
+      #   )
+    endif()
     target_include_directories(rpa_lib
       PRIVATE
         # ${CMAKE_CURRENT_LIST_DIR}
@@ Expand Down @@

src/app_rpa.cpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,10 @@ @@
     #include "stl_io_helper.h"
     #include "utils_mem.h"
     #include "utils_timefreq.h"
+    #ifdef LIBRPA_USE_CUDA
+    #include <cuda_runtime.h> // added by hbchen in 2025-5-17
+    #include <epsilon_cuda.h> // added by hbchen in 2025-7-26
+    #endif
     namespace LIBRPA
     {
@@ Expand Down Expand Up @@
         mpi_comm_global_h.barrier();
         Profiler::start("EcRPA", "Compute RPA correlation Energy");
         CorrEnergy corr;
+        #ifdef LIBRPA_USE_CUDA
+        int deviceCount;
+        cudaError_t err= cudaGetDeviceCount(&deviceCount);
+        // lib_printf("cudaSuccess:%d\n",err==cudaSuccess&&deviceCount>0);
+        printf("Number of CUDA devices: %d\n", deviceCount);
+        // deviceCount!=4 is a bug, because if I don't set the gres=gpu:*, cuda will detect all the gpu device
+        if(err==cudaSuccess&&deviceCount>0){//说明存在gpu设备
+            #ifdef ENABLE_NVHPC
+            if (Params::use_scalapack_ecrpa &&
+            (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR ||
+             LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI))
+                corr = compute_RPA_correlation_blacs_2d_cuda(chi0, Vq);
+            else
+            #endif
+                corr = compute_RPA_correlation_cuda(chi0, Vq);
+        }else
+        #endif
         if (Params::use_scalapack_ecrpa &&
             (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR ||
              LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI))
@@ Expand Down @@

src/array_desc_device.cu

-Original file line number
+Diff line change
@@ -0,0 +1,36 @@
+    #include "array_desc_device.h"
+    Array_Desc_Device::Array_Desc_Device(const LIBRPA::Array_Desc& array_desc) {
+        this->ictxt_ = array_desc.ictxt();
+        this->m_ = array_desc.m();
+        this->n_ = array_desc.n();
+        this->mb_ = array_desc.mb();
+        this->nb_ = array_desc.nb();
+        this->lld_ = array_desc.lld();
+        this->irsrc_ = array_desc.irsrc();
+        this->icsrc_ = array_desc.icsrc();
+        this->m_local_ = array_desc.m_loc();
+        this->n_local_ = array_desc.n_loc();
+        this->myprow_ = array_desc.myprow();
+        this->mypcol_ = array_desc.mypcol();
+        this->nprows_ = array_desc.nprows();
+        this->npcols_ = array_desc.npcols();
+    }
+    __host__ __device__ int Array_Desc_Device::indx_g2p(const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs) {
+        return (isrcproc + indxglob / nb) % nprocs;
+    }
+    __host__ __device__ int Array_Desc_Device::indx_g2l(const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs) {
+        return nb * (indxglob / (nb * nprocs)) + indxglob % nb;
+    }
+    __host__ __device__ int Array_Desc_Device::indx_g2l_r(int gindx)const{
+        return this->myprow_ != indx_g2p(gindx, this->mb_, this->irsrc_, this->nprows_) ||
+                       gindx >= this->m_
+                   ? -1
+                   : indx_g2l(gindx, this->mb_, this->irsrc_, this->nprows_);
+    }
+    __host__ __device__ int Array_Desc_Device::indx_g2l_c(int gindx)const{
+        return this->mypcol_ != indx_g2p(gindx, this->nb_, this->icsrc_, this->npcols_) ||
+                       gindx >= this->n_
+                   ? -1
+                   : indx_g2l(gindx, this->nb_, this->icsrc_, this->npcols_);
+    }

src/array_desc_device.h

-Original file line number
+Diff line change
@@ -0,0 +1,61 @@
+    #include "base_blacs.h"
+    class Array_Desc_Device{
+    private:
+        int ictxt_;
+        // int nprocs_;
+        // int myid_;
+        int nprows_;
+        int myprow_;
+        int npcols_;
+        int mypcol_;
+        // Array dimensions
+        int m_;
+        int n_;
+        int mb_;
+        int nb_;
+        int irsrc_;
+        int icsrc_;
+        int lld_;
+        int m_local_;
+        int n_local_;
+        __host__ __device__ static int indx_g2p(
+            const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs);
+        __host__ __device__ static int indx_g2l(
+            const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs);
+    public:
+        Array_Desc_Device(const LIBRPA::Array_Desc& array_desc);
+        __host__ __device__
+        int indx_g2l_r(int gindx) const;
+        __host__ __device__
+        int indx_g2l_c(int gindx) const;
+        __host__ __device__
+        const int& m() const{ return m_; }
+        __host__ __device__
+        const int& n() const{ return n_; }
+        __host__ __device__
+        const int& mb() const{ return mb_; }
+        __host__ __device__
+        const int& nb() const{ return nb_; }
+        __host__ __device__
+        const int& irsrc() const{ return irsrc_; }
+        __host__ __device__
+        const int& icsrc() const{ return icsrc_; }
+        __host__ __device__
+        const int& lld() const{ return lld_; }
+        __host__ __device__
+        const int& m_loc() const{ return m_local_; }
+        __host__ __device__
+        const int& n_loc() const{ return n_local_; }
+        __host__ __device__
+        const int& nprows() const{ return nprows_; }
+        __host__ __device__
+        const int& npcols() const{ return npcols_; }
+        __host__ __device__
+        const int& myprow() const{ return myprow_; }
+        __host__ __device__
+        const int& mypcol() const{ return mypcol_; }
+        __host__ __device__
+        const int& ictxt() const{ return ictxt_; }
+    };

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Merge gpu calculation for rpa and wc #4

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

Merge gpu calculation for rpa and wc #4

Are you sure you want to change the base?

Uh oh!

Merge gpu calculation for rpa and wc #4

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!