Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 50 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,18 @@ set(LIB_NAME "rpa")
set(FORTRAN_LIB_NAME "rpa_f")

# options setup
option(USE_LIBRI "Use LibRI for tensor contraction" OFF)
option(USE_LIBRI "Use LibRI for tensor contraction" ON)
option(USE_CMAKE_INC "Use cmake.inc for configure" OFF)
option(USE_GREENX_API "Use GreenX API for minimax grids generation" OFF)
option(USE_GREENX_API "Use GreenX API for minimax grids generation" ON)
option(USE_EXTERNAL_GREENX "Use external GreenX library rather than the packaged one" OFF)
option(ENABLE_TEST "Flag to build unit tests" ON)
option(ENABLE_DRIVER "Flag to build driver executables" ON)
option(ENABLE_FORTRAN_BIND "Flag to build Fotran binding" OFF)
option(VERBOSE_OUTPUT "Flag to print verbose information in stdout and process output" OFF)
option(USE_CUDA "Use cuda for EcRPA calculation" OFF)
option(ENABLE_CUSOLVERMP "Use cusolverMp for EcRPA calculation" OFF)
option(ENABLE_CUBLASMP "Use cublasmp for EcRPA calculation" OFF )
option(ENABLE_NVHPC "Use nvhpc for calculation" OFF )
# NOTE: static library not tested
option(BUILD_LIBRPA_SHARED "Flag to build shared libraries" ON)

Expand Down Expand Up @@ -61,6 +65,10 @@ if(USE_GREENX_API OR ENABLE_FORTRAN_BIND)
else()
enable_language(CXX)
endif()
# added by hbchen in 2025-5-19
if(USE_CUDA)
enable_language(CUDA)
endif()

# bypass the deprecation warning of classic C++ of oneAPI
if(CMAKE_CXX_COMPILER_ID MATCHES Intel)
Expand Down Expand Up @@ -177,6 +185,46 @@ if(USE_LIBRI)
add_compile_definitions("LIBRPA_USE_LIBRI")
endif()


# add gpu accelaration by chenhaobo in 2025-04-26
# cmake_policy(SET CMP0146 NEW)
if(USE_CUDA)
find_package(CUDA REQUIRED)
find_package(CUDAToolkit REQUIRED)
add_definitions(-DADD_)
include_directories(${MAGMA_ROOT}/include)
set(MAGMA_INCLUDE_DIR "${MAGMA_ROOT}/include")
set(MAGMA_INCLUDE_DIR $ENV{MAGMA_INCLUDE_DIR})
link_directories(${MAGMA_ROOT}/lib)
set(CMAKE_CUDA_STANDARD 14)
set(CUDA_LIBRARIES "-lcublas -lcudart -lcusolver")
list(APPEND math_libs
${CUDA_LIBRARIES}
${MAGMA_ROOT}/lib/libmagma.so
)
add_compile_definitions("LIBRPA_USE_CUDA")
endif()
if(ENABLE_NVHPC)
add_compile_definitions("ENABLE_NVHPC")
set(ENABLE_CUSOLVERMP ON)
set(ENABLE_CUBLASMP ON)
endif()
if(ENABLE_CUSOLVERMP)
add_compile_definitions("ENABLE_CUSOLVERMP")
set(CUSOLVERMP_LIBRAYIES "-lcal -lcusolverMp")
list(APPEND math_libs
${CUSOLVERMP_LIBRAYIES}
)
endif()
if(ENABLE_CUBLASMP)
add_compile_definitions("ENABLE_CUBLASMP")
set(CUBLASMP_LIBRAYIES "-lcublasmp -lcurand -lcublas")
list(APPEND math_libs
${CUBLASMP_LIBRAYIES}
)
endif()
# finish adding gpu accelaration by chenhaobo in 2025-04-26

if(CMAKE_BUILD_TYPE MATCHES "Debug")
message(STATUS "Build type set to Debug, adding LIBRPA_DEBUG preprocessor directive")
add_compile_definitions("LIBRPA_DEBUG")
Expand Down
12 changes: 9 additions & 3 deletions driver/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@
#include "utils_cmake.h"
#include "utils_mem.h"
#include "utils_mpi_io.h"

#ifdef ENABLE_NVHPC
#include "device_stream.h"
#endif
static void initialize(int argc, char **argv)
{
using namespace LIBRPA::envs;
Expand All @@ -54,7 +56,9 @@ static void initialize(int argc, char **argv)
}

initialize_librpa_environment(MPI_COMM_WORLD, 0, 0, "");

#ifdef ENABLE_NVHPC
device_stream.init();
#endif
// Global profiler begins right after MPI is initialized
Profiler::start("total", "Total");
}
Expand All @@ -77,7 +81,9 @@ static void finalize(bool success)
lib_printf("libRPA failed\n");
}
}

#ifdef ENABLE_NVHPC
device_stream.finalize();
#endif
finalize_librpa_environment();

MPI_Finalize();
Expand Down
12 changes: 11 additions & 1 deletion driver/task_gw_band.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@
#include "ri.h"
#include "utils_timefreq.h"
#include "write_aims.h"

#ifdef ENABLE_NVHPC
#include "epsilon_cuda.h"
#include <cuda_runtime.h>
#endif
void task_g0w0_band(std::map<Vector3_Order<double>, ComplexMatrix> &sinvS)
{
using LIBRPA::envs::mpi_comm_global_h;
Expand Down Expand Up @@ -156,6 +159,13 @@ void task_g0w0_band(std::map<Vector3_Order<double>, ComplexMatrix> &sinvS)
Wc_freq_q;
if (Params::use_scalapack_gw_wc)
{
#ifdef ENABLE_NVHPC
int numDevices;
cudaError_t cudaStat = cudaGetDeviceCount(&numDevices);
if(cudaStat == cudaSuccess && numDevices>0)
Wc_freq_q = compute_Wc_freq_q_blacs_cuda(chi0, Vq, Vq_cut, epsmac_LF_imagfreq);
else
#endif
Wc_freq_q = compute_Wc_freq_q_blacs(chi0, Vq, Vq_cut, epsmac_LF_imagfreq);
}
else
Expand Down
23 changes: 23 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,29 @@ else()
list(APPEND lib_sources get_minimax_local.cpp)
endif()

if(USE_CUDA)
list(APPEND lib_sources
epsilon_cuda.cpp
cuda_connector.cu
)
endif()
if(ENABLE_NVHPC)
list(APPEND lib_sources
cuda_connector.cpp
array_desc_device.cu
device_stream.cpp
matrix_device.cpp
device_connector.cpp
)
# set_source_files_properties(
# base_blacs.cpp
# PROPERTIES
# LANGUAGE CUDA
# # COMPILE_OPTIONS "--x cu"
# )
endif()


target_include_directories(rpa_lib
PRIVATE
# ${CMAKE_CURRENT_LIST_DIR}
Expand Down
21 changes: 21 additions & 0 deletions src/app_rpa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
#include "stl_io_helper.h"
#include "utils_mem.h"
#include "utils_timefreq.h"
#ifdef LIBRPA_USE_CUDA
#include <cuda_runtime.h> // added by hbchen in 2025-5-17
#include <epsilon_cuda.h> // added by hbchen in 2025-7-26
#endif

namespace LIBRPA
{
Expand Down Expand Up @@ -87,6 +91,23 @@ void get_rpa_correlation_energy_(std::complex<double> &rpa_corr,
mpi_comm_global_h.barrier();
Profiler::start("EcRPA", "Compute RPA correlation Energy");
CorrEnergy corr;
#ifdef LIBRPA_USE_CUDA
int deviceCount;
cudaError_t err= cudaGetDeviceCount(&deviceCount);
// lib_printf("cudaSuccess:%d\n",err==cudaSuccess&&deviceCount>0);
printf("Number of CUDA devices: %d\n", deviceCount);
// deviceCount!=4 is a bug, because if I don't set the gres=gpu:*, cuda will detect all the gpu device
if(err==cudaSuccess&&deviceCount>0){//说明存在gpu设备
#ifdef ENABLE_NVHPC
if (Params::use_scalapack_ecrpa &&
(LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR ||
LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI))
corr = compute_RPA_correlation_blacs_2d_cuda(chi0, Vq);
else
#endif
corr = compute_RPA_correlation_cuda(chi0, Vq);
}else
#endif
if (Params::use_scalapack_ecrpa &&
(LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR ||
LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI))
Expand Down
36 changes: 36 additions & 0 deletions src/array_desc_device.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#include "array_desc_device.h"

Array_Desc_Device::Array_Desc_Device(const LIBRPA::Array_Desc& array_desc) {
this->ictxt_ = array_desc.ictxt();
this->m_ = array_desc.m();
this->n_ = array_desc.n();
this->mb_ = array_desc.mb();
this->nb_ = array_desc.nb();
this->lld_ = array_desc.lld();
this->irsrc_ = array_desc.irsrc();
this->icsrc_ = array_desc.icsrc();
this->m_local_ = array_desc.m_loc();
this->n_local_ = array_desc.n_loc();
this->myprow_ = array_desc.myprow();
this->mypcol_ = array_desc.mypcol();
this->nprows_ = array_desc.nprows();
this->npcols_ = array_desc.npcols();
}
__host__ __device__ int Array_Desc_Device::indx_g2p(const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs) {
return (isrcproc + indxglob / nb) % nprocs;
}
__host__ __device__ int Array_Desc_Device::indx_g2l(const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs) {
return nb * (indxglob / (nb * nprocs)) + indxglob % nb;
}
__host__ __device__ int Array_Desc_Device::indx_g2l_r(int gindx)const{
return this->myprow_ != indx_g2p(gindx, this->mb_, this->irsrc_, this->nprows_) ||
gindx >= this->m_
? -1
: indx_g2l(gindx, this->mb_, this->irsrc_, this->nprows_);
}
__host__ __device__ int Array_Desc_Device::indx_g2l_c(int gindx)const{
return this->mypcol_ != indx_g2p(gindx, this->nb_, this->icsrc_, this->npcols_) ||
gindx >= this->n_
? -1
: indx_g2l(gindx, this->nb_, this->icsrc_, this->npcols_);
}
61 changes: 61 additions & 0 deletions src/array_desc_device.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#include "base_blacs.h"
class Array_Desc_Device{
private:
int ictxt_;
// int nprocs_;
// int myid_;
int nprows_;
int myprow_;
int npcols_;
int mypcol_;

// Array dimensions
int m_;
int n_;
int mb_;
int nb_;
int irsrc_;
int icsrc_;
int lld_;
int m_local_;
int n_local_;
__host__ __device__ static int indx_g2p(
const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs);
__host__ __device__ static int indx_g2l(
const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs);
public:
Array_Desc_Device(const LIBRPA::Array_Desc& array_desc);
__host__ __device__
int indx_g2l_r(int gindx) const;
__host__ __device__
int indx_g2l_c(int gindx) const;
__host__ __device__
const int& m() const{ return m_; }
__host__ __device__
const int& n() const{ return n_; }
__host__ __device__
const int& mb() const{ return mb_; }
__host__ __device__
const int& nb() const{ return nb_; }
__host__ __device__
const int& irsrc() const{ return irsrc_; }
__host__ __device__
const int& icsrc() const{ return icsrc_; }
__host__ __device__
const int& lld() const{ return lld_; }
__host__ __device__
const int& m_loc() const{ return m_local_; }
__host__ __device__
const int& n_loc() const{ return n_local_; }
__host__ __device__
const int& nprows() const{ return nprows_; }
__host__ __device__
const int& npcols() const{ return npcols_; }
__host__ __device__
const int& myprow() const{ return myprow_; }
__host__ __device__
const int& mypcol() const{ return mypcol_; }
__host__ __device__
const int& ictxt() const{ return ictxt_; }

};
Loading
Loading