diff --git a/CMakeLists.txt b/CMakeLists.txt index e9d4ee05..4264025f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,14 +20,18 @@ set(LIB_NAME "rpa") set(FORTRAN_LIB_NAME "rpa_f") # options setup -option(USE_LIBRI "Use LibRI for tensor contraction" OFF) +option(USE_LIBRI "Use LibRI for tensor contraction" ON) option(USE_CMAKE_INC "Use cmake.inc for configure" OFF) -option(USE_GREENX_API "Use GreenX API for minimax grids generation" OFF) +option(USE_GREENX_API "Use GreenX API for minimax grids generation" ON) option(USE_EXTERNAL_GREENX "Use external GreenX library rather than the packaged one" OFF) option(ENABLE_TEST "Flag to build unit tests" ON) option(ENABLE_DRIVER "Flag to build driver executables" ON) option(ENABLE_FORTRAN_BIND "Flag to build Fotran binding" OFF) option(VERBOSE_OUTPUT "Flag to print verbose information in stdout and process output" OFF) +option(USE_CUDA "Use cuda for EcRPA calculation" OFF) +option(ENABLE_CUSOLVERMP "Use cusolverMp for EcRPA calculation" OFF) +option(ENABLE_CUBLASMP "Use cublasmp for EcRPA calculation" OFF ) +option(ENABLE_NVHPC "Use nvhpc for calculation" OFF ) # NOTE: static library not tested option(BUILD_LIBRPA_SHARED "Flag to build shared libraries" ON) @@ -61,6 +65,10 @@ if(USE_GREENX_API OR ENABLE_FORTRAN_BIND) else() enable_language(CXX) endif() +# added by hbchen in 2025-5-19 +if(USE_CUDA) + enable_language(CUDA) +endif() # bypass the deprecation warning of classic C++ of oneAPI if(CMAKE_CXX_COMPILER_ID MATCHES Intel) @@ -177,6 +185,46 @@ if(USE_LIBRI) add_compile_definitions("LIBRPA_USE_LIBRI") endif() + +# add gpu accelaration by chenhaobo in 2025-04-26 +# cmake_policy(SET CMP0146 NEW) +if(USE_CUDA) + find_package(CUDA REQUIRED) + find_package(CUDAToolkit REQUIRED) + add_definitions(-DADD_) + include_directories(${MAGMA_ROOT}/include) + set(MAGMA_INCLUDE_DIR "${MAGMA_ROOT}/include") + set(MAGMA_INCLUDE_DIR $ENV{MAGMA_INCLUDE_DIR}) + link_directories(${MAGMA_ROOT}/lib) + set(CMAKE_CUDA_STANDARD 14) + set(CUDA_LIBRARIES "-lcublas -lcudart -lcusolver") + list(APPEND math_libs + ${CUDA_LIBRARIES} + ${MAGMA_ROOT}/lib/libmagma.so + ) + add_compile_definitions("LIBRPA_USE_CUDA") +endif() +if(ENABLE_NVHPC) +add_compile_definitions("ENABLE_NVHPC") +set(ENABLE_CUSOLVERMP ON) +set(ENABLE_CUBLASMP ON) +endif() +if(ENABLE_CUSOLVERMP) + add_compile_definitions("ENABLE_CUSOLVERMP") + set(CUSOLVERMP_LIBRAYIES "-lcal -lcusolverMp") + list(APPEND math_libs + ${CUSOLVERMP_LIBRAYIES} + ) +endif() +if(ENABLE_CUBLASMP) + add_compile_definitions("ENABLE_CUBLASMP") + set(CUBLASMP_LIBRAYIES "-lcublasmp -lcurand -lcublas") + list(APPEND math_libs + ${CUBLASMP_LIBRAYIES} + ) +endif() +# finish adding gpu accelaration by chenhaobo in 2025-04-26 + if(CMAKE_BUILD_TYPE MATCHES "Debug") message(STATUS "Build type set to Debug, adding LIBRPA_DEBUG preprocessor directive") add_compile_definitions("LIBRPA_DEBUG") diff --git a/driver/main.cpp b/driver/main.cpp index 736a1838..164671ed 100644 --- a/driver/main.cpp +++ b/driver/main.cpp @@ -39,7 +39,9 @@ #include "utils_cmake.h" #include "utils_mem.h" #include "utils_mpi_io.h" - +#ifdef ENABLE_NVHPC +#include "device_stream.h" +#endif static void initialize(int argc, char **argv) { using namespace LIBRPA::envs; @@ -54,7 +56,9 @@ static void initialize(int argc, char **argv) } initialize_librpa_environment(MPI_COMM_WORLD, 0, 0, ""); - + #ifdef ENABLE_NVHPC + device_stream.init(); + #endif // Global profiler begins right after MPI is initialized Profiler::start("total", "Total"); } @@ -77,7 +81,9 @@ static void finalize(bool success) lib_printf("libRPA failed\n"); } } - + #ifdef ENABLE_NVHPC + device_stream.finalize(); + #endif finalize_librpa_environment(); MPI_Finalize(); diff --git a/driver/task_gw_band.cpp b/driver/task_gw_band.cpp index d5393743..850ba297 100644 --- a/driver/task_gw_band.cpp +++ b/driver/task_gw_band.cpp @@ -23,7 +23,10 @@ #include "ri.h" #include "utils_timefreq.h" #include "write_aims.h" - +#ifdef ENABLE_NVHPC +#include "epsilon_cuda.h" +#include +#endif void task_g0w0_band(std::map, ComplexMatrix> &sinvS) { using LIBRPA::envs::mpi_comm_global_h; @@ -156,6 +159,13 @@ void task_g0w0_band(std::map, ComplexMatrix> &sinvS) Wc_freq_q; if (Params::use_scalapack_gw_wc) { + #ifdef ENABLE_NVHPC + int numDevices; + cudaError_t cudaStat = cudaGetDeviceCount(&numDevices); + if(cudaStat == cudaSuccess && numDevices>0) + Wc_freq_q = compute_Wc_freq_q_blacs_cuda(chi0, Vq, Vq_cut, epsmac_LF_imagfreq); + else + #endif Wc_freq_q = compute_Wc_freq_q_blacs(chi0, Vq, Vq_cut, epsmac_LF_imagfreq); } else diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e36be38e..213ef544 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -72,6 +72,29 @@ else() list(APPEND lib_sources get_minimax_local.cpp) endif() +if(USE_CUDA) + list(APPEND lib_sources + epsilon_cuda.cpp + cuda_connector.cu + ) +endif() +if(ENABLE_NVHPC) + list(APPEND lib_sources + cuda_connector.cpp + array_desc_device.cu + device_stream.cpp + matrix_device.cpp + device_connector.cpp + ) + # set_source_files_properties( + # base_blacs.cpp + # PROPERTIES + # LANGUAGE CUDA + # # COMPILE_OPTIONS "--x cu" + # ) +endif() + + target_include_directories(rpa_lib PRIVATE # ${CMAKE_CURRENT_LIST_DIR} diff --git a/src/app_rpa.cpp b/src/app_rpa.cpp index a6cb3128..a837684c 100644 --- a/src/app_rpa.cpp +++ b/src/app_rpa.cpp @@ -11,6 +11,10 @@ #include "stl_io_helper.h" #include "utils_mem.h" #include "utils_timefreq.h" +#ifdef LIBRPA_USE_CUDA +#include // added by hbchen in 2025-5-17 +#include // added by hbchen in 2025-7-26 +#endif namespace LIBRPA { @@ -87,6 +91,23 @@ void get_rpa_correlation_energy_(std::complex &rpa_corr, mpi_comm_global_h.barrier(); Profiler::start("EcRPA", "Compute RPA correlation Energy"); CorrEnergy corr; + #ifdef LIBRPA_USE_CUDA + int deviceCount; + cudaError_t err= cudaGetDeviceCount(&deviceCount); + // lib_printf("cudaSuccess:%d\n",err==cudaSuccess&&deviceCount>0); + printf("Number of CUDA devices: %d\n", deviceCount); + // deviceCount!=4 is a bug, because if I don't set the gres=gpu:*, cuda will detect all the gpu device + if(err==cudaSuccess&&deviceCount>0){//说明存在gpu设备 + #ifdef ENABLE_NVHPC + if (Params::use_scalapack_ecrpa && + (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR || + LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI)) + corr = compute_RPA_correlation_blacs_2d_cuda(chi0, Vq); + else + #endif + corr = compute_RPA_correlation_cuda(chi0, Vq); + }else + #endif if (Params::use_scalapack_ecrpa && (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR || LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI)) diff --git a/src/array_desc_device.cu b/src/array_desc_device.cu new file mode 100644 index 00000000..5dab0fa5 --- /dev/null +++ b/src/array_desc_device.cu @@ -0,0 +1,36 @@ +#include "array_desc_device.h" + +Array_Desc_Device::Array_Desc_Device(const LIBRPA::Array_Desc& array_desc) { + this->ictxt_ = array_desc.ictxt(); + this->m_ = array_desc.m(); + this->n_ = array_desc.n(); + this->mb_ = array_desc.mb(); + this->nb_ = array_desc.nb(); + this->lld_ = array_desc.lld(); + this->irsrc_ = array_desc.irsrc(); + this->icsrc_ = array_desc.icsrc(); + this->m_local_ = array_desc.m_loc(); + this->n_local_ = array_desc.n_loc(); + this->myprow_ = array_desc.myprow(); + this->mypcol_ = array_desc.mypcol(); + this->nprows_ = array_desc.nprows(); + this->npcols_ = array_desc.npcols(); +} +__host__ __device__ int Array_Desc_Device::indx_g2p(const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs) { + return (isrcproc + indxglob / nb) % nprocs; +} +__host__ __device__ int Array_Desc_Device::indx_g2l(const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs) { + return nb * (indxglob / (nb * nprocs)) + indxglob % nb; +} +__host__ __device__ int Array_Desc_Device::indx_g2l_r(int gindx)const{ + return this->myprow_ != indx_g2p(gindx, this->mb_, this->irsrc_, this->nprows_) || + gindx >= this->m_ + ? -1 + : indx_g2l(gindx, this->mb_, this->irsrc_, this->nprows_); +} +__host__ __device__ int Array_Desc_Device::indx_g2l_c(int gindx)const{ + return this->mypcol_ != indx_g2p(gindx, this->nb_, this->icsrc_, this->npcols_) || + gindx >= this->n_ + ? -1 + : indx_g2l(gindx, this->nb_, this->icsrc_, this->npcols_); +} \ No newline at end of file diff --git a/src/array_desc_device.h b/src/array_desc_device.h new file mode 100644 index 00000000..15e25e80 --- /dev/null +++ b/src/array_desc_device.h @@ -0,0 +1,61 @@ +#include "base_blacs.h" +class Array_Desc_Device{ +private: + int ictxt_; + // int nprocs_; + // int myid_; + int nprows_; + int myprow_; + int npcols_; + int mypcol_; + + // Array dimensions + int m_; + int n_; + int mb_; + int nb_; + int irsrc_; + int icsrc_; + int lld_; + int m_local_; + int n_local_; + __host__ __device__ static int indx_g2p( + const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs); + __host__ __device__ static int indx_g2l( + const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs); +public: + Array_Desc_Device(const LIBRPA::Array_Desc& array_desc); + __host__ __device__ + int indx_g2l_r(int gindx) const; + __host__ __device__ + int indx_g2l_c(int gindx) const; + __host__ __device__ + const int& m() const{ return m_; } + __host__ __device__ + const int& n() const{ return n_; } + __host__ __device__ + const int& mb() const{ return mb_; } + __host__ __device__ + const int& nb() const{ return nb_; } + __host__ __device__ + const int& irsrc() const{ return irsrc_; } + __host__ __device__ + const int& icsrc() const{ return icsrc_; } + __host__ __device__ + const int& lld() const{ return lld_; } + __host__ __device__ + const int& m_loc() const{ return m_local_; } + __host__ __device__ + const int& n_loc() const{ return n_local_; } + __host__ __device__ + const int& nprows() const{ return nprows_; } + __host__ __device__ + const int& npcols() const{ return npcols_; } + __host__ __device__ + const int& myprow() const{ return myprow_; } + __host__ __device__ + const int& mypcol() const{ return mypcol_; } + __host__ __device__ + const int& ictxt() const{ return ictxt_; } + +}; \ No newline at end of file diff --git a/src/cuda_connector.cpp b/src/cuda_connector.cpp new file mode 100644 index 00000000..f273c680 --- /dev/null +++ b/src/cuda_connector.cpp @@ -0,0 +1,2108 @@ +#include "cuda_connector.h" +#include +// cusolverMp include +#include "helpers.h" +#include +#include "lapack_connector.h" +#include "envs_mpi.h" +using LIBRPA::envs::mpi_comm_global_h; +#include "device_stream.h" +// #define CUSOLVERMP_MPI_GRID_COL_MAJOR +// #define OPEN_TEST_FOR_LU_DECOMPOSITION +// #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION +// #include +// #include +// #endif +// #ifdef ENABLE_CUBLASMP +// // #include +// // #include +// #endif +// #include +// #ifdef ENABLE_CUSOLVERMP +// void CudaConnector::pzgetrf_cusolverMp(const int &m, const int &n, std::complex *h_C_A, const int &ia, +// const int &ja, const LIBRPA::Array_Desc &arrdesc_pi, int *ipiv, int &h_info_getrf,const char order) +// { +// const int64_t M = arrdesc_pi.m(); +// const int64_t N = arrdesc_pi.n(); + +// const int64_t IA = ia; +// const int64_t JA = ja; + +// /* Tile sizes */ +// const int64_t MA = arrdesc_pi.mb(); +// const int64_t NA = arrdesc_pi.nb(); +// int numRowDevices, numColDevices; +// if(order=='C'){ +// numRowDevices = arrdesc_pi.npcols(); +// numColDevices = arrdesc_pi.nprows(); +// } +// else if(order=='R'){ +// numRowDevices = arrdesc_pi.nprows(); +// numColDevices = arrdesc_pi.npcols(); +// }else{ +// fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n"); +// } + + +// const uint32_t RSRCA = 0; +// const uint32_t CSRCA = 0; + +// int mpiCommSize, mpiRank; +// MPI_Comm_size(MPI_COMM_WORLD, &mpiCommSize); +// MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank); + +// int local_device = getLocalDevice(); +// int numDevices = 0; + +// // printf("Number of devices = %d\n", numDevices); +// // printf("local_device = %d, mpiRank = %d, mpiCommSize = %d\n", local_device, mpiRank, mpiCommSize); +// #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION +// double time_start_set_device=omp_get_wtime(); +// #endif +// cudaError_t cudaStat = cudaSetDevice(local_device); +// #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION +// printf("time for set device:%f seconds, rank:%d\n", omp_get_wtime() - time_start_set_device,mpiRank); +// #endif +// assert(cudaStat == cudaSuccess); +// cudaStat = cudaFree(0); +// assert(cudaStat == cudaSuccess); +// // const int mpiRank_T=arrdesc_pi.myprow()+ arrdesc_pi.mypcol()*arrdesc_pi.npcols(); +// const int rank = mpiRank; +// // printf("mpiRank=%d,mpiRank_T=%d\n", mpiRank, mpiRank_T); +// const int commSize = mpiCommSize; +// /* Library handles */ +// cusolverMpHandle_t cusolverMpHandle = NULL; +// cal_comm_t cal_comm = NULL; + +// /* Error codes */ +// cusolverStatus_t cusolverStat = CUSOLVER_STATUS_SUCCESS; +// calError_t calStat = CAL_OK; +// cudaStat = cudaSuccess; + +// /* User defined stream */ +// cudaStream_t localStream = NULL; +// cal_comm_create_params_t params; +// params.allgather = allgather; +// params.req_test = request_test; +// params.req_free = request_free; +// params.data = (void*)(MPI_COMM_WORLD); +// params.rank = rank; +// params.nranks = commSize; +// params.local_device = local_device; + +// calStat = cal_comm_create(params, &cal_comm); +// assert(calStat == CAL_OK); + +// /* Create local stream */ +// #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION +// double time_start_create_stream=omp_get_wtime(); +// #endif +// cudaStat = cudaStreamCreate(&localStream); +// #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION +// printf("time for create stream:%f seconds, rank:%d\n", omp_get_wtime() - time_start_create_stream,mpiRank); +// #endif +// assert(cudaStat == cudaSuccess); + +// /* Initialize cusolverMp library handle */ +// #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION +// double time_start_cusolverMpCreate=omp_get_wtime(); +// #endif +// cusolverStat = cusolverMpCreate(&cusolverMpHandle, local_device, localStream); +// #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION +// printf("time for cusolverMpCreate:%f seconds, rank:%d\n", omp_get_wtime() - time_start_cusolverMpCreate,mpiRank); +// #endif +// assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + +// /* cusolverMp grids */ +// cusolverMpGrid_t gridA_C = NULL; + +// /* cusolverMp matrix descriptors */ +// cusolverMpMatrixDescriptor_t descrA_C = NULL; + +// /* Distributed matrices */ +// void* d_C_A = NULL; // for cuDoubleComplex +// int64_t* d_ipiv = NULL; + +// /* Distributed device workspace */ +// void* d_work_getrf_C = NULL; + +// /* Distributed host workspace */ +// void* h_work_getrf_C = NULL; + +// /* size of workspace on device */ +// size_t workspaceInBytesOnDevice_getrf_C = 0; + +// /* size of workspace on host */ +// size_t workspaceInBytesOnHost_getrf_C = 0; + +// /* error codes from cusolverMp (device) */ +// int* d_info_getrf = NULL; + +// /* error codes from cusolverMp (host) */ +// // int h_info_getrf = 0; + +// /* Single process per device */ +// assert((numRowDevices * numColDevices) == commSize); + +// /* =========================================== */ +// /* Create inputs on master rank */ +// /* =========================================== */ + +// const int64_t lda = (IA - 1) + N; +// const int64_t colsA = (JA - 1) + N; +// // cuDoubleComplex* h_C_A = NULL; +// int64_t LLDA, localColsA; +// if(order=='C'){ +// localColsA =arrdesc_pi.m_loc(); +// LLDA=arrdesc_pi.n_loc(); +// }else if(order=='R'){ +// localColsA =arrdesc_pi.n_loc(); +// LLDA=arrdesc_pi.m_loc(); +// }else{ +// fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n"); +// } + +// /* Allocate global d_A */ +// cudaStat = cudaMalloc((void**)&d_C_A, localColsA * LLDA * sizeof(cuDoubleComplex)); +// assert(cudaStat == cudaSuccess); + +// /* =========================================== */ +// /* CREATE GRID DESCRIPTORS */ +// /* =========================================== */ +// if(order=='C'){ +// cusolverStat = cusolverMpCreateDeviceGrid( +// cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_COL_MAJOR); +// }else if(order=='R'){ +// cusolverStat = cusolverMpCreateDeviceGrid( +// cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_ROW_MAJOR); +// }else{ +// fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n"); +// } +// assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + +// /* =========================================== */ +// /* CREATE MATRIX DESCRIPTORS */ +// /* =========================================== */ +// cusolverStat = cusolverMpCreateMatrixDesc( +// &descrA_C, gridA_C, CUDA_C_64F, (IA - 1) + M, (JA - 1) + N, MA, NA, RSRCA, CSRCA, LLDA); +// assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + +// /* Allocate global d_ipiv */ +// /* REMARK : ipiv overlaps A[IA, JA:JA+N] as in Netlib's ScaLAPACK */ +// cudaStat = cudaMalloc((void**)&d_ipiv, arrdesc_pi.m_loc() * sizeof(int64_t)); +// assert(cudaStat == cudaSuccess); + +// /* =========================================== */ +// /* ALLOCATE D_INFO */ +// /* =========================================== */ + +// cudaStat = cudaMalloc((void**)&d_info_getrf, sizeof(int)); +// assert(cudaStat == cudaSuccess); + +// /* =========================================== */ +// /* RESET D_INFO */ +// /* =========================================== */ + +// cudaStat = cudaMemset(d_info_getrf, 1, sizeof(int)); +// assert(cudaStat == cudaSuccess); + +// /* =========================================== */ +// /* QUERY WORKSPACE SIZE FOR MP ROUTINES */ +// /* =========================================== */ +// cusolverStat = cusolverMpGetrf_bufferSize(cusolverMpHandle, +// N, +// N, +// d_C_A, +// IA, +// JA, +// descrA_C, +// d_ipiv, +// CUDA_C_64F, +// &workspaceInBytesOnDevice_getrf_C, +// &workspaceInBytesOnHost_getrf_C); +// assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + +// /* =========================================== */ +// /* ALLOCATE PGETRF WORKSPACE */ +// /* =========================================== */ +// cudaStat = cudaMalloc((void**)&d_work_getrf_C, workspaceInBytesOnDevice_getrf_C); +// assert(cudaStat == cudaSuccess); +// h_work_getrf_C = (void*)malloc(workspaceInBytesOnHost_getrf_C); +// assert(h_work_getrf_C != NULL); + +// // copy matrix from h_C_A to d_C_A +// std::complex* h_C_A_temp = NULL; +// size_t temp_size = (int64_t)localColsA * (int64_t)LLDA * sizeof(cuDoubleComplex); +// if(order=='C'){ +// h_C_A_temp = LapackConnector::transpose(h_C_A, arrdesc_pi.m_loc(), arrdesc_pi.n_loc()); +// cudaStat = cudaMemcpy(d_C_A, h_C_A_temp, temp_size, cudaMemcpyHostToDevice); +// }else if(order=='R'){ +// cudaStat = cudaMemcpy(d_C_A, h_C_A, temp_size, cudaMemcpyHostToDevice); +// }else{ +// fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n"); +// } +// assert(cudaStat == cudaSuccess); + + +// /* sync wait for data to arrive to device */ +// calStat = cal_stream_sync(cal_comm, localStream); +// assert(calStat == CAL_OK); + + +// /* =========================================== */ +// /* CALL PGETRF */ +// /* =========================================== */ +// h_info_getrf=1; +// // printf("h_info_getrf before LU composition(cuDoubleComplex) : %d\n", h_info_getrf); +// // printf("LU decomposition begin(cuDoubleComplex)\n"); +// double start_time_C = omp_get_wtime(); +// cusolverStat = cusolverMpGetrf(cusolverMpHandle, +// N, +// N, +// d_C_A, +// IA, +// JA, +// descrA_C, +// d_ipiv, +// CUDA_C_64F, +// d_work_getrf_C, +// workspaceInBytesOnDevice_getrf_C, +// h_work_getrf_C, +// workspaceInBytesOnHost_getrf_C, +// d_info_getrf); + +// /* sync after cusolverMpGetrf */ +// calStat = cal_stream_sync(cal_comm, localStream); +// assert(calStat == CAL_OK); +// // printf("LU decomposition end(cuDoubleComplex), time = %f seconds,rand=%d\n", omp_get_wtime() - start_time_C,rank); + +// /* copy d_info_getrf to host */ +// cudaStat = cudaMemcpyAsync(&h_info_getrf, d_info_getrf, sizeof(int), cudaMemcpyDeviceToHost, localStream); +// assert(cudaStat == cudaSuccess); +// /* wait for d_info_getrf copy */ +// cudaStat = cudaStreamSynchronize(localStream); +// assert(cudaStat == cudaSuccess); +// // printf("h_info_getrf after composition(cuDoubleComplex) : %d\n", h_info_getrf); +// /* check return value of cusolverMpGetrf */ +// assert(h_info_getrf == 0); +// // copy d_ipiv to ipiv +// cudaStat = cudaMemcpy(ipiv, d_ipiv, arrdesc_pi.m_loc() * sizeof(int64_t), cudaMemcpyDeviceToHost); +// assert(cudaStat == cudaSuccess); +// // copy matrix from d_C_A to h_C_A +// if(order=='C'){ +// cudaStat = cudaMemcpy(h_C_A_temp, d_C_A, temp_size, cudaMemcpyDeviceToHost); +// LapackConnector::transpose(h_C_A_temp, h_C_A, arrdesc_pi.m_loc(), arrdesc_pi.n_loc()); +// }else if(order=='R'){ +// cudaStat = cudaMemcpy(h_C_A, d_C_A, temp_size, cudaMemcpyDeviceToHost); +// }else{ +// fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n"); +// } +// assert(cudaStat == cudaSuccess); + +// calStat = cal_stream_sync(cal_comm, localStream); +// assert(calStat == CAL_OK); + +// cudaStat = cudaStreamSynchronize(localStream); +// assert(cudaStat == cudaSuccess); + + +// /* sync wait for data to arrive to host */ +// calStat = cal_stream_sync(cal_comm, localStream); +// assert(calStat == CAL_OK); + + +// /* =========================================== */ +// /* CHECK RESIDUAL ON MASTER */ +// /* =========================================== */ + +// /* =========================================== */ +// /* CLEAN UP HOST WORKSPACE ON MASTER */ +// /* =========================================== */ +// cusolverStat = cusolverMpDestroyMatrixDesc(descrA_C); +// assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + +// cusolverStat = cusolverMpDestroyGrid(gridA_C); +// assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); +// if (d_C_A != NULL) +// { +// cudaStat = cudaFree(d_C_A); +// assert(cudaStat == cudaSuccess); +// d_C_A = NULL; +// } +// if (d_ipiv != NULL) +// { +// cudaStat = cudaFree(d_ipiv); +// assert(cudaStat == cudaSuccess); +// d_ipiv = NULL; +// } + +// if(h_C_A_temp != NULL) +// { +// delete [] h_C_A_temp; +// h_C_A_temp = NULL; +// } + + +// if (d_work_getrf_C != NULL) +// { +// cudaStat = cudaFree(d_work_getrf_C); +// assert(cudaStat == cudaSuccess); +// d_work_getrf_C = NULL; +// } + +// if (d_info_getrf != NULL) +// { +// cudaStat = cudaFree(d_info_getrf); +// assert(cudaStat == cudaSuccess); +// d_info_getrf = NULL; +// } +// if (h_work_getrf_C) +// { +// free(h_work_getrf_C); +// h_work_getrf_C = NULL; +// } +// /* Destroy cusolverMp handle */ +// cusolverStat = cusolverMpDestroy(cusolverMpHandle); +// assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + +// /* sync before cal_comm_destroy */ +// calStat = cal_comm_barrier(cal_comm, localStream); +// assert(calStat == CAL_OK); + +// /* destroy CAL communicator */ +// calStat = cal_comm_destroy(cal_comm); +// assert(calStat == CAL_OK); + +// /* destroy user stream */ +// cudaStat = cudaStreamDestroy(localStream); +// assert(cudaStat == cudaSuccess); + +// /* MPI barrier before MPI_Finalize */ +// MPI_Barrier(MPI_COMM_WORLD); +// // printf("success in test\n"); +// return; +// } +// #endif + +#ifdef ENABLE_NVHPC +void CudaConnector::pzgetrf_nvhpc(const GpuDeviceStream&gpu_dev_stream, ComplexMatrixDevice &d_A, const int &ia, + const int &ja, const LIBRPA::Array_Desc &arrdesc_pi, int64_t *d_ipiv, int *d_info_getrf, const char& order) +{ + const int64_t M = arrdesc_pi.m(); + const int64_t N = arrdesc_pi.n(); + + const int64_t IA = ia; + const int64_t JA = ja; + + /* Tile sizes */ + const int64_t MA = arrdesc_pi.mb(); + const int64_t NA = arrdesc_pi.nb(); + int numRowDevices, numColDevices; + if(order=='C'){ + numRowDevices = arrdesc_pi.npcols(); + numColDevices = arrdesc_pi.nprows(); + } + else if(order=='R'){ + numRowDevices = arrdesc_pi.nprows(); + numColDevices = arrdesc_pi.npcols(); + }else{ + fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n"); + } + + + const uint32_t RSRCA = 0; + const uint32_t CSRCA = 0; + + int mpiRank = gpu_dev_stream.rank; + int mpiCommSize = gpu_dev_stream.nranks; + + int local_device = gpu_dev_stream.local_device; + + const int rank = mpiRank; + const int commSize = mpiCommSize; + /* Library handles */ + cusolverMpHandle_t cusolverMpHandle = gpu_dev_stream.cusolver_handle; + cal_comm_t cal_comm = gpu_dev_stream.cal_comm; + + /* Error codes */ + cusolverStatus_t cusolverStat = CUSOLVER_STATUS_SUCCESS; + cudaError_t cudaStat = cudaSuccess; + cudaStream_t localStream = gpu_dev_stream.stream; + /* Initialize cusolverMp library handle */ + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + double time_start_cusolverMpCreate=omp_get_wtime(); + #endif + // cusolverStat = cusolverMpCreate(&cusolverMpHandle, local_device, localStream); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("time for cusolverMpCreate:%f seconds, rank:%d\n", omp_get_wtime() - time_start_cusolverMpCreate,mpiRank); + #endif + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + /* cusolverMp grids */ + cusolverMpGrid_t gridA_C = NULL; + + /* cusolverMp matrix descriptors */ + cusolverMpMatrixDescriptor_t descrA_C = NULL; + + /* Distributed matrices */ + void* d_C_A = d_A.ptr(); // for cuDoubleComplex + + + /* Distributed device workspace */ + void* d_work_getrf_C = NULL; + + /* Distributed host workspace */ + void* h_work_getrf_C = NULL; + + /* size of workspace on device */ + size_t workspaceInBytesOnDevice_getrf_C = 0; + + /* size of workspace on host */ + size_t workspaceInBytesOnHost_getrf_C = 0; + + /* Single process per device */ + assert((numRowDevices * numColDevices) == commSize); + + /* =========================================== */ + /* Create inputs on master rank */ + /* =========================================== */ + + const int64_t lda = (IA - 1) + N; + const int64_t colsA = (JA - 1) + N; + // cuDoubleComplex* h_C_A = NULL; + int64_t LLDA, localColsA; + if(order=='C'){ + localColsA =arrdesc_pi.m_loc(); + LLDA=arrdesc_pi.n_loc(); + }else if(order=='R'){ + localColsA =arrdesc_pi.n_loc(); + LLDA=arrdesc_pi.m_loc(); + }else{ + fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n"); + } + + /* =========================================== */ + /* CREATE GRID DESCRIPTORS */ + /* =========================================== */ + if(order=='C'){ + cusolverStat = cusolverMpCreateDeviceGrid( + cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_COL_MAJOR); + }else if(order=='R'){ + cusolverStat = cusolverMpCreateDeviceGrid( + cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_ROW_MAJOR); + }else{ + fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n"); + } + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + /* =========================================== */ + /* CREATE MATRIX DESCRIPTORS */ + /* =========================================== */ + cusolverStat = cusolverMpCreateMatrixDesc( + &descrA_C, gridA_C, CUDA_C_64F, (IA - 1) + M, (JA - 1) + N, MA, NA, RSRCA, CSRCA, LLDA); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + + /* =========================================== */ + /* RESET D_INFO */ + /* =========================================== */ + + CUDA_CHECK(cudaMemset(d_info_getrf, 1, sizeof(int))); + + + /* =========================================== */ + /* QUERY WORKSPACE SIZE FOR MP ROUTINES */ + /* =========================================== */ + cusolverStat = cusolverMpGetrf_bufferSize(cusolverMpHandle, + N, + N, + d_C_A, + IA, + JA, + descrA_C, + d_ipiv, + CUDA_C_64F, + &workspaceInBytesOnDevice_getrf_C, + &workspaceInBytesOnHost_getrf_C); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + /* =========================================== */ + /* ALLOCATE PGETRF WORKSPACE */ + /* =========================================== */ + cudaStat = cudaMalloc((void**)&d_work_getrf_C, workspaceInBytesOnDevice_getrf_C); + assert(cudaStat == cudaSuccess); + h_work_getrf_C = (void*)malloc(workspaceInBytesOnHost_getrf_C); + assert(h_work_getrf_C != NULL); + + + /* sync wait for data to arrive to device */ + CAL_CHECK(cal_stream_sync(cal_comm, localStream)); + + + /* =========================================== */ + /* CALL PGETRF */ + /* =========================================== */ + double start_time_C = omp_get_wtime(); + cusolverStat = cusolverMpGetrf(cusolverMpHandle, + N, + N, + d_C_A, + IA, + JA, + descrA_C, + d_ipiv, + CUDA_C_64F, + d_work_getrf_C, + workspaceInBytesOnDevice_getrf_C, + h_work_getrf_C, + workspaceInBytesOnHost_getrf_C, + d_info_getrf); + + /* sync after cusolverMpGetrf */ + CAL_CHECK(cal_stream_sync(cal_comm, localStream)); + // printf("LU decomposition end(cuDoubleComplex), time = %f seconds,rand=%d\n", omp_get_wtime() - start_time_C,rank); + + /* copy d_info_getrf to host */ + int h_info_getrf=1; + cudaStat = cudaMemcpyAsync(&h_info_getrf, d_info_getrf, sizeof(int), cudaMemcpyDeviceToHost, localStream); + assert(cudaStat == cudaSuccess); + /* wait for d_info_getrf copy */ + gpu_dev_stream.cudaSync(); + assert(cudaStat == cudaSuccess); + assert(h_info_getrf == 0); + + + /* sync wait for data to arrive to host */ + CAL_CHECK(cal_stream_sync(cal_comm, localStream)); + + /* =========================================== */ + /* CLEAN UP HOST WORKSPACE ON MASTER */ + /* =========================================== */ + cusolverStat = cusolverMpDestroyMatrixDesc(descrA_C); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + cusolverStat = cusolverMpDestroyGrid(gridA_C); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + if (d_work_getrf_C != NULL) + { + cudaStat = cudaFree(d_work_getrf_C); + assert(cudaStat == cudaSuccess); + d_work_getrf_C = NULL; + } + + if (h_work_getrf_C) + { + free(h_work_getrf_C); + h_work_getrf_C = NULL; + } + /* Destroy cusolverMp handle */ + // cusolverStat = cusolverMpDestroy(cusolverMpHandle); + // assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + /* MPI barrier before MPI_Finalize */ + MPI_Barrier(MPI_COMM_WORLD); + // printf("success in test\n"); + return; +} +void CudaConnector::pgetrf_nvhpc_mixed_precision( + const GpuDeviceStream&gpu_dev_stream, void *d_A, + const int &ia, const int &ja, + const LIBRPA::Array_Desc &arrdesc_pi, int64_t *d_ipiv, int *d_info_getrf, + const cudaDataType_t &computeType,const char &order + ) +{ + const int64_t M = arrdesc_pi.m(); + const int64_t N = arrdesc_pi.n(); + + const int64_t IA = ia; + const int64_t JA = ja; + + /* Tile sizes */ + const int64_t MA = arrdesc_pi.mb(); + const int64_t NA = arrdesc_pi.nb(); + int numRowDevices, numColDevices; + if(order=='C'||order=='c'){ + numRowDevices = arrdesc_pi.npcols(); + numColDevices = arrdesc_pi.nprows(); + } + else if(order=='R'||order=='r'){ + numRowDevices = arrdesc_pi.nprows(); + numColDevices = arrdesc_pi.npcols(); + }else{ + fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n"); + } + + + const uint32_t RSRCA = 0; + const uint32_t CSRCA = 0; + + int local_device = gpu_dev_stream.local_device; + + const int rank = gpu_dev_stream.rank; + const int commSize = gpu_dev_stream.nranks; + /* Library handles */ + cusolverMpHandle_t cusolverMpHandle = gpu_dev_stream.cusolver_handle; + cal_comm_t cal_comm = gpu_dev_stream.cal_comm; + + /* Error codes */ + cusolverStatus_t cusolverStat = CUSOLVER_STATUS_SUCCESS; + cudaError_t cudaStat = cudaSuccess; + cudaStream_t localStream = gpu_dev_stream.stream; + + /* cusolverMp grids */ + cusolverMpGrid_t gridA_C = NULL; + + /* cusolverMp matrix descriptors */ + cusolverMpMatrixDescriptor_t descrA_C = NULL; + + /* Distributed matrices */ + void* d_C_A = d_A; // + + + /* Distributed device workspace */ + void* d_work_getrf_C = NULL; + + /* Distributed host workspace */ + void* h_work_getrf_C = NULL; + + /* size of workspace on device */ + size_t workspaceInBytesOnDevice_getrf_C = 0; + + /* size of workspace on host */ + size_t workspaceInBytesOnHost_getrf_C = 0; + + /* Single process per device */ + assert((numRowDevices * numColDevices) == commSize); + + const int64_t lda = (IA - 1) + N; + const int64_t colsA = (JA - 1) + N; + // cuDoubleComplex* h_C_A = NULL; + int64_t LLDA, localColsA; + if(order=='C'||order=='c'){ + localColsA =arrdesc_pi.m_loc(); + LLDA=arrdesc_pi.n_loc(); + }else if(order=='R'||order=='r'){ + localColsA =arrdesc_pi.n_loc(); + LLDA=arrdesc_pi.m_loc(); + }else{ + fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n"); + } + + /* =========================================== */ + /* CREATE GRID DESCRIPTORS */ + /* =========================================== */ + if(order=='C'||order=='c'){ + cusolverStat = cusolverMpCreateDeviceGrid( + cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_COL_MAJOR); + }else if(order=='R'||order=='r'){ + cusolverStat = cusolverMpCreateDeviceGrid( + cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_ROW_MAJOR); + }else{ + fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n"); + } + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + /* =========================================== */ + /* CREATE MATRIX DESCRIPTORS */ + /* =========================================== */ + cusolverStat = cusolverMpCreateMatrixDesc( + &descrA_C, gridA_C, computeType, (IA - 1) + M, (JA - 1) + N, MA, NA, RSRCA, CSRCA, LLDA); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + + /* =========================================== */ + /* RESET D_INFO */ + /* =========================================== */ + + CUDA_CHECK(cudaMemset(d_info_getrf, 1, sizeof(int))); + + + /* =========================================== */ + /* QUERY WORKSPACE SIZE FOR MP ROUTINES */ + /* =========================================== */ + cusolverStat = cusolverMpGetrf_bufferSize(cusolverMpHandle, + N, + N, + d_C_A, + IA, + JA, + descrA_C, + d_ipiv, + computeType, + &workspaceInBytesOnDevice_getrf_C, + &workspaceInBytesOnHost_getrf_C); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + /* =========================================== */ + /* ALLOCATE PGETRF WORKSPACE */ + /* =========================================== */ + CUDA_CHECK(cudaMallocAsync((void**)&d_work_getrf_C, workspaceInBytesOnDevice_getrf_C, localStream)); + h_work_getrf_C = (void*)malloc(workspaceInBytesOnHost_getrf_C); + assert(h_work_getrf_C != NULL); + + /* =========================================== */ + /* CALL PGETRF */ + /* =========================================== */ + double start_time_C = omp_get_wtime(); + CUSOLVERMP_CHECK(cusolverMpGetrf( + cusolverMpHandle, N, N, + d_C_A, IA, JA, descrA_C, + d_ipiv, computeType, + d_work_getrf_C, workspaceInBytesOnDevice_getrf_C, + h_work_getrf_C, workspaceInBytesOnHost_getrf_C, + d_info_getrf)); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + /* sync after cusolverMpGetrf */ + CAL_CHECK(cal_stream_sync(cal_comm, localStream)); + + /* copy d_info_getrf to host */ + int h_info_getrf=1; + cudaStat = cudaMemcpyAsync(&h_info_getrf, d_info_getrf, sizeof(int), cudaMemcpyDeviceToHost, localStream); + assert(cudaStat == cudaSuccess); + /* wait for d_info_getrf copy */ + gpu_dev_stream.cudaSync(); + assert(cudaStat == cudaSuccess); + assert(h_info_getrf == 0); + + /* =========================================== */ + /* CLEAN UP HOST WORKSPACE ON MASTER */ + /* =========================================== */ + cusolverStat = cusolverMpDestroyMatrixDesc(descrA_C); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + cusolverStat = cusolverMpDestroyGrid(gridA_C); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + if (d_work_getrf_C != NULL) + { + CUDA_CHECK(cudaFreeAsync(d_work_getrf_C, localStream)); + d_work_getrf_C = NULL; + } + + if (h_work_getrf_C) + { + free(h_work_getrf_C); + h_work_getrf_C = NULL; + } + + /* MPI barrier before MPI_Finalize */ + MPI_Barrier(MPI_COMM_WORLD); + return; +} +void CudaConnector::pgetrf_nvhpc_mixed_precision( + void *d_A, const int &ia, const int &ja, + const LIBRPA::Array_Desc &arrdesc_pi, int64_t *d_ipiv, int *d_info_getrf, + const cudaDataType_t &computeType,const char &order + ) +{ + const int64_t M = arrdesc_pi.m(); + const int64_t N = arrdesc_pi.n(); + + const int64_t IA = ia; + const int64_t JA = ja; + + /* Tile sizes */ + const int64_t MA = arrdesc_pi.mb(); + const int64_t NA = arrdesc_pi.nb(); + int numRowDevices, numColDevices; + if(order=='C'||order=='c'){ + numRowDevices = arrdesc_pi.npcols(); + numColDevices = arrdesc_pi.nprows(); + } + else if(order=='R'||order=='r'){ + numRowDevices = arrdesc_pi.nprows(); + numColDevices = arrdesc_pi.npcols(); + }else{ + fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n"); + } + + + const uint32_t RSRCA = 0; + const uint32_t CSRCA = 0; + + int local_device = device_stream.local_device; + + const int rank = mpi_comm_global_h.myid; + const int commSize = mpi_comm_global_h.nprocs; + /* Library handles */ + cusolverMpHandle_t cusolverMpHandle = device_stream.cusolverMp_handle; + cal_comm_t cal_comm = device_stream.cal_comm; + + /* Error codes */ + cusolverStatus_t cusolverStat = CUSOLVER_STATUS_SUCCESS; + cudaError_t cudaStat = cudaSuccess; + cudaStream_t localStream = (cudaStream_t)device_stream.stream; + + /* cusolverMp grids */ + cusolverMpGrid_t gridA_C = NULL; + + /* cusolverMp matrix descriptors */ + cusolverMpMatrixDescriptor_t descrA_C = NULL; + + /* Distributed matrices */ + void* d_C_A = d_A; // + + + /* Distributed device workspace */ + void* d_work_getrf_C = NULL; + + /* Distributed host workspace */ + void* h_work_getrf_C = NULL; + + /* size of workspace on device */ + size_t workspaceInBytesOnDevice_getrf_C = 0; + + /* size of workspace on host */ + size_t workspaceInBytesOnHost_getrf_C = 0; + + /* Single process per device */ + assert((numRowDevices * numColDevices) == commSize); + + const int64_t lda = (IA - 1) + N; + const int64_t colsA = (JA - 1) + N; + // cuDoubleComplex* h_C_A = NULL; + int64_t LLDA, localColsA; + if(order=='C'||order=='c'){ + localColsA =arrdesc_pi.m_loc(); + LLDA=arrdesc_pi.n_loc(); + }else if(order=='R'||order=='r'){ + localColsA =arrdesc_pi.n_loc(); + LLDA=arrdesc_pi.m_loc(); + }else{ + fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n"); + } + + /* =========================================== */ + /* CREATE GRID DESCRIPTORS */ + /* =========================================== */ + if(order=='C'||order=='c'){ + cusolverStat = cusolverMpCreateDeviceGrid( + cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_COL_MAJOR); + }else if(order=='R'||order=='r'){ + cusolverStat = cusolverMpCreateDeviceGrid( + cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_ROW_MAJOR); + }else{ + fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n"); + } + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + /* =========================================== */ + /* CREATE MATRIX DESCRIPTORS */ + /* =========================================== */ + cusolverStat = cusolverMpCreateMatrixDesc( + &descrA_C, gridA_C, computeType, (IA - 1) + M, (JA - 1) + N, MA, NA, RSRCA, CSRCA, LLDA); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + + /* =========================================== */ + /* RESET D_INFO */ + /* =========================================== */ + + CUDA_CHECK(cudaMemset(d_info_getrf, 1, sizeof(int))); + + + /* =========================================== */ + /* QUERY WORKSPACE SIZE FOR MP ROUTINES */ + /* =========================================== */ + cusolverStat = cusolverMpGetrf_bufferSize(cusolverMpHandle, + N, + N, + d_C_A, + IA, + JA, + descrA_C, + d_ipiv, + computeType, + &workspaceInBytesOnDevice_getrf_C, + &workspaceInBytesOnHost_getrf_C); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + /* =========================================== */ + /* ALLOCATE PGETRF WORKSPACE */ + /* =========================================== */ + CUDA_CHECK(cudaMallocAsync((void**)&d_work_getrf_C, workspaceInBytesOnDevice_getrf_C, localStream)); + h_work_getrf_C = (void*)malloc(workspaceInBytesOnHost_getrf_C); + assert(h_work_getrf_C != NULL); + + /* =========================================== */ + /* CALL PGETRF */ + /* =========================================== */ + double start_time_C = omp_get_wtime(); + CUSOLVERMP_CHECK(cusolverMpGetrf( + cusolverMpHandle, N, N, + d_C_A, IA, JA, descrA_C, + d_ipiv, computeType, + d_work_getrf_C, workspaceInBytesOnDevice_getrf_C, + h_work_getrf_C, workspaceInBytesOnHost_getrf_C, + d_info_getrf)); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + /* sync after cusolverMpGetrf */ + CAL_CHECK(cal_stream_sync(cal_comm, localStream)); + + /* copy d_info_getrf to host */ + int h_info_getrf=1; + cudaStat = cudaMemcpyAsync(&h_info_getrf, d_info_getrf, sizeof(int), cudaMemcpyDeviceToHost, localStream); + assert(cudaStat == cudaSuccess); + /* wait for d_info_getrf copy */ + device_stream.cudaSync(); + assert(cudaStat == cudaSuccess); + assert(h_info_getrf == 0); + + /* =========================================== */ + /* CLEAN UP HOST WORKSPACE ON MASTER */ + /* =========================================== */ + cusolverStat = cusolverMpDestroyMatrixDesc(descrA_C); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + cusolverStat = cusolverMpDestroyGrid(gridA_C); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + if (d_work_getrf_C != NULL) + { + CUDA_CHECK(cudaFreeAsync(d_work_getrf_C, localStream)); + d_work_getrf_C = NULL; + } + + if (h_work_getrf_C) + { + free(h_work_getrf_C); + h_work_getrf_C = NULL; + } + + /* MPI barrier before MPI_Finalize */ + MPI_Barrier(MPI_COMM_WORLD); + return; +} +void CudaConnector::pgetrs_nvhpc_mixed_precision( + const GpuDeviceStream& gpu_dev_stream, const cublasOperation_t& trans, + const void* d_A, const int64_t& IA, const int64_t& JA, const LIBRPA::Array_Desc &arrdesc_A, + const int64_t* d_ipiv, + void* d_B, const int64_t& IB, const int64_t& JB, const LIBRPA::Array_Desc &arrdesc_B, + int* d_info, const cudaDataType_t& compute_type, + const char& order +) +{ + const int64_t N = arrdesc_A.m(); + const int64_t NRHS = arrdesc_B.n(); + assert(arrdesc_A.m()==arrdesc_A.n()); + + int h_info = 1; + size_t workspaceInBytesOnDevice, workspaceInBytesOnHost; + void* h_work = NULL; + void* d_work = NULL; + const int64_t mbA = arrdesc_A.mb(); + const int64_t nbA = arrdesc_A.nb(); + const int64_t mbB = arrdesc_B.mb(); + const int64_t nbB = arrdesc_B.nb(); + assert(arrdesc_A.npcols() == arrdesc_B.npcols()); + assert(arrdesc_A.nprows() == arrdesc_B.nprows()); + int numRowDevices, numColDevices; + ORDER_CHECK(order); + if(order=='C'||order=='c'){ + assert(N==NRHS); + numRowDevices = arrdesc_A.npcols(); + numColDevices = arrdesc_A.nprows(); + } + else{ + numRowDevices = arrdesc_A.nprows(); + numColDevices = arrdesc_A.npcols(); + } + int64_t LLDA, localColsA, LLDB, localColsB; + if(order=='C'||order=='c'){ + LLDA=arrdesc_A.n_loc(); + localColsA =arrdesc_A.m_loc(); + LLDB=arrdesc_B.n_loc(); + localColsB =arrdesc_B.m_loc(); + }else{ + LLDA=arrdesc_A.m_loc(); + localColsA =arrdesc_A.n_loc(); + LLDB=arrdesc_B.m_loc(); + localColsB =arrdesc_B.n_loc(); + } + int mpiCommSize = gpu_dev_stream.nranks; + int rank = gpu_dev_stream.rank; + + cusolverMpHandle_t cusolverMpHandle = gpu_dev_stream.cusolver_handle; + + cusolverMpGrid_t grid = NULL; + cusolverMpMatrixDescriptor_t descrA = NULL; + cusolverMpMatrixDescriptor_t descrB = NULL; + + CUSOLVERMP_CHECK(cusolverMpCreateDeviceGrid( + cusolverMpHandle, &grid, gpu_dev_stream.cal_comm, + numRowDevices, numColDevices, + (order=='c'||order=='C')?CUSOLVERMP_GRID_MAPPING_COL_MAJOR:CUSOLVERMP_GRID_MAPPING_ROW_MAJOR) + ); + + CUSOLVERMP_CHECK(cusolverMpCreateMatrixDesc( + &descrA, grid, compute_type, + (IA - 1) + N, (JA - 1) + N, + mbA, nbA, 0, 0, LLDA) + ); + CUSOLVERMP_CHECK(cusolverMpCreateMatrixDesc( + &descrB, grid, compute_type, + (IB - 1) + N, (JB - 1) + NRHS, + mbB, nbB, 0, 0, LLDB) + ); + + CUSOLVERMP_CHECK(cusolverMpGetrs_bufferSize( + cusolverMpHandle, trans, N, NRHS, + d_A, IA, JA, descrA, d_ipiv, + d_B, IB, JB, descrB, + compute_type, + &workspaceInBytesOnDevice,&workspaceInBytesOnHost) + ); + gpu_dev_stream.calSync(); + if (workspaceInBytesOnHost > 0) + { + h_work = (void*)malloc(workspaceInBytesOnHost); + assert(h_work != NULL); + } + if (workspaceInBytesOnDevice > 0) + { + CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, gpu_dev_stream.stream)); + } + gpu_dev_stream.calSync(); + CUSOLVERMP_CHECK(cusolverMpGetrs( + cusolverMpHandle, trans, + N, NRHS, + d_A, IA, JA, descrA, d_ipiv, + d_B, IB, JB, descrB, + compute_type, + d_work, workspaceInBytesOnDevice, + h_work, workspaceInBytesOnHost, + d_info) + ); + gpu_dev_stream.calSync(); + + CUDA_CHECK(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); + if(h_info!=0) + { + fprintf(stderr, "Error: cusolverMpgetrs failed with info=%d\n", h_info); + exit(1); + } + if(h_work!=NULL) + { + free(h_work); + } + if(d_work!=NULL){ + CUDA_CHECK(cudaFreeAsync(d_work, gpu_dev_stream.stream)); + } +} +void CudaConnector::pgetrf_trs_nvhpc_mixed_precision( + const GpuDeviceStream& gpu_dev_stream, const cublasOperation_t& trans, + void* d_A, const int64_t& IA, const int64_t& JA, const LIBRPA::Array_Desc &arrdesc_A, + void* d_B, const int64_t& IB, const int64_t& JB, const LIBRPA::Array_Desc &arrdesc_B, + const cudaDataType_t& compute_type, const char& order +) +{ + int64_t* d_ipiv; + int* d_info; + CUDA_CHECK(cudaMallocAsync(&d_info,sizeof(int),gpu_dev_stream.stream)); + if(order == 'c'||order == 'C'){ + CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*arrdesc_A.n_loc(),gpu_dev_stream.stream)); + }else{ + CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*arrdesc_A.m_loc(),gpu_dev_stream.stream)); + } + pgetrf_nvhpc_mixed_precision( + gpu_dev_stream, d_A, 1, 1, arrdesc_A, + d_ipiv, d_info, + CUDA_C_64F, order + ); + pgetrs_nvhpc_mixed_precision( + gpu_dev_stream, trans, + d_A, 1, 1, arrdesc_A, + d_ipiv, + d_B, 1, 1, arrdesc_B, + d_info, + CUDA_C_64F, order + ); + CUDA_CHECK(cudaFreeAsync(d_info, gpu_dev_stream.stream)); + CUDA_CHECK(cudaFreeAsync(d_ipiv, gpu_dev_stream.stream)); +} +// void CudaConnector::pgemm_cublasMp(const char &transa, const char &transb, const int &m, const int &n, const int &k, +// const double &alphaD, const std::complex *A, const int &ia, const int &ja, const LIBRPA::Array_Desc &arrdesc_A, +// const std::complex *B, const int &ib, const int &jb, const LIBRPA::Array_Desc &arrdesc_B, +// const double &betaD, std::complex *C, const int &ic, const int &jc, const LIBRPA::Array_Desc &arrdesc_C) +// { +// using input_t = cuDoubleComplex; +// using output_t = cuDoubleComplex; +// using compute_t = cuDoubleComplex; +// const cudaDataType_t cuda_input_type = CUDA_C_64F; +// const cudaDataType_t cuda_output_type = CUDA_C_64F; +// cublasComputeType_t cublas_compute_type = CUBLAS_COMPUTE_64F_PEDANTIC; + +// int64_t mbA = arrdesc_A.mb(); +// int64_t nbA = arrdesc_A.nb(); +// int64_t mbB = arrdesc_B.mb(); +// int64_t nbB = arrdesc_B.nb(); +// int64_t mbC = arrdesc_C.mb(); +// int64_t nbC = arrdesc_C.nb(); +// int nprow = arrdesc_A.nprows(); +// int npcol = arrdesc_A.npcols(); +// char grid_layout = 'r'; +// bool verbose = false; +// int rank, nranks; +// MPI_Comm_size(MPI_COMM_WORLD, &nranks); +// MPI_Comm_rank(MPI_COMM_WORLD, &rank); +// // if(rank==0) +// // printf("sizeof(input_t)=%zu sizeof(output_t)=%zu sizeof(compute_t)=%zu\n", sizeof(input_t), sizeof(output_t), sizeof(compute_t)); +// const int myprow = arrdesc_A.myprow(); +// const int mypcol = arrdesc_A.mypcol(); +// if(rank==0) +// { +// printf("m:%" PRId64 ", n:%" PRId64 ", k:%" PRId64 "\n", m, n, k); +// printf("nprow:%d, npcol:%d\n", nprow, npcol); +// printf("mbA:%" PRId64 ", nbA:%" PRId64 "\n", mbA, nbA); +// } +// const int local_device = getLocalDevice(); +// printf("myrank:%d, myprow:%d, mypcol:%d, local_device:%d\n", rank, myprow, mypcol, local_device); +// CUDA_CHECK(cudaSetDevice(local_device)); +// CUDA_CHECK(cudaFree(nullptr)); + +// cal_comm_t cal_comm; +// cal_comm_create_params_t params; +// { +// params.allgather = allgather; +// params.req_test = request_test; +// params.req_free = request_free; +// params.data = (void*)(MPI_COMM_WORLD); +// params.rank = rank; +// params.nranks = nranks; +// params.local_device = local_device; + +// CAL_CHECK(cal_comm_create(params, &cal_comm)); +// } +// cudaStream_t stream = nullptr; +// CUDA_CHECK(cudaStreamCreate(&stream)); + +// cublasMpHandle_t handle = nullptr; +// CUBLASMP_CHECK(cublasMpCreate(&handle, stream)); + +// cublasMpGrid_t grid = nullptr; + +// cublasMpMatrixDescriptor_t descA = nullptr; +// cublasMpMatrixDescriptor_t descB = nullptr; +// cublasMpMatrixDescriptor_t descC = nullptr; + +// input_t* d_A = nullptr; +// input_t* d_B = nullptr; +// output_t* d_C = nullptr; + +// void* d_work = nullptr; + +// compute_t alpha = {alphaD,0.0}; +// compute_t beta = {betaD,0.0}; + + +// size_t workspaceInBytesOnDevice = 0; +// size_t workspaceInBytesOnHost = 0; + +// const int64_t global_m_a = (ia - 1) + m; +// const int64_t global_n_a = (ja - 1) + k; +// const int64_t global_m_b = (ib - 1) + k; +// const int64_t global_n_b = (jb - 1) + n; +// const int64_t global_m_c = (ic - 1) + m; +// const int64_t global_n_c = (jc - 1) + n; + +// const int64_t llda = cublasMpNumroc(global_m_a, mbA, myprow, 0, nprow); +// const int64_t loc_n_a = cublasMpNumroc(global_n_a, nbA, mypcol, 0, npcol); +// printf("rank:%d, llda=%" PRId64 ", loc_n_a=%" PRId64 "\n", rank, llda, loc_n_a); +// const int64_t lldb = cublasMpNumroc(global_m_b, mbB, myprow, 0, nprow); +// const int64_t loc_n_b = cublasMpNumroc(global_n_b, nbB, mypcol, 0, npcol); + +// const int64_t lldc = cublasMpNumroc(global_m_c, mbC, myprow, 0, nprow); +// const int64_t loc_n_c = cublasMpNumroc(global_n_c, nbC, mypcol, 0, npcol); + +// CUDA_CHECK(cudaMallocAsync(&d_A, llda * loc_n_a * sizeof(input_t), stream)); +// CUDA_CHECK(cudaMallocAsync(&d_B, lldb * loc_n_b * sizeof(input_t), stream)); +// CUDA_CHECK(cudaMallocAsync(&d_C, lldc * loc_n_c * sizeof(output_t), stream)); + +// CUDA_CHECK(cudaMemcpyAsync(d_A, A, llda * loc_n_a * sizeof(input_t), cudaMemcpyHostToDevice, stream)); +// CUDA_CHECK(cudaMemcpyAsync(d_B, B, lldb * loc_n_b * sizeof(input_t), cudaMemcpyHostToDevice, stream)); +// CUDA_CHECK(cudaMemcpyAsync(d_C, C, lldc * loc_n_c * sizeof(output_t), cudaMemcpyHostToDevice, stream)); + +// CUBLASMP_CHECK(cublasMpGridCreate( +// handle, +// nprow, +// npcol, +// grid_layout == 'c' ? CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR, +// cal_comm, +// &grid)); + +// CUBLASMP_CHECK( +// cublasMpMatrixDescriptorCreate(handle,global_m_a, global_n_a, mbA, nbA, 0, 0, llda, cuda_input_type, grid, &descA)); +// CUBLASMP_CHECK( +// cublasMpMatrixDescriptorCreate(handle,global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, cuda_input_type, grid, &descB)); +// CUBLASMP_CHECK( +// cublasMpMatrixDescriptorCreate(handle,global_m_c, global_n_c, mbC, nbC, 0, 0, lldc, cuda_output_type, grid, &descC)); +// cublasOperation_t transA,transB; +// if(transa=='N') +// transA=CUBLAS_OP_N; +// else if(transa=='T') +// transA=CUBLAS_OP_T; +// else if(transa=='C') +// transA=CUBLAS_OP_C; +// else{ +// if(rank==0) +// printf("transa=%c is not supported\n", transa); +// exit(1); +// } +// if(transb=='N') +// transB=CUBLAS_OP_N; +// else if(transb=='T') +// transB=CUBLAS_OP_T; +// else if(transb=='C') +// transB=CUBLAS_OP_C; +// else{ +// if(rank==0) +// printf("transb=%c is not supported\n", transb); +// exit(1); +// } + + + +// CUBLASMP_CHECK(cublasMpGemm_bufferSize( +// handle, +// transA, +// transB, +// m, +// n, +// k, +// &alpha, +// d_A, +// ia, +// ja, +// descA, +// d_B, +// ib, +// jb, +// descB, +// &beta, +// d_C, +// ic, +// jc, +// descC, +// cublas_compute_type, +// &workspaceInBytesOnDevice, +// &workspaceInBytesOnHost)); + +// CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream)); +// printf("workspaceInBytesOnDevice=%zu, workspaceInBytesOnHost=%zu, rank:%d\n", workspaceInBytesOnDevice, workspaceInBytesOnHost, rank); +// std::vector h_work(workspaceInBytesOnHost); + +// CUDA_CHECK(cudaStreamSynchronize(stream)); + +// const double begin = MPI_Wtime(); + +// CUBLASMP_CHECK(cublasMpGemm( +// handle, +// transA, +// transB, +// m, +// n, +// k, +// &alpha, +// d_A, +// ia, +// ja, +// descA, +// d_B, +// ib, +// jb, +// descB, +// &beta, +// d_C, +// ic, +// jc, +// descC, +// cublas_compute_type, +// d_work, +// workspaceInBytesOnDevice, +// h_work.data(), +// workspaceInBytesOnHost)); +// printf("Duration(before synchronize): %lf GFlops: %lf rank:%d\n", MPI_Wtime() - begin, (2 * m * n * k * 1e-9) / (MPI_Wtime() - begin), rank); +// CUDA_CHECK(cudaStreamSynchronize(stream)); +// printf("Duration(after synchronize): %lf GFlops: %lf rank:%d\n", MPI_Wtime() - begin, (2 * m * n * k * 1e-9) / (MPI_Wtime() - begin), rank); +// if(verbose){ +// } +// CUDA_CHECK(cudaMemcpyAsync(C, d_C, lldc * loc_n_c * sizeof(output_t), cudaMemcpyDeviceToHost, stream)); +// CUDA_CHECK(cudaStreamSynchronize(stream)); + +// CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descA)); +// CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descB)); +// CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descC)); + +// CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid)); + +// CUBLASMP_CHECK(cublasMpDestroy(handle)); + +// CUDA_CHECK(cudaFreeAsync(d_A, stream)); +// CUDA_CHECK(cudaFreeAsync(d_B, stream)); +// CUDA_CHECK(cudaFreeAsync(d_C, stream)); +// CUDA_CHECK(cudaFreeAsync(d_work, stream)); + +// CAL_CHECK(cal_comm_destroy(cal_comm)); +// CUDA_CHECK(cudaStreamDestroy(stream)); + +// MPI_Barrier(MPI_COMM_WORLD); + +// } +void CudaConnector::pgemm_device(cublasMpHandle_t handle,cublasOperation_t transA,cublasOperation_t transB,const int &m,const int &n,const int &k, + const void *alpha, + const ComplexMatrixDevice &d_A,int64_t ia,int64_t ja, + const ComplexMatrixDevice &d_B,int64_t ib,int64_t jb, + const void *beta, + ComplexMatrixDevice &d_C,int64_t ic,int64_t jc, + cublasComputeType_t cublas_compute_type) +{ + void* d_work; + + size_t workspaceInBytesOnDevice,workspaceInBytesOnHost; + CUBLASMP_CHECK(cublasMpGemm_bufferSize(handle,transA,transB,m,n,k, + alpha, + (const void *)(d_A.ptr()),ia,ja,d_A.desc_cublas, + (const void *)d_B.ptr(),ib,jb,d_B.desc_cublas, + beta, + d_C.ptr(),ic,jc,d_C.desc_cublas, + cublas_compute_type, + &workspaceInBytesOnDevice, + &workspaceInBytesOnHost)); + CUDA_CHECK(cudaMalloc((void**)&d_work,workspaceInBytesOnDevice)); + std::vector h_work(workspaceInBytesOnHost); + CUBLASMP_CHECK(cublasMpGemm(handle,transA,transB,m,n,k, + alpha, + (const void *)d_A.ptr(),ia,ja,d_A.desc_cublas, + (const void *)d_B.ptr(),ib,jb,d_B.desc_cublas, + beta, + d_C.ptr(),ic,jc,d_C.desc_cublas, + cublas_compute_type, + d_work,workspaceInBytesOnDevice, + h_work.data(),workspaceInBytesOnHost)); + + CUDA_CHECK(cudaFree(d_work)); +} + +void CudaConnector::pgemm_nvhpc(const GpuDeviceStream& gpu_dev_stream,cublasOperation_t transA,cublasOperation_t transB,const int & m,const int & n,const int & k, + const void *alpha, + const ComplexMatrixDevice &d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA, + const ComplexMatrixDevice &d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB, + const void *beta, + ComplexMatrixDevice & d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC, + cublasComputeType_t cublas_compute_type) +{ + using input_t = cuDoubleComplex; + using output_t = cuDoubleComplex; + using compute_t = cuDoubleComplex; + const cudaDataType_t cuda_input_type = CUDA_C_64F; + const cudaDataType_t cuda_output_type = CUDA_C_64F; + + int64_t mbA = array_descA.mb(); + int64_t nbA = array_descA.nb(); + int64_t mbB = array_descB.mb(); + int64_t nbB = array_descB.nb(); + int64_t mbC = array_descC.mb(); + int64_t nbC = array_descC.nb(); + int nprow = array_descA.nprows(); + int npcol = array_descA.npcols(); + int llda= array_descA.lld(); + int lldb= array_descB.lld(); + int lldc= array_descC.lld(); + cublasMpGrid_t grid = nullptr; + + cublasMpMatrixDescriptor_t descA = nullptr; + cublasMpMatrixDescriptor_t descB = nullptr; + cublasMpMatrixDescriptor_t descC = nullptr; + char grid_layout = 'r'; + int rank=gpu_dev_stream.rank; + int nranks=gpu_dev_stream.nranks; + + const int myprow = (grid_layout == 'c' ? rank % nprow : rank / npcol); + const int mypcol = (grid_layout == 'c' ? rank / nprow : rank % npcol); + + cublasMpHandle_t handle = nullptr; + CUBLASMP_CHECK(cublasMpCreate(&handle, gpu_dev_stream.stream)); + void* d_work = nullptr; + + + size_t workspaceInBytesOnDevice = 0; + size_t workspaceInBytesOnHost = 0; + + int64_t global_m_a, global_n_a, global_m_b, global_n_b; + if(transA == CUBLAS_OP_N){ + global_m_a = (ia - 1) + m; + global_n_a = (ja - 1) + k; + }else{ + global_m_a = (ia - 1) + k; + global_n_a = (ja - 1) + m; + } + if(transB == CUBLAS_OP_N){ + global_m_b = (ib - 1) + k; + global_n_b = (jb - 1) + n; + }else{ + global_m_b = (ib - 1) + n; + global_n_b = (jb - 1) + k; + } + const int64_t global_m_c = (ic - 1) + m; + const int64_t global_n_c = (jc - 1) + n; + + CUBLASMP_CHECK(cublasMpGridCreate( + handle, nprow, npcol, + grid_layout == 'c' ? CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR, + gpu_dev_stream.cal_comm, &grid) + ); + + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle,global_m_a, global_n_a, mbA, nbA, 0, 0, llda, cuda_input_type, grid, &descA)); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle,global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, cuda_input_type, grid, &descB)); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle,global_m_c, global_n_c, mbC, nbC, 0, 0, lldc, cuda_output_type, grid, &descC)); + + CUBLASMP_CHECK(cublasMpGemm_bufferSize( + handle, transA, transB, + m, n, k, + alpha, + d_A.ptr(), ia, ja, descA, + d_B.ptr(), ib, jb, descB, + beta, + d_C.ptr(), ic, jc, descC, + cublas_compute_type, + &workspaceInBytesOnDevice, &workspaceInBytesOnHost) + ); + gpu_dev_stream.cudaSync(); + CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, gpu_dev_stream.stream)); + std::vector h_work(workspaceInBytesOnHost); + + CUBLASMP_CHECK(cublasMpGemm( + handle, transA, transB, + m, n, k, + alpha, + d_A.ptr(), ia, ja, descA, + d_B.ptr(), ib, jb, descB, + beta, + d_C.ptr(), ic, jc, descC, + cublas_compute_type, + d_work, workspaceInBytesOnDevice, + h_work.data(), workspaceInBytesOnHost + )); + + gpu_dev_stream.cudaSync(); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descA)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descB)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descC)); + + CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid)); + + CUBLASMP_CHECK(cublasMpDestroy(handle)); + CUDA_CHECK(cudaFree(d_work)); + + MPI_Barrier(MPI_COMM_WORLD); +} +// void CudaConnector::pgemm_nvhpc_cuFloatComplex(const GpuDeviceStream& gpu_dev_stream,cublasOperation_t transA,cublasOperation_t transB,const int & m,const int & n,const int & k, +// const void *alpha, +// const cuFloatComplex* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA, +// const cuFloatComplex* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB, +// const void *beta, +// cuFloatComplex * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC, +// cublasComputeType_t cublas_compute_type) +// { +// using input_t = cuFloatComplex; +// using output_t = cuFloatComplex; +// using compute_t = cuFloatComplex; +// const cudaDataType_t cuda_input_type = CUDA_C_32F; +// const cudaDataType_t cuda_output_type = CUDA_C_32F; + +// int64_t mbA = array_descA.mb(); +// int64_t nbA = array_descA.nb(); +// int64_t mbB = array_descB.mb(); +// int64_t nbB = array_descB.nb(); +// int64_t mbC = array_descC.mb(); +// int64_t nbC = array_descC.nb(); +// int nprow = array_descA.nprows(); +// int npcol = array_descA.npcols(); +// int llda= array_descA.lld(); +// int lldb= array_descB.lld(); +// int lldc= array_descC.lld(); +// cublasMpGrid_t grid = nullptr; + +// cublasMpMatrixDescriptor_t descA = nullptr; +// cublasMpMatrixDescriptor_t descB = nullptr; +// cublasMpMatrixDescriptor_t descC = nullptr; +// char grid_layout = 'r'; +// int rank=gpu_dev_stream.rank; +// int nranks=gpu_dev_stream.nranks; +// // if(rank==0) +// // printf("sizeof(input_t)=%zu sizeof(output_t)=%zu sizeof(compute_t)=%zu\n", sizeof(input_t), sizeof(output_t), sizeof(compute_t)); +// const int myprow = (grid_layout == 'c' ? rank % nprow : rank / npcol); +// const int mypcol = (grid_layout == 'c' ? rank / nprow : rank % npcol); +// const int local_device = gpu_dev_stream.local_device; +// cudaStream_t stream = gpu_dev_stream.stream; + +// cublasMpHandle_t handle = nullptr; +// CUBLASMP_CHECK(cublasMpCreate(&handle, stream)); +// void* d_work = nullptr; + + +// size_t workspaceInBytesOnDevice = 0; +// size_t workspaceInBytesOnHost = 0; + +// const int64_t global_m_a = (ia - 1) + m; +// const int64_t global_n_a = (ja - 1) + k; +// const int64_t global_m_b = (ib - 1) + k; +// const int64_t global_n_b = (jb - 1) + n; +// const int64_t global_m_c = (ic - 1) + m; +// const int64_t global_n_c = (jc - 1) + n; + +// gpu_dev_stream.cudaSync(); +// // printf("before create grid, rank:%d\n", rank); +// CUBLASMP_CHECK(cublasMpGridCreate( +// handle, +// nprow, +// npcol, +// grid_layout == 'c' ? CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR, +// gpu_dev_stream.cal_comm, +// &grid)); +// // printf("after create grid, rank:%d\n", rank); +// CUBLASMP_CHECK( +// cublasMpMatrixDescriptorCreate(handle,global_m_a, global_n_a, mbA, nbA, 0, 0, llda, cuda_input_type, grid, &descA)); +// CUBLASMP_CHECK( +// cublasMpMatrixDescriptorCreate(handle,global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, cuda_input_type, grid, &descB)); +// CUBLASMP_CHECK( +// cublasMpMatrixDescriptorCreate(handle,global_m_c, global_n_c, mbC, nbC, 0, 0, lldc, cuda_output_type, grid, &descC)); +// // printf("after create desc, rank:%d\n", rank); +// CUBLASMP_CHECK(cublasMpGemm_bufferSize( +// handle, +// transA, +// transB, +// m, +// n, +// k, +// alpha, +// d_A, +// ia, +// ja, +// descA, +// d_B, +// ib, +// jb, +// descB, +// beta, +// d_C, +// ic, +// jc, +// descC, +// cublas_compute_type, +// &workspaceInBytesOnDevice, +// &workspaceInBytesOnHost)); +// // printf("workspaceInBytesOnDevice=%zu, workspaceInBytesOnHost=%zu, rank:%d\n", workspaceInBytesOnDevice, workspaceInBytesOnHost, rank); +// CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream)); +// std::vector h_work(workspaceInBytesOnHost); + +// gpu_dev_stream.cudaSync(); + +// // const double begin = MPI_Wtime(); +// // printf("before gemm, rank:%d\n", rank); +// CUBLASMP_CHECK(cublasMpGemm( +// handle, +// transA, +// transB, +// m, +// n, +// k, +// alpha, +// d_A, +// ia, +// ja, +// descA, +// d_B, +// ib, +// jb, +// descB, +// beta, +// d_C, +// ic, +// jc, +// descC, +// cublas_compute_type, +// d_work, +// workspaceInBytesOnDevice, +// h_work.data(), +// workspaceInBytesOnHost)); +// // printf("after gemm, rank:%d\n", rank); +// CUDA_CHECK(cudaStreamSynchronize(stream)); +// CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descA)); +// CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descB)); +// CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descC)); + +// CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid)); + +// CUBLASMP_CHECK(cublasMpDestroy(handle)); +// CUDA_CHECK(cudaFree(d_work)); + +// MPI_Barrier(MPI_COMM_WORLD); +// } +void CudaConnector::pgemm_nvhpc_mixed_precision( + const GpuDeviceStream& gpu_dev_stream,cublasOperation_t transA,cublasOperation_t transB,const int & m,const int & n,const int & k, + const void *alpha, + const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA, + const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB, + const void *beta, + void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC, + cublasComputeType_t cublas_compute_type +) +{ + cudaDataType_t cuda_compute_type; + if(cublas_compute_type == CUBLAS_COMPUTE_64F_PEDANTIC){ + cuda_compute_type = CUDA_C_64F; + }else if(cublas_compute_type == CUBLAS_COMPUTE_32F_PEDANTIC){ + cuda_compute_type = CUDA_C_32F; + }else{ + fprintf(stderr, "Unsupported cublas_compute_type\n"); + } + int64_t mbA = array_descA.mb(); + int64_t nbA = array_descA.nb(); + int64_t mbB = array_descB.mb(); + int64_t nbB = array_descB.nb(); + int64_t mbC = array_descC.mb(); + int64_t nbC = array_descC.nb(); + int nprow = array_descA.nprows(); + int npcol = array_descA.npcols(); + int llda= array_descA.lld(); + int lldb= array_descB.lld(); + int lldc= array_descC.lld(); + cublasMpGrid_t grid = nullptr; + + cublasMpMatrixDescriptor_t descA = nullptr; + cublasMpMatrixDescriptor_t descB = nullptr; + cublasMpMatrixDescriptor_t descC = nullptr; + char grid_layout = 'r'; + int rank=gpu_dev_stream.rank; + int nranks=gpu_dev_stream.nranks; + // if(rank==0) + // printf("sizeof(input_t)=%zu sizeof(output_t)=%zu sizeof(compute_t)=%zu\n", sizeof(input_t), sizeof(output_t), sizeof(compute_t)); + const int myprow = (grid_layout == 'c' ? rank % nprow : rank / npcol); + const int mypcol = (grid_layout == 'c' ? rank / nprow : rank % npcol); + const int local_device = gpu_dev_stream.local_device; + cudaStream_t stream = gpu_dev_stream.stream; + + cublasMpHandle_t handle = nullptr; + // printf("before create handle, rank:%d\n"); + // CudaConnector::check_memory(gpu_dev_stream); + CUBLASMP_CHECK(cublasMpCreate(&handle, stream)); + // printf("after create handle, rank:%d\n"); + // CudaConnector::check_memory(gpu_dev_stream); + void* d_work = nullptr; + + + size_t workspaceInBytesOnDevice = 0; + size_t workspaceInBytesOnHost = 0; + + int64_t global_m_a, global_n_a, global_m_b, global_n_b; + if(transA == CUBLAS_OP_N){ + global_m_a = (ia - 1) + m; + global_n_a = (ja - 1) + k; + }else{ + global_m_a = (ia - 1) + k; + global_n_a = (ja - 1) + m; + } + if(transB == CUBLAS_OP_N){ + global_m_b = (ib - 1) + k; + global_n_b = (jb - 1) + n; + }else{ + global_m_b = (ib - 1) + n; + global_n_b = (jb - 1) + k; + } + const int64_t global_m_c = (ic - 1) + m; + const int64_t global_n_c = (jc - 1) + n; + + // gpu_dev_stream.cudaSync(); + // printf("before create grid, rank:%d\n", rank); + // CudaConnector::check_memory(gpu_dev_stream); + CUBLASMP_CHECK(cublasMpGridCreate( + handle, + nprow, + npcol, + grid_layout == 'c' ? CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR, + gpu_dev_stream.cal_comm, + &grid)); + // printf("after create grid, rank:%d\n", rank); + // CudaConnector::check_memory(gpu_dev_stream); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle,global_m_a, global_n_a, mbA, nbA, 0, 0, llda, cuda_compute_type, grid, &descA)); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle,global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, cuda_compute_type, grid, &descB)); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle,global_m_c, global_n_c, mbC, nbC, 0, 0, lldc, cuda_compute_type, grid, &descC)); + + CUBLASMP_CHECK(cublasMpGemm_bufferSize( + handle, transA, transB, + m, n, k, + alpha, + d_A, ia, ja, descA, + d_B, ib, jb, descB, + beta, + d_C, ic, jc, descC, + cublas_compute_type, + &workspaceInBytesOnDevice, &workspaceInBytesOnHost) + ); + // printf("workspaceInBytesOnDevice=%zu GiB, workspaceInBytesOnHost=%zu GiB, rank:%d\n", workspaceInBytesOnDevice, workspaceInBytesOnHost, rank); + gpu_dev_stream.cudaSync(); + // printf("before malloc d_work, rank:%d\n"); + // CudaConnector::check_memory(gpu_dev_stream); + CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream)); + // printf("after malloc d_work, rank:%d\n"); + // CudaConnector::check_memory(gpu_dev_stream); + std::vector h_work(workspaceInBytesOnHost); + + // gpu_dev_stream.cudaSync(); + + // const double begin = MPI_Wtime(); + CUBLASMP_CHECK(cublasMpGemm( + handle, transA, transB, + m, n, k, + alpha, + d_A, ia, ja, descA, + d_B, ib, jb, descB, + beta, + d_C, ic, jc, descC, + cublas_compute_type, + d_work, workspaceInBytesOnDevice, + h_work.data(), workspaceInBytesOnHost) + ); + // gpu_dev_stream.cudaSync(); + // printf("before free d_work, rank:%d\n"); + // CudaConnector::check_memory(gpu_dev_stream); + CUDA_CHECK(cudaFreeAsync(d_work, stream)); + // printf("after free d_work, rank:%d\n"); + // CudaConnector::check_memory(gpu_dev_stream); + gpu_dev_stream.cudaSync(); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descA)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descB)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descC)); + + CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid)); + + CUBLASMP_CHECK(cublasMpDestroy(handle)); + + MPI_Barrier(MPI_COMM_WORLD); +} + + +void CudaConnector::pgemm_nvhpc_mixed_precision( + cublasOperation_t transA,cublasOperation_t transB,const int & m,const int & n,const int & k, + const void *alpha, + const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA, + const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB, + const void *beta, + void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC, + cublasComputeType_t cublas_compute_type +) +{ + cudaDataType_t cuda_compute_type; + if(cublas_compute_type == CUBLAS_COMPUTE_64F_PEDANTIC){ + cuda_compute_type = CUDA_C_64F; + }else if(cublas_compute_type == CUBLAS_COMPUTE_32F_PEDANTIC){ + cuda_compute_type = CUDA_C_32F; + }else{ + fprintf(stderr, "Unsupported cublas_compute_type\n"); + } + int64_t mbA = array_descA.mb(); + int64_t nbA = array_descA.nb(); + int64_t mbB = array_descB.mb(); + int64_t nbB = array_descB.nb(); + int64_t mbC = array_descC.mb(); + int64_t nbC = array_descC.nb(); + int nprow = array_descA.nprows(); + int npcol = array_descA.npcols(); + int llda= array_descA.lld(); + int lldb= array_descB.lld(); + int lldc= array_descC.lld(); + cublasMpGrid_t grid = nullptr; + + cublasMpMatrixDescriptor_t descA = nullptr; + cublasMpMatrixDescriptor_t descB = nullptr; + cublasMpMatrixDescriptor_t descC = nullptr; + char grid_layout = 'r'; + int rank=mpi_comm_global_h.myid; + int nranks=mpi_comm_global_h.nprocs; + // if(rank==0) + // printf("sizeof(input_t)=%zu sizeof(output_t)=%zu sizeof(compute_t)=%zu\n", sizeof(input_t), sizeof(output_t), sizeof(compute_t)); + const int myprow = (grid_layout == 'c' ? rank % nprow : rank / npcol); + const int mypcol = (grid_layout == 'c' ? rank / nprow : rank % npcol); + const int local_device = device_stream.local_device; + cudaStream_t stream = (cudaStream_t)device_stream.stream; + + cublasMpHandle_t handle = nullptr; + CUBLASMP_CHECK(cublasMpCreate(&handle, stream)); + void* d_work = nullptr; + + + size_t workspaceInBytesOnDevice = 0; + size_t workspaceInBytesOnHost = 0; + + int64_t global_m_a, global_n_a, global_m_b, global_n_b; + if(transA == CUBLAS_OP_N){ + global_m_a = (ia - 1) + m; + global_n_a = (ja - 1) + k; + }else{ + global_m_a = (ia - 1) + k; + global_n_a = (ja - 1) + m; + } + if(transB == CUBLAS_OP_N){ + global_m_b = (ib - 1) + k; + global_n_b = (jb - 1) + n; + }else{ + global_m_b = (ib - 1) + n; + global_n_b = (jb - 1) + k; + } + const int64_t global_m_c = (ic - 1) + m; + const int64_t global_n_c = (jc - 1) + n; + + CUBLASMP_CHECK(cublasMpGridCreate( + handle, + nprow, + npcol, + grid_layout == 'c' ? CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR, + device_stream.cal_comm, + &grid)); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle,global_m_a, global_n_a, mbA, nbA, 0, 0, llda, cuda_compute_type, grid, &descA)); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle,global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, cuda_compute_type, grid, &descB)); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle,global_m_c, global_n_c, mbC, nbC, 0, 0, lldc, cuda_compute_type, grid, &descC)); + + CUBLASMP_CHECK(cublasMpGemm_bufferSize( + handle, transA, transB, + m, n, k, + alpha, + d_A, ia, ja, descA, + d_B, ib, jb, descB, + beta, + d_C, ic, jc, descC, + cublas_compute_type, + &workspaceInBytesOnDevice, &workspaceInBytesOnHost) + ); + device_stream.cudaSync(); + CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream)); + std::vector h_work(workspaceInBytesOnHost); + + // const double begin = MPI_Wtime(); + CUBLASMP_CHECK(cublasMpGemm( + handle, transA, transB, + m, n, k, + alpha, + d_A, ia, ja, descA, + d_B, ib, jb, descB, + beta, + d_C, ic, jc, descC, + cublas_compute_type, + d_work, workspaceInBytesOnDevice, + h_work.data(), workspaceInBytesOnHost) + ); + + CUDA_CHECK(cudaFreeAsync(d_work, stream)); + + device_stream.cudaSync(); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descA)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descB)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descC)); + + CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid)); + + CUBLASMP_CHECK(cublasMpDestroy(handle)); +} +void CudaConnector::pgeadd_nvhpc( + const GpuDeviceStream& gpu_dev_stream,const cublasOperation_t& trans, + const void *alpha, + const void* d_A, const int64_t& ia, const int64_t& ja, const Array_Desc& array_descA, + const void* beta, + void* d_B, const int64_t& ib, const int64_t& jb, const Array_Desc& array_descB, + const cudaDataType_t& compute_type, + const char& order +) +{ + cudaStream_t stream = gpu_dev_stream.stream; + cublasMpHandle_t handle = nullptr; + const int64_t mbA = array_descA.mb(); + const int64_t nbA = array_descA.nb(); + const int64_t mbB = array_descB.mb(); + const int64_t nbB = array_descB.nb(); + + int nprow, npcol; + assert(array_descA.nprows() == array_descB.nprows()); + assert(array_descA.npcols() == array_descB.npcols()); + ORDER_CHECK(order); + if(order == 'c'||order == 'C'){ + nprow = array_descA.npcols(); + npcol = array_descA.nprows(); + }else{ + nprow = array_descA.nprows(); + npcol = array_descA.npcols(); + } + int llda,loc_n_a,lldb,loc_n_b,mA,nA,mB,nB; + if(order == 'c'||order == 'C'){ + llda = array_descA.n_loc(); + loc_n_a = array_descA.m_loc(); + lldb = array_descB.n_loc(); + loc_n_b = array_descB.m_loc(); + mA = array_descA.n(); + nA = array_descA.m(); + mB = array_descB.n(); + nB = array_descB.m(); + + }else{ + llda = array_descA.m_loc(); + loc_n_a = array_descA.n_loc(); + lldb = array_descB.m_loc(); + loc_n_b = array_descB.n_loc(); + mA = array_descA.m(); + nA = array_descA.n(); + mB = array_descB.m(); + nB = array_descB.n(); + } + if(trans == CUBLAS_OP_N){ + assert(mA == nA); + assert(mB == nB); + }else{ + assert(mA == nB); + assert(mB == nA); + } + const int global_m_a = ia-1+mA; + const int global_n_a = ja-1+nA; + const int global_m_b = ib-1+mB; + const int global_n_b = jb-1+nB; + int rank = gpu_dev_stream.rank; + int nranks = gpu_dev_stream.nranks; + + const int local_device = gpu_dev_stream.local_device; + CUBLASMP_CHECK(cublasMpCreate(&handle, stream)); + cublasMpGrid_t grid = nullptr; + cublasMpMatrixDescriptor_t descA = nullptr; + cublasMpMatrixDescriptor_t descB = nullptr; + void* d_work = nullptr; + size_t workspaceInBytesOnDevice = 0; + size_t workspaceInBytesOnHost = 0; + CUBLASMP_CHECK(cublasMpGridCreate( + handle, nprow, npcol, + order=='c'||order=='C'?CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR, + gpu_dev_stream.cal_comm, + &grid)); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle, global_m_a, global_n_a, mbA, nbA, 0, 0, llda, compute_type, grid, &descA)); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle, global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, compute_type, grid, &descB)); + CUBLASMP_CHECK(cublasMpGeadd_bufferSize( + handle, trans, + mB, nB, + alpha, + d_A, ia, ja, descA, + beta, + d_B, ib, jb, descB, + &workspaceInBytesOnDevice, &workspaceInBytesOnHost)); + gpu_dev_stream.calSync(); + + CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream)); + gpu_dev_stream.calSync(); + std::vector h_work(workspaceInBytesOnHost); + CUBLASMP_CHECK(cublasMpGeadd( + handle, trans, + mB, nB, + alpha, + d_A, ia, ja, descA, + beta, + d_B, ib, jb, descB, + d_work, workspaceInBytesOnDevice, + h_work.data(), workspaceInBytesOnHost)); + gpu_dev_stream.calSync(); + CUDA_CHECK(cudaFreeAsync(d_work, stream)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle, descA)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle, descB)); + CUBLASMP_CHECK(cublasMpGridDestroy(handle, grid)); + CUBLASMP_CHECK(cublasMpDestroy(handle)); +} + +void CudaConnector::pgemr2d_nvhpc( + const GpuDeviceStream& gpu_dev_stream,const int& m, const int& n, + const void* d_A, const int64_t& ia, const int64_t& ja, const Array_Desc& array_descA, + void* d_B, const int64_t& ib, const int64_t& jb, const Array_Desc& array_descB, + const cudaDataType_t& compute_type +) +{ + + cudaStream_t stream = gpu_dev_stream.stream; + cublasMpHandle_t handle = nullptr; + const int64_t mbA = array_descA.mb(); + const int64_t nbA = array_descA.nb(); + const int64_t mbB = array_descB.mb(); + const int64_t nbB = array_descB.nb(); + int nprow, npcol; + assert(array_descA.nprows() == array_descB.nprows()); + assert(array_descA.npcols() == array_descB.npcols()); + nprow = array_descA.nprows(); + npcol = array_descA.npcols(); + + int llda = array_descA.m_loc(); + int loc_n_a = array_descA.n_loc(); + + int lldb = array_descB.m_loc(); + int loc_n_b = array_descB.n_loc(); + + const int global_m_a = ia-1+m; + const int global_n_a = ja-1+n; + + const int global_m_b = ib-1+m; + const int global_n_b = jb-1+n; + + CUBLASMP_CHECK(cublasMpCreate(&handle, stream)); + cublasMpGrid_t grid = nullptr; + cublasMpMatrixDescriptor_t descA = nullptr; + cublasMpMatrixDescriptor_t descB = nullptr; + void* d_work = nullptr; + size_t workspaceInBytesOnDevice = 0; + size_t workspaceInBytesOnHost = 0; + CUBLASMP_CHECK(cublasMpGridCreate( + handle, nprow, npcol, + CUBLASMP_GRID_LAYOUT_ROW_MAJOR, + gpu_dev_stream.cal_comm, + &grid) + ); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle, global_m_a, global_n_a, mbA, nbA, 0, 0, llda, compute_type, grid, &descA)); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle, global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, compute_type, grid, &descB)); + CUBLASMP_CHECK(cublasMpGemr2D_bufferSize( + handle, + m, n, + d_A, ia, ja, descA, + d_B, ib, jb, descB, + &workspaceInBytesOnDevice, &workspaceInBytesOnHost, + gpu_dev_stream.cal_comm) + ); + + gpu_dev_stream.calSync(); + CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream)); + std::vector h_work(workspaceInBytesOnHost); + + CUBLASMP_CHECK(cublasMpGemr2D( + handle, + m, n, + d_A, ia, ja, descA, + d_B, ib, jb, descB, + d_work, workspaceInBytesOnDevice, + h_work.data(), workspaceInBytesOnHost, + gpu_dev_stream.cal_comm) + ); + gpu_dev_stream.calSync(); + CUDA_CHECK(cudaFreeAsync(d_work, stream)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle, descA)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle, descB)); + CUBLASMP_CHECK(cublasMpGridDestroy(handle, grid)); + CUBLASMP_CHECK(cublasMpDestroy(handle)); +} +#endif + diff --git a/src/cuda_connector.cu b/src/cuda_connector.cu new file mode 100644 index 00000000..f6ffa7db --- /dev/null +++ b/src/cuda_connector.cu @@ -0,0 +1,601 @@ +#include "cuda_connector.h" +#include "device_stream.h" +// add by hbchen in 2025-05-19 +__device__ int get_indxg2p(const int &indxglob, const int &nb, const int &iproc, + const int &isrcproc, const int &nprocs) +{ + return (isrcproc + indxglob / nb) % nprocs; +} +__device__ int get_indxg2l(const int &indxglob, const int &nb, const int &iproc, + const int &isrcproc, const int &nprocs) +{ + return nb * (indxglob / (nb * nprocs)) + indxglob % nb; +} +__device__ int get_index_g2l_r(const int& gindx, const int& m, const int& mb,const int &myprow, const int& irsrc, const int& nprows) +{ + return myprow != get_indxg2p(gindx, mb, myprow, irsrc, nprows) || + gindx >= m + ? -1 + : get_indxg2l(gindx, mb, myprow, irsrc, nprows); +} +__device__ int get_index_g2l_c(const int& gindx, const int &n, const int &nb, const int &mypcol, const int &icsrc, const int &npcols) +{ + return mypcol != get_indxg2p(gindx, nb, mypcol, icsrc, npcols) || + gindx >= n + ? -1 + : get_indxg2l(gindx, nb, mypcol, icsrc, npcols); +} +__global__ void det_multiply_seq(const cuDoubleComplex *inC,const int* d_ipiv,const int& num, cuDoubleComplex* ouC){ + int f = blockIdx.x*blockDim.x + threadIdx.x; + // printf("f = %d\n", f); + ouC[f]=make_cuDoubleComplex(1.0, 0.0); + for(int i=f;i>>(d_a,d_ipiv,*d_n,d_ouC); + cudaMemcpyAsync(h_ouC, d_ouC, blockSize*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); + cudaFreeAsync(d_ouC, stream); + cudaFreeAsync(d_n, stream); + for(int i=0;i>>(d_a,*d_n); + cudaStreamSynchronize(stream); + cudaFreeAsync(d_n, stream); +} +void CudaConnector::multiply_number_for_ComplexMatrixDevice(ComplexMatrixDevice& mat, const cuDoubleComplex& num, const cudaStream_t& stream) +{ + int blockSize = 256; + cuDoubleComplex * d_num; + int *d_len; + CUDA_CHECK(cudaMallocAsync((void**)&d_len, sizeof(int), stream)); + int len = mat.nr()*mat.nc(); + CUDA_CHECK(cudaMemcpyAsync(d_len, &len, sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(cuDoubleComplex), stream)); + CUDA_CHECK(cudaMemcpyAsync(d_num, &num, sizeof(cuDoubleComplex), cudaMemcpyHostToDevice, stream)); + int gridSize = (mat.nr()*mat.nc() + blockSize - 1) / blockSize; + multiply_number_for_ComplexMatrixDevice_kernel<<>>(mat.ptr(), *d_num, *d_len); +} +__global__ void diag_add_ComplexMatrixDevice_kernel(cuDoubleComplex* d_a, const double& d_num, const int& d_m, const int& d_n, const int& d_m_loc, const int& d_n_loc, const int& d_mb, const int& d_nb, const int& d_myprow, const int& d_mypcol, const int& d_irsrc, const int& d_icsrc, const int& d_nprows, const int& d_npcols) +{ + int row = blockIdx.x * blockDim.x + threadIdx.x; + if (row < d_m) + { + int col = row; // Diagonal element + int local_row = get_index_g2l_r(row, d_m, d_mb, d_myprow, d_irsrc, d_nprows); + int local_col = get_index_g2l_c(col, d_n, d_nb, d_mypcol, d_icsrc, d_npcols); + if (local_row != -1 && local_col != -1 && local_row < d_m_loc && local_col < d_n_loc) + { + int local_index = local_row + local_col * d_m_loc; // Column-major order + d_a[local_index].x += d_num; // Add the real number to the real part + } + } +} + +void CudaConnector::diag_add_ComplexMatrixDevice(ComplexMatrixDevice& mat, const double& num, const Array_Desc& arrdesc,const cudaStream_t& stream) +{ + double *d_num; + int blockSize = 256; + CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(double), stream)); + CUDA_CHECK(cudaMemcpyAsync(d_num, &num, sizeof(double), cudaMemcpyHostToDevice, stream)); + int *d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols; + CUDA_CHECK(cudaMallocAsync((void**)&d_m, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_n, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_m_loc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_n_loc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_mb, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_nb, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_myprow, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_mypcol, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_irsrc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_icsrc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_nprows, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_npcols, sizeof(int), stream)); + CUDA_CHECK(cudaMemcpyAsync(d_m, &arrdesc.m(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_n, &arrdesc.n(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_m_loc, &arrdesc.m_loc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_n_loc, &arrdesc.n_loc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_mb, &arrdesc.mb(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_nb, &arrdesc.nb(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_myprow, &arrdesc.myprow(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_mypcol, &arrdesc.mypcol(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_irsrc, &arrdesc.irsrc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_icsrc, &arrdesc.icsrc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_nprows, &arrdesc.nprows(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_npcols, &arrdesc.npcols(), sizeof(int), cudaMemcpyHostToDevice, stream)); + int gridSize = (arrdesc.m() + blockSize - 1) / blockSize; + diag_add_ComplexMatrixDevice_kernel<<>>(mat.ptr(),*d_num,*d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols); + CUDA_CHECK(cudaFreeAsync(d_num, stream)); + CUDA_CHECK(cudaFreeAsync(d_m, stream)); + CUDA_CHECK(cudaFreeAsync(d_n, stream)); + CUDA_CHECK(cudaFreeAsync(d_m_loc, stream)); + CUDA_CHECK(cudaFreeAsync(d_n_loc, stream)); + CUDA_CHECK(cudaFreeAsync(d_mb, stream)); + CUDA_CHECK(cudaFreeAsync(d_nb, stream)); + CUDA_CHECK(cudaFreeAsync(d_myprow, stream)); + CUDA_CHECK(cudaFreeAsync(d_mypcol, stream)); + CUDA_CHECK(cudaFreeAsync(d_irsrc, stream)); + CUDA_CHECK(cudaFreeAsync(d_icsrc, stream)); + CUDA_CHECK(cudaFreeAsync(d_nprows, stream)); + CUDA_CHECK(cudaFreeAsync(d_npcols, stream)); + +} +__global__ void diag_add_matrix_device_blacs_kernel(const cuDoubleComplex* num, cuDoubleComplex* d_A, const Array_Desc_Device& array_desc_device) +{ + int i = threadIdx.x + blockIdx.x * blockDim.x; + if(i>=array_desc_device.m()) + return; + int ilo = array_desc_device.indx_g2l_r(i); + if(ilo==-1) + return; + int jlo = array_desc_device.indx_g2l_c(i); + if(jlo==-1) + return; + if(ilo>>((cuDoubleComplex*)d_num,(cuDoubleComplex*)d_A,*d_array_desc_device); + + }else if(compute_type == LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT){ + CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(cuFloatComplex), (cudaStream_t)device_stream.stream)); + CUDA_CHECK(cudaMemcpyAsync(d_num, num, sizeof(cuFloatComplex), cudaMemcpyHostToDevice, (cudaStream_t)device_stream.stream)); + }else if(compute_type == LIBRPA_COMPUTE_TYPE_DOUBLE){ + CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(double), (cudaStream_t)device_stream.stream)); + CUDA_CHECK(cudaMemcpyAsync(d_num, num, sizeof(double), cudaMemcpyHostToDevice, (cudaStream_t)device_stream.stream)); + }else{ + CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(float), (cudaStream_t)device_stream.stream)); + CUDA_CHECK(cudaMemcpyAsync(d_num, num, sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)device_stream.stream)); + } + CUDA_CHECK(cudaFreeAsync(d_num, (cudaStream_t)device_stream.stream)); + CUDA_CHECK(cudaFreeAsync(d_array_desc_device, (cudaStream_t)device_stream.stream)); + +} +__global__ void det_ComplexMatrixDevice_blacs_kernel(const cuDoubleComplex* d_in,cuDoubleComplex* d_out,const int& m,const int& n,const int& m_loc,const int& n_loc,const int& mb,const int& nb,const int& myprow,const int& mypcol,const int& irsrc,const int& icsrc,const int& nprows,const int& npcols) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + // printf("idx=%d\n",idx); + d_out[idx] = make_cuDoubleComplex(1.0,0.0); + + for(int i=idx;i CudaConnector::det_ComplexMatrixDevice_blacs(const cudaStream_t& stream, const ComplexMatrixDevice&d_A, const LIBRPA::Array_Desc &arrdesc_pi) +{ + std::complex det_loc={1.0,0.0}; + int *d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols; + CUDA_CHECK(cudaMallocAsync((void**)&d_m, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_n, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_m_loc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_n_loc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_mb, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_nb, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_myprow, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_mypcol, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_irsrc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_icsrc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_nprows, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_npcols, sizeof(int), stream)); + CUDA_CHECK(cudaMemcpyAsync(d_m, &arrdesc_pi.m(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_n, &arrdesc_pi.n(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_m_loc, &arrdesc_pi.m_loc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_n_loc, &arrdesc_pi.n_loc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_mb, &arrdesc_pi.mb(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_nb, &arrdesc_pi.nb(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_myprow, &arrdesc_pi.myprow(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_mypcol, &arrdesc_pi.mypcol(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_irsrc, &arrdesc_pi.irsrc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_icsrc, &arrdesc_pi.icsrc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_nprows, &arrdesc_pi.nprows(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_npcols, &arrdesc_pi.npcols(), sizeof(int), cudaMemcpyHostToDevice, stream)); + int blockSize = 256; + int gridSize = 1; + cuDoubleComplex* d_detA; + std::complex *h_detA = new std::complex[blockSize]; + CUDA_CHECK(cudaMallocAsync((void**)&d_detA, sizeof(cuDoubleComplex) * blockSize, stream)); + // printf("before det_ComplexMatrixDevice_blacs_kernel\n"); + // CUDA_CHECK(cudaStreamSynchronize(stream)); + det_ComplexMatrixDevice_blacs_kernel<<>>(d_A.ptr(),d_detA,*d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols); + CUDA_CHECK(cudaFreeAsync(d_m, stream)); + CUDA_CHECK(cudaFreeAsync(d_n, stream)); + CUDA_CHECK(cudaFreeAsync(d_m_loc, stream)); + CUDA_CHECK(cudaFreeAsync(d_n_loc, stream)); + CUDA_CHECK(cudaFreeAsync(d_mb, stream)); + CUDA_CHECK(cudaFreeAsync(d_nb, stream)); + CUDA_CHECK(cudaFreeAsync(d_myprow, stream)); + CUDA_CHECK(cudaFreeAsync(d_mypcol, stream)); + CUDA_CHECK(cudaFreeAsync(d_irsrc, stream)); + CUDA_CHECK(cudaFreeAsync(d_icsrc, stream)); + CUDA_CHECK(cudaFreeAsync(d_nprows, stream)); + CUDA_CHECK(cudaFreeAsync(d_npcols, stream)); + // printf("after det_ComplexMatrixDevice_blacs_kernel\n"); + CUDA_CHECK(cudaMemcpyAsync(h_detA, d_detA, sizeof(cuDoubleComplex) * blockSize, cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaFreeAsync(d_detA, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + for(int i=0;i>>(d_in, d_out, *d_m_ptr, *d_n_ptr, *d_conjugate_ptr); + // printf("after transpose kernel\n"); + CUDA_CHECK(cudaFreeAsync(d_m_ptr, gpu_dev_stream.stream)); + CUDA_CHECK(cudaFreeAsync(d_n_ptr, gpu_dev_stream.stream)); + CUDA_CHECK(cudaFreeAsync(d_conjugate_ptr, gpu_dev_stream.stream)); + gpu_dev_stream.cudaSync(); + in.set_ptr(d_out); + in.set_nr(n); + in.set_nc(m); +} +__global__ void cuFloatComplex_to_cuDoubleComplex_kernel(const cuFloatComplex* d_in, cuDoubleComplex* d_out, const int64_t& d_len) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < d_len) + { + d_out[idx] = make_cuDoubleComplex(static_cast(d_in[idx].x), static_cast(d_in[idx].y)); + } +} +void CudaConnector::cuFloatComplex_to_cuDoubleComplex_Async(const cuFloatComplex* d_in, cuDoubleComplex* d_out, const int64_t& len, const cudaStream_t& stream) +{ + int blockSize = 256; + int gridSize = (len + blockSize - 1) / blockSize; + // printf("gridSize = %d, blockSize = %d, len = %d\n", gridSize, blockSize, len); + // Launch the kernel + int64_t *d_len; + CUDA_CHECK(cudaMallocAsync((void**)&d_len, sizeof(int64_t), stream)); + CUDA_CHECK(cudaMemcpyAsync(d_len, &len, sizeof(int64_t), cudaMemcpyHostToDevice, stream)); + cuFloatComplex_to_cuDoubleComplex_kernel<<>>(d_in, d_out, *d_len); +} +__global__ void cuDoubleComplex_to_cuFloatComplex_kernel(const cuDoubleComplex* d_a, cuFloatComplex* d_b, const int64_t& d_len) +{ + + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < d_len) + { + d_b[idx] = make_cuFloatComplex(static_cast(d_a[idx].x), static_cast(d_a[idx].y)); + } +} +void CudaConnector::cuDoubleComplex_to_cuFloatComplex_Async(const cuDoubleComplex* d_a, cuFloatComplex* d_b, const int64_t& len, const cudaStream_t& stream) +{ + int blockSize = 256; + int gridSize = (len + blockSize - 1) / blockSize; + // printf("gridSize = %d, blockSize = %d, len = %d\n", gridSize, blockSize, len); + // Launch the kernel + int64_t *d_len; + CUDA_CHECK(cudaMallocAsync((void**)&d_len, sizeof(int64_t), stream)); + CUDA_CHECK(cudaMemcpyAsync(d_len, &len, sizeof(int64_t), cudaMemcpyHostToDevice, stream)); + // printf("start kernel function\n"); + cuDoubleComplex_to_cuFloatComplex_kernel<<>>(d_a, d_b, *d_len); +} +__global__ void float_to_double_kernel(const float* d_a, double* d_b, const int64_t& d_len) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < d_len) + { + d_b[idx] = d_a[idx]; + } +} +void CudaConnector::float_to_double_device(const float* d_a, double* d_b, const int64_t& len) +{ + int blockSize = 256; + int gridSize = (len + blockSize - 1) / blockSize; + int64_t *d_len; + CUDA_CHECK(cudaMallocAsync((void**)&d_len, sizeof(int64_t), device_stream.stream)); + CUDA_CHECK(cudaMemcpyAsync(d_len, &len, sizeof(int64_t), cudaMemcpyHostToDevice, device_stream.stream)); + float_to_double_kernel<<>>(d_a, d_b, *d_len); + CUDA_CHECK(cudaFreeAsync(d_len, device_stream.stream)); +} +__global__ void double_to_float_kernel(const double* d_a, float* d_b, const int64_t& d_len) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < d_len) + { + d_b[idx] = static_cast(d_a[idx]); + } +} +void CudaConnector::double_to_float_device(const double* d_a, float* d_b, const int64_t& len) +{ + int blockSize = 256; + int gridSize = (len + blockSize - 1) / blockSize; + int64_t *d_len; + CUDA_CHECK(cudaMallocAsync((void**)&d_len, sizeof(int64_t), device_stream.stream)); + CUDA_CHECK(cudaMemcpyAsync(d_len, &len, sizeof(int64_t), cudaMemcpyHostToDevice, device_stream.stream)); + double_to_float_kernel<<>>(d_a, d_b, *d_len); + CUDA_CHECK(cudaFreeAsync(d_len, device_stream.stream)); +} +__global__ void trace_ComplexMatrixDevice_blacs_kernel(const cuDoubleComplex* d_in, cuDoubleComplex* d_out, const int& d_m, const int& d_n, const int& d_m_loc, const int& d_n_loc, const int& d_mb, const int& d_nb, const int& d_myprow, const int& d_mypcol, const int& d_irsrc, const int& d_icsrc, const int& d_nprows, const int& d_npcols) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + d_out[idx] = make_cuDoubleComplex(0.0, 0.0); + for(int i=idx;i *trace_z = (std::complex*)trace; + CUDA_CHECK(cudaMallocAsync((void**)&d_trace, blockSize*sizeof(std::complex), device_stream.stream)); + h_trace = (std::complex*)malloc(blockSize*sizeof(std::complex)); + std::complex *h_trace_z = (std::complex*)h_trace; + *(trace_z) = std::complex(0.0, 0.0); + trace_matrix_device_blacs_kernel<<>>((const cuDoubleComplex*)d_A, (cuDoubleComplex*)d_trace, *d_array_desc_device); + CUDA_CHECK(cudaMemcpyAsync(h_trace, d_trace, blockSize*sizeof(std::complex), cudaMemcpyDeviceToHost, device_stream.stream)); + device_stream.sync(); + for(int i=0;i *det_z = (std::complex*)det; + CUDA_CHECK(cudaMallocAsync((void**)&d_det, blockSize*sizeof(std::complex), device_stream.stream)); + h_det = (std::complex*)malloc(blockSize*sizeof(std::complex)); + std::complex *h_det_z = (std::complex*)h_det; + *(det_z) = std::complex(1.0, 0.0); + det_matrix_device_blacs_kernel<<>>((const cuDoubleComplex*)d_A, (cuDoubleComplex*)d_det, *d_array_desc_device); + CUDA_CHECK(cudaMemcpyAsync(h_det, d_det, blockSize*sizeof(std::complex), cudaMemcpyDeviceToHost, device_stream.stream)); + device_stream.sync(); + for(int i=0;i CudaConnector::trace_ComplexMatrixDevice_blacs(const cudaStream_t& stream, const ComplexMatrixDevice& d_A, const LIBRPA::Array_Desc &arrdesc) +{ + + int blockSize = 256; + int *d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols; + CUDA_CHECK(cudaMallocAsync((void**)&d_m, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_n, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_m_loc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_n_loc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_mb, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_nb, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_myprow, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_mypcol, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_irsrc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_icsrc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_nprows, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_npcols, sizeof(int), stream)); + CUDA_CHECK(cudaMemcpyAsync(d_m, &arrdesc.m(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_n, &arrdesc.n(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_m_loc, &arrdesc.m_loc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_n_loc, &arrdesc.n_loc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_mb, &arrdesc.mb(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_nb, &arrdesc.nb(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_myprow, &arrdesc.myprow(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_mypcol, &arrdesc.mypcol(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_irsrc, &arrdesc.irsrc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_icsrc, &arrdesc.icsrc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_nprows, &arrdesc.nprows(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_npcols, &arrdesc.npcols(), sizeof(int), cudaMemcpyHostToDevice, stream)); + cuDoubleComplex *d_out; + CUDA_CHECK(cudaMallocAsync((void**)&d_out, sizeof(cuDoubleComplex)*blockSize, stream)); + std::complex *h_out; + std::complex trace_loc = {0.0, 0.0}; + h_out = new std::complex[blockSize]; + int gridSize = 1; + trace_ComplexMatrixDevice_blacs_kernel<<>>(d_A.ptr(),d_out,*d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols); + CUDA_CHECK(cudaFreeAsync(d_m, stream)); + CUDA_CHECK(cudaFreeAsync(d_n, stream)); + CUDA_CHECK(cudaFreeAsync(d_m_loc, stream)); + CUDA_CHECK(cudaFreeAsync(d_n_loc, stream)); + CUDA_CHECK(cudaFreeAsync(d_mb, stream)); + CUDA_CHECK(cudaFreeAsync(d_nb, stream)); + CUDA_CHECK(cudaFreeAsync(d_myprow, stream)); + CUDA_CHECK(cudaFreeAsync(d_mypcol, stream)); + CUDA_CHECK(cudaFreeAsync(d_irsrc, stream)); + CUDA_CHECK(cudaFreeAsync(d_icsrc, stream)); + CUDA_CHECK(cudaFreeAsync(d_nprows, stream)); + CUDA_CHECK(cudaFreeAsync(d_npcols, stream)); + CUDA_CHECK(cudaMemcpyAsync(h_out, d_out, sizeof(cuDoubleComplex)*blockSize, cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaFreeAsync(d_out, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + for(int i=0;i=array_desc_device.m()) + return; + int ilo = array_desc_device.indx_g2l_r(i); + if(ilo==-1) + return; + int jlo = array_desc_device.indx_g2l_c(i); + if(jlo==-1) + return; + d_A[ilo+jlo*array_desc_device.m_loc()] = make_cuDoubleComplex(1.0, 0.0); +} +void ComplexMatrixDevice::set_as_identity(const GpuDeviceStream& gpu_dev_stream, const Array_Desc_Device& array_desc_device){ + this->set_as_zero(gpu_dev_stream.stream); + Array_Desc_Device* d_array_desc_device; + // printf("array_desc_device.m():%d\n", array_desc_device.m()); + CUDA_CHECK(cudaMallocAsync((void**)&d_array_desc_device, sizeof(Array_Desc_Device), gpu_dev_stream.stream)); + CUDA_CHECK(cudaMemcpyAsync(d_array_desc_device, &array_desc_device, sizeof(Array_Desc_Device), cudaMemcpyHostToDevice, gpu_dev_stream.stream)); + if(array_desc_device.m()!=array_desc_device.n()){ + fprintf(stderr,"Error: ComplexMatrixDevice::set_as_identity: m and n are not equal\n"); + std::abort(); + } + int blockSize = 256; + int gridSize = (array_desc_device.m()+blockSize-1)/blockSize; + set_as_identity_kernel<<>>(d_data, *d_array_desc_device); + CUDA_CHECK(cudaFreeAsync(d_array_desc_device, gpu_dev_stream.stream)); +} \ No newline at end of file diff --git a/src/cuda_connector.h b/src/cuda_connector.h new file mode 100644 index 00000000..f496bf08 --- /dev/null +++ b/src/cuda_connector.h @@ -0,0 +1,621 @@ +#ifndef CUDA_CONNECTOR_H +#define CUDA_CONNECTOR_H + +#pragma once +//=================hbchen 2025-05-11========================= +#include +#include +#include +#include +#include +//=================hbchen 2025-05-11========================= +#include +#include +#include +#include +#include "base_blacs.h" +#include +#ifdef ENABLE_NVHPC +#include +#include +#include "helpers.h" +#include "scalapack_connector.h" +#include +#include +#include "gpu_device_stream.h" +#endif +using LIBRPA::Array_Desc; +#include "array_desc_device.h" +#include "matrix_device.h" +#ifdef ENABLE_NVHPC + + +class ComplexMatrixDevice{ +private: + int m=0; + int n=0; +public: + cuDoubleComplex* d_data=nullptr; + + cusolverMpGrid_t grid_cusolver=nullptr; + cusolverMpMatrixDescriptor_t desc_cusolver=nullptr; + + cublasMpGrid_t grid_cublas=nullptr; + cublasMpMatrixDescriptor_t desc_cublas=nullptr; + + bool is_cusolver_init=false; + bool is_cublas_init=false; + + ComplexMatrixDevice(){ + + } + ComplexMatrixDevice(const int& m, const int& n){ + ComplexMatrixDevice(); + this->m=m; + this->n=n; + CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(cuDoubleComplex))); + } + ComplexMatrixDevice(const int& m, const int& n,void* c_data){ + this->m=m; + this->n=n; + CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(cuDoubleComplex))); + CUDA_CHECK(cudaMemcpy(d_data, c_data, m * n * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice)); + } + void set_data(const int& m, const int& n, const cudaStream_t& stream=nullptr){ + if(m!=this->m || n!=this->n){ + clean(stream); + this->m=m; + this->n=n; + if(stream==nullptr){ + CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(cuDoubleComplex))); + }else{ + CUDA_CHECK(cudaMallocAsync((void**)&d_data, m * n * sizeof(cuDoubleComplex),stream)); + } + } + } + void set_data(const int& m, const int& n,const void* c_data, const cudaStream_t& stream=nullptr){ + set_data(m,n,stream); + if(stream==nullptr){ + CUDA_CHECK(cudaMemcpy(d_data, c_data, m * n * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice)); + }else{ + CUDA_CHECK(cudaMemcpyAsync(d_data, c_data, m * n * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice,stream)); + } + } + void set_data_device(const int& m, const int& n,const void* d_A, const GpuDeviceStream& gpu_dev_stream) + { + if(m!=this->m || n!=this->n){ + set_data(m,n,gpu_dev_stream.stream); + } + + CUDA_CHECK(cudaMemcpyPeerAsync(this->d_data,gpu_dev_stream.local_device, d_A, gpu_dev_stream.local_device, m * n * sizeof(cuDoubleComplex), gpu_dev_stream.stream)); + + } + void cusolver_init(cusolverMpHandle_t handle, cal_comm_t cal_comm, const Array_Desc& arrdesc){ + if(!is_cusolver_init){ + CUSOLVERMP_CHECK(cusolverMpCreateDeviceGrid(handle,&grid_cusolver,cal_comm,arrdesc.nprows(),arrdesc.npcols(),CUSOLVERMP_GRID_MAPPING_ROW_MAJOR)); + CUSOLVERMP_CHECK(cusolverMpCreateMatrixDesc(&desc_cusolver,grid_cusolver,CUDA_C_64F,m,n,arrdesc.mb(),arrdesc.nb(),0,0,arrdesc.m_loc())); + is_cusolver_init=true; + } + } + void cublas_init(cublasMpHandle_t handle, cal_comm_t cal_comm, const Array_Desc& arrdesc){ + if(!is_cublas_init){ + CUBLASMP_CHECK(cublasMpGridCreate(handle,arrdesc.nprows(),arrdesc.npcols(),CUBLASMP_GRID_LAYOUT_ROW_MAJOR,cal_comm,&grid_cublas)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorCreate(handle,m,n,arrdesc.mb(),arrdesc.nb(),0,0,arrdesc.m_loc(),CUDA_C_64F,grid_cublas,&desc_cublas)); + is_cublas_init=true; + } + } + cuDoubleComplex* ptr(){ + return d_data; + } + const cuDoubleComplex* ptr() const { + return d_data; + } + int nr(){ + return this->m; + } + int nc(){ + return this->n; + } + void set_nr(int m){ + this->m=m; + } + void set_nc(int n){ + this->n=n; + } + void set_ptr(cuDoubleComplex* d_data){ + if(this->d_data!=nullptr){ + cudaFree(this->d_data); + this->d_data=nullptr; + } + this->d_data=d_data; + } + void clean(const cudaStream_t& stream=nullptr){ + if(d_data!=nullptr){ + if(stream==nullptr) + cudaFree(d_data); + else + cudaFreeAsync(d_data,stream); + d_data=nullptr; + } + this->m=0; + this->n=0; + } + void set_as_zero(const cudaStream_t& stream=nullptr); + void set_as_identity(const GpuDeviceStream& gpu_dev_stream, const Array_Desc_Device&); + void cublasClean(cublasMpHandle_t handle){ + if(desc_cublas!=nullptr){ + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,desc_cublas)); + desc_cublas=nullptr; + } + if(grid_cublas!=nullptr){ + CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid_cublas)); + grid_cublas=nullptr; + } + is_cublas_init=false; + } + ~ComplexMatrixDevice(){ + if(is_cublas_init){ + fprintf(stderr, "Error: ComplexMatrixDevice not cleaned cublasMp resources before destructing!\n"); + std::abort(); + } + // CUDA_CHECK(cudaDeviceSynchronize()); + if(d_data!=nullptr){ + CUDA_CHECK(cudaFree(d_data)); + d_data=nullptr; + } + if(desc_cusolver!=nullptr){ + CUSOLVERMP_CHECK(cusolverMpDestroyMatrixDesc(desc_cusolver)); + desc_cusolver=nullptr; + } + if(grid_cusolver!=nullptr){ + CUSOLVERMP_CHECK(cusolverMpDestroyGrid(grid_cusolver)); + grid_cusolver=nullptr; + } + } + +}; + + +#endif + +class CudaConnector +{ +public: + static + void write_file(double* A,int M,int N,const char* name){ + std::fstream out; + out.open(name, std::ios::out); + if (!out.is_open()) { + std::cerr << "Failed to open file: " << name << std::endl; + return; + } + for(int i=0;i *a, cuDoubleComplex *b, const int n, const int lda,bool is_transpose=false) + { + if (is_transpose) + { + #pragma omp parallel for + for (int i = 0; i < n; i++) + { + for (int j = 0; j < lda; j++) + { + b[i*lda+j].x = a[j*n+i].real(); + b[i*lda+j].y = a[j*n+i].imag(); + } + } + } + else + { + #pragma omp parallel for + for (int i = 0; i < n*lda; i++) + { + b[i].x = a[i].real(); + b[i].y = a[i].imag(); + } + } + } + static inline + void cuDoubleComplexToDoubleComplex_host(std::complex *a,const cuDoubleComplex *b,const int n,const int lda,bool is_transpose=false) + { + if (is_transpose) + { + for (int i = 0; i < n; i++) + { + for (int j = 0; j < lda; j++) + { + a[j*n+i].real(b[i*lda+j].x); + a[j*n+i].imag(b[i*lda+j].y); + } + } + } + else + { + for (int i = 0; i < n*lda; i++) + { + a[i].real(b[i].x); + a[i].imag(b[i].y); + } + } + } + // update by chenhaobo in 2025-05-19 + static inline + void cuZgetrf_f(const int& m, const int& n, cuDoubleComplex *d_a, const int& lda, int *d_ipiv, int *d_info, cusolverDnHandle_t cusolverH) + { + cudaStream_t stream; + cusolverStatus_t status = cusolverDnGetStream(cusolverH, &stream); + // printf("cusolverDnGetStream status:%d\n",status); + // printf("Stream stream:%d\n",stream); + // printf("cusolverDnhandle:%d\n",cusolverH); + int lwork; + // auto start = std::chrono::high_resolution_clock::now(); + cusolverDnZgetrf_bufferSize(cusolverH, m, n, d_a, lda, &lwork); + cuDoubleComplex *d_work; + cudaMallocAsync((void**)&d_work, lwork * sizeof(cuDoubleComplex), stream); + // cudaStreamSynchronize(stream); + cusolverDnZgetrf(cusolverH, m, n, d_a, lda, d_work, d_ipiv, d_info); + // cudaStreamSynchronize(stream); + // auto end = std::chrono::high_resolution_clock::now(); + // auto duration= std::chrono::duration_cast(end - start); + // int info; + // cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost); + // printf("stream:%d, info:%d,Time taken of the cudaStreamSynchronize on GPU: %lld microseconds\n",stream,info, duration.count()); + cudaFreeAsync(d_work, stream); + } + static + cuDoubleComplex det_cuDoubleComplex(const cuDoubleComplex *d_a,const int *d_ipiv, const int& n, const cudaStream_t stream); + static + void cuDoubleComplex_minus_identity_Async(cuDoubleComplex *d_a,const int& h_n, const cudaStream_t stream); + static inline void cuDoubleComplex_minus_identity_host(cuDoubleComplex* h_a, const int& n) + { + #pragma omp parallel for + for(int i=0;i *, const int &, + // const int &, const LIBRPA::Array_Desc &, int *, int &,const char order='C'); + static + void pzgetrf_nvhpc(const GpuDeviceStream&, ComplexMatrixDevice &, const int &, + const int &, const LIBRPA::Array_Desc &, int64_t *, int *,const char& order='C'); + static + void pgetrf_nvhpc_mixed_precision( + const GpuDeviceStream&, void *, + const int &, const int &, + const LIBRPA::Array_Desc &, int64_t *, int *, + const cudaDataType_t &,const char &order='C' + ); + static + void pgetrf_nvhpc_mixed_precision( + void *, const int &, const int &, + const LIBRPA::Array_Desc &, int64_t *, int *, + const cudaDataType_t &,const char &order='C' + ); + static + void pgetrs_nvhpc_mixed_precision( + const GpuDeviceStream&, const cublasOperation_t&, + const void* d_A, const int64_t& IA, const int64_t& JA, const LIBRPA::Array_Desc &, + const int64_t* d_ipiv, + void* d_B, const int64_t& IB, const int64_t& JB, const LIBRPA::Array_Desc &, + int* d_info,const cudaDataType_t& compute_type, + const char& order='C' + ); + static void pgetrf_trs_nvhpc_mixed_precision( + const GpuDeviceStream&, const cublasOperation_t&, + void* d_A, const int64_t& IA, const int64_t& JA, const LIBRPA::Array_Desc &, + void* d_B, const int64_t& IB, const int64_t& JB, const LIBRPA::Array_Desc &, + const cudaDataType_t& compute_type, const char& order='C' + ); + static + std::complex det_ComplexMatrixDevice_blacs(const cudaStream_t&, const ComplexMatrixDevice&, const LIBRPA::Array_Desc &); + static + std::complex trace_ComplexMatrixDevice_blacs(const cudaStream_t&, const ComplexMatrixDevice&, const LIBRPA::Array_Desc &); + static + void trace_matrix_device_blacs(void* trace, const void* d_A, const Array_Desc &arrdesc_A, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type); + static + void det_matrix_device_blacs(void* det, const void* d_A, const Array_Desc &arrdesc_A, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type); + + // static + // void pgemm_cublasMp(const char &transa, const char &transb, const int &m, const int &n, const int &k, + // const double &alphaD, const std::complex *A, const int &ia, const int &ja, const LIBRPA::Array_Desc &arrdesc_A, + // const std::complex *B, const int &ib, const int &jb, const LIBRPA::Array_Desc &arrdesc_B, + // const double &betaD, std::complex *C, const int &ic, const int &jc, const LIBRPA::Array_Desc &arrdesc_C); + + static + void pgemm_device(cublasMpHandle_t,cublasOperation_t,cublasOperation_t,const int &,const int &,const int &, + const void *, + const ComplexMatrixDevice &,int64_t,int64_t, + const ComplexMatrixDevice &,int64_t,int64_t, + const void *, + ComplexMatrixDevice &,int64_t,int64_t, + cublasComputeType_t); + + static + void pgemm_nvhpc(const GpuDeviceStream&,cublasOperation_t,cublasOperation_t,const int &,const int &,const int &, + const void *, + const ComplexMatrixDevice &,int64_t,int64_t,const Array_Desc&, + const ComplexMatrixDevice &,int64_t,int64_t,const Array_Desc&, + const void *, + ComplexMatrixDevice &,int64_t,int64_t,const Array_Desc&, + cublasComputeType_t); + // static + // void pgemm_nvhpc_cuFloatComplex(const GpuDeviceStream& gpu_dev_stream,cublasOperation_t transA,cublasOperation_t transB,const int & m,const int & n,const int & k, + // const void *alpha, + // const cuFloatComplex* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA, + // const cuFloatComplex* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB, + // const void *beta, + // cuFloatComplex * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC, + // cublasComputeType_t cublas_compute_type); + static + void pgemm_nvhpc_mixed_precision( + const GpuDeviceStream& gpu_dev_stream,cublasOperation_t transA,cublasOperation_t transB, + const int & m,const int & n,const int & k, + const void *alpha, + const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA, + const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB, + const void *beta, + void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC, + cublasComputeType_t cublas_compute_type + ); + static + void pgemm_nvhpc_mixed_precision( + cublasOperation_t transA,cublasOperation_t transB, + const int & m,const int & n,const int & k, + const void *alpha, + const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA, + const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB, + const void *beta, + void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC, + cublasComputeType_t cublas_compute_type + ); + static + void pgeadd_nvhpc( + const GpuDeviceStream& gpu_dev_stream,const cublasOperation_t& trans, + const void *alpha, + const void* d_A, const int64_t& ia, const int64_t& ja, const Array_Desc& array_descA, + const void* beta, + void* d_B, const int64_t& ib, const int64_t& jb, const Array_Desc& array_descB, + const cudaDataType_t&, + const char& order = 'c' + ); + static + void pgemr2d_nvhpc( + const GpuDeviceStream& gpu_dev_stream,const int&,const int&, + const void* d_A, const int64_t& ia, const int64_t& ja, const Array_Desc& array_descA, + void* d_B, const int64_t& ib, const int64_t& jb, const Array_Desc& array_descB, + const cudaDataType_t& + ); + static + void cuFloatComplex_to_cuDoubleComplex_Async(const cuFloatComplex* d_a, cuDoubleComplex* d_b, const int64_t& len, const cudaStream_t& stream); + static + void cuDoubleComplex_to_cuFloatComplex_Async(const cuDoubleComplex* d_a, cuFloatComplex* d_b, const int64_t& len, const cudaStream_t& stream); + static + void float_to_double_device(const float* d_a, double* d_b, const int64_t& len); + static + void double_to_float_device(const double* d_a, float* d_b, const int64_t& len); + static + void multiply_number_for_ComplexMatrixDevice(ComplexMatrixDevice& mat, const cuDoubleComplex& num, const cudaStream_t& stream); + static + void diag_add_ComplexMatrixDevice(ComplexMatrixDevice& mat, const double& num, const Array_Desc& arrdesc,const cudaStream_t& stream); + static + void diag_add_matrix_device_blacs( + const void* num, void* d_A, const Array_Desc& array_desc,const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type + ); + static + void transpose_ComplexMatrixDevice(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice& in ,bool is_conjugate=false); + + #endif + +}; + +#endif // CUDA_CONNECTOR_H \ No newline at end of file diff --git a/src/device_connector.cpp b/src/device_connector.cpp new file mode 100644 index 00000000..b343450c --- /dev/null +++ b/src/device_connector.cpp @@ -0,0 +1,248 @@ +#include "device_connector.h" +#ifdef ENABLE_NVHPC +#include "cuda_connector.h" +#endif +#include "device_stream.h" + +void DeviceConnector::pgetrf_device_mixed_precision( + void* d_A, const int& m, const int& n, + const LIBRPA::Array_Desc& arrdesc_pi, + int64_t* d_ipiv, int* d_info, + const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type, + const char& order +){ + { + #ifdef ENABLE_NVHPC + cudaDataType_t cuda_compute_type; + if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){ + cuda_compute_type = CUDA_C_64F; + }else if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT){ + cuda_compute_type = CUDA_C_32F; + }else if(compute_type==LIBRPA_COMPUTE_TYPE_DOUBLE){ + cuda_compute_type = CUDA_R_64F; + }else if(compute_type==LIBRPA_COMPUTE_TYPE_FLOAT){ + cuda_compute_type = CUDA_R_32F; + }else{ + fprintf(stderr, "Error: Unsupported compute type in pgetrf_device_mixed_precision\n"); + exit(1); + } + CudaConnector::pgetrf_nvhpc_mixed_precision( + d_A, m, n, + arrdesc_pi, + d_ipiv, d_info, + cuda_compute_type, + order + ); + #else + fprintf(stderr, "Error: NVHPC is not enabled in this build, cannot call pgetrf_device_mixed_precision\n"); + exit(1); + #endif + } +} + +void DeviceConnector::pgemm_device_mixed_precision( + const char& transa, const char& transb, + const int & m,const int & n,const int & k, + const void *alpha, + const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA, + const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB, + const void *beta, + void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC, + const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type +){ + #ifdef ENABLE_NVHPC + cublasComputeType_t cublas_compute_type; + cublasOperation_t transA, transB; + if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){ + cublas_compute_type = CUBLAS_COMPUTE_64F_PEDANTIC; + }else if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT){ + cublas_compute_type = CUBLAS_COMPUTE_32F_PEDANTIC; + }else if(compute_type==LIBRPA_COMPUTE_TYPE_DOUBLE){ + cublas_compute_type = CUBLAS_COMPUTE_64F; + }else if(compute_type==LIBRPA_COMPUTE_TYPE_FLOAT){ + cublas_compute_type = CUBLAS_COMPUTE_32F; + }else{ + fprintf(stderr, "Error: Unsupported compute type in pgemm_device_mixed_precision\n"); + exit(1); + } + if(transa=='N'||transa=='n'){ + transA = CUBLAS_OP_N; + }else if(transa=='T'||transa=='t'){ + transA = CUBLAS_OP_T; + }else if(transa=='C'||transa=='c'){ + transA = CUBLAS_OP_C; + }else{ + fprintf(stderr, "Error: Unsupported transa in pgemm_device_mixed_precision\n"); + exit(1); + } + if(transb=='N'||transb=='n'){ + transB = CUBLAS_OP_N; + }else if(transb=='T'||transb=='t'){ + transB = CUBLAS_OP_T; + }else if(transb=='C'||transb=='c'){ + transB = CUBLAS_OP_C; + }else{ + fprintf(stderr, "Error: Unsupported transb in pgemm_device_mixed_precision\n"); + exit(1); + } + CudaConnector::pgemm_nvhpc_mixed_precision( + transA, transB, + m, n, k, + alpha, + d_A, ia, ja, array_descA, + d_B, ib, jb, array_descB, + beta, + d_C, ic, jc, array_descC, + cublas_compute_type + ); + #endif +} + +void DeviceConnector::transpose_device_blas( + const void* d_A, + const int& m, const int& n, + void* d_B, + const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type, + const bool& is_conjugate +){ + #ifdef ENABLE_NVHPC + cublasHandle_t cublasH = NULL; + CUBLAS_CHECK(cublasCreate(&cublasH)); + CUBLAS_CHECK(cublasSetStream(cublasH, (cudaStream_t)device_stream.stream)); + cublasOperation_t trans = CUBLAS_OP_T; + if(is_conjugate) + trans = CUBLAS_OP_C; + if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){ + cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0); + cuDoubleComplex beta = make_cuDoubleComplex(0.0, 0.0); + CUBLAS_CHECK(cublasZgeam( + cublasH, trans, CUBLAS_OP_N, + m, n, + &alpha, + (cuDoubleComplex*)d_A, n, + &beta, + (cuDoubleComplex*)d_B, m, + (cuDoubleComplex*)d_B, m)); + }else if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT){ + cuFloatComplex alpha = make_cuFloatComplex(1.0, 0.0); + cuFloatComplex beta = make_cuFloatComplex(0.0, 0.0); + CUBLAS_CHECK(cublasCgeam( + cublasH, trans, CUBLAS_OP_N, + m, n, + &alpha, + (cuFloatComplex*)d_A, n, + &beta, + (cuFloatComplex*)d_B, m, + (cuFloatComplex*)d_B, m)); + }else if(compute_type==LIBRPA_COMPUTE_TYPE_DOUBLE){ + double alpha = 1.0; + double beta = 0.0; + CUBLAS_CHECK(cublasDgeam( + cublasH, trans, CUBLAS_OP_N, + m, n, + &alpha, + (double*)d_A, n, + &beta, + (double*)d_B, m, + (double*)d_B, m)); + + }else if(compute_type==LIBRPA_COMPUTE_TYPE_FLOAT){ + float alpha = 1.0; + float beta = 0.0; + CUBLAS_CHECK(cublasSgeam( + cublasH, trans, CUBLAS_OP_N, + m, n, + &alpha, + (float*)d_A, n, + &beta, + (float*)d_B, m, + (float*)d_B, m)); + }else{ + fprintf(stderr, "Error: Unsupported compute type in transpose_device_blas\n"); + exit(1); + } + CUBLAS_CHECK(cublasDestroy(cublasH)); + #endif +} + +void DeviceConnector::num_multiply_matrix_device( + const int& n, const void* num, void* d_A, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type +){ + #ifdef ENABLE_NVHPC + cublasHandle_t cublasH = NULL; + CUBLAS_CHECK(cublasCreate(&cublasH)); + CUBLAS_CHECK(cublasSetStream(cublasH, (cudaStream_t)device_stream.stream)); + if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){ + CUBLAS_CHECK(cublasZscal( + cublasH, n, + (cuDoubleComplex*)num, + (cuDoubleComplex*)d_A, 1 + )); + }else if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT){ + CUBLAS_CHECK(cublasCscal( + cublasH, n, + (cuFloatComplex*)num, + (cuFloatComplex*)d_A, 1 + )); + }else if(compute_type==LIBRPA_COMPUTE_TYPE_DOUBLE){ + CUBLAS_CHECK(cublasDscal( + cublasH, n, + (double*)num, + (double*)d_A, 1 + )); + }else if(compute_type==LIBRPA_COMPUTE_TYPE_FLOAT){ + CUBLAS_CHECK(cublasSscal( + cublasH, n, + (float*)num, + (float*)d_A, 1 + )); + }else{ + fprintf(stderr, "Error: Unsupported compute type in num_multiply_matrix_device\n"); + exit(1); + } + #endif +} + +void DeviceConnector::diag_add_matrix_device_blacs( + const void* num, void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type +){ + #ifdef ENABLE_NVHPC + CudaConnector::diag_add_matrix_device_blacs( + num, d_A, array_desc, compute_type + ); + #endif +} + +void DeviceConnector::trace_matrix_device_blacs( + void* trace, const void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type +){ + #ifdef ENABLE_NVHPC + CudaConnector::trace_matrix_device_blacs( + trace, d_A, array_desc, compute_type + ); + #endif +} + +void DeviceConnector::det_matrix_device_blacs( + void* det, const void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type +){ + #ifdef ENABLE_NVHPC + CudaConnector::det_matrix_device_blacs( + det, d_A, array_desc, compute_type + ); + #endif +} + +void DeviceConnector::float_to_double_device(float* d_A, double* d_B, const int64_t& n) +{ + #ifdef ENABLE_NVHPC + CudaConnector::float_to_double_device(d_A, d_B, n); + #endif +} + +void DeviceConnector::double_to_float_device(double* d_A, float* d_B, const int64_t& n) +{ + #ifdef ENABLE_NVHPC + CudaConnector::double_to_float_device(d_A, d_B, n); + #endif +} diff --git a/src/device_connector.h b/src/device_connector.h new file mode 100644 index 00000000..b4188f68 --- /dev/null +++ b/src/device_connector.h @@ -0,0 +1,54 @@ +#ifndef DEVICE_CONNECTOR_H +#define DEVICE_CONNECTOR_H +#include "base_blacs.h" +#include "helpers.h" +using LIBRPA::Array_Desc; + +class DeviceConnector{ +public: + static void pgetrf_device_mixed_precision( + void* d_A, const int& m, const int& n, + const LIBRPA::Array_Desc& arrdesc_pi, + int64_t* d_ipiv, int* d_info, + const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type, + const char& order='C' + ); + static void pgemm_device_mixed_precision( + const char& transa, const char& transb, + const int & m,const int & n,const int & k, + const void *alpha, + const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA, + const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB, + const void *beta, + void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC, + const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type + ); + static void transpose_device_blas( + const void* d_A, + const int& m, const int& n, + void* d_B, + const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type, + const bool& is_conjugate=false + ); + static void num_multiply_matrix_device( + const int& n, const void* num, void* d_A, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type + ); + static void diag_add_matrix_device_blacs( + const void* num, void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type + ); + + static void trace_matrix_device_blacs( + void* trace, const void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type + ); + static void det_matrix_device_blacs( + void* det, const void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type + ); + static void float_to_double_device(float* d_A, double* d_B, const int64_t& n); + static void double_to_float_device(double* d_A, float* d_B, const int64_t& n); + + +}; + + + +#endif // DEVICE_CONNECTOR_H \ No newline at end of file diff --git a/src/device_stream.cpp b/src/device_stream.cpp new file mode 100644 index 00000000..03b9ddfc --- /dev/null +++ b/src/device_stream.cpp @@ -0,0 +1,34 @@ +#include "device_stream.h" +#include "envs_mpi.h" +using LIBRPA::envs::mpi_comm_global_h; +void DeviceStream::init(){ + local_device = DeviceStream::getLocalDevice(); + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaSetDevice(local_device)); + CUDA_CHECK(cudaFree(nullptr)); + { + params.allgather = DeviceStream::allgather; + params.req_test = DeviceStream::request_test; + params.req_free = DeviceStream::request_free; + params.data = (void*)(MPI_COMM_WORLD); + params.rank = mpi_comm_global_h.myid; + params.nranks = mpi_comm_global_h.nprocs; + params.local_device = local_device; + + CAL_CHECK(cal_comm_create(params, &cal_comm)); + } + CUDA_CHECK(cudaStreamCreate((cudaStream_t*)&stream)); + CUSOLVERMP_CHECK(cusolverMpCreate(&cusolverMp_handle, local_device, (cudaStream_t)stream)); + CUBLASMP_CHECK(cublasMpCreate(&cublasMp_handle, (cudaStream_t)stream)); + #endif +} +void DeviceStream::check_memory(){ + #ifdef ENABLE_NVHPC + size_t free, total; + CUDA_CHECK(cudaMemGetInfo(&free, &total)); // 直接查询驱动 + printf("rank:%d, Used: %f GiB / %f GiB\n",mpi_comm_global_h.myid, (total - free) / (1024.0*1024.0*1024.0), total / (1024.0*1024.0*1024.0)); + #endif +} + + +DeviceStream device_stream = DeviceStream(); \ No newline at end of file diff --git a/src/device_stream.h b/src/device_stream.h new file mode 100644 index 00000000..91b9800c --- /dev/null +++ b/src/device_stream.h @@ -0,0 +1,134 @@ +#ifndef DEVICE_STREAM_H +#define DEVICE_STREAM_H + +#include +#ifdef ENABLE_NVHPC +#include +#include +#include +#include +#include +#endif +#include "helpers.h" +#include +class DeviceStream{ +private: + #ifdef ENABLE_NVHPC + cal_comm_create_params_t params; + static inline calError_t allgather(void* src_buf, void* recv_buf, size_t size, void* data, void** request) + { + MPI_Request req; + int err = MPI_Iallgather(src_buf, size, MPI_BYTE, recv_buf, size, MPI_BYTE, (MPI_Comm)(data), &req); + if (err != MPI_SUCCESS) + { + return CAL_ERROR; + } + *request = (void*)(req); + return CAL_OK; + } + + static inline calError_t request_test(void* request) + { + MPI_Request req = (MPI_Request)(request); + int completed; + int err = MPI_Test(&req, &completed, MPI_STATUS_IGNORE); + if (err != MPI_SUCCESS) + { + return CAL_ERROR; + } + return completed ? CAL_OK : CAL_ERROR_INPROGRESS; + } + + static inline calError_t request_free(void* request) + { + return CAL_OK; + } + #endif + static inline int getLocalDevice() + { + int localRank; + MPI_Comm localComm; + + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &localComm); + MPI_Comm_rank(localComm, &localRank); + MPI_Comm_free(&localComm); + + int deviceCount = 0; + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaGetDeviceCount(&deviceCount)); + #endif + + return localRank % deviceCount; + } +public: + int local_device; + #ifdef ENABLE_NVHPC + cudaStream_t stream = nullptr; + cal_comm_t cal_comm = nullptr; + cusolverMpHandle_t cusolverMp_handle = nullptr; + cublasMpHandle_t cublasMp_handle = nullptr; + #endif + DeviceStream(){} + void check_memory(); + void init(); + void finalize(){ + if(stream!=nullptr){ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaStreamDestroy((cudaStream_t)stream)); + #endif + stream=nullptr; + } + #ifdef ENABLE_NVHPC + if(cusolverMp_handle!=nullptr){ + CUSOLVERMP_CHECK(cusolverMpDestroy(cusolverMp_handle)); + cusolverMp_handle=nullptr; + } + if(cublasMp_handle!=nullptr){ + CUBLASMP_CHECK(cublasMpDestroy(cublasMp_handle)); + cublasMp_handle=nullptr; + } + if(cal_comm!=nullptr){ + CAL_CHECK(cal_comm_destroy(cal_comm)); + cal_comm=nullptr; + } + #endif + } + ~DeviceStream(){ + if(stream!=nullptr){ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaStreamDestroy((cudaStream_t)stream)); + #endif + stream=nullptr; + } + #ifdef ENABLE_NVHPC + if(cusolverMp_handle!=nullptr){ + CUSOLVERMP_CHECK(cusolverMpDestroy(cusolverMp_handle)); + cusolverMp_handle=nullptr; + } + if(cublasMp_handle!=nullptr){ + CUBLASMP_CHECK(cublasMpDestroy(cublasMp_handle)); + cublasMp_handle=nullptr; + } + if(cal_comm!=nullptr){ + CAL_CHECK(cal_comm_destroy(cal_comm)); + cal_comm=nullptr; + } + #endif + } + void sync() const{ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaStreamSynchronize((cudaStream_t)stream)); + #endif + } + #ifdef ENABLE_NVHPC + void cudaSync() const{ + CUDA_CHECK(cudaStreamSynchronize((cudaStream_t)stream)); + } + void calSync() const { + CAL_CHECK(cal_stream_sync(cal_comm,(cudaStream_t)stream)); + } + #endif +}; + +extern DeviceStream device_stream; +#endif // DEVICE_STREAM_H \ No newline at end of file diff --git a/src/dielecmodel.cpp b/src/dielecmodel.cpp index b634dc33..8338c640 100644 --- a/src/dielecmodel.cpp +++ b/src/dielecmodel.cpp @@ -20,6 +20,9 @@ using LIBRPA::envs::mpi_comm_global_h; using LIBRPA::envs::ofs_myid; using LIBRPA::utils::lib_printf; +#ifdef ENABLE_NVHPC +#include "cuda_connector.h" +#endif const int DoubleHavriliakNegami::d_npar = 8; const std::function &)> @@ -498,18 +501,20 @@ void diele_func::wing_mu_to_lambda(matrix_m> &sqrtveig_blac int n_lambda = this->n_nonsingular - 1; Array_Desc desc_wing_mu(blacs_ctxt_global_h); desc_wing_mu.init_square_blk(n_abf, 3, 0, 0); + desc_wing_mu.init(n_abf, 3, desc_nabf_nabf_opt.mb(), desc_wing_mu.nb(), 0, 0); Array_Desc desc_wing(blacs_ctxt_global_h); desc_wing.init_square_blk(n_nonsingular - 1, 3, 0, 0); Array_Desc desc_body(blacs_ctxt_global_h); desc_body.init_square_blk(n_nonsingular - 1, n_nonsingular - 1, 0, 0); - // opt descriptor for wing - Array_Desc desc_wing_opt(blacs_ctxt_global_h); - desc_wing_opt.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0); + int mb = std::min(128, desc_body.mb()); + desc_body.init(n_nonsingular - 1, n_nonsingular - 1, mb, mb, 0, 0); + + desc_wing.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0); for (int iomega = 0; iomega != this->omega.size(); iomega++) { auto &wing_tmp = this->wing.at(iomega); - wing_tmp = init_local_mat>(desc_wing_opt, MAJOR::COL); + wing_tmp = init_local_mat>(desc_wing, MAJOR::COL); // TODO: reconstruct wing_mu auto wing_mu_tmp = init_local_mat>(desc_wing_mu, MAJOR::COL); for (int alpha = 0; alpha != 3; alpha++) @@ -529,7 +534,7 @@ void diele_func::wing_mu_to_lambda(matrix_m> &sqrtveig_blac ScalapackConnector::pgemm_f('C', 'N', n_lambda, 3, n_abf, 1.0, sqrtveig_blacs.ptr(), 1, 2, desc_nabf_nabf_opt.desc, wing_mu_tmp.ptr(), 1, 1, desc_wing_mu.desc, 0.0, wing_tmp.ptr(), 1, 1, - desc_wing_opt.desc); + desc_wing.desc); } this->wing_mu.clear(); @@ -1058,6 +1063,8 @@ Array_Desc diele_func::get_body_inv(matrix_m> &chi0_block, Array_Desc desc_body(blacs_ctxt_global_h); desc_body.init_square_blk(n_nonsingular - 1, n_nonsingular - 1, 0, 0); + int mb = std::min(128,desc_body.mb()); + desc_body.init(n_nonsingular - 1, n_nonsingular - 1, mb, mb, 0, 0); this->body_inv = init_local_mat>(desc_body, MAJOR::COL); /* for (int ilambda = 0; ilambda < n_nonsingular - 1; ilambda++) @@ -1077,6 +1084,46 @@ Array_Desc diele_func::get_body_inv(matrix_m> &chi0_block, return desc_body; }; +#ifdef ENABLE_NVHPC +Array_Desc diele_func::get_body_inv_nvhpc(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice& d_chi0_block, const Array_Desc& desc_nabf_nabf_opt) +{ + Profiler::start("get_inverse_body_of_chi0_nvhpc"); + gpu_dev_stream.calSync(); + Array_Desc desc_body(blacs_ctxt_global_h); + desc_body.init_square_blk(n_nonsingular - 1, n_nonsingular - 1, 0, 0); + int mb = std::min(128,desc_body.mb()); + desc_body.init(n_nonsingular - 1, n_nonsingular - 1, mb, mb, 0, 0); + this->d_body_inv.set_data(desc_body.m_loc(),desc_body.n_loc(),gpu_dev_stream.stream); + Array_Desc_Device desc_body_dev(desc_body); + CudaConnector::pgemr2d_nvhpc( + gpu_dev_stream, n_nonsingular - 1, n_nonsingular - 1, + d_chi0_block.ptr(), 2, 2, desc_nabf_nabf_opt, + this->d_body_inv.ptr(), 1, 1, desc_body, + CUDA_C_64F + ); + ComplexMatrixDevice d_identity; + d_identity.set_data(desc_body.m_loc(),desc_body.n_loc(),gpu_dev_stream.stream); + d_identity.set_as_identity(gpu_dev_stream, desc_body_dev); + char order = 'c'; + if(order == 'c'||order == 'C'){ + CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_body_inv); + CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_identity); + } + CudaConnector::pgetrf_trs_nvhpc_mixed_precision( + gpu_dev_stream, CUBLAS_OP_N, + d_body_inv.ptr(), 1, 1, desc_body, + d_identity.ptr(), 1, 1, desc_body, + CUDA_C_64F, order + ); + if(order == 'c'||order == 'C'){ + CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_identity); + } + + this->d_body_inv.set_data_device(d_identity.nr(),d_identity.nc(),d_identity.ptr(),gpu_dev_stream); + Profiler::stop("get_inverse_body_of_chi0_nvhpc"); + return desc_body; +} +#endif void diele_func::construct_L(const int ifreq, Array_Desc &desc_body) { Profiler::start("cal_L"); @@ -1086,17 +1133,16 @@ void diele_func::construct_L(const int ifreq, Array_Desc &desc_body) Array_Desc desc_wing(blacs_ctxt_global_h); desc_wing.init_square_blk(n_nonsingular - 1, 3, 0, 0); // opt descriptor for wing - Array_Desc desc_wing_opt(blacs_ctxt_global_h); - desc_wing_opt.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0); - + desc_wing.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0); + Array_Desc desc_lam_3(blacs_ctxt_global_h); - desc_lam_3.init_square_blk(n_nonsingular - 1, 3, 0, 0); + desc_lam_3.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0); Array_Desc desc_3_lam(blacs_ctxt_global_h); - desc_3_lam.init_square_blk(3, n_nonsingular - 1, 0, 0); + desc_3_lam.init(3, n_nonsingular - 1, desc_wing.nb(), desc_body.mb(), 0, 0); Array_Desc desc_3_3(blacs_ctxt_global_h); - desc_3_3.init_square_blk(3, 3, 0, 0); + desc_3_3.init(3, 3, desc_wing.nb(), desc_wing.nb(), 0, 0); auto lam_3 = init_local_mat>(desc_lam_3, MAJOR::COL); auto _3_lam = init_local_mat>(desc_3_lam, MAJOR::COL); @@ -1104,12 +1150,12 @@ void diele_func::construct_L(const int ifreq, Array_Desc &desc_body) // tmp = head.at(ifreq) - transpose(wing.at(ifreq), true) * body_inv * wing.at(ifreq); ScalapackConnector::pgemm_f('N', 'N', n_nonsingular - 1, 3, n_nonsingular - 1, 1.0, body_inv.ptr(), 1, 1, desc_body.desc, wing.at(ifreq).ptr(), 1, 1, - desc_wing_opt.desc, 0.0, lam_3.ptr(), 1, 1, desc_lam_3.desc); + desc_wing.desc, 0.0, lam_3.ptr(), 1, 1, desc_lam_3.desc); ScalapackConnector::pgemm_f('C', 'N', 3, 3, n_nonsingular - 1, 1.0, wing.at(ifreq).ptr(), 1, 1, - desc_wing_opt.desc, lam_3.ptr(), 1, 1, desc_lam_3.desc, 0.0, + desc_wing.desc, lam_3.ptr(), 1, 1, desc_lam_3.desc, 0.0, Lind_loc.ptr(), 1, 1, desc_3_3.desc); ScalapackConnector::pgemm_f('C', 'N', 3, n_nonsingular - 1, n_nonsingular - 1, 1.0, - wing.at(ifreq).ptr(), 1, 1, desc_wing_opt.desc, body_inv.ptr(), 1, + wing.at(ifreq).ptr(), 1, 1, desc_wing.desc, body_inv.ptr(), 1, 1, desc_body.desc, 0.0, _3_lam.ptr(), 1, 1, desc_3_lam.desc); for (int i = 0; i != 3; i++) @@ -1146,6 +1192,114 @@ void diele_func::construct_L(const int ifreq, Array_Desc &desc_body) Profiler::stop("cal_L"); }; +#ifdef ENABLE_NVHPC +void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const int& ifreq, Array_Desc& desc_body) +{ + Profiler::start("cal_L"); + this->Lind.resize(3, 3, MAJOR::COL); + this->bw.resize(n_nonsingular - 1, 3, MAJOR::COL); + this->wb.resize(3, n_nonsingular - 1, MAJOR::COL); + Array_Desc desc_wing(blacs_ctxt_global_h); + desc_wing.init_square_blk(n_nonsingular - 1, 3, 0, 0); + // opt descriptor for wing + desc_wing.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0); + + Array_Desc desc_lam_3(blacs_ctxt_global_h); + desc_lam_3.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0); + + Array_Desc desc_3_lam(blacs_ctxt_global_h); + desc_3_lam.init(3, n_nonsingular - 1, desc_wing.nb(), desc_body.mb(), 0, 0); + + Array_Desc desc_3_3(blacs_ctxt_global_h); + desc_3_3.init(3, 3, desc_wing.nb(), desc_wing.nb(), 0, 0); + + auto lam_3 = init_local_mat>(desc_lam_3, MAJOR::COL); + auto _3_lam = init_local_mat>(desc_3_lam, MAJOR::COL); + auto Lind_loc = init_local_mat>(desc_3_3, MAJOR::COL); + // tmp = head.at(ifreq) - transpose(wing.at(ifreq), true) * body_inv * wing.at(ifreq); + + ComplexMatrixDevice d_lam_3,d_3_lam,d_Lind_loc,d_wing_ifreq; + + d_wing_ifreq.set_data(wing.at(ifreq).nr(),wing.at(ifreq).nc(),wing.at(ifreq).ptr(),gpu_dev_stream.stream); + + d_lam_3.set_data(desc_lam_3.m_loc(),desc_lam_3.n_loc(),gpu_dev_stream.stream); + d_3_lam.set_data(desc_3_lam.m_loc(),desc_3_lam.n_loc(),gpu_dev_stream.stream); + d_Lind_loc.set_data(desc_3_3.m_loc(),desc_3_3.n_loc(),gpu_dev_stream.stream); + + std::complex calpha(1.0,0.0),cbeta(0.0,0.0); + + CudaConnector::pgemm_nvhpc( + gpu_dev_stream, CUBLAS_OP_N, CUBLAS_OP_N, n_nonsingular - 1, 3, n_nonsingular - 1, + &calpha, + d_body_inv, 1, 1, desc_body, + d_wing_ifreq, 1, 1, desc_wing, + &cbeta, + d_lam_3, 1, 1, desc_lam_3, + CUBLAS_COMPUTE_64F_PEDANTIC + ); + + // printf("desc_wing:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_wing.m_loc(),desc_wing.n_loc(),desc_wing.lld(),desc_wing.mb(),desc_wing.nb()); + // printf("desc_lam_3:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_lam_3.m_loc(),desc_lam_3.n_loc(),desc_lam_3.lld(),desc_lam_3.mb(),desc_lam_3.nb()); + // printf("desc_3_3:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_3_3.m_loc(),desc_3_3.n_loc(),desc_3_3.lld(),desc_3_3.mb(),desc_3_3.nb()); + CudaConnector::pgemm_nvhpc( + gpu_dev_stream, CUBLAS_OP_C, CUBLAS_OP_N, 3, 3, n_nonsingular - 1, + &calpha, + d_wing_ifreq, 1, 1, desc_wing, + d_lam_3, 1, 1, desc_lam_3, + &cbeta, + d_Lind_loc, 1, 1, desc_3_3, + CUBLAS_COMPUTE_64F_PEDANTIC + ); + + CudaConnector::pgemm_nvhpc( + gpu_dev_stream, CUBLAS_OP_C, CUBLAS_OP_N, 3, n_nonsingular - 1, n_nonsingular - 1, + &calpha, + d_wing_ifreq, 1, 1, desc_wing, + d_body_inv, 1, 1, desc_body, + &cbeta, + d_3_lam, 1, 1, desc_3_lam, + CUBLAS_COMPUTE_64F_PEDANTIC + ); + + CUDA_CHECK(cudaMemcpyAsync(lam_3.ptr(), d_lam_3.ptr(), desc_lam_3.m_loc() * desc_lam_3.n_loc() * sizeof(std::complex), cudaMemcpyDeviceToHost, gpu_dev_stream.stream)); + CUDA_CHECK(cudaMemcpyAsync(_3_lam.ptr(), d_3_lam.ptr(), desc_3_lam.m_loc() * desc_3_lam.n_loc() * sizeof(std::complex), cudaMemcpyDeviceToHost, gpu_dev_stream.stream)); + CUDA_CHECK(cudaMemcpyAsync(Lind_loc.ptr(), d_Lind_loc.ptr(), desc_3_3.m_loc() * desc_3_3.n_loc() * sizeof(std::complex), cudaMemcpyDeviceToHost, gpu_dev_stream.stream)); + gpu_dev_stream.cudaSync(); + for (int i = 0; i != 3; i++) + { + auto loc_i = desc_3_3.indx_g2l_r(i); + for (int ilambda = 0; ilambda < n_nonsingular - 1; ilambda++) + { + auto loc_ilambda = desc_lam_3.indx_g2l_r(ilambda); + auto loc_ibw = desc_lam_3.indx_g2l_c(i); + if (loc_ibw >= 0 && loc_ilambda >= 0) + this->bw(ilambda, i) = lam_3(loc_ilambda, loc_ibw); + + loc_ilambda = desc_3_lam.indx_g2l_c(ilambda); + auto loc_iwb = desc_3_lam.indx_g2l_r(i); + if (loc_iwb >= 0 && loc_ilambda >= 0) + this->wb(i, ilambda) = _3_lam(loc_iwb, loc_ilambda); + + MPI_Allreduce(MPI_IN_PLACE, &bw(ilambda, i), 1, MPI_DOUBLE_COMPLEX, MPI_SUM, + mpi_comm_global_h.comm); + MPI_Allreduce(MPI_IN_PLACE, &wb(i, ilambda), 1, MPI_DOUBLE_COMPLEX, MPI_SUM, + mpi_comm_global_h.comm); + } + + for (int j = 0; j != 3; j++) + { + auto loc_j = desc_3_3.indx_g2l_c(j); + if (loc_j >= 0 && loc_i >= 0) + this->Lind(i, j) = head.at(ifreq)(i, j) - Lind_loc(loc_i, loc_j); + MPI_Allreduce(MPI_IN_PLACE, &Lind(i, j), 1, MPI_DOUBLE_COMPLEX, MPI_SUM, + mpi_comm_global_h.comm); + } + } + + Profiler::stop("cal_L"); +}; +#endif + void diele_func::get_Leb_points() { auto quad_order = lebedev::QuadratureOrder::order_5810; @@ -1306,25 +1460,7 @@ void diele_func::cal_eps(const int ifreq, Array_Desc &desc_nabf_nabf_opt, Array_ chi0(ilo, jlo) = result; } } - // auto identity = init_local_mat>(desc_body, MAJOR::COL); - // for (int i = 0; i < n_nonsingular - 1; i++) - // { - // const int ilo = desc_body.indx_g2l_r(i); - // if (ilo < 0) continue; - // for (int j = 0; j < n_nonsingular - 1; j++) - // { - // const int jlo = desc_body.indx_g2l_c(j); - // if (jlo < 0) continue; - // if (i == j) - // identity(ilo, jlo) = 1.0; - // else - // identity(ilo, jlo) = 0.0; - // } - // } - // ScalapackConnector::pgemm_f('N', 'N', n_nonsingular - 1, n_nonsingular - 1, n_nonsingular - - // 1, - // 1.0, body_inv.ptr(), 1, 1, desc_body.desc, identity.ptr(), 1, 1, - // desc_body.desc, 1.0, chi0.ptr(), 2, 2, desc_nabf_nabf_opt.desc); + ScalapackConnector::pgeadd_f('N', n_nonsingular - 1, n_nonsingular - 1, 1.0, body_inv.ptr(), 1, 1, desc_body.desc, 1.0, chi0.ptr(), 2, 2, desc_nabf_nabf_opt.desc); Profiler::stop("cal_inverse_dielectric_matrix_ij"); @@ -1334,6 +1470,119 @@ void diele_func::cal_eps(const int ifreq, Array_Desc &desc_nabf_nabf_opt, Array_ Profiler::stop("cal_inverse_dielectric_matrix"); }; +#ifdef ENABLE_NVHPC +void diele_func::cal_eps_nvhpc(const GpuDeviceStream& gpu_dev_stream, const int& ifreq, Array_Desc &desc_nabf_nabf_opt, Array_Desc &desc_body) +{ + Profiler::start("cal_inverse_dielectric_matrix"); + // mpi_comm_global_h.barrier(); + this->chi0 = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); + + const double k_volume = std::abs(G.Det()); + this->vol_gamma = k_volume / nk; + double vol_gamma_numeric = 0.0; + if (ifreq == 0 && mpi_comm_global_h.is_root()) + { + for (int ileb = 0; ileb != qw_leb.size(); ileb++) + { + vol_gamma_numeric += qw_leb[ileb] * std::pow(q_gamma[ileb], 3) / 3.0; + } + std::cout << "Number of angular grids for average inverse dielectric matrix: " + << qw_leb.size() << std::endl; + std::cout << "vol_gamma_numeric/vol_gamma: " << vol_gamma_numeric << ", " << vol_gamma + << std::endl; + std::cout << "Angular quadrature accuracy for volume: " << vol_gamma_numeric / vol_gamma + << " (should be close to 1)" << std::endl; + } + /*std::cout << "major of Matz: " << wing[0].is_row_major() << "," << body_inv.is_row_major() + << "," << transpose(wing.at(0), true).is_row_major() << "," << Lind.is_row_major() + << std::endl;*/ + construct_L_nvhpc(gpu_dev_stream, ifreq, desc_body); + + Profiler::start("precompute_q_data"); + + const size_t nleb = qw_leb.size(); + std::vector> weights(nleb); + std::vector> q_vectors(nleb); + + const auto L00 = Lind(0, 0), L01 = Lind(0, 1), L02 = Lind(0, 2); + const auto L10 = Lind(1, 0), L11 = Lind(1, 1), L12 = Lind(1, 2); + const auto L20 = Lind(2, 0), L21 = Lind(2, 1), L22 = Lind(2, 2); + +#pragma omp parallel for schedule(static) + for (int ileb = 0; ileb < nleb; ++ileb) + { + const double qx = qx_leb[ileb]; + const double qy = qy_leb[ileb]; + const double qz = qz_leb[ileb]; + + q_vectors[ileb] = {qx, qy, qz}; + + const auto qLq = qx * (qx * L00 + qy * L01 + qz * L02) + + qy * (qx * L10 + qy * L11 + qz * L12) + + qz * (qx * L20 + qy * L21 + qz * L22); + + weights[ileb] = qw_leb[ileb] * std::pow(q_gamma[ileb], 3) / (3.0 * vol_gamma) / qLq; + } + Profiler::stop("precompute_q_data"); + + Profiler::start("cal_inverse_dielectric_matrix_ij"); + int i_start = 0, i_end = n_nonsingular; + int j_start = 0, j_end = n_nonsingular; +#pragma omp parallel for schedule(dynamic, 4) collapse(2) + for (int i = i_start; i != i_end; i++) + { + for (int j = j_start; j != j_end; j++) + { + const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i); + if (ilo < 0) continue; + const int jlo = desc_nabf_nabf_opt.indx_g2l_c(j); + if (jlo < 0) continue; + + complex result = 0.0; + + if (i == 0 && j == 0) + { + for (int ileb = 0; ileb < nleb; ++ileb) + { + result += weights[ileb]; + } + } + else if (i == 0 || j == 0) + { + result = 0.0; + } + else + { + const int idx_i = i - 1, idx_j = j - 1; + + const auto bw_i0 = bw(idx_i, 0), bw_i1 = bw(idx_i, 1), bw_i2 = bw(idx_i, 2); + const auto wb_j0 = wb(0, idx_j), wb_j1 = wb(1, idx_j), wb_j2 = wb(2, idx_j); + + for (int ileb = 0; ileb < nleb; ++ileb) + { + const auto &[qx, qy, qz] = q_vectors[ileb]; + const auto bwq = bw_i0 * qx + bw_i1 * qy + bw_i2 * qz; + const auto qwb = qx * wb_j0 + qy * wb_j1 + qz * wb_j2; + + result += weights[ileb] * bwq * qwb; + } + } + chi0(ilo, jlo) = result; + } + } + ScalapackConnector::pgeadd_f( + 'N', n_nonsingular - 1, n_nonsingular - 1, + 1.0, + body_inv.ptr(), 1, 1, desc_body.desc, + 1.0, + chi0.ptr(), 2, 2, desc_nabf_nabf_opt.desc); + Profiler::stop("cal_inverse_dielectric_matrix_ij"); + if (mpi_comm_global_h.is_root()) + std::cout << "* Success: calculate average inverse dielectric matrix no." << ifreq + 1 + << "." << std::endl; + Profiler::stop("cal_inverse_dielectric_matrix"); +}; +#endif /*std::complex diele_func::compute_chi0_inv_00(const int ifreq) { std::complex total = 0.0; @@ -1423,4 +1672,23 @@ void diele_func::rewrite_eps(matrix_m> &chi0_block, const i this->body_inv.clear(); }; +#ifdef ENABLE_NVHPC +void diele_func::rewrite_eps_nvhpc(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice& d_chi0_block, const int& ifreq, Array_Desc& desc_nabf_nabf_opt) +{ + auto desc_body = get_body_inv_nvhpc(gpu_dev_stream, d_chi0_block, desc_nabf_nabf_opt); + this->body_inv = init_local_mat>(desc_body, MAJOR::COL); + CUDA_CHECK(cudaMemcpyAsync(this->body_inv.ptr(),this->d_body_inv.ptr(),sizeof(cuDoubleComplex)*desc_body.m_loc()*desc_body.n_loc(),cudaMemcpyDeviceToHost,gpu_dev_stream.stream)); + + cal_eps_nvhpc(gpu_dev_stream, ifreq, desc_nabf_nabf_opt, desc_body); + d_chi0_block.set_data(this->chi0.nr(),this->chi0.nc(),this->chi0.ptr(),gpu_dev_stream.stream); + // this->chi0.clear(); + ofs_myid << get_timestamp() << "success finish rewrite_eps"<Lind.clear(); + this->body_inv.clear(); + this->d_body_inv.clean(gpu_dev_stream.stream); + ofs_myid << get_timestamp() << "success clean the data"<> body_inv; // ( i:3, j:3 ) + #ifdef ENABLE_NVHPC + ComplexMatrixDevice d_body_inv; + #endif matrix_m> Lind; // ( i:n_lambda, j:3 ) matrix_m> bw; @@ -132,16 +137,26 @@ class diele_func Array_Desc get_body_inv(matrix_m> &chi0_block, Array_Desc &desc_nabf_nabf_opt); void construct_L(const int ifreq, Array_Desc &desc_body); + #ifdef ENABLE_NVHPC + void construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const int& ifreq, Array_Desc& desc_body); + #endif // Lebedev-Laikov quadrature void get_Leb_points(); void get_g_enclosing_gamma(); void calculate_q_gamma(); void cal_eps(const int ifreq, Array_Desc &desc_nabf_nabf_opt, Array_Desc &desc_body); + #ifdef ENABLE_NVHPC + void cal_eps_nvhpc(const GpuDeviceStream& gpu_dev_stream, const int& ifreq, Array_Desc &desc_nabf_nabf_opt, Array_Desc &desc_body); + #endif // not used now due to performance optimization // std::complex compute_chi0_inv_00(const int ifreq); // std::complex compute_chi0_inv_ij(const int ifreq, int i, int j); void rewrite_eps(matrix_m> &chi0_block, const int ifreq, Array_Desc &desc_nabf_nabf_opt); + #ifdef ENABLE_NVHPC + Array_Desc get_body_inv_nvhpc(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice&, const Array_Desc&); + void rewrite_eps_nvhpc(const GpuDeviceStream& gpu_dev_stream,ComplexMatrixDevice&,const int& ifreq,Array_Desc& desc_nabf_nabf_opt); + #endif void assign_chi0(matrix_m> &chi0_block, Array_Desc &desc_nabf_nabf_opt); }; diff --git a/src/epsilon_cuda.cpp b/src/epsilon_cuda.cpp new file mode 100644 index 00000000..dc194711 --- /dev/null +++ b/src/epsilon_cuda.cpp @@ -0,0 +1,1448 @@ +#include "epsilon_cuda.h" +#include "epsilon.h" +#include "device_stream.h" +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "atoms.h" +#include "constants.h" +#include "envs_blacs.h" +#include "envs_io.h" +#include "envs_mpi.h" +#include "lapack_connector.h" +#include "libri_utils.h" +#include "matrix_m_parallel_utils.h" +#include "parallel_mpi.h" +#include "params.h" +#include "pbc.h" +#include "profiler.h" +#include "scalapack_connector.h" +#include "stl_io_helper.h" +#include "utils_blacs.h" +#include "utils_io.h" +#include "utils_mem.h" +#include "utils_mpi_io.h" +#ifdef LIBRPA_USE_LIBRI +#include +#include +using RI::Tensor; +using RI::Communicate_Tensors_Map_Judge::comm_map2_first; +#endif +#ifdef ENABLE_NVHPC +#include "helpers.h" +#include +#include +#endif + +using LIBRPA::Array_Desc; +using LIBRPA::envs::blacs_ctxt_global_h; +using LIBRPA::envs::mpi_comm_global_h; +using LIBRPA::envs::ofs_myid; +using LIBRPA::utils::lib_printf; +CorrEnergy compute_RPA_correlation_blacs_2d_cuda(Chi0 &chi0, atpair_k_cplx_mat_t &coulmat) +{ + lib_printf("Begin to compute_RPA_correlation_blacs_2d_nvhpc myid: %d\n", mpi_comm_global_h.myid); + system("free -m"); + CorrEnergy corr; + if (mpi_comm_global_h.myid == 0) lib_printf("Calculating EcRPA with BLACS/ScaLAPACK_nvhpc 2D\n"); + const auto &mf = chi0.mf; + const complex CONE{1.0, 0.0}; + const int n_abf = LIBRPA::atomic_basis_abf.nb_total; + if (mpi_comm_global_h.myid == 0) lib_printf("n_abf = %d\n", n_abf); + const auto part_range = LIBRPA::atomic_basis_abf.get_part_range(); + + mpi_comm_global_h.barrier(); + + Array_Desc desc_nabf_nabf(blacs_ctxt_global_h); + desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0); + const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy( + 'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf); + const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf); + auto chi0_block = init_local_mat>(desc_nabf_nabf, MAJOR::COL); + auto coul_block = init_local_mat>(desc_nabf_nabf, MAJOR::COL); + auto coul_chi0_block = init_local_mat>(desc_nabf_nabf, MAJOR::COL); + #ifdef ENABLE_NVHPC + MatrixDevice> d_chi0_block, d_coul_block, d_coul_chi0_block; + #endif + vector> qpts; + for(const auto &q : chi0.klist) + { + qpts.push_back(q); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("processId:%d, q: (%f, %f, %f)\n", mpi_comm_global_h.myid, q.x, q.y, q.z); + #endif + } + complex tot_RPA_energy(0.0, 0.0); + map, complex> cRPA_q; + if (mpi_comm_global_h.is_root()) lib_printf("Finish init RPA blacs 2d\n"); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success before for loop processid:%d\n", mpi_comm_global_h.myid); + #endif +#ifdef LIBRPA_USE_LIBRI + + for (const auto &q : qpts) + { + coul_block.zero_out(); + + int iq = std::distance(klist.begin(), std::find(klist.begin(), klist.end(), q)); + std::array qa = {q.x, q.y, q.z}; + // collect the block elements of coulomb matrices + { + double vq_begin = omp_get_wtime(); + // LibRI tensor for communication, release once done + std::map>, Tensor>>> + coul_libri; + coul_libri.clear(); + for (const auto &Mu_Nu : local_atpair) + { + const auto Mu = Mu_Nu.first; + const auto Nu = Mu_Nu.second; + + if (coulmat.count(Mu) == 0 || coulmat.at(Mu).count(Nu) == 0 || + coulmat.at(Mu).at(Nu).count(q) == 0) + continue; + const auto &Vq = coulmat.at(Mu).at(Nu).at(q); + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu); + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu); + std::valarray> Vq_va(Vq->c, Vq->size); + auto pvq = std::make_shared>>(); + *pvq = Vq_va; + coul_libri[Mu][{Nu, qa}] = Tensor>({n_mu, n_nu}, pvq); + } + double arr_end = omp_get_wtime(); + mpi_comm_global_h.barrier(); + double comm_begin = omp_get_wtime(); + const auto IJq_coul = + comm_map2_first(mpi_comm_global_h.comm, coul_libri, s0_s1.first, s0_s1.second); + double comm_end = omp_get_wtime(); + mpi_comm_global_h.barrier(); + double block_begin = omp_get_wtime(); + collect_block_from_ALL_IJ_Tensor(coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, + qa, true, CONE, IJq_coul, MAJOR::ROW); + double block_end = omp_get_wtime(); + lib_printf( + "Vq Time myid: %d arr_time: %f comm_time: %f block_time: %f pair_size: %d\n", + mpi_comm_global_h.myid, arr_end - vq_begin, comm_end - comm_begin, + block_end - block_begin, set_IJ_nabf_nabf.size()); + mpi_comm_global_h.barrier(); + double vq_end = omp_get_wtime(); + + if (mpi_comm_global_h.myid == 0) + lib_printf(" | Total vq time: %f lri_coul: %f comm_vq: %f block_vq: %f\n", + vq_end - vq_begin, comm_begin - vq_begin, block_begin - comm_begin, + vq_end - block_begin); + } + double chi_arr_time = 0.0; + double chi_comm_time = 0.0; + double chi_2d_time = 0.0; + for (const auto &freq : chi0.tfg.get_freq_nodes()) + { + const auto ifreq = chi0.tfg.get_freq_index(freq); + const double freq_weight = chi0.tfg.find_freq_weight(freq); + double pi_freq_begin = omp_get_wtime(); + chi0_block.zero_out(); + { + double chi_begin_arr = omp_get_wtime(); + std::map>, Tensor>>> + chi0_libri; + atom_mapping::pair_t_old chi0_wq; + if(!chi0.get_chi0_q().empty()) + chi0_wq = chi0.get_chi0_q().at(freq).at(q); + chi0_libri.clear(); + if(!chi0.get_chi0_q().empty()) + for (const auto &M_Nchi : chi0_wq) + { + const auto &M = M_Nchi.first; + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M); + for (const auto &N_chi : M_Nchi.second) + { + const auto &N = N_chi.first; + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N); + const auto &chi = N_chi.second; + std::valarray> chi_va(chi.c, chi.size); + auto pchi = std::make_shared>>(); + *pchi = chi_va; + chi0_libri[M][{N, qa}] = Tensor>({n_mu, n_nu}, pchi); + } + } + if (mpi_comm_global_h.is_root()) + { + lib_printf("Begin to clean chi0 !!! \n"); + LIBRPA::utils::display_free_mem(); + lib_printf("chi0_freq_q size: %d, freq: %f, q:( %f, %f, %f )\n", + chi0_wq.size(), freq, q.x, q.y, q.z); + } + if(!chi0.get_chi0_q().empty()) + chi0.free_chi0_q(freq, q); + + LIBRPA::utils::release_free_mem(); + mpi_comm_global_h.barrier(); + double chi_end_arr = omp_get_wtime(); + const auto IJq_chi0 = + comm_map2_first(mpi_comm_global_h.comm, chi0_libri, s0_s1.first, s0_s1.second); + // ofs_myid << "IJq_chi0" << endl << IJq_chi0; + double chi_end_comm = omp_get_wtime(); + collect_block_from_ALL_IJ_Tensor(chi0_block, desc_nabf_nabf, + LIBRPA::atomic_basis_abf, qa, true, CONE, IJq_chi0, + MAJOR::ROW); + mpi_comm_global_h.barrier(); + double chi_end_2d = omp_get_wtime(); + + chi_arr_time = (chi_end_arr - chi_begin_arr); + chi_comm_time = (chi_end_comm - chi_end_arr); + chi_2d_time = (chi_end_2d - chi_end_comm); + } + + double pi_begin = omp_get_wtime(); + d_coul_block.set_data(coul_block.nr(), coul_block.nc(), coul_block.ptr(),device_stream.stream); + d_chi0_block.set_data(chi0_block.nr(), chi0_block.nc(), chi0_block.ptr(),device_stream.stream); + d_coul_chi0_block.set_data(coul_chi0_block.nr(), coul_chi0_block.nc(), coul_chi0_block.ptr(),device_stream.stream); + std::complex calpha(1.0,0.0),cbeta(0.0,0.0); + bool is_mixed_precision = false; + if(is_mixed_precision) + { + MatrixDevice> d_coul_block_f(coul_block.nr(),coul_block.nc(),device_stream.stream); + MatrixDevice> d_chi0_block_f(chi0_block.nr(),chi0_block.nc(),device_stream.stream); + MatrixDevice> d_coul_chi0_block_f(coul_chi0_block.nr(),coul_chi0_block.nc(),device_stream.stream); + DeviceConnector::double_to_float_device((double*)d_coul_block.ptr(),(float*)d_coul_block_f.ptr(),d_coul_block.nr()*d_coul_block.nc()*2); + DeviceConnector::double_to_float_device((double*)d_chi0_block.ptr(),(float*)d_chi0_block_f.ptr(),d_chi0_block.nr()*d_chi0_block.nc()*2); + DeviceConnector::double_to_float_device((double*)d_coul_chi0_block.ptr(),(float*)d_coul_chi0_block_f.ptr(),d_coul_chi0_block.nr()*d_coul_chi0_block.nc()*2); + std::complex calpha_f(1.0f,0.0f),cbeta_f(0.0f,0.0f); + DeviceConnector::pgemm_device_mixed_precision( + 'N', 'N', n_abf, n_abf, n_abf, + &calpha_f, + d_coul_block_f.ptr(), 1, 1, desc_nabf_nabf, + d_chi0_block_f.ptr(), 1, 1, desc_nabf_nabf, + &cbeta_f, + d_coul_chi0_block_f.ptr(), 1, 1, desc_nabf_nabf, + LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT + ); + DeviceConnector::float_to_double_device((float*)d_coul_chi0_block_f.ptr(),(double*)d_coul_chi0_block.ptr(),d_coul_chi0_block.nr()*d_coul_chi0_block.nc()*2); + }else{ + DeviceConnector::pgemm_device_mixed_precision( + 'N', 'N', n_abf, n_abf, n_abf, + &calpha, + d_coul_block.ptr(), 1, 1, desc_nabf_nabf, + d_chi0_block.ptr(), 1, 1, desc_nabf_nabf, + &cbeta, + d_coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf, + LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE + ); + } + double pi_end = omp_get_wtime(); + complex trace_pi(0.0, 0.0); + complex trace_pi_loc(0.0, 0.0); + DeviceConnector::trace_matrix_device_blacs(&trace_pi_loc,d_coul_chi0_block.ptr(),desc_nabf_nabf,LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE); + cuDoubleComplex calpha1; + calpha1.x = -1.0; + calpha1.y = 0.0; + DeviceConnector::num_multiply_matrix_device(d_coul_chi0_block.nr()*d_coul_chi0_block.nc(),&calpha1,d_coul_chi0_block.ptr(),LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE); + + DeviceConnector::diag_add_matrix_device_blacs(&calpha,d_coul_chi0_block.ptr(),desc_nabf_nabf, LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE); + + int64_t *d_ipiv; + int *d_info ; + CUDA_CHECK(cudaMallocAsync((void**)&d_ipiv, sizeof(int64_t)*max(desc_nabf_nabf.m_loc(), desc_nabf_nabf.n_loc()),device_stream.stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_info, sizeof(int),device_stream.stream)); + + complex ln_det = + compute_pi_det_blacs_2d_nvhpc(d_coul_chi0_block, desc_nabf_nabf, d_ipiv, d_info, 'c'); + CUDA_CHECK(cudaFreeAsync(d_ipiv,device_stream.stream)); + CUDA_CHECK(cudaFreeAsync(d_info,device_stream.stream)); + + double det_end = omp_get_wtime(); + mpi_comm_global_h.barrier(); + MPI_Allreduce(&trace_pi_loc, &trace_pi, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, + mpi_comm_global_h.comm); + double pi_freq_end = omp_get_wtime(); + if (mpi_comm_global_h.myid == 0) + { + lib_printf( + "| TIME of DET-freq-q: %f, q: ( %f, %f, %f) TOT: %f CHI_arr: %f CHI_comm: " + "%f, CHI_2d: %f, Pi: %f, Det: %f\n", + freq, q.x, q.y, q.z, pi_freq_end - pi_freq_begin, chi_arr_time, chi_comm_time, + chi_2d_time, pi_end - pi_begin, det_end - pi_end); + complex rpa_for_omega_q = trace_pi + ln_det; + cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; //! check + tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; + } + } + } +#else + throw std::logic_error("need compilation with LibRI"); +#endif + if (mpi_comm_global_h.myid == 0) + { + for (auto &q_crpa : cRPA_q) + { + corr.qcontrib[q_crpa.first] = q_crpa.second; + // cout << q_crpa.first << q_crpa.second << endl; + } + // cout << "gx_num_" << chi0.tfg.size() << " tot_RPA_energy: " << setprecision(8) + // <, atom_mapping::pair_t_old>> + pi_freq_q_Mu_Nu; + if (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR || + LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI) + pi_freq_q_Mu_Nu = compute_Pi_q_MPI(chi0, coulmat); + else + pi_freq_q_Mu_Nu = compute_Pi_q(chi0, coulmat); + lib_printf("Finish Pi freq on Proc %4d, size %zu\n", mpi_comm_global_h.myid, + pi_freq_q_Mu_Nu.size()); + // mpi_comm_global_h.barrier(); + + int range_all = N_all_mu; + + vector part_range; + part_range.resize(atom_mu.size()); + part_range[0] = 0; + int count_range = 0; + + for (int I = 0; I != atom_mu.size() - 1; I++) + { + count_range += atom_mu[I]; + part_range[I + 1] = count_range; + } + + + // pi_freq_q contains all atoms + map, ComplexMatrix>> pi_freq_q; + + for(const auto &freq : chi0.tfg.get_freq_nodes()) + { + // printf("| process %d, freq: %f\n", mpi_comm_global_h.myid, freq); + map, atom_mapping::pair_t_old> freq_q_MuNupi; + if(!chi0.get_chi0_q().empty()) + freq_q_MuNupi=pi_freq_q_Mu_Nu.at(freq); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success before freq_q_MuNupi processid:%d, freq_q_MuNupi.size(): %zu\n", + // mpi_comm_global_h.myid, freq_q_MuNupi.size()); + #endif + for(const auto &q:chi0.klist){ + atom_mapping::pair_t_old q_MuNupi; + if(!chi0.get_chi0_q().empty()) + q_MuNupi = freq_q_MuNupi.at(q); + const auto MuNupi = q_MuNupi; + pi_freq_q[freq][q].create(range_all, range_all); + + ComplexMatrix pi_munu_tmp(range_all, range_all); + pi_munu_tmp.zero_out(); + if(!chi0.get_chi0_q().empty()) + for (const auto &Mu_Nupi : MuNupi) + { + const auto Mu = Mu_Nupi.first; + const auto Nupi = Mu_Nupi.second; + const size_t n_mu = atom_mu[Mu]; + for (const auto &Nu_pi : Nupi) + { + const auto Nu = Nu_pi.first; + const auto pimat = Nu_pi.second; + const size_t n_nu = atom_mu[Nu]; + + for (size_t mu = 0; mu != n_mu; ++mu) + { + for (size_t nu = 0; nu != n_nu; ++nu) + { + pi_munu_tmp(part_range[Mu] + mu, part_range[Nu] + nu) += pimat(mu, nu); + } + } + } + } + if (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR || + LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI) + { + mpi_comm_global_h.reduce_ComplexMatrix(pi_munu_tmp, pi_freq_q.at(freq).at(q), 0); + } + else + { + pi_freq_q.at(freq).at(q) = std::move(pi_munu_tmp); + } + } + } + // lib_printf("Finish Pi communicate %4d, size %zu\n", mpi_comm_global_h.myid, + // pi_freq_q_Mu_Nu.size()); + mpi_comm_global_h.barrier(); + // if (mpi_comm_global_h.myid == 0) + { + complex tot_RPA_energy(0.0, 0.0); + map, complex> cRPA_q; + int deviceCount; + cudaError_t err= cudaGetDeviceCount(&deviceCount); + + // if(err==cudaSuccess&&deviceCount>0&&deviceCount!=4) + // printf("cudaSuccess:%d\n",err==cudaSuccess&&deviceCount>0); + // ==============================test the velocity of cuda stream======================================== + + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + range_all=TEST_LU_NUMBER; + complex* temp_complex=new complex[range_all*range_all]; + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<> dis(-1.0, 1.0); + double temp_begin_time=omp_get_wtime(); + // #pragma omp parallel for + for(int i=0;i[range_all*range_all]; + complex* c_ptr=pi_freq_q[freq][q].c; + + double temp_begin_time=omp_get_wtime(); + #pragma omp parallel for + for(int i=0;i trace_pi = trace(pi_freq_q.at(freq).at(q)); + complex rpa_for_omega_q(0.0, 0.0); + int info_LU = 1; + int *ipiv = new int[range_all]; + cuDoubleComplex det_test; + if(range_all det_for_rpa(det_test.x, det_test.y); + // printf("det_for_rpa_gpu: %f+%fi\n",det_test.x,det_test.y); + + // auto end_time = std::chrono::high_resolution_clock::now(); + // auto duration = std::chrono::duration_cast(end_time - start_time); + // printf("LU time by lapack: %lld us\n", duration.count()); + // complex trace_pi; + complex ln_det; + ln_det = std::log(det_for_rpa); + + // printf("in_det: %f+%fi, trace_pi: %f+%fi\n", ln_det.real(), ln_det.imag(), + // trace_pi.real(), trace_pi.imag()); + // cout << "PI trace vector:" << endl; + // cout << endl; + rpa_for_omega_q = ln_det + trace_pi; + // cout << " ifreq:" << freq << " rpa_for_omega_k: " << rpa_for_omega_q << " + // lnt_det: " << ln_det << " trace_pi " << trace_pi << endl; + // printf("tot_RPA_energy_gpu: %f+%fi,num_iteration:%d\n", + // tot_RPA_energy.real(), tot_RPA_energy.imag(), num_iteration); + // printf("rpa_for_omega_q:%f+%fi,freq_weight:%f,irk_weight[q]:%f\n", + // rpa_for_omega_q.real(), rpa_for_omega_q.imag(), freq_weight, + // irk_weight[q]); + #ifdef OPEN_OMP_FOR_LU_DECOMPOSITION + #pragma omp critical + #endif + { + cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; + + tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; + } + // printf("freq: %f, q: (%f, %f, %f), rpa_for_omega_q: %f+%fi,tot_RPA_energy_gpu: %f+%fi,num_iteration:%d,deviceId:%d,streamId:%d\n", + // freq, q.x, q.y, q.z, rpa_for_omega_q.real(), rpa_for_omega_q.imag(),tot_RPA_energy.real(), tot_RPA_energy.imag(), num_iteration, deviceId, streamId); + + // printf("tot_RPA_energy_gpu: %f+%fi,num_iteration:%d\n",tot_RPA_energy.real(),tot_RPA_energy.imag(),num_iteration); + } + // double test_endTime=omp_get_wtime(); + // printf("task_time:%f(aimed to test whether task is congested)\n",test_endTime-test_startTime); + num_iteration++; + #ifndef OPEN_OMP_FOR_LU_DECOMPOSITION + printf("one task time:%f\n",omp_get_wtime()-one_task_begin); + #endif + } + } + + } + printf("time for calculate: %f\n",omp_get_wtime()-time_calculate); + printf("mpi_comm_global_h.myid:%d,num_iteration:%d\n",mpi_comm_global_h.myid,num_iteration); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("time for all tasks: %f\n",omp_get_wtime()-start_time); + #endif + printf("tot_RPA_energy_gpu: %f+%fi\n",tot_RPA_energy.real(),tot_RPA_energy.imag()); + if(range_all, complex> global_cRPA_q; + for (auto q_weight : irk_weight) + { + MPI_Reduce(&cRPA_q[q_weight.first], &global_cRPA_q[q_weight.first], 1, + MPI_DOUBLE_COMPLEX, MPI_SUM, 0, mpi_comm_global_h.comm); + } + + for (auto &q_crpa : global_cRPA_q) + { + corr.qcontrib[q_crpa.first] = q_crpa.second; + } + complex gather_tot_RPA_energy(0.0, 0.0); + MPI_Reduce(&tot_RPA_energy, &gather_tot_RPA_energy, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, 0, + mpi_comm_global_h.comm); + corr.value = gather_tot_RPA_energy; + } + // printf("gather_tot_RPA_energy_gpu: %f+%fi\n",corr.value.real(),corr.value.imag()); + corr.etype = CorrEnergy::type::RPA; + return corr; +} +#ifdef ENABLE_NVHPC + +complex compute_pi_det_blacs_2d_nvhpc( + MatrixDevice> &d_A, const LIBRPA::Array_Desc &arrdesc_pi, int64_t *d_ipiv, int *d_info,char order) +{ + MatrixDevice> d_A_T; + + ORDER_CHECK(order); + if(order=='C'||order=='c'){ + d_A_T.set_data(d_A.nc(), d_A.nr(), device_stream.stream); + DeviceConnector::transpose_device_blas(d_A.ptr(),d_A_T.nr(), d_A_T.nc(),d_A_T.ptr(), LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE); + } + int ia=1,ja=1; + bool is_mixed_precision=false; + if(is_mixed_precision){ + MatrixDevice> d_A_f(d_A.nr(),d_A.nc(),device_stream.stream); + + DeviceConnector::double_to_float_device((order=='C'||order=='c')?(double*)d_A_T.ptr():(double*)d_A.ptr(),(float*)d_A_f.ptr(),d_A.nr()*d_A.nc()*2); + + DeviceConnector::pgetrf_device_mixed_precision( + d_A_f.ptr(), ia, ja, arrdesc_pi, + d_ipiv, d_info, + LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT, + order + ); + DeviceConnector::float_to_double_device((float*)d_A_f.ptr(),(order=='C'||order=='c')?(double*)d_A_T.ptr():(double*)d_A.ptr(),d_A.nr()*d_A.nc()*2); + }else{ + DeviceConnector::pgetrf_device_mixed_precision( + (order=='C'||order=='c')?(void*)d_A_T.ptr():(void*)d_A.ptr(), ia, ja, arrdesc_pi, + d_ipiv, d_info, + LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE, + order); + } + if(order=='C'||order=='c'){ + DeviceConnector::transpose_device_blas(d_A_T.ptr(),d_A.nr(),d_A.nc(), d_A.ptr(), LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE); + d_A_T.clean(device_stream.stream); + } + complex ln_det_loc(0.0, 0.0); + complex ln_det_all(0.0, 0.0); + std::complex det_loc; + + DeviceConnector::det_matrix_device_blacs( + &det_loc, d_A.ptr(), arrdesc_pi, LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE + ); + if(det_loc.real() > 0) + { + ln_det_loc = std::log(det_loc); + } + else + { + ln_det_loc = std::log(-det_loc); + } + + MPI_Allreduce(&ln_det_loc, &ln_det_all, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_global_h.comm); + return ln_det_all; +} +// Done: converge compute_Wc_freq_q_blacs and compute_Wc_freq_q_blacs_wing +map, matrix_m>>>::pair_t_old> +compute_Wc_freq_q_blacs_cuda(Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat_eps, + atpair_k_cplx_mat_t &coulmat_wc, + const vector> &epsmac_LF_imagfreq) +{ + map, matrix_m>>>::pair_t_old> + Wc_freq_q; + const complex CONE{1.0, 0.0}; + const int n_abf = LIBRPA::atomic_basis_abf.nb_total; + const auto part_range = LIBRPA::atomic_basis_abf.get_part_range(); + + if (mpi_comm_global_h.myid == 0) + { + cout << "Calculating Wc using NVHPC" << endl; + } + mpi_comm_global_h.barrier(); + + Profiler::start("compute_Wc_freq_q_blacs_init"); + Array_Desc desc_nabf_nabf(blacs_ctxt_global_h); + // Use a square blocksize instead max block, otherwise heev and inversion will complain about + // illegal parameter Maximal blocksize ensure that atom indices related to the rows/columns of a + // local matrix is minimized. + desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0); + // This, however, is not optimal for matrix operations, and may lead to segment fault during + // MPI operations with parallel linear algebra subroutine. Thus we define an optimal blocksize + Array_Desc desc_nabf_nabf_opt(blacs_ctxt_global_h); + const int nb_opt = min(128, desc_nabf_nabf.nb()); + desc_nabf_nabf_opt.init(n_abf, n_abf, nb_opt, nb_opt, 0, 0); + // obtain the indices of atom-pair block necessary to build 2D block of a Hermitian/symmetric + // matrix + const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy( + 'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf); + const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf); + // temp_block is used to collect data from IJ-pair data structure with comm_map2_first + auto temp_block = init_local_mat>(desc_nabf_nabf, MAJOR::COL); + #ifdef ENABLE_NVHPC + GpuDeviceStream gpu_dev_stream; + ComplexMatrixDevice d_temp_block; + ComplexMatrixDevice d_coul_block; + #endif + // Below are the working arrays for matrix operations + auto chi0_block = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); + auto coul_block = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); + auto coul_eigen_block = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); + auto coul_chi0_block = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); + #ifdef ENABLE_NVHPC + ComplexMatrixDevice d_chi0_block; + ComplexMatrixDevice d_coul_chi0_block; + ComplexMatrixDevice d_coul_eigen_block; + #endif + auto coulwc_block = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); + #ifdef ENABLE_NVHPC + ComplexMatrixDevice d_coulwc_block; + #endif + + const double mem_blocks = (chi0_block.size() + coul_block.size() + coul_eigen_block.size() + + coul_chi0_block.size() + coulwc_block.size()) * + 16.0e-6; + ofs_myid << get_timestamp() + << " Memory consumption of task-local blocks for screened Coulomb [MB]: " << mem_blocks + << endl; + + const auto atpair_local = dispatch_upper_trangular_tasks( + natom, blacs_ctxt_global_h.myid, blacs_ctxt_global_h.nprows, blacs_ctxt_global_h.npcols, + blacs_ctxt_global_h.myprow, blacs_ctxt_global_h.mypcol); +#ifdef LIBRPA_DEBUG + ofs_myid << get_timestamp() << " atpair_local " << atpair_local << endl; + ofs_myid << get_timestamp() << " s0_s1 " << s0_s1 << endl; +#endif + + // IJ pair of Wc to be returned + pair, set> Iset_Jset_Wc; + for (const auto &ap : atpair_local) + { + Iset_Jset_Wc.first.insert(ap.first); + Iset_Jset_Wc.second.insert(ap.second); + } + + // Prepare local basis indices for 2D->IJ map + int I, iI; + map> map_lor_v; + map> map_loc_v; + for (int i_lo = 0; i_lo != desc_nabf_nabf.m_loc(); i_lo++) + { + int i_glo = desc_nabf_nabf.indx_l2g_r(i_lo); + LIBRPA::atomic_basis_abf.get_local_index(i_glo, I, iI); + map_lor_v[I].push_back(iI); + } + for (int i_lo = 0; i_lo != desc_nabf_nabf.n_loc(); i_lo++) + { + int i_glo = desc_nabf_nabf.indx_l2g_c(i_lo); + LIBRPA::atomic_basis_abf.get_local_index(i_glo, I, iI); + map_loc_v[I].push_back(iI); + } + + vector> qpts; + for (const auto &q_weight : irk_weight) qpts.push_back(q_weight.first); + + vec eigenvalues(n_abf); + Profiler::cease("compute_Wc_freq_q_blacs_init"); + LIBRPA::utils::lib_printf_root("Time for Wc initialization (seconds, Wall/CPU): %f %f\n", + Profiler::get_wall_time_last("compute_Wc_freq_q_blacs_init"), + Profiler::get_cpu_time_last("compute_Wc_freq_q_blacs_init")); + + Profiler::start("compute_Wc_freq_q_work"); +#ifdef LIBRPA_USE_LIBRI + for (const auto &q : qpts) + { + const int iq = std::distance(qpts.cbegin(), std::find(qpts.cbegin(), qpts.cend(), q)); + const int iq_in_k = + std::distance(klist.cbegin(), std::find(klist.cbegin(), klist.cend(), q)); + // q-point in fractional coordinates + const auto &qf = kfrac_list[iq_in_k]; + LIBRPA::utils::lib_printf_root("Computing Wc(q), %d / %d, q=(%f, %f, %f)\n", iq + 1, + qpts.size(), qf.x, qf.y, qf.z); + coul_block.zero_out(); + coulwc_block.zero_out(); + // lib_printf("coul_block\n%s", str(coul_block).c_str()); + + // q-array for LibRI object + std::array qa = {q.x, q.y, q.z}; + + // collect the block elements of truncated coulomb matrices first + // as we reuse coul_eigen_block to reduce memory usage + Profiler::start("epsilon_prepare_coulwc_sqrt", "Prepare sqrt of truncated Coulomb"); + { + size_t n_singular_coulwc; + // LibRI tensor for communication, release once done + std::map>, RI::Tensor>>> + couleps_libri; + Profiler::start("epsilon_prepare_coulwc_sqrt_1", "Setup libRI object"); + for (const auto &Mu_Nu : atpair_local) + { + const auto Mu = Mu_Nu.first; + const auto Nu = Mu_Nu.second; + // ofs_myid << "Mu " << Mu << " Nu " << Nu << endl; + if (coulmat_wc.count(Mu) == 0 || coulmat_wc.at(Mu).count(Nu) == 0 || + coulmat_wc.at(Mu).at(Nu).count(q) == 0) + continue; + const auto &Vq = coulmat_wc.at(Mu).at(Nu).at(q); + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu); + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu); + std::valarray> Vq_va(Vq->c, Vq->size); + auto pvq = std::make_shared>>(); + *pvq = Vq_va; + couleps_libri[Mu][{Nu, qa}] = RI::Tensor>({n_mu, n_nu}, pvq); + } + Profiler::stop("epsilon_prepare_coulwc_sqrt_1"); + + Profiler::start("epsilon_prepare_coulwc_sqrt_2", "libRI Communicate"); + const auto IJq_coul = RI::Communicate_Tensors_Map_Judge::comm_map2_first( + mpi_comm_global_h.comm, couleps_libri, s0_s1.first, s0_s1.second); + Profiler::stop("epsilon_prepare_coulwc_sqrt_2"); + + Profiler::start("epsilon_prepare_coulwc_sqrt_3", "Collect 2D-block from IJ"); + + collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, + qa, true, CONE, IJq_coul, MAJOR::ROW); + ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1, desc_nabf_nabf.desc, + coulwc_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, + blacs_ctxt_global_h.ictxt); + Profiler::stop("epsilon_prepare_coulwc_sqrt_3"); + Profiler::start("epsilon_prepare_coulwc_sqrt_4", "Perform square root"); + power_hemat_blacs(coulwc_block, desc_nabf_nabf_opt, coul_eigen_block, + desc_nabf_nabf_opt, n_singular_coulwc, eigenvalues.c, 0.5, + Params::sqrt_coulomb_threshold); + Profiler::stop("epsilon_prepare_coulwc_sqrt_4"); + } + Profiler::stop("epsilon_prepare_coulwc_sqrt"); + LIBRPA::utils::lib_printf_root( + "Time to prepare sqrt root of Coulomb for Wc(q) (seconds, Wall/CPU): %f %f\n", + Profiler::get_wall_time_last("epsilon_prepare_coulwc_sqrt"), + Profiler::get_cpu_time_last("epsilon_prepare_coulwc_sqrt")); + ofs_myid << get_timestamp() << " Done coulwc sqrt" << endl; + + Profiler::start("epsilon_prepare_couleps_sqrt", "Prepare sqrt of bare Coulomb"); + // collect the block elements of coulomb matrices + { + // LibRI tensor for communication, release once done + std::map>, RI::Tensor>>> + couleps_libri; + ofs_myid << get_timestamp() << " Start build couleps_libri" << endl; + for (const auto &Mu_Nu : atpair_local) + { + const auto Mu = Mu_Nu.first; + const auto Nu = Mu_Nu.second; + // ofs_myid << "Mu " << Mu << " Nu " << Nu << endl; + if (coulmat_eps.count(Mu) == 0 || coulmat_eps.at(Mu).count(Nu) == 0 || + coulmat_eps.at(Mu).at(Nu).count(q) == 0) + continue; + const auto &Vq = coulmat_eps.at(Mu).at(Nu).at(q); + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu); + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu); + std::valarray> Vq_va(Vq->c, Vq->size); + auto pvq = std::make_shared>>(); + *pvq = Vq_va; + couleps_libri[Mu][{Nu, qa}] = RI::Tensor>({n_mu, n_nu}, pvq); + } + ofs_myid << get_timestamp() << " Done build couleps_libri" << endl; + // ofs_myid << "Couleps_libri" << endl << couleps_libri; + // if (couleps_libri.size() == 0) + // throw std::logic_error("data at q-point not found in coulmat_eps"); + + // perform communication + ofs_myid << get_timestamp() << " Start collect couleps_libri, targets" << endl; +#ifdef LIBRPA_DEBUG + ofs_myid << set_IJ_nabf_nabf << endl; + ofs_myid << "Extended blocks" << endl; + ofs_myid << "atom 1: " << s0_s1.first << endl; + ofs_myid << "atom 2: " << s0_s1.second << endl; +#endif + // ofs_myid << "Owned blocks\n"; + // print_keys(ofs_myid, couleps_libri); + // mpi_comm_global_h.barrier(); + const auto IJq_coul = RI::Communicate_Tensors_Map_Judge::comm_map2_first( + mpi_comm_global_h.comm, couleps_libri, s0_s1.first, s0_s1.second); + ofs_myid << get_timestamp() << " Done collect couleps_libri, collected blocks" << endl; + + ofs_myid << get_timestamp() << " Start construct couleps 2D block" << endl; + collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, + qa, true, CONE, IJq_coul, MAJOR::ROW); + #ifndef ENABLE_NVHPC + ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1, desc_nabf_nabf.desc, + coul_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, + blacs_ctxt_global_h.ictxt); + #else + d_temp_block.set_data(temp_block.nr(),temp_block.nc(),temp_block.ptr()); + d_coul_block.set_data(coul_block.nr(),coul_block.nc()); + CudaConnector::pgemr2d_nvhpc( + gpu_dev_stream, n_abf, n_abf, + d_temp_block.ptr(), 1, 1, desc_nabf_nabf, + d_coul_block.ptr(), 1, 1, desc_nabf_nabf_opt, + CUDA_C_64F + ); + gpu_dev_stream.cudaSync(); + CUDA_CHECK(cudaMemcpy(coul_block.ptr(), d_coul_block.ptr(), sizeof(cuDoubleComplex)*coul_block.nr()*coul_block.nc(), cudaMemcpyDeviceToHost)); + #endif + ofs_myid << get_timestamp() << " Done construct couleps 2D block" << endl; + } + + size_t n_singular; + ofs_myid << get_timestamp() << " Start power hemat couleps\n"; + matrix_m> sqrtveig_blacs; + #ifdef ENABLE_NVHPC + ComplexMatrixDevice d_sqrtveig_blacs; + #endif + if (is_gamma_point(q)) + { + // choice of power_hemat_blacs_real/power_hemat_blacs_desc + // leads to sub-meV difference + sqrtveig_blacs = power_hemat_blacs_real( + coul_block, desc_nabf_nabf_opt, coul_eigen_block, desc_nabf_nabf_opt, n_singular, + eigenvalues.c, 0.5, Params::sqrt_coulomb_threshold); + if (Params::replace_w_head && Params::option_dielect_func == 3) + { + df_headwing.wing_mu_to_lambda(sqrtveig_blacs, desc_nabf_nabf_opt); + } + } + else + { + sqrtveig_blacs = power_hemat_blacs(coul_block, desc_nabf_nabf_opt, coul_eigen_block, + desc_nabf_nabf_opt, n_singular, eigenvalues.c, 0.5, + Params::sqrt_coulomb_threshold); + } + ofs_myid << get_timestamp() << " Done power hemat couleps\n"; + // lib_printf("nabf %d nsingu %lu\n", n_abf, n_singular); + // release sqrtv when the q-point is not Gamma, or macroscopic dielectric constant at + // imaginary frequency is not prepared + if (epsmac_LF_imagfreq.empty() || !is_gamma_point(q)) sqrtveig_blacs.clear(); + const size_t n_nonsingular = n_abf - n_singular; + if(gpu_dev_stream.rank==0){ + printf("n_abf:%lu,n_nonsingular:%lu,n_singular:%lu\n",n_abf,n_nonsingular,n_singular); + } + Profiler::stop("epsilon_prepare_couleps_sqrt"); + LIBRPA::utils::lib_printf_root( + "Time to prepare sqrt root of Coulomb for Epsilon(q) (seconds, Wall/CPU): %f %f\n", + Profiler::get_wall_time_last("epsilon_prepare_couleps_sqrt"), + Profiler::get_cpu_time_last("epsilon_prepare_couleps_sqrt")); + ofs_myid << get_timestamp() << " Done couleps sqrt\n"; + std::flush(ofs_myid); + + for (const auto &freq : chi0.tfg.get_freq_nodes()) + { + const auto ifreq = chi0.tfg.get_freq_index(freq); + Profiler::start("epsilon_wc_work_q_omega"); + Profiler::start("epsilon_prepare_chi0_2d", "Prepare Chi0 2D block"); + chi0_block.zero_out(); + { + std::map>, + RI::Tensor>>> + chi0_libri; + if (chi0.get_chi0_q().count(freq) > 0 && chi0.get_chi0_q().at(freq).count(q) > 0) + { + const auto &chi0_wq = chi0.get_chi0_q().at(freq).at(q); + for (const auto &M_Nchi : chi0_wq) + { + const auto &M = M_Nchi.first; + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M); + for (const auto &N_chi : M_Nchi.second) + { + const auto &N = N_chi.first; + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N); + const auto &chi = N_chi.second; + std::valarray> chi_va(chi.c, chi.size); + auto pchi = std::make_shared>>(); + *pchi = chi_va; + chi0_libri[M][{N, qa}] = + RI::Tensor>({n_mu, n_nu}, pchi); + } + } + // Release the chi0 block for this frequency and q to reduce memory load, + // as they will not be used again + chi0.free_chi0_q(freq, q); + } + // ofs_myid << "chi0_libri" << endl << chi0_libri; + Profiler::start("epsilon_prepare_chi0_2d_comm_map2"); + const auto IJq_chi0 = RI::Communicate_Tensors_Map_Judge::comm_map2_first( + mpi_comm_global_h.comm, chi0_libri, s0_s1.first, s0_s1.second); + Profiler::stop("epsilon_prepare_chi0_2d_comm_map2"); + Profiler::start("epsilon_prepare_chi0_2d_collect_block"); + collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf, + LIBRPA::atomic_basis_abf, qa, true, CONE, IJq_chi0, + MAJOR::ROW); + #ifndef ENABLE_NVHPC + ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1, + desc_nabf_nabf.desc, chi0_block.ptr(), 1, 1, + desc_nabf_nabf_opt.desc, blacs_ctxt_global_h.ictxt); + #else + d_temp_block.set_data(temp_block.nr(),temp_block.nc(),temp_block.ptr()); + d_chi0_block.set_data(chi0_block.nr(),chi0_block.nc()); + CudaConnector::pgemr2d_nvhpc( + gpu_dev_stream, n_abf, n_abf, + d_temp_block.ptr(), 1, 1, desc_nabf_nabf, + d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + CUDA_C_64F + ); + #endif + Profiler::stop("epsilon_prepare_chi0_2d_collect_block"); + } + Profiler::stop("epsilon_prepare_chi0_2d"); + + Profiler::start("epsilon_compute_eps", "Compute dielectric matrix"); + if(gpu_dev_stream.rank==0) + printf("is_gamma_point(q):%d\n",is_gamma_point(q)); + const std::complex calpha(1.0,0.0),cbeta(0.0,0.0); + if (epsmac_LF_imagfreq.size() > 0 && is_gamma_point(q)) + { + ofs_myid << get_timestamp() << " Entering dielectric matrix head overwrite" << endl; + // rotate to Coulomb-eigenvector basis + // descending order + + d_sqrtveig_blacs.set_data(sqrtveig_blacs.nr(), sqrtveig_blacs.nc(), sqrtveig_blacs.ptr()); + d_coul_chi0_block.set_data(coul_chi0_block.nr(), coul_chi0_block.nc()); + gpu_dev_stream.cudaSync(); + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_nonsingular, n_abf, + &calpha, + d_chi0_block,1,1,desc_nabf_nabf_opt, + d_sqrtveig_blacs,1,1,desc_nabf_nabf_opt, + &cbeta, + d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_C,CUBLAS_OP_N,n_nonsingular, n_nonsingular, n_abf, + &calpha, + d_sqrtveig_blacs,1,1,desc_nabf_nabf_opt, + d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + + if (Params::option_dielect_func == 3) + { + + cuDoubleComplex calpha1; + calpha1.x = -1.0; + calpha1.y = 0.0; + CudaConnector::multiply_number_for_ComplexMatrixDevice(d_chi0_block,calpha1,gpu_dev_stream.stream); + CudaConnector::diag_add_ComplexMatrixDevice(d_chi0_block,1.0,desc_nabf_nabf_opt,gpu_dev_stream.stream); + gpu_dev_stream.cudaSync(); + // { + // std::string filename = "gpu_"; + // filename += to_string(gpu_dev_stream.nranks); + // filename += "_"; + // filename += to_string(gpu_dev_stream.rank); + // gpu_dev_stream.cudaSync(); + // CUDA_CHECK(cudaMemcpy(coul_chi0_block.ptr(),d_chi0_block.ptr(),sizeof(cuDoubleComplex)*coul_chi0_block.nr()*coul_chi0_block.nc(),cudaMemcpyDeviceToHost)); + // CudaConnector::write_file((cuDoubleComplex*)coul_chi0_block.ptr(),coul_chi0_block.nr(),coul_chi0_block.nc(),filename.data()); + // } + // CUDA_CHECK(cudaMemcpy(chi0_block.ptr(),d_chi0_block.ptr(),sizeof(cuDoubleComplex)*chi0_block.nr()*chi0_block.nc(),cudaMemcpyDeviceToHost)); + + ofs_myid << get_timestamp() << "Perform the head & wing element overwrite" + << endl; + // df_headwing.rewrite_eps(chi0_block, ifreq, desc_nabf_nabf_opt); + df_headwing.rewrite_eps_nvhpc(gpu_dev_stream, d_chi0_block, ifreq, desc_nabf_nabf_opt); + // d_chi0_block.set_data(chi0_block.nr(), chi0_block.nc(), chi0_block.ptr()); + + d_coul_eigen_block.set_data(coul_eigen_block.nr(), coul_eigen_block.nc(), coul_eigen_block.ptr()); + // rotate back to ABF + // descending order + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_nonsingular, n_nonsingular, + &calpha, + d_coul_eigen_block,1,1,desc_nabf_nabf_opt, + d_chi0_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_C,n_abf, n_abf, n_nonsingular, + &calpha, + d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + d_coul_eigen_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + // subtract 1 from diagonal + CudaConnector::diag_add_ComplexMatrixDevice(d_chi0_block,-1.0,desc_nabf_nabf_opt,gpu_dev_stream.stream); + Profiler::start("epsilon_multiply_coulwc", "Multiply truncated Coulomb"); + d_coulwc_block.set_data(coulwc_block.nr(), coulwc_block.nc(), coulwc_block.ptr()); + d_coul_chi0_block.set_data(coul_chi0_block.nr(), coul_chi0_block.nc()); + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf, + &calpha, + d_coulwc_block,1,1,desc_nabf_nabf_opt, + d_chi0_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf, + &calpha, + d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + d_coulwc_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + Profiler::stop("epsilon_multiply_coulwc"); + } + else + { + const int ilo = desc_nabf_nabf_opt.indx_g2l_r(0); + const int jlo = desc_nabf_nabf_opt.indx_g2l_c(0); + if (ilo >= 0 && jlo >= 0) + { + ofs_myid << get_timestamp() << "Perform the head element overwrite" << endl; + std::complex temp_element = 1.0 - epsmac_LF_imagfreq[ifreq]; + CUDA_CHECK(cudaMemcpy(d_chi0_block.ptr() + ilo + jlo * d_chi0_block.nr(), + &temp_element, sizeof(cuDoubleComplex), + cudaMemcpyHostToDevice)); + } + d_coul_eigen_block.set_data(coul_eigen_block.nr(), coul_eigen_block.nc(), coul_eigen_block.ptr()); + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_nonsingular, n_nonsingular, + &calpha, + d_coul_eigen_block,1,1,desc_nabf_nabf_opt, + d_chi0_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_C,n_abf, n_abf, n_nonsingular, + &calpha, + d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + d_coul_eigen_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + + cuDoubleComplex calpha1; + calpha1.x = -1.0; + calpha1.y = 0.0; + CudaConnector::multiply_number_for_ComplexMatrixDevice(d_chi0_block,calpha1,gpu_dev_stream.stream); + CudaConnector::diag_add_ComplexMatrixDevice(d_chi0_block,1.0,desc_nabf_nabf_opt,gpu_dev_stream.stream); + gpu_dev_stream.cudaSync(); + Profiler::start("epsilon_invert_eps and epsilon_multiply_coulwc", "Invert dielectric matrix and Multiply truncated Coulomb"); + char order = 'c'; + d_coulwc_block.set_data(coulwc_block.nr(), coulwc_block.nc(), coulwc_block.ptr(),gpu_dev_stream.stream); + d_coul_chi0_block.set_data_device(coulwc_block.nr(), coulwc_block.nc(), d_coulwc_block.ptr(),gpu_dev_stream); + int64_t* d_ipiv; + int* d_info; + CUDA_CHECK(cudaMallocAsync(&d_info,sizeof(int),gpu_dev_stream.stream)); + if(order == 'c'||order == 'C'){ + CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*desc_nabf_nabf_opt.n_loc(),gpu_dev_stream.stream)); + CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_coul_chi0_block); + CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_chi0_block); + }else{ + CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*desc_nabf_nabf_opt.m_loc(),gpu_dev_stream.stream)); + } + CudaConnector::pgetrf_nvhpc_mixed_precision( + gpu_dev_stream, d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + d_ipiv, d_info, + CUDA_C_64F, order + ); + CudaConnector::pgetrs_nvhpc_mixed_precision( + gpu_dev_stream, CUBLAS_OP_N, + d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + d_ipiv, + d_coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + d_info, + CUDA_C_64F, order + ); + if(order == 'c'||order == 'C'){ + CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_coul_chi0_block); + } + CUDA_CHECK(cudaFreeAsync(d_info, gpu_dev_stream.stream)); + CUDA_CHECK(cudaFreeAsync(d_ipiv, gpu_dev_stream.stream)); + d_chi0_block.set_data_device(coulwc_block.nr(), coulwc_block.nc(), d_coulwc_block.ptr(), gpu_dev_stream); + + CudaConnector::pgeadd_nvhpc( + gpu_dev_stream, CUBLAS_OP_N, + &calpha, + d_coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + &calpha1, + d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + CUDA_C_64F, order + ); + d_coul_chi0_block.set_data_device(d_chi0_block.nr(), d_chi0_block.nc(), d_chi0_block.ptr(), gpu_dev_stream); + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf, + &calpha, + (order=='r'||order=='R')?d_coulwc_block:d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + (order=='r'||order=='R')?d_coul_chi0_block:d_coulwc_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + Profiler::stop("epsilon_invert_eps and epsilon_multiply_coulwc"); + } + } + else + { + Profiler::start("epsilon_compute_eps_pgemm_1"); + // d_chi0_block.set_data(chi0_block.nr(), chi0_block.nc(), chi0_block.ptr()); + d_coul_block.set_data(coul_block.nr(), coul_block.nc(), coul_block.ptr()); + d_coul_chi0_block.set_data(coul_chi0_block.nr(), coul_chi0_block.nc()); + + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf, + &calpha, + d_coul_block,1,1,desc_nabf_nabf_opt, + d_chi0_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + Profiler::cease("epsilon_compute_eps_pgemm_1"); + Profiler::start("epsilon_compute_eps_pgemm_2"); + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf, + &calpha, + d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + d_coul_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + // d_coul_block.cublasClean(gpu_dev_stream.cublas_handle); + Profiler::cease("epsilon_compute_eps_pgemm_2"); + // now chi0_block is actually v1/2 chi v1/2 + cuDoubleComplex calpha1; + calpha1.x = -1.0; + calpha1.y = 0.0; + CudaConnector::multiply_number_for_ComplexMatrixDevice(d_chi0_block,calpha1,gpu_dev_stream.stream); + gpu_dev_stream.cudaSync(); + CudaConnector::diag_add_ComplexMatrixDevice(d_chi0_block,1.0,desc_nabf_nabf_opt,gpu_dev_stream.stream); + Profiler::stop("epsilon_compute_eps"); + // now chi0_block is actually the dielectric matrix + // perform inversion + Profiler::start("epsilon_invert_eps and epsilon_multiply_coulwc", "Invert dielectric matrix and Multiply truncated Coulomb"); + char order = 'c'; + d_coulwc_block.set_data(coulwc_block.nr(), coulwc_block.nc(), coulwc_block.ptr(),gpu_dev_stream.stream); + d_coul_chi0_block.set_data_device(coulwc_block.nr(), coulwc_block.nc(), d_coulwc_block.ptr(),gpu_dev_stream); + int64_t* d_ipiv; + int* d_info; + CUDA_CHECK(cudaMallocAsync(&d_info,sizeof(int),gpu_dev_stream.stream)); + if(order == 'c'||order == 'C'){ + CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*desc_nabf_nabf_opt.n_loc(),gpu_dev_stream.stream)); + CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_coul_chi0_block); + CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_chi0_block); + }else{ + CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*desc_nabf_nabf_opt.m_loc(),gpu_dev_stream.stream)); + } + CudaConnector::pgetrf_nvhpc_mixed_precision( + gpu_dev_stream, d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + d_ipiv, d_info, + CUDA_C_64F, order + ); + + CudaConnector::pgetrs_nvhpc_mixed_precision( + gpu_dev_stream, CUBLAS_OP_N, + d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + d_ipiv, + d_coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + d_info, + CUDA_C_64F, order + ); + if(order == 'c'||order == 'C'){ + CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_coul_chi0_block); + } + CUDA_CHECK(cudaFreeAsync(d_info, gpu_dev_stream.stream)); + CUDA_CHECK(cudaFreeAsync(d_ipiv, gpu_dev_stream.stream)); + d_chi0_block.set_data_device(coulwc_block.nr(), coulwc_block.nc(), d_coulwc_block.ptr(), gpu_dev_stream); + + CudaConnector::pgeadd_nvhpc( + gpu_dev_stream, CUBLAS_OP_N, + &calpha, + d_coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + &calpha1, + d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + CUDA_C_64F, order + ); + d_coul_chi0_block.set_data_device(d_chi0_block.nr(), d_chi0_block.nc(), d_chi0_block.ptr(), gpu_dev_stream); + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf, + &calpha, + (order=='r'||order=='R')?d_coulwc_block:d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + (order=='r'||order=='R')?d_coul_chi0_block:d_coulwc_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + Profiler::stop("epsilon_invert_eps and epsilon_multiply_coulwc"); + } + // Array_Desc_Device array_desc_device(desc_nabf_nabf_opt); + // printf("successful create object array_desc_device\n"); + #ifndef ENABLE_NVHPC + gpu_dev_stream.calSync(); + CUDA_CHECK(cudaMemcpy(chi0_block.ptr(),d_chi0_block.ptr(),sizeof(cuDoubleComplex)*chi0_block.nr()*chi0_block.nc(),cudaMemcpyDeviceToHost)); + ScalapackConnector::pgemr2d_f(n_abf, n_abf, chi0_block.ptr(), 1, 1, + desc_nabf_nabf_opt.desc, temp_block.ptr(), 1, 1, + desc_nabf_nabf.desc, blacs_ctxt_global_h.ictxt); + #else + d_temp_block.set_data(temp_block.nr(),temp_block.nc()); + CudaConnector::pgemr2d_nvhpc( + gpu_dev_stream, n_abf, n_abf, + d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + d_temp_block.ptr(), 1, 1, desc_nabf_nabf, + CUDA_C_64F + ); + CUDA_CHECK(cudaMemcpyAsync(temp_block.ptr(), d_temp_block.ptr(), sizeof(cuDoubleComplex)*temp_block.nr()*temp_block.nc(), cudaMemcpyDeviceToHost, gpu_dev_stream.stream)); + gpu_dev_stream.cudaSync(); + #endif + + Profiler::start("epsilon_convert_wc_2d_to_ij", "Convert Wc, 2D -> IJ"); + Profiler::start("epsilon_convert_wc_map_block", "Initialize Wc atom-pair map"); + map>>> Wc_MNmap; + // map_block_to_IJ_storage(Wc_MNmap, LIBRPA::atomic_basis_abf, + // LIBRPA::atomic_basis_abf, chi0_block, + // desc_nabf_nabf, MAJOR::ROW); + map_block_to_IJ_storage_new(Wc_MNmap, LIBRPA::atomic_basis_abf, map_lor_v, map_loc_v, + temp_block, desc_nabf_nabf, MAJOR::ROW); + Profiler::stop("epsilon_convert_wc_map_block"); + + Profiler::start("epsilon_convert_wc_communicate", "Communicate"); + { + std::map>, + RI::Tensor>>> + Wc_libri; + Profiler::start("epsilon_convert_wc_communicate_1"); + for (const auto &M_NWc : Wc_MNmap) + { + const auto &M = M_NWc.first; + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M); + for (const auto &N_Wc : M_NWc.second) + { + const auto &N = N_Wc.first; + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N); + const auto &Wc = N_Wc.second; + // std::valarray> Wc_va(Wc.ptr(), Wc.size()); + // auto pWc = std::make_shared>>(); + // *pWc = Wc_va; + /*if (iq == 10 && ifreq == 10) + { + char fn[100]; + sprintf(fn, "Wc_M_%zu_N_%zu.dat", M, N); + print_matrix_mm_file(Wc, Params::output_dir + "/" + fn); + }*/ + Wc_libri[M][{N, qa}] = RI::Tensor>({n_mu, n_nu}, Wc.sptr()); + } + } + Profiler::stop("epsilon_convert_wc_communicate_1"); + Profiler::start("epsilon_convert_wc_communicate_2"); + // main timing + // cout << Wc_libri; + const auto IJq_Wc = RI::Communicate_Tensors_Map_Judge::comm_map2_first( + mpi_comm_global_h.comm, Wc_libri, Iset_Jset_Wc.first, Iset_Jset_Wc.second); + Profiler::stop("epsilon_convert_wc_communicate_2"); + Profiler::start("epsilon_convert_wc_communicate_3"); + // parse collected to + for (const auto &MN : atpair_local) + { + const auto &M = MN.first; + const auto &N = MN.second; + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M); + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N); + // Use row major for later usage in LibRI + Wc_freq_q[freq][M][N][q] = matrix_m>( + n_mu, n_nu, IJq_Wc.at(M).at({N, qa}).data, MAJOR::ROW); + } + Profiler::stop("epsilon_convert_wc_communicate_3"); + // for ( int i_mu = 0; i_mu != n_mu; i_mu++ ) + // for ( int i_nu = 0; i_nu != n_nu; i_nu++ ) + // { + // } + } + Profiler::stop("epsilon_convert_wc_communicate"); + Profiler::stop("epsilon_convert_wc_2d_to_ij"); + Profiler::cease("epsilon_wc_work_q_omega"); + LIBRPA::utils::lib_printf_root( + "Time for Wc(i_q=%d, i_omega=%d) (seconds, Wall/CPU): %f %f\n", iq + 1, ifreq + 1, + Profiler::get_wall_time_last("epsilon_wc_work_q_omega"), + Profiler::get_cpu_time_last("epsilon_wc_work_q_omega")); + } + } +#else + throw std::logic_error("need compilation with LibRI"); +#endif + Profiler::cease("compute_Wc_freq_q_work"); + LIBRPA::utils::lib_printf_root("Time for Wc computation (seconds, Wall/CPU): %f %f\n", + Profiler::get_wall_time_last("compute_Wc_freq_q_work"), + Profiler::get_cpu_time_last("compute_Wc_freq_q_work")); + + return Wc_freq_q; +} + +#endif \ No newline at end of file diff --git a/src/epsilon_cuda.h b/src/epsilon_cuda.h new file mode 100644 index 00000000..0fd7b2d2 --- /dev/null +++ b/src/epsilon_cuda.h @@ -0,0 +1,18 @@ +#pragma once +#include "epsilon.h" +#include "device_connector.h" +#include "matrix_device.h" + +#ifdef LIBRPA_USE_CUDA +CorrEnergy compute_RPA_correlation_cuda(const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat); +#endif +#ifdef ENABLE_NVHPC +CorrEnergy compute_RPA_correlation_blacs_2d_cuda(Chi0 &chi0, atpair_k_cplx_mat_t &coulmat); + +map, matrix_m>>>::pair_t_old> +compute_Wc_freq_q_blacs_cuda(Chi0 &, const atpair_k_cplx_mat_t &, + atpair_k_cplx_mat_t &, + const vector> &); +complex compute_pi_det_blacs_2d_nvhpc( + MatrixDevice> &, const LIBRPA::Array_Desc &arrdesc_pi, int64_t *d_ipiv, int *d_info,char order='C'); +#endif diff --git a/src/gpu_device_stream.cpp b/src/gpu_device_stream.cpp new file mode 100644 index 00000000..e69de29b diff --git a/src/gpu_device_stream.h b/src/gpu_device_stream.h new file mode 100644 index 00000000..849556bd --- /dev/null +++ b/src/gpu_device_stream.h @@ -0,0 +1,121 @@ +#pragma once +//=================hbchen 2025-05-11========================= +#include +#include +#include +#include +//=================hbchen 2025-05-11========================= +#include +#include +#include +#include +#ifdef ENABLE_NVHPC +#include +#include +#include "helpers.h" +#endif + +class GpuDeviceStream{ +private: + cal_comm_create_params_t params; + static inline calError_t allgather(void* src_buf, void* recv_buf, size_t size, void* data, void** request) + { + MPI_Request req; + int err = MPI_Iallgather(src_buf, size, MPI_BYTE, recv_buf, size, MPI_BYTE, (MPI_Comm)(data), &req); + if (err != MPI_SUCCESS) + { + return CAL_ERROR; + } + *request = (void*)(req); + return CAL_OK; + } + + static inline calError_t request_test(void* request) + { + MPI_Request req = (MPI_Request)(request); + int completed; + int err = MPI_Test(&req, &completed, MPI_STATUS_IGNORE); + if (err != MPI_SUCCESS) + { + return CAL_ERROR; + } + return completed ? CAL_OK : CAL_ERROR_INPROGRESS; + } + + static inline calError_t request_free(void* request) + { + return CAL_OK; + } + static inline int getLocalDevice() + { + int localRank; + MPI_Comm localComm; + + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &localComm); + MPI_Comm_rank(localComm, &localRank); + MPI_Comm_free(&localComm); + + int deviceCount = 0; + CUDA_CHECK(cudaGetDeviceCount(&deviceCount)); + + return localRank % deviceCount; + } + + +public: + int rank; + int nranks; + int local_device; + cudaStream_t stream = nullptr; + cal_comm_t cal_comm = nullptr; + + cusolverMpHandle_t cusolver_handle = nullptr; + cublasMpHandle_t cublas_handle = nullptr; + GpuDeviceStream(){ + MPI_Comm_size(MPI_COMM_WORLD, &nranks); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + local_device = GpuDeviceStream::getLocalDevice(); + // printf("myrank:%d, local_device:%d\n", rank, local_device); + CUDA_CHECK(cudaSetDevice(local_device)); + CUDA_CHECK(cudaFree(nullptr)); + { + params.allgather = GpuDeviceStream::allgather; + params.req_test = GpuDeviceStream::request_test; + params.req_free = GpuDeviceStream::request_free; + params.data = (void*)(MPI_COMM_WORLD); + params.rank = rank; + params.nranks = nranks; + params.local_device = local_device; + + CAL_CHECK(cal_comm_create(params, &cal_comm)); + } + CUDA_CHECK(cudaStreamCreate(&stream)); + CUSOLVERMP_CHECK(cusolverMpCreate(&cusolver_handle, local_device, stream)); + CUBLASMP_CHECK(cublasMpCreate(&cublas_handle, stream)); + + } + ~GpuDeviceStream(){ + if(stream!=nullptr){ + CUDA_CHECK(cudaStreamDestroy(stream)); + stream=nullptr; + } + if(cal_comm!=nullptr){ + CAL_CHECK(cal_comm_destroy(cal_comm)); + cal_comm=nullptr; + } + if(cusolver_handle!=nullptr){ + CUSOLVERMP_CHECK(cusolverMpDestroy(cusolver_handle)); + cusolver_handle=nullptr; + } + if(cublas_handle!=nullptr){ + CUBLASMP_CHECK(cublasMpDestroy(cublas_handle)); + cublas_handle=nullptr; + } + } + void cudaSync() const { + CUDA_CHECK(cudaStreamSynchronize(stream)); + } + void calSync() const { + CAL_CHECK(cal_stream_sync(cal_comm,stream)); + } +}; \ No newline at end of file diff --git a/src/helpers.h b/src/helpers.h new file mode 100644 index 00000000..ec014586 --- /dev/null +++ b/src/helpers.h @@ -0,0 +1,107 @@ + +#ifndef HELPERS_H +#define HELPERS_H + +#pragma once +#include +#include +#ifdef ENABLE_NVHPC +#include +#include +#include +#endif + + +typedef enum{ + LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE, + LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT, + LIBRPA_COMPUTE_TYPE_DOUBLE, + LIBRPA_COMPUTE_TYPE_FLOAT +}LIBRPA_DEVICE_COMPUTE_TYPE; + + +#ifdef ENABLE_NVHPC +#define NVHPC_MPI_CHECK(call) \ + do \ + { \ + int status = call; \ + if (status != MPI_SUCCESS) \ + { \ + fprintf(stderr, "MPI error at %s:%d : %d\n", __FILE__, __LINE__, status); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) +#define NVSHMEM_CHECK(call) \ + do \ + { \ + int status = call; \ + if (status != 0) \ + { \ + fprintf(stderr, "NVSHMEM error at %s:%d : %d\n", __FILE__, __LINE__, status); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + + +#define CUDA_CHECK(call) \ + do \ + { \ + cudaError_t status = call; \ + if (status != cudaSuccess) \ + { \ + fprintf(stderr, "CUDA error at %s:%d : %s\n", __FILE__, __LINE__, cudaGetErrorString(status)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define CUBLASMP_CHECK(call) \ + do \ + { \ + cublasStatus_t status = call; \ + if (status != CUBLAS_STATUS_SUCCESS) \ + { \ + fprintf(stderr, "cuBLASMp error at %s:%d : %d\n", __FILE__, __LINE__, status); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) +#define CUBLAS_CHECK(err) \ + do { \ + cublasStatus_t err_ = (err); \ + if (err_ != CUBLAS_STATUS_SUCCESS) { \ + std::printf("cublas error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) +#define CAL_CHECK(call) \ + do \ + { \ + calError_t status = call; \ + if (status != CAL_OK) \ + { \ + fprintf(stderr, "CAL error at %s:%d : %d\n", __FILE__, __LINE__, status); \ + exit(EXIT_FAILURE); \ + } \ + }while(0) + +#define CUSOLVERMP_CHECK(call) \ + do \ + { \ + cusolverStatus_t status = call; \ + if (status != CUSOLVER_STATUS_SUCCESS) \ + { \ + fprintf(stderr, "cuSOLVERMp error at %s:%d : %d\n", __FILE__, __LINE__, status); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) +#endif +#define ORDER_CHECK(order) \ + do \ + { \ + if(order !='C'&&order !='c'&&order !='R'&&order !='r') \ + { \ + fprintf(stderr, "Order should be either 'C' or 'R', order error at %s:%d:%s\n", __FILE__, __LINE__, order);\ + exit(EXIT_FAILURE); \ + } \ + }while (0) + +#endif // HELPERS_H \ No newline at end of file diff --git a/src/matrix_device.cpp b/src/matrix_device.cpp new file mode 100644 index 00000000..28b7987e --- /dev/null +++ b/src/matrix_device.cpp @@ -0,0 +1,16 @@ +#include "matrix_device.h" +#include "device_stream.h" + +template +void MatrixDevice::set_data_device(const int& m, const int& n, const void* d_A, const void* stream){ + set_data(m,n,stream); + if(stream==nullptr){ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaMemcpyPeer(this->d_data, device_stream.local_device, d_A, device_stream.local_device, m * n * sizeof(T))); + #endif + }else{ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaMemcpyPeerAsync(this->d_data, device_stream.local_device, d_A, device_stream.local_device, m * n * sizeof(T), (cudaStream_t)stream)); + #endif + } +} \ No newline at end of file diff --git a/src/matrix_device.h b/src/matrix_device.h new file mode 100644 index 00000000..000c3847 --- /dev/null +++ b/src/matrix_device.h @@ -0,0 +1,111 @@ +#ifndef MATRIX_DEVICE_H +#define MATRIX_DEVICE_H + +#include +#ifdef ENABLE_NVHPC +#include +#include "helpers.h" +#endif + +template +class MatrixDevice{ +private: + T* d_data=nullptr; + int m=0; + int n=0; +public: + MatrixDevice(){} + MatrixDevice(const int& m, const int& n, const void* stream){ + this->m=m; + this->n=n; + if(stream==nullptr){ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(T))); + #endif + }else{ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaMallocAsync((void**)&d_data, m * n * sizeof(T), (cudaStream_t)stream)); + #endif + } + } + MatrixDevice(const int& m, const int& n,const void* c_data, const void* stream){ + this->m=m; + this->n=n; + if(stream==nullptr){ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(T))); + CUDA_CHECK(cudaMemcpy(d_data, c_data, m * n * sizeof(T), cudaMemcpyHostToDevice)); + #endif + }else{ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaMallocAsync((void**)&d_data, m * n * sizeof(T), (cudaStream_t)stream)); + CUDA_CHECK(cudaMemcpyAsync(d_data, c_data, m * n * sizeof(T), cudaMemcpyHostToDevice, (cudaStream_t)stream)); + #endif + } + } + void set_data(const int& m, const int& n, const void* stream){ + if(m!=this->m || n!=this->n){ + clean(stream); + this->m=m; + this->n=n; + if(stream==nullptr){ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(T))); + #endif + }else{ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaMallocAsync((void**)&d_data, m * n * sizeof(T), (cudaStream_t)stream)); + #endif + } + } + } + void set_data(const int& m, const int& n,const void* c_data, const void* stream){ + set_data(m,n,stream); + if(stream==nullptr){ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaMemcpy(d_data, c_data, m * n * sizeof(T), cudaMemcpyHostToDevice)); + #endif + }else{ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaMemcpyAsync(d_data, c_data, m * n * sizeof(T), cudaMemcpyHostToDevice, (cudaStream_t)stream)); + #endif + } + } + void set_data_device(const int& m, const int& n, const void* d_A, const void* stream); + + + void clean(const void* stream){ + if(d_data!=nullptr){ + if(stream==nullptr){ + #ifdef ENABLE_NVHPC + cudaFree(d_data); + #endif + }else{ + #ifdef ENABLE_NVHPC + cudaFreeAsync(d_data, (cudaStream_t)stream); + #endif + } + d_data=nullptr; + } + this->m=0; + this->n=0; + } + T* ptr(){ + return d_data; + } + const T* ptr() const { + return d_data; + } + int nr() const { + return m; + } + int nc() const { + return n; + } + ~MatrixDevice(){ + clean(nullptr); + } + +}; + +#endif // MATRIX_DEVICE_H \ No newline at end of file