diff --git a/CMakeLists.txt b/CMakeLists.txt
index e9d4ee05..4264025f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,14 +20,18 @@ set(LIB_NAME            "rpa")
 set(FORTRAN_LIB_NAME    "rpa_f")
 
 # options setup
-option(USE_LIBRI            "Use LibRI for tensor contraction" OFF)
+option(USE_LIBRI            "Use LibRI for tensor contraction" ON)
 option(USE_CMAKE_INC        "Use cmake.inc for configure" OFF)
-option(USE_GREENX_API       "Use GreenX API for minimax grids generation" OFF)
+option(USE_GREENX_API       "Use GreenX API for minimax grids generation" ON)
 option(USE_EXTERNAL_GREENX  "Use external GreenX library rather than the packaged one" OFF)
 option(ENABLE_TEST          "Flag to build unit tests" ON)
 option(ENABLE_DRIVER        "Flag to build driver executables" ON)
 option(ENABLE_FORTRAN_BIND  "Flag to build Fotran binding" OFF)
 option(VERBOSE_OUTPUT       "Flag to print verbose information in stdout and process output" OFF)
+option(USE_CUDA "Use cuda for EcRPA calculation" OFF)
+option(ENABLE_CUSOLVERMP "Use cusolverMp for EcRPA calculation" OFF)
+option(ENABLE_CUBLASMP "Use cublasmp for EcRPA calculation" OFF )
+option(ENABLE_NVHPC "Use nvhpc for calculation" OFF )
 # NOTE: static library not tested
 option(BUILD_LIBRPA_SHARED  "Flag to build shared libraries" ON)
 
@@ -61,6 +65,10 @@ if(USE_GREENX_API OR ENABLE_FORTRAN_BIND)
 else()
   enable_language(CXX)
 endif()
+# added by hbchen in 2025-5-19
+if(USE_CUDA)
+  enable_language(CUDA)
+endif()
 
 # bypass the deprecation warning of classic C++ of oneAPI
 if(CMAKE_CXX_COMPILER_ID MATCHES Intel)
@@ -177,6 +185,46 @@ if(USE_LIBRI)
   add_compile_definitions("LIBRPA_USE_LIBRI")
 endif()
 
+
+# add gpu accelaration by chenhaobo in 2025-04-26
+# cmake_policy(SET CMP0146 NEW)
+if(USE_CUDA)
+  find_package(CUDA REQUIRED)
+  find_package(CUDAToolkit REQUIRED)
+  add_definitions(-DADD_)
+  include_directories(${MAGMA_ROOT}/include)
+  set(MAGMA_INCLUDE_DIR "${MAGMA_ROOT}/include")
+  set(MAGMA_INCLUDE_DIR $ENV{MAGMA_INCLUDE_DIR})
+  link_directories(${MAGMA_ROOT}/lib)
+  set(CMAKE_CUDA_STANDARD 14)
+  set(CUDA_LIBRARIES "-lcublas -lcudart -lcusolver")
+  list(APPEND math_libs
+              ${CUDA_LIBRARIES}
+              ${MAGMA_ROOT}/lib/libmagma.so
+              )
+  add_compile_definitions("LIBRPA_USE_CUDA")
+endif()
+if(ENABLE_NVHPC)
+add_compile_definitions("ENABLE_NVHPC")
+set(ENABLE_CUSOLVERMP ON)
+set(ENABLE_CUBLASMP ON)
+endif()
+if(ENABLE_CUSOLVERMP)
+  add_compile_definitions("ENABLE_CUSOLVERMP")
+  set(CUSOLVERMP_LIBRAYIES "-lcal -lcusolverMp")
+  list(APPEND math_libs
+              ${CUSOLVERMP_LIBRAYIES}
+              )
+endif()
+if(ENABLE_CUBLASMP)
+  add_compile_definitions("ENABLE_CUBLASMP")
+  set(CUBLASMP_LIBRAYIES "-lcublasmp -lcurand -lcublas")
+  list(APPEND math_libs
+              ${CUBLASMP_LIBRAYIES}
+              )
+endif()
+# finish adding gpu accelaration by chenhaobo in 2025-04-26
+
 if(CMAKE_BUILD_TYPE MATCHES "Debug")
   message(STATUS "Build type set to Debug, adding LIBRPA_DEBUG preprocessor directive")
   add_compile_definitions("LIBRPA_DEBUG")
diff --git a/driver/main.cpp b/driver/main.cpp
index 736a1838..164671ed 100644
--- a/driver/main.cpp
+++ b/driver/main.cpp
@@ -39,7 +39,9 @@
 #include "utils_cmake.h"
 #include "utils_mem.h"
 #include "utils_mpi_io.h"
-
+#ifdef ENABLE_NVHPC
+#include "device_stream.h"
+#endif
 static void initialize(int argc, char **argv)
 {
     using namespace LIBRPA::envs;
@@ -54,7 +56,9 @@ static void initialize(int argc, char **argv)
     }
 
     initialize_librpa_environment(MPI_COMM_WORLD, 0, 0, "");
-
+    #ifdef ENABLE_NVHPC
+    device_stream.init();
+    #endif
     // Global profiler begins right after MPI is initialized
     Profiler::start("total", "Total");
 }
@@ -77,7 +81,9 @@ static void finalize(bool success)
             lib_printf("libRPA failed\n");
         }
     }
-
+    #ifdef ENABLE_NVHPC
+    device_stream.finalize();
+    #endif
     finalize_librpa_environment();
 
     MPI_Finalize();
diff --git a/driver/task_gw_band.cpp b/driver/task_gw_band.cpp
index d5393743..850ba297 100644
--- a/driver/task_gw_band.cpp
+++ b/driver/task_gw_band.cpp
@@ -23,7 +23,10 @@
 #include "ri.h"
 #include "utils_timefreq.h"
 #include "write_aims.h"
-
+#ifdef ENABLE_NVHPC
+#include "epsilon_cuda.h"
+#include <cuda_runtime.h>
+#endif
 void task_g0w0_band(std::map<Vector3_Order<double>, ComplexMatrix> &sinvS)
 {
     using LIBRPA::envs::mpi_comm_global_h;
@@ -156,6 +159,13 @@ void task_g0w0_band(std::map<Vector3_Order<double>, ComplexMatrix> &sinvS)
         Wc_freq_q;
     if (Params::use_scalapack_gw_wc)
     {
+        #ifdef ENABLE_NVHPC
+        int numDevices;
+        cudaError_t cudaStat = cudaGetDeviceCount(&numDevices);
+        if(cudaStat == cudaSuccess && numDevices>0)
+            Wc_freq_q = compute_Wc_freq_q_blacs_cuda(chi0, Vq, Vq_cut, epsmac_LF_imagfreq);
+        else 
+        #endif
         Wc_freq_q = compute_Wc_freq_q_blacs(chi0, Vq, Vq_cut, epsmac_LF_imagfreq);
     }
     else
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e36be38e..213ef544 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -72,6 +72,29 @@ else()
   list(APPEND lib_sources get_minimax_local.cpp)
 endif()
 
+if(USE_CUDA)
+  list(APPEND lib_sources 
+        epsilon_cuda.cpp
+        cuda_connector.cu
+        )
+endif()
+if(ENABLE_NVHPC)
+  list(APPEND lib_sources 
+        cuda_connector.cpp  
+        array_desc_device.cu
+        device_stream.cpp
+        matrix_device.cpp
+        device_connector.cpp
+        )
+  # set_source_files_properties(
+  #   base_blacs.cpp
+  #   PROPERTIES
+  #   LANGUAGE CUDA
+  #   # COMPILE_OPTIONS "--x cu"
+  #   )
+endif()
+
+
 target_include_directories(rpa_lib
   PRIVATE
     # ${CMAKE_CURRENT_LIST_DIR}
diff --git a/src/app_rpa.cpp b/src/app_rpa.cpp
index a6cb3128..a837684c 100644
--- a/src/app_rpa.cpp
+++ b/src/app_rpa.cpp
@@ -11,6 +11,10 @@
 #include "stl_io_helper.h"
 #include "utils_mem.h"
 #include "utils_timefreq.h"
+#ifdef LIBRPA_USE_CUDA
+#include <cuda_runtime.h> // added by hbchen in 2025-5-17
+#include <epsilon_cuda.h> // added by hbchen in 2025-7-26
+#endif
 
 namespace LIBRPA
 {
@@ -87,6 +91,23 @@ void get_rpa_correlation_energy_(std::complex<double> &rpa_corr,
     mpi_comm_global_h.barrier();
     Profiler::start("EcRPA", "Compute RPA correlation Energy");
     CorrEnergy corr;
+    #ifdef LIBRPA_USE_CUDA
+    int deviceCount;
+    cudaError_t err= cudaGetDeviceCount(&deviceCount);
+    // lib_printf("cudaSuccess:%d\n",err==cudaSuccess&&deviceCount>0);
+    printf("Number of CUDA devices: %d\n", deviceCount);
+    // deviceCount!=4 is a bug, because if I don't set the gres=gpu:*, cuda will detect all the gpu device
+    if(err==cudaSuccess&&deviceCount>0){//说明存在gpu设备
+        #ifdef ENABLE_NVHPC
+        if (Params::use_scalapack_ecrpa &&
+        (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR ||
+         LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI))
+            corr = compute_RPA_correlation_blacs_2d_cuda(chi0, Vq);
+        else
+        #endif
+            corr = compute_RPA_correlation_cuda(chi0, Vq);
+    }else 
+    #endif
     if (Params::use_scalapack_ecrpa &&
         (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR ||
          LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI))
diff --git a/src/array_desc_device.cu b/src/array_desc_device.cu
new file mode 100644
index 00000000..5dab0fa5
--- /dev/null
+++ b/src/array_desc_device.cu
@@ -0,0 +1,36 @@
+#include "array_desc_device.h"
+
+Array_Desc_Device::Array_Desc_Device(const LIBRPA::Array_Desc& array_desc) {
+    this->ictxt_ = array_desc.ictxt();
+    this->m_ = array_desc.m();
+    this->n_ = array_desc.n();
+    this->mb_ = array_desc.mb();
+    this->nb_ = array_desc.nb();
+    this->lld_ = array_desc.lld();
+    this->irsrc_ = array_desc.irsrc();
+    this->icsrc_ = array_desc.icsrc();
+    this->m_local_ = array_desc.m_loc();
+    this->n_local_ = array_desc.n_loc();
+    this->myprow_ = array_desc.myprow();
+    this->mypcol_ = array_desc.mypcol();
+    this->nprows_ = array_desc.nprows();
+    this->npcols_ = array_desc.npcols();
+}
+__host__ __device__ int Array_Desc_Device::indx_g2p(const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs) {
+    return (isrcproc + indxglob / nb) % nprocs;
+}
+__host__ __device__ int Array_Desc_Device::indx_g2l(const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs) {
+    return nb * (indxglob / (nb * nprocs)) + indxglob % nb;
+}
+__host__ __device__ int Array_Desc_Device::indx_g2l_r(int gindx)const{
+    return this->myprow_ != indx_g2p(gindx, this->mb_, this->irsrc_, this->nprows_) ||
+                   gindx >= this->m_
+               ? -1
+               : indx_g2l(gindx, this->mb_, this->irsrc_, this->nprows_);
+}
+__host__ __device__ int Array_Desc_Device::indx_g2l_c(int gindx)const{
+    return this->mypcol_ != indx_g2p(gindx, this->nb_, this->icsrc_, this->npcols_) ||
+                   gindx >= this->n_
+               ? -1
+               : indx_g2l(gindx, this->nb_, this->icsrc_, this->npcols_);
+}
\ No newline at end of file
diff --git a/src/array_desc_device.h b/src/array_desc_device.h
new file mode 100644
index 00000000..15e25e80
--- /dev/null
+++ b/src/array_desc_device.h
@@ -0,0 +1,61 @@
+#include "base_blacs.h"
+class Array_Desc_Device{
+private:
+    int ictxt_;
+    // int nprocs_;
+    // int myid_;
+    int nprows_;
+    int myprow_;
+    int npcols_;
+    int mypcol_;
+
+    // Array dimensions
+    int m_;
+    int n_;
+    int mb_;
+    int nb_;
+    int irsrc_;
+    int icsrc_;
+    int lld_;
+    int m_local_;
+    int n_local_;
+    __host__ __device__ static int indx_g2p(
+        const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs);
+    __host__ __device__ static int indx_g2l(
+        const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs);
+public:
+    Array_Desc_Device(const LIBRPA::Array_Desc& array_desc);
+    __host__ __device__ 
+    int indx_g2l_r(int gindx) const;
+    __host__ __device__ 
+    int indx_g2l_c(int gindx) const;
+    __host__ __device__ 
+    const int& m() const{ return m_; }
+    __host__ __device__ 
+    const int& n() const{ return n_; }
+    __host__ __device__ 
+    const int& mb() const{ return mb_; }
+    __host__ __device__ 
+    const int& nb() const{ return nb_; }
+    __host__ __device__ 
+    const int& irsrc() const{ return irsrc_; }
+    __host__ __device__ 
+    const int& icsrc() const{ return icsrc_; }
+    __host__ __device__ 
+    const int& lld() const{ return lld_; }
+    __host__ __device__ 
+    const int& m_loc() const{ return m_local_; }
+    __host__ __device__ 
+    const int& n_loc() const{ return n_local_; }
+    __host__ __device__ 
+    const int& nprows() const{ return nprows_; }
+    __host__ __device__ 
+    const int& npcols() const{ return npcols_; }
+    __host__ __device__ 
+    const int& myprow() const{ return myprow_; }
+    __host__ __device__ 
+    const int& mypcol() const{ return mypcol_; }
+    __host__ __device__ 
+    const int& ictxt() const{ return ictxt_; }
+
+};
\ No newline at end of file
diff --git a/src/cuda_connector.cpp b/src/cuda_connector.cpp
new file mode 100644
index 00000000..f273c680
--- /dev/null
+++ b/src/cuda_connector.cpp
@@ -0,0 +1,2108 @@
+#include "cuda_connector.h"
+#include <assert.h>
+// cusolverMp include
+#include "helpers.h"
+#include <cusolverMp.h>
+#include "lapack_connector.h"
+#include "envs_mpi.h"
+using LIBRPA::envs::mpi_comm_global_h;
+#include "device_stream.h"
+// #define CUSOLVERMP_MPI_GRID_COL_MAJOR
+// #define OPEN_TEST_FOR_LU_DECOMPOSITION
+// #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+// #include <fstream> 
+// #include <string>
+// #endif
+// #ifdef ENABLE_CUBLASMP
+// // #include<curand.h>
+// // #include<chrono>
+// #endif
+// #include <vector>
+// #ifdef ENABLE_CUSOLVERMP
+// void CudaConnector::pzgetrf_cusolverMp(const int &m, const int &n, std::complex<double> *h_C_A, const int &ia,
+//                                 const int &ja, const LIBRPA::Array_Desc &arrdesc_pi, int *ipiv, int &h_info_getrf,const char order)
+//     {
+//         const int64_t M = arrdesc_pi.m();
+//         const int64_t N = arrdesc_pi.n();
+
+//         const int64_t IA = ia;
+//         const int64_t JA = ja;
+
+//         /* Tile sizes */
+//         const int64_t MA = arrdesc_pi.mb();
+//         const int64_t NA = arrdesc_pi.nb();
+//         int numRowDevices, numColDevices;
+//         if(order=='C'){
+//             numRowDevices = arrdesc_pi.npcols();
+//             numColDevices = arrdesc_pi.nprows();
+//         }
+//         else if(order=='R'){
+//             numRowDevices = arrdesc_pi.nprows();
+//             numColDevices = arrdesc_pi.npcols();
+//         }else{
+//             fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n");
+//         }
+        
+        
+//         const uint32_t RSRCA = 0;
+//         const uint32_t CSRCA = 0;
+
+//         int mpiCommSize, mpiRank;
+//         MPI_Comm_size(MPI_COMM_WORLD, &mpiCommSize);
+//         MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank);
+
+//         int local_device = getLocalDevice();
+//         int numDevices = 0;
+        
+//         // printf("Number of devices = %d\n", numDevices);
+//         // printf("local_device = %d, mpiRank = %d, mpiCommSize = %d\n", local_device, mpiRank, mpiCommSize);
+//         #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+//         double time_start_set_device=omp_get_wtime();
+//         #endif
+//         cudaError_t cudaStat  = cudaSetDevice(local_device);
+//         #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+//         printf("time for set device:%f seconds, rank:%d\n", omp_get_wtime() - time_start_set_device,mpiRank);
+//         #endif
+//         assert(cudaStat == cudaSuccess);
+//         cudaStat = cudaFree(0);
+//         assert(cudaStat == cudaSuccess);
+//         // const int mpiRank_T=arrdesc_pi.myprow()+ arrdesc_pi.mypcol()*arrdesc_pi.npcols();
+//         const int rank     = mpiRank;
+//         // printf("mpiRank=%d,mpiRank_T=%d\n", mpiRank, mpiRank_T);
+//         const int commSize = mpiCommSize;
+//         /* Library handles */
+//         cusolverMpHandle_t cusolverMpHandle = NULL;
+//         cal_comm_t         cal_comm         = NULL;
+
+//         /* Error codes */
+//         cusolverStatus_t cusolverStat = CUSOLVER_STATUS_SUCCESS;
+//         calError_t       calStat      = CAL_OK;
+//         cudaStat     = cudaSuccess;
+
+//         /* User defined stream */
+//         cudaStream_t localStream = NULL;
+//         cal_comm_create_params_t params;
+//         params.allgather    = allgather;
+//         params.req_test     = request_test;
+//         params.req_free     = request_free;
+//         params.data         = (void*)(MPI_COMM_WORLD);
+//         params.rank         = rank;
+//         params.nranks       = commSize;
+//         params.local_device = local_device;
+
+//         calStat = cal_comm_create(params, &cal_comm);
+//         assert(calStat == CAL_OK);
+
+//         /* Create local stream */
+//         #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+//         double time_start_create_stream=omp_get_wtime();
+//         #endif
+//         cudaStat = cudaStreamCreate(&localStream);
+//         #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+//         printf("time for create stream:%f seconds, rank:%d\n", omp_get_wtime() - time_start_create_stream,mpiRank);
+//         #endif
+//         assert(cudaStat == cudaSuccess);
+
+//         /* Initialize cusolverMp library handle */
+//         #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+//         double time_start_cusolverMpCreate=omp_get_wtime();
+//         #endif
+//         cusolverStat = cusolverMpCreate(&cusolverMpHandle, local_device, localStream);
+//         #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+//         printf("time for cusolverMpCreate:%f seconds, rank:%d\n", omp_get_wtime() - time_start_cusolverMpCreate,mpiRank);
+//         #endif
+//         assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+//         /* cusolverMp grids */
+//         cusolverMpGrid_t gridA_C = NULL;
+
+//         /* cusolverMp matrix descriptors */
+//         cusolverMpMatrixDescriptor_t descrA_C = NULL;
+
+//         /* Distributed matrices */
+//         void*    d_C_A  = NULL; // for cuDoubleComplex
+//         int64_t* d_ipiv = NULL;
+
+//         /* Distributed device workspace */
+//         void* d_work_getrf_C = NULL;
+
+//         /* Distributed host workspace */
+//         void* h_work_getrf_C = NULL;
+
+//         /* size of workspace on device */
+//         size_t workspaceInBytesOnDevice_getrf_C = 0;
+
+//         /* size of workspace on host */
+//         size_t workspaceInBytesOnHost_getrf_C = 0;
+
+//         /* error codes from cusolverMp (device) */
+//         int* d_info_getrf = NULL;
+
+//         /* error codes from cusolverMp (host) */
+//         // int h_info_getrf = 0;
+
+//         /* Single process per device */
+//         assert((numRowDevices * numColDevices) == commSize);
+
+//         /* =========================================== */
+//         /*          Create inputs on master rank       */
+//         /* =========================================== */
+
+//         const int64_t lda   = (IA - 1) + N;
+//         const int64_t colsA = (JA - 1) + N;
+//         // cuDoubleComplex* h_C_A = NULL;
+//         int64_t LLDA, localColsA;
+//         if(order=='C'){
+//             localColsA =arrdesc_pi.m_loc();
+//             LLDA=arrdesc_pi.n_loc();
+//         }else if(order=='R'){
+//             localColsA =arrdesc_pi.n_loc();
+//             LLDA=arrdesc_pi.m_loc();
+//         }else{
+//             fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n");
+//         }
+        
+//         /* Allocate global d_A */
+//         cudaStat = cudaMalloc((void**)&d_C_A, localColsA * LLDA * sizeof(cuDoubleComplex));
+//         assert(cudaStat == cudaSuccess);
+
+//         /* =========================================== */
+//         /*          CREATE GRID DESCRIPTORS            */
+//         /* =========================================== */
+//         if(order=='C'){
+//         cusolverStat = cusolverMpCreateDeviceGrid(
+//                 cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_COL_MAJOR);
+//         }else if(order=='R'){
+//         cusolverStat = cusolverMpCreateDeviceGrid(
+//                 cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_ROW_MAJOR);
+//         }else{
+//             fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n");
+//         }
+//         assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+//         /* =========================================== */
+//         /*        CREATE MATRIX DESCRIPTORS            */
+//         /* =========================================== */
+//         cusolverStat = cusolverMpCreateMatrixDesc(
+//                 &descrA_C, gridA_C, CUDA_C_64F, (IA - 1) + M, (JA - 1) + N, MA, NA, RSRCA, CSRCA, LLDA);
+//         assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+//         /* Allocate global d_ipiv */
+//         /* REMARK : ipiv overlaps A[IA, JA:JA+N] as in Netlib's ScaLAPACK */
+//         cudaStat = cudaMalloc((void**)&d_ipiv, arrdesc_pi.m_loc() * sizeof(int64_t));
+//         assert(cudaStat == cudaSuccess);
+
+//         /* =========================================== */
+//         /*             ALLOCATE D_INFO                 */
+//         /* =========================================== */
+
+//         cudaStat = cudaMalloc((void**)&d_info_getrf, sizeof(int));
+//         assert(cudaStat == cudaSuccess);
+
+//         /* =========================================== */
+//         /*                RESET D_INFO                 */
+//         /* =========================================== */
+
+//         cudaStat = cudaMemset(d_info_getrf, 1, sizeof(int));
+//         assert(cudaStat == cudaSuccess);
+
+//         /* =========================================== */
+//         /*     QUERY WORKSPACE SIZE FOR MP ROUTINES    */
+//         /* =========================================== */
+//         cusolverStat = cusolverMpGetrf_bufferSize(cusolverMpHandle,
+//                                                   N,
+//                                                   N,
+//                                                   d_C_A,
+//                                                   IA,
+//                                                   JA,
+//                                                   descrA_C,
+//                                                   d_ipiv,
+//                                                   CUDA_C_64F,
+//                                                   &workspaceInBytesOnDevice_getrf_C,
+//                                                   &workspaceInBytesOnHost_getrf_C);
+//         assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+//         /* =========================================== */
+//         /*         ALLOCATE PGETRF WORKSPACE            */
+//         /* =========================================== */
+//         cudaStat = cudaMalloc((void**)&d_work_getrf_C, workspaceInBytesOnDevice_getrf_C);
+//         assert(cudaStat == cudaSuccess);
+//         h_work_getrf_C = (void*)malloc(workspaceInBytesOnHost_getrf_C);
+//         assert(h_work_getrf_C != NULL);
+
+//         // copy matrix from h_C_A to d_C_A
+//         std::complex<double>* h_C_A_temp = NULL;
+//         size_t temp_size = (int64_t)localColsA * (int64_t)LLDA * sizeof(cuDoubleComplex);
+//         if(order=='C'){
+//             h_C_A_temp = LapackConnector::transpose(h_C_A, arrdesc_pi.m_loc(), arrdesc_pi.n_loc());
+//             cudaStat = cudaMemcpy(d_C_A, h_C_A_temp, temp_size, cudaMemcpyHostToDevice);
+//         }else if(order=='R'){
+//             cudaStat = cudaMemcpy(d_C_A, h_C_A, temp_size, cudaMemcpyHostToDevice);
+//         }else{
+//             fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n");
+//         }
+//         assert(cudaStat == cudaSuccess);
+        
+
+//         /* sync wait for data to arrive to device */
+//         calStat = cal_stream_sync(cal_comm, localStream);
+//         assert(calStat == CAL_OK);
+
+
+//         /* =========================================== */
+//         /*                   CALL PGETRF               */
+//         /* =========================================== */
+//         h_info_getrf=1;
+//         // printf("h_info_getrf before LU composition(cuDoubleComplex) : %d\n", h_info_getrf);
+//         // printf("LU decomposition begin(cuDoubleComplex)\n");
+//         double start_time_C = omp_get_wtime();
+//         cusolverStat = cusolverMpGetrf(cusolverMpHandle,
+//                                        N,
+//                                        N,
+//                                        d_C_A,
+//                                        IA,
+//                                        JA,
+//                                        descrA_C,
+//                                        d_ipiv,
+//                                        CUDA_C_64F,
+//                                        d_work_getrf_C,
+//                                        workspaceInBytesOnDevice_getrf_C,
+//                                        h_work_getrf_C,
+//                                        workspaceInBytesOnHost_getrf_C,
+//                                        d_info_getrf);
+
+//         /* sync after cusolverMpGetrf */
+//         calStat = cal_stream_sync(cal_comm, localStream);
+//         assert(calStat == CAL_OK);
+//         // printf("LU decomposition end(cuDoubleComplex), time = %f seconds,rand=%d\n", omp_get_wtime() - start_time_C,rank);
+
+//         /* copy d_info_getrf to host */
+//         cudaStat = cudaMemcpyAsync(&h_info_getrf, d_info_getrf, sizeof(int), cudaMemcpyDeviceToHost, localStream);
+//         assert(cudaStat == cudaSuccess);
+//         /* wait for d_info_getrf copy */
+//         cudaStat = cudaStreamSynchronize(localStream);
+//         assert(cudaStat == cudaSuccess);
+//         // printf("h_info_getrf after composition(cuDoubleComplex) : %d\n", h_info_getrf);
+//         /* check return value of cusolverMpGetrf */
+//         assert(h_info_getrf == 0);
+//         // copy d_ipiv to ipiv
+//         cudaStat = cudaMemcpy(ipiv, d_ipiv, arrdesc_pi.m_loc() * sizeof(int64_t), cudaMemcpyDeviceToHost);
+//         assert(cudaStat == cudaSuccess);
+//         // copy matrix from d_C_A to h_C_A
+//         if(order=='C'){
+//             cudaStat = cudaMemcpy(h_C_A_temp, d_C_A, temp_size, cudaMemcpyDeviceToHost);
+//             LapackConnector::transpose(h_C_A_temp, h_C_A, arrdesc_pi.m_loc(), arrdesc_pi.n_loc());
+//         }else if(order=='R'){
+//             cudaStat = cudaMemcpy(h_C_A, d_C_A, temp_size, cudaMemcpyDeviceToHost);
+//         }else{
+//             fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n");
+//         }
+//         assert(cudaStat == cudaSuccess);
+
+//         calStat = cal_stream_sync(cal_comm, localStream);
+//         assert(calStat == CAL_OK);
+
+//         cudaStat = cudaStreamSynchronize(localStream);
+//         assert(cudaStat == cudaSuccess);
+
+
+//         /* sync wait for data to arrive to host */
+//         calStat = cal_stream_sync(cal_comm, localStream);
+//         assert(calStat == CAL_OK);
+
+
+//         /* =========================================== */
+//         /*            CHECK RESIDUAL ON MASTER         */
+//         /* =========================================== */
+
+//         /* =========================================== */
+//         /*        CLEAN UP HOST WORKSPACE ON MASTER    */
+//         /* =========================================== */
+//         cusolverStat = cusolverMpDestroyMatrixDesc(descrA_C);
+//         assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+//         cusolverStat = cusolverMpDestroyGrid(gridA_C);
+//         assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+//         if (d_C_A != NULL)
+//         {
+//             cudaStat = cudaFree(d_C_A);
+//             assert(cudaStat == cudaSuccess);
+//             d_C_A = NULL;
+//         }
+//         if (d_ipiv != NULL)
+//         {
+//             cudaStat = cudaFree(d_ipiv);
+//             assert(cudaStat == cudaSuccess);
+//             d_ipiv = NULL;
+//         }
+
+//         if(h_C_A_temp != NULL)
+//         {
+//             delete [] h_C_A_temp;
+//             h_C_A_temp = NULL;
+//         }
+
+        
+//         if (d_work_getrf_C != NULL)
+//         {
+//             cudaStat = cudaFree(d_work_getrf_C);
+//             assert(cudaStat == cudaSuccess);
+//             d_work_getrf_C = NULL;
+//         }
+
+//         if (d_info_getrf != NULL)
+//         {
+//             cudaStat = cudaFree(d_info_getrf);
+//             assert(cudaStat == cudaSuccess);
+//             d_info_getrf = NULL;
+//         }
+//         if (h_work_getrf_C)
+//         {
+//             free(h_work_getrf_C);
+//             h_work_getrf_C = NULL;
+//         }
+//         /* Destroy cusolverMp handle */
+//         cusolverStat = cusolverMpDestroy(cusolverMpHandle);
+//         assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+//         /* sync before cal_comm_destroy */
+//         calStat = cal_comm_barrier(cal_comm, localStream);
+//         assert(calStat == CAL_OK);
+
+//         /* destroy CAL communicator */
+//         calStat = cal_comm_destroy(cal_comm);
+//         assert(calStat == CAL_OK);
+
+//         /* destroy user stream */
+//         cudaStat = cudaStreamDestroy(localStream);
+//         assert(cudaStat == cudaSuccess);
+
+//         /* MPI barrier before MPI_Finalize */
+//         MPI_Barrier(MPI_COMM_WORLD);
+//         // printf("success in test\n");
+//         return;
+//     }
+// #endif
+
+#ifdef ENABLE_NVHPC
+void CudaConnector::pzgetrf_nvhpc(const GpuDeviceStream&gpu_dev_stream, ComplexMatrixDevice &d_A, const int &ia,
+                                const int &ja, const LIBRPA::Array_Desc &arrdesc_pi, int64_t *d_ipiv, int *d_info_getrf, const char& order)
+{
+        const int64_t M = arrdesc_pi.m();
+        const int64_t N = arrdesc_pi.n();
+
+        const int64_t IA = ia;
+        const int64_t JA = ja;
+
+        /* Tile sizes */
+        const int64_t MA = arrdesc_pi.mb();
+        const int64_t NA = arrdesc_pi.nb();
+        int numRowDevices, numColDevices;
+        if(order=='C'){
+            numRowDevices = arrdesc_pi.npcols();
+            numColDevices = arrdesc_pi.nprows();
+        }
+        else if(order=='R'){
+            numRowDevices = arrdesc_pi.nprows();
+            numColDevices = arrdesc_pi.npcols();
+        }else{
+            fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n");
+        }
+        
+        
+        const uint32_t RSRCA = 0;
+        const uint32_t CSRCA = 0;
+
+        int mpiRank = gpu_dev_stream.rank;
+        int mpiCommSize = gpu_dev_stream.nranks;
+
+        int local_device = gpu_dev_stream.local_device;
+
+        const int rank     = mpiRank;
+        const int commSize = mpiCommSize;
+        /* Library handles */
+        cusolverMpHandle_t cusolverMpHandle = gpu_dev_stream.cusolver_handle;
+        cal_comm_t         cal_comm         = gpu_dev_stream.cal_comm;
+
+        /* Error codes */
+        cusolverStatus_t cusolverStat = CUSOLVER_STATUS_SUCCESS;
+        cudaError_t cudaStat     = cudaSuccess;
+        cudaStream_t localStream = gpu_dev_stream.stream;
+        /* Initialize cusolverMp library handle */
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+        double time_start_cusolverMpCreate=omp_get_wtime();
+        #endif
+        // cusolverStat = cusolverMpCreate(&cusolverMpHandle, local_device, localStream);
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+        // printf("time for cusolverMpCreate:%f seconds, rank:%d\n", omp_get_wtime() - time_start_cusolverMpCreate,mpiRank);
+        #endif
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        /* cusolverMp grids */
+        cusolverMpGrid_t gridA_C = NULL;
+
+        /* cusolverMp matrix descriptors */
+        cusolverMpMatrixDescriptor_t descrA_C = NULL;
+
+        /* Distributed matrices */
+        void*    d_C_A  = d_A.ptr(); // for cuDoubleComplex
+        
+
+        /* Distributed device workspace */
+        void* d_work_getrf_C = NULL;
+
+        /* Distributed host workspace */
+        void* h_work_getrf_C = NULL;
+
+        /* size of workspace on device */
+        size_t workspaceInBytesOnDevice_getrf_C = 0;
+
+        /* size of workspace on host */
+        size_t workspaceInBytesOnHost_getrf_C = 0;
+
+        /* Single process per device */
+        assert((numRowDevices * numColDevices) == commSize);
+
+        /* =========================================== */
+        /*          Create inputs on master rank       */
+        /* =========================================== */
+
+        const int64_t lda   = (IA - 1) + N;
+        const int64_t colsA = (JA - 1) + N;
+        // cuDoubleComplex* h_C_A = NULL;
+        int64_t LLDA, localColsA;
+        if(order=='C'){
+            localColsA =arrdesc_pi.m_loc();
+            LLDA=arrdesc_pi.n_loc();
+        }else if(order=='R'){
+            localColsA =arrdesc_pi.n_loc();
+            LLDA=arrdesc_pi.m_loc();
+        }else{
+            fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n");
+        }
+
+        /* =========================================== */
+        /*          CREATE GRID DESCRIPTORS            */
+        /* =========================================== */
+        if(order=='C'){
+        cusolverStat = cusolverMpCreateDeviceGrid(
+                cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_COL_MAJOR);
+        }else if(order=='R'){
+        cusolverStat = cusolverMpCreateDeviceGrid(
+                cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_ROW_MAJOR);
+        }else{
+            fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n");
+        }
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        /* =========================================== */
+        /*        CREATE MATRIX DESCRIPTORS            */
+        /* =========================================== */
+        cusolverStat = cusolverMpCreateMatrixDesc(
+                &descrA_C, gridA_C, CUDA_C_64F, (IA - 1) + M, (JA - 1) + N, MA, NA, RSRCA, CSRCA, LLDA);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+
+        /* =========================================== */
+        /*                RESET D_INFO                 */
+        /* =========================================== */
+
+        CUDA_CHECK(cudaMemset(d_info_getrf, 1, sizeof(int)));
+        
+
+        /* =========================================== */
+        /*     QUERY WORKSPACE SIZE FOR MP ROUTINES    */
+        /* =========================================== */
+        cusolverStat = cusolverMpGetrf_bufferSize(cusolverMpHandle,
+                                                  N,
+                                                  N,
+                                                  d_C_A,
+                                                  IA,
+                                                  JA,
+                                                  descrA_C,
+                                                  d_ipiv,
+                                                  CUDA_C_64F,
+                                                  &workspaceInBytesOnDevice_getrf_C,
+                                                  &workspaceInBytesOnHost_getrf_C);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        /* =========================================== */
+        /*         ALLOCATE PGETRF WORKSPACE            */
+        /* =========================================== */
+        cudaStat = cudaMalloc((void**)&d_work_getrf_C, workspaceInBytesOnDevice_getrf_C);
+        assert(cudaStat == cudaSuccess);
+        h_work_getrf_C = (void*)malloc(workspaceInBytesOnHost_getrf_C);
+        assert(h_work_getrf_C != NULL);        
+        
+
+        /* sync wait for data to arrive to device */
+        CAL_CHECK(cal_stream_sync(cal_comm, localStream));
+
+
+        /* =========================================== */
+        /*                   CALL PGETRF               */
+        /* =========================================== */
+        double start_time_C = omp_get_wtime();
+        cusolverStat = cusolverMpGetrf(cusolverMpHandle,
+                                       N,
+                                       N,
+                                       d_C_A,
+                                       IA,
+                                       JA,
+                                       descrA_C,
+                                       d_ipiv,
+                                       CUDA_C_64F,
+                                       d_work_getrf_C,
+                                       workspaceInBytesOnDevice_getrf_C,
+                                       h_work_getrf_C,
+                                       workspaceInBytesOnHost_getrf_C,
+                                       d_info_getrf);
+
+        /* sync after cusolverMpGetrf */
+        CAL_CHECK(cal_stream_sync(cal_comm, localStream));
+        // printf("LU decomposition end(cuDoubleComplex), time = %f seconds,rand=%d\n", omp_get_wtime() - start_time_C,rank);
+
+        /* copy d_info_getrf to host */
+        int h_info_getrf=1;
+        cudaStat = cudaMemcpyAsync(&h_info_getrf, d_info_getrf, sizeof(int), cudaMemcpyDeviceToHost, localStream);
+        assert(cudaStat == cudaSuccess);
+        /* wait for d_info_getrf copy */
+        gpu_dev_stream.cudaSync();
+        assert(cudaStat == cudaSuccess);
+        assert(h_info_getrf == 0);
+
+
+        /* sync wait for data to arrive to host */
+        CAL_CHECK(cal_stream_sync(cal_comm, localStream));
+
+        /* =========================================== */
+        /*        CLEAN UP HOST WORKSPACE ON MASTER    */
+        /* =========================================== */
+        cusolverStat = cusolverMpDestroyMatrixDesc(descrA_C);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        cusolverStat = cusolverMpDestroyGrid(gridA_C);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+        
+        if (d_work_getrf_C != NULL)
+        {
+            cudaStat = cudaFree(d_work_getrf_C);
+            assert(cudaStat == cudaSuccess);
+            d_work_getrf_C = NULL;
+        }
+
+        if (h_work_getrf_C)
+        {
+            free(h_work_getrf_C);
+            h_work_getrf_C = NULL;
+        }
+        /* Destroy cusolverMp handle */
+        // cusolverStat = cusolverMpDestroy(cusolverMpHandle);
+        // assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        /* MPI barrier before MPI_Finalize */
+        MPI_Barrier(MPI_COMM_WORLD);
+        // printf("success in test\n");
+        return;
+}
+void CudaConnector::pgetrf_nvhpc_mixed_precision(
+        const GpuDeviceStream&gpu_dev_stream, void *d_A, 
+        const int &ia, const int &ja,
+        const LIBRPA::Array_Desc &arrdesc_pi, int64_t *d_ipiv, int *d_info_getrf,
+        const cudaDataType_t &computeType,const char &order
+    )
+{
+        const int64_t M = arrdesc_pi.m();
+        const int64_t N = arrdesc_pi.n();
+
+        const int64_t IA = ia;
+        const int64_t JA = ja;
+
+        /* Tile sizes */
+        const int64_t MA = arrdesc_pi.mb();
+        const int64_t NA = arrdesc_pi.nb();
+        int numRowDevices, numColDevices;
+        if(order=='C'||order=='c'){
+            numRowDevices = arrdesc_pi.npcols();
+            numColDevices = arrdesc_pi.nprows();
+        }
+        else if(order=='R'||order=='r'){
+            numRowDevices = arrdesc_pi.nprows();
+            numColDevices = arrdesc_pi.npcols();
+        }else{
+            fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n");
+        }
+        
+        
+        const uint32_t RSRCA = 0;
+        const uint32_t CSRCA = 0;
+
+        int local_device = gpu_dev_stream.local_device;
+
+        const int rank     = gpu_dev_stream.rank;
+        const int commSize =  gpu_dev_stream.nranks;
+        /* Library handles */
+        cusolverMpHandle_t cusolverMpHandle = gpu_dev_stream.cusolver_handle;
+        cal_comm_t         cal_comm         = gpu_dev_stream.cal_comm;
+
+        /* Error codes */
+        cusolverStatus_t cusolverStat = CUSOLVER_STATUS_SUCCESS;
+        cudaError_t cudaStat     = cudaSuccess;
+        cudaStream_t localStream = gpu_dev_stream.stream;
+
+        /* cusolverMp grids */
+        cusolverMpGrid_t gridA_C = NULL;
+
+        /* cusolverMp matrix descriptors */
+        cusolverMpMatrixDescriptor_t descrA_C = NULL;
+
+        /* Distributed matrices */
+        void*    d_C_A  = d_A; //
+        
+
+        /* Distributed device workspace */
+        void* d_work_getrf_C = NULL;
+
+        /* Distributed host workspace */
+        void* h_work_getrf_C = NULL;
+
+        /* size of workspace on device */
+        size_t workspaceInBytesOnDevice_getrf_C = 0;
+
+        /* size of workspace on host */
+        size_t workspaceInBytesOnHost_getrf_C = 0;
+
+        /* Single process per device */
+        assert((numRowDevices * numColDevices) == commSize);
+
+        const int64_t lda   = (IA - 1) + N;
+        const int64_t colsA = (JA - 1) + N;
+        // cuDoubleComplex* h_C_A = NULL;
+        int64_t LLDA, localColsA;
+        if(order=='C'||order=='c'){
+            localColsA =arrdesc_pi.m_loc();
+            LLDA=arrdesc_pi.n_loc();
+        }else if(order=='R'||order=='r'){
+            localColsA =arrdesc_pi.n_loc();
+            LLDA=arrdesc_pi.m_loc();
+        }else{
+            fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n");
+        }
+
+        /* =========================================== */
+        /*          CREATE GRID DESCRIPTORS            */
+        /* =========================================== */
+        if(order=='C'||order=='c'){
+        cusolverStat = cusolverMpCreateDeviceGrid(
+                cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_COL_MAJOR);
+        }else if(order=='R'||order=='r'){
+        cusolverStat = cusolverMpCreateDeviceGrid(
+                cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_ROW_MAJOR);
+        }else{
+            fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n");
+        }
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        /* =========================================== */
+        /*        CREATE MATRIX DESCRIPTORS            */
+        /* =========================================== */
+        cusolverStat = cusolverMpCreateMatrixDesc(
+                &descrA_C, gridA_C, computeType, (IA - 1) + M, (JA - 1) + N, MA, NA, RSRCA, CSRCA, LLDA);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+
+        /* =========================================== */
+        /*                RESET D_INFO                 */
+        /* =========================================== */
+
+        CUDA_CHECK(cudaMemset(d_info_getrf, 1, sizeof(int)));
+        
+
+        /* =========================================== */
+        /*     QUERY WORKSPACE SIZE FOR MP ROUTINES    */
+        /* =========================================== */
+        cusolverStat = cusolverMpGetrf_bufferSize(cusolverMpHandle,
+                                                  N,
+                                                  N,
+                                                  d_C_A,
+                                                  IA,
+                                                  JA,
+                                                  descrA_C,
+                                                  d_ipiv,
+                                                  computeType,
+                                                  &workspaceInBytesOnDevice_getrf_C,
+                                                  &workspaceInBytesOnHost_getrf_C);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        /* =========================================== */
+        /*         ALLOCATE PGETRF WORKSPACE            */
+        /* =========================================== */
+        CUDA_CHECK(cudaMallocAsync((void**)&d_work_getrf_C, workspaceInBytesOnDevice_getrf_C, localStream));
+        h_work_getrf_C = (void*)malloc(workspaceInBytesOnHost_getrf_C);
+        assert(h_work_getrf_C != NULL);        
+
+        /* =========================================== */
+        /*                   CALL PGETRF               */
+        /* =========================================== */
+        double start_time_C = omp_get_wtime();
+        CUSOLVERMP_CHECK(cusolverMpGetrf(
+            cusolverMpHandle, N, N,
+            d_C_A, IA, JA, descrA_C,
+            d_ipiv, computeType,
+            d_work_getrf_C, workspaceInBytesOnDevice_getrf_C,
+            h_work_getrf_C, workspaceInBytesOnHost_getrf_C,
+            d_info_getrf));
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        /* sync after cusolverMpGetrf */
+        CAL_CHECK(cal_stream_sync(cal_comm, localStream));
+
+        /* copy d_info_getrf to host */
+        int h_info_getrf=1;
+        cudaStat = cudaMemcpyAsync(&h_info_getrf, d_info_getrf, sizeof(int), cudaMemcpyDeviceToHost, localStream);
+        assert(cudaStat == cudaSuccess);
+        /* wait for d_info_getrf copy */
+        gpu_dev_stream.cudaSync();
+        assert(cudaStat == cudaSuccess);
+        assert(h_info_getrf == 0);
+
+        /* =========================================== */
+        /*        CLEAN UP HOST WORKSPACE ON MASTER    */
+        /* =========================================== */
+        cusolverStat = cusolverMpDestroyMatrixDesc(descrA_C);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        cusolverStat = cusolverMpDestroyGrid(gridA_C);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+        
+        if (d_work_getrf_C != NULL)
+        {
+            CUDA_CHECK(cudaFreeAsync(d_work_getrf_C, localStream));
+            d_work_getrf_C = NULL;
+        }
+
+        if (h_work_getrf_C)
+        {
+            free(h_work_getrf_C);
+            h_work_getrf_C = NULL;
+        }
+
+        /* MPI barrier before MPI_Finalize */
+        MPI_Barrier(MPI_COMM_WORLD);
+        return;
+}
+void CudaConnector::pgetrf_nvhpc_mixed_precision(
+        void *d_A, const int &ia, const int &ja,
+        const LIBRPA::Array_Desc &arrdesc_pi, int64_t *d_ipiv, int *d_info_getrf,
+        const cudaDataType_t &computeType,const char &order
+    )
+{
+        const int64_t M = arrdesc_pi.m();
+        const int64_t N = arrdesc_pi.n();
+
+        const int64_t IA = ia;
+        const int64_t JA = ja;
+
+        /* Tile sizes */
+        const int64_t MA = arrdesc_pi.mb();
+        const int64_t NA = arrdesc_pi.nb();
+        int numRowDevices, numColDevices;
+        if(order=='C'||order=='c'){
+            numRowDevices = arrdesc_pi.npcols();
+            numColDevices = arrdesc_pi.nprows();
+        }
+        else if(order=='R'||order=='r'){
+            numRowDevices = arrdesc_pi.nprows();
+            numColDevices = arrdesc_pi.npcols();
+        }else{
+            fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n");
+        }
+        
+        
+        const uint32_t RSRCA = 0;
+        const uint32_t CSRCA = 0;
+
+        int local_device = device_stream.local_device;
+
+        const int rank     = mpi_comm_global_h.myid;
+        const int commSize =  mpi_comm_global_h.nprocs;
+        /* Library handles */
+        cusolverMpHandle_t cusolverMpHandle = device_stream.cusolverMp_handle;
+        cal_comm_t         cal_comm         = device_stream.cal_comm;
+
+        /* Error codes */
+        cusolverStatus_t cusolverStat = CUSOLVER_STATUS_SUCCESS;
+        cudaError_t cudaStat     = cudaSuccess;
+        cudaStream_t localStream = (cudaStream_t)device_stream.stream;
+
+        /* cusolverMp grids */
+        cusolverMpGrid_t gridA_C = NULL;
+
+        /* cusolverMp matrix descriptors */
+        cusolverMpMatrixDescriptor_t descrA_C = NULL;
+
+        /* Distributed matrices */
+        void*    d_C_A  = d_A; //
+        
+
+        /* Distributed device workspace */
+        void* d_work_getrf_C = NULL;
+
+        /* Distributed host workspace */
+        void* h_work_getrf_C = NULL;
+
+        /* size of workspace on device */
+        size_t workspaceInBytesOnDevice_getrf_C = 0;
+
+        /* size of workspace on host */
+        size_t workspaceInBytesOnHost_getrf_C = 0;
+
+        /* Single process per device */
+        assert((numRowDevices * numColDevices) == commSize);
+
+        const int64_t lda   = (IA - 1) + N;
+        const int64_t colsA = (JA - 1) + N;
+        // cuDoubleComplex* h_C_A = NULL;
+        int64_t LLDA, localColsA;
+        if(order=='C'||order=='c'){
+            localColsA =arrdesc_pi.m_loc();
+            LLDA=arrdesc_pi.n_loc();
+        }else if(order=='R'||order=='r'){
+            localColsA =arrdesc_pi.n_loc();
+            LLDA=arrdesc_pi.m_loc();
+        }else{
+            fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n");
+        }
+
+        /* =========================================== */
+        /*          CREATE GRID DESCRIPTORS            */
+        /* =========================================== */
+        if(order=='C'||order=='c'){
+        cusolverStat = cusolverMpCreateDeviceGrid(
+                cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_COL_MAJOR);
+        }else if(order=='R'||order=='r'){
+        cusolverStat = cusolverMpCreateDeviceGrid(
+                cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_ROW_MAJOR);
+        }else{
+            fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n");
+        }
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        /* =========================================== */
+        /*        CREATE MATRIX DESCRIPTORS            */
+        /* =========================================== */
+        cusolverStat = cusolverMpCreateMatrixDesc(
+                &descrA_C, gridA_C, computeType, (IA - 1) + M, (JA - 1) + N, MA, NA, RSRCA, CSRCA, LLDA);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+
+        /* =========================================== */
+        /*                RESET D_INFO                 */
+        /* =========================================== */
+
+        CUDA_CHECK(cudaMemset(d_info_getrf, 1, sizeof(int)));
+        
+
+        /* =========================================== */
+        /*     QUERY WORKSPACE SIZE FOR MP ROUTINES    */
+        /* =========================================== */
+        cusolverStat = cusolverMpGetrf_bufferSize(cusolverMpHandle,
+                                                  N,
+                                                  N,
+                                                  d_C_A,
+                                                  IA,
+                                                  JA,
+                                                  descrA_C,
+                                                  d_ipiv,
+                                                  computeType,
+                                                  &workspaceInBytesOnDevice_getrf_C,
+                                                  &workspaceInBytesOnHost_getrf_C);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        /* =========================================== */
+        /*         ALLOCATE PGETRF WORKSPACE            */
+        /* =========================================== */
+        CUDA_CHECK(cudaMallocAsync((void**)&d_work_getrf_C, workspaceInBytesOnDevice_getrf_C, localStream));
+        h_work_getrf_C = (void*)malloc(workspaceInBytesOnHost_getrf_C);
+        assert(h_work_getrf_C != NULL);        
+
+        /* =========================================== */
+        /*                   CALL PGETRF               */
+        /* =========================================== */
+        double start_time_C = omp_get_wtime();
+        CUSOLVERMP_CHECK(cusolverMpGetrf(
+            cusolverMpHandle, N, N,
+            d_C_A, IA, JA, descrA_C,
+            d_ipiv, computeType,
+            d_work_getrf_C, workspaceInBytesOnDevice_getrf_C,
+            h_work_getrf_C, workspaceInBytesOnHost_getrf_C,
+            d_info_getrf));
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        /* sync after cusolverMpGetrf */
+        CAL_CHECK(cal_stream_sync(cal_comm, localStream));
+
+        /* copy d_info_getrf to host */
+        int h_info_getrf=1;
+        cudaStat = cudaMemcpyAsync(&h_info_getrf, d_info_getrf, sizeof(int), cudaMemcpyDeviceToHost, localStream);
+        assert(cudaStat == cudaSuccess);
+        /* wait for d_info_getrf copy */
+        device_stream.cudaSync();
+        assert(cudaStat == cudaSuccess);
+        assert(h_info_getrf == 0);
+
+        /* =========================================== */
+        /*        CLEAN UP HOST WORKSPACE ON MASTER    */
+        /* =========================================== */
+        cusolverStat = cusolverMpDestroyMatrixDesc(descrA_C);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        cusolverStat = cusolverMpDestroyGrid(gridA_C);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+        
+        if (d_work_getrf_C != NULL)
+        {
+            CUDA_CHECK(cudaFreeAsync(d_work_getrf_C, localStream));
+            d_work_getrf_C = NULL;
+        }
+
+        if (h_work_getrf_C)
+        {
+            free(h_work_getrf_C);
+            h_work_getrf_C = NULL;
+        }
+
+        /* MPI barrier before MPI_Finalize */
+        MPI_Barrier(MPI_COMM_WORLD);
+        return;
+}
+void CudaConnector::pgetrs_nvhpc_mixed_precision(
+    const GpuDeviceStream& gpu_dev_stream, const cublasOperation_t& trans,
+    const void* d_A, const int64_t& IA, const int64_t& JA, const LIBRPA::Array_Desc &arrdesc_A,
+    const int64_t* d_ipiv,
+    void* d_B, const int64_t& IB, const int64_t& JB, const LIBRPA::Array_Desc &arrdesc_B,
+    int* d_info, const cudaDataType_t& compute_type,
+    const char& order
+)
+{
+    const int64_t N = arrdesc_A.m();
+    const int64_t NRHS = arrdesc_B.n();
+    assert(arrdesc_A.m()==arrdesc_A.n());
+
+    int h_info = 1;
+    size_t workspaceInBytesOnDevice, workspaceInBytesOnHost;
+    void* h_work = NULL;
+    void* d_work = NULL;
+    const int64_t mbA = arrdesc_A.mb();
+    const int64_t nbA = arrdesc_A.nb();
+    const int64_t mbB = arrdesc_B.mb();
+    const int64_t nbB = arrdesc_B.nb();
+    assert(arrdesc_A.npcols() == arrdesc_B.npcols());
+    assert(arrdesc_A.nprows() == arrdesc_B.nprows());
+    int numRowDevices, numColDevices;
+    ORDER_CHECK(order);
+    if(order=='C'||order=='c'){
+        assert(N==NRHS);
+        numRowDevices = arrdesc_A.npcols();
+        numColDevices = arrdesc_A.nprows();
+    }
+    else{
+        numRowDevices = arrdesc_A.nprows();
+        numColDevices = arrdesc_A.npcols();
+    }
+    int64_t LLDA, localColsA, LLDB, localColsB;
+    if(order=='C'||order=='c'){
+        LLDA=arrdesc_A.n_loc();
+        localColsA =arrdesc_A.m_loc();
+        LLDB=arrdesc_B.n_loc();
+        localColsB =arrdesc_B.m_loc();
+    }else{
+        LLDA=arrdesc_A.m_loc();
+        localColsA =arrdesc_A.n_loc();
+        LLDB=arrdesc_B.m_loc();
+        localColsB =arrdesc_B.n_loc();
+    }
+    int mpiCommSize = gpu_dev_stream.nranks;
+    int rank = gpu_dev_stream.rank;
+
+    cusolverMpHandle_t cusolverMpHandle = gpu_dev_stream.cusolver_handle;
+
+    cusolverMpGrid_t grid = NULL;
+    cusolverMpMatrixDescriptor_t descrA = NULL;
+    cusolverMpMatrixDescriptor_t descrB = NULL;
+
+    CUSOLVERMP_CHECK(cusolverMpCreateDeviceGrid(
+        cusolverMpHandle, &grid, gpu_dev_stream.cal_comm, 
+        numRowDevices, numColDevices, 
+        (order=='c'||order=='C')?CUSOLVERMP_GRID_MAPPING_COL_MAJOR:CUSOLVERMP_GRID_MAPPING_ROW_MAJOR)
+    );
+    
+    CUSOLVERMP_CHECK(cusolverMpCreateMatrixDesc(
+            &descrA, grid, compute_type, 
+            (IA - 1) + N, (JA - 1) + N, 
+            mbA, nbA, 0, 0, LLDA)
+    );
+    CUSOLVERMP_CHECK(cusolverMpCreateMatrixDesc(
+            &descrB, grid, compute_type, 
+            (IB - 1) + N, (JB - 1) + NRHS, 
+            mbB, nbB, 0, 0, LLDB)
+    );
+
+    CUSOLVERMP_CHECK(cusolverMpGetrs_bufferSize(
+        cusolverMpHandle, trans, N, NRHS,
+        d_A, IA, JA, descrA, d_ipiv,
+        d_B, IB, JB, descrB,
+        compute_type,
+        &workspaceInBytesOnDevice,&workspaceInBytesOnHost)
+    );
+    gpu_dev_stream.calSync();
+    if (workspaceInBytesOnHost > 0)
+    {
+        h_work = (void*)malloc(workspaceInBytesOnHost);
+        assert(h_work != NULL);
+    }
+    if (workspaceInBytesOnDevice > 0)
+    {
+        CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, gpu_dev_stream.stream));
+    }
+    gpu_dev_stream.calSync();
+    CUSOLVERMP_CHECK(cusolverMpGetrs(
+        cusolverMpHandle, trans, 
+        N, NRHS,
+        d_A, IA, JA, descrA, d_ipiv,
+        d_B, IB, JB, descrB,
+        compute_type,
+        d_work, workspaceInBytesOnDevice,
+        h_work, workspaceInBytesOnHost,
+        d_info)
+    );
+    gpu_dev_stream.calSync();
+
+    CUDA_CHECK(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+    if(h_info!=0)
+    {
+        fprintf(stderr, "Error: cusolverMpgetrs failed with info=%d\n", h_info);
+        exit(1);
+    }
+    if(h_work!=NULL)
+    {
+        free(h_work);
+    }
+    if(d_work!=NULL){
+        CUDA_CHECK(cudaFreeAsync(d_work, gpu_dev_stream.stream));
+    }
+}
+void CudaConnector::pgetrf_trs_nvhpc_mixed_precision(
+    const GpuDeviceStream& gpu_dev_stream, const cublasOperation_t& trans,
+    void* d_A, const int64_t& IA, const int64_t& JA, const LIBRPA::Array_Desc &arrdesc_A,
+    void* d_B, const int64_t& IB, const int64_t& JB, const LIBRPA::Array_Desc &arrdesc_B,
+    const cudaDataType_t& compute_type, const char& order
+)
+{
+    int64_t* d_ipiv;
+    int* d_info;
+    CUDA_CHECK(cudaMallocAsync(&d_info,sizeof(int),gpu_dev_stream.stream));
+    if(order == 'c'||order == 'C'){
+        CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*arrdesc_A.n_loc(),gpu_dev_stream.stream));
+    }else{
+        CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*arrdesc_A.m_loc(),gpu_dev_stream.stream));
+    }
+    pgetrf_nvhpc_mixed_precision(
+        gpu_dev_stream, d_A, 1, 1, arrdesc_A,
+        d_ipiv, d_info,
+        CUDA_C_64F, order
+    );
+    pgetrs_nvhpc_mixed_precision(
+        gpu_dev_stream, trans,
+        d_A, 1, 1, arrdesc_A,
+        d_ipiv,
+        d_B, 1, 1, arrdesc_B,
+        d_info,
+        CUDA_C_64F, order
+    );
+    CUDA_CHECK(cudaFreeAsync(d_info, gpu_dev_stream.stream));
+    CUDA_CHECK(cudaFreeAsync(d_ipiv, gpu_dev_stream.stream));
+}
+// void CudaConnector::pgemm_cublasMp(const char &transa, const char &transb, const int &m, const int &n, const int &k,
+//                         const double &alphaD, const std::complex<double> *A, const int &ia, const int &ja, const LIBRPA::Array_Desc &arrdesc_A,
+//                         const std::complex<double> *B, const int &ib, const int &jb, const LIBRPA::Array_Desc &arrdesc_B,
+//                         const double &betaD, std::complex<double> *C, const int &ic, const int &jc, const LIBRPA::Array_Desc &arrdesc_C)
+// {
+//     using input_t = cuDoubleComplex;
+//     using output_t = cuDoubleComplex;
+//     using compute_t = cuDoubleComplex;
+//     const cudaDataType_t cuda_input_type = CUDA_C_64F;
+//     const cudaDataType_t cuda_output_type = CUDA_C_64F;
+//     cublasComputeType_t cublas_compute_type = CUBLAS_COMPUTE_64F_PEDANTIC;
+
+//     int64_t mbA = arrdesc_A.mb();
+//     int64_t nbA = arrdesc_A.nb();
+//     int64_t mbB = arrdesc_B.mb();
+//     int64_t nbB = arrdesc_B.nb();
+//     int64_t mbC = arrdesc_C.mb();
+//     int64_t nbC = arrdesc_C.nb();
+//     int nprow = arrdesc_A.nprows();
+//     int npcol = arrdesc_A.npcols();
+//     char grid_layout = 'r';
+//     bool verbose = false;
+//     int rank, nranks;
+//     MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+//     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+//     // if(rank==0)
+//     //     printf("sizeof(input_t)=%zu sizeof(output_t)=%zu sizeof(compute_t)=%zu\n", sizeof(input_t), sizeof(output_t), sizeof(compute_t));
+//     const int myprow = arrdesc_A.myprow();
+//     const int mypcol = arrdesc_A.mypcol();
+//     if(rank==0)
+//     {
+//         printf("m:%" PRId64 ", n:%" PRId64 ", k:%" PRId64 "\n", m, n, k);
+//         printf("nprow:%d, npcol:%d\n", nprow, npcol);
+//         printf("mbA:%" PRId64 ", nbA:%" PRId64 "\n", mbA, nbA);
+//     }
+//     const int local_device = getLocalDevice();
+//     printf("myrank:%d, myprow:%d, mypcol:%d, local_device:%d\n", rank, myprow, mypcol, local_device);
+//     CUDA_CHECK(cudaSetDevice(local_device));
+//     CUDA_CHECK(cudaFree(nullptr));
+
+//     cal_comm_t cal_comm;
+//     cal_comm_create_params_t params;
+//     {
+//         params.allgather    = allgather;
+//         params.req_test     = request_test;
+//         params.req_free     = request_free;
+//         params.data         = (void*)(MPI_COMM_WORLD);
+//         params.rank         = rank;
+//         params.nranks       = nranks;
+//         params.local_device = local_device;
+
+//         CAL_CHECK(cal_comm_create(params, &cal_comm));
+//     }
+//     cudaStream_t stream = nullptr;
+//     CUDA_CHECK(cudaStreamCreate(&stream));
+
+//     cublasMpHandle_t handle = nullptr;
+//     CUBLASMP_CHECK(cublasMpCreate(&handle, stream));
+
+//     cublasMpGrid_t grid = nullptr;
+
+//     cublasMpMatrixDescriptor_t descA = nullptr;
+//     cublasMpMatrixDescriptor_t descB = nullptr;
+//     cublasMpMatrixDescriptor_t descC = nullptr;
+
+//     input_t* d_A = nullptr;
+//     input_t* d_B = nullptr;
+//     output_t* d_C = nullptr;
+
+//     void* d_work = nullptr;
+
+//     compute_t alpha = {alphaD,0.0};
+//     compute_t beta = {betaD,0.0};
+
+
+//     size_t workspaceInBytesOnDevice = 0;
+//     size_t workspaceInBytesOnHost = 0;
+
+//     const int64_t global_m_a = (ia - 1) + m;
+//     const int64_t global_n_a = (ja - 1) + k;
+//     const int64_t global_m_b = (ib - 1) + k;
+//     const int64_t global_n_b = (jb - 1) + n;
+//     const int64_t global_m_c = (ic - 1) + m;
+//     const int64_t global_n_c = (jc - 1) + n;
+
+//     const int64_t llda = cublasMpNumroc(global_m_a, mbA, myprow, 0, nprow);
+//     const int64_t loc_n_a = cublasMpNumroc(global_n_a, nbA, mypcol, 0, npcol);
+//     printf("rank:%d, llda=%" PRId64 ", loc_n_a=%" PRId64 "\n", rank, llda, loc_n_a);
+//     const int64_t lldb = cublasMpNumroc(global_m_b, mbB, myprow, 0, nprow);
+//     const int64_t loc_n_b = cublasMpNumroc(global_n_b, nbB, mypcol, 0, npcol);
+
+//     const int64_t lldc = cublasMpNumroc(global_m_c, mbC, myprow, 0, nprow);
+//     const int64_t loc_n_c = cublasMpNumroc(global_n_c, nbC, mypcol, 0, npcol);
+
+//     CUDA_CHECK(cudaMallocAsync(&d_A, llda * loc_n_a * sizeof(input_t), stream));
+//     CUDA_CHECK(cudaMallocAsync(&d_B, lldb * loc_n_b * sizeof(input_t), stream));
+//     CUDA_CHECK(cudaMallocAsync(&d_C, lldc * loc_n_c * sizeof(output_t), stream));
+
+//     CUDA_CHECK(cudaMemcpyAsync(d_A, A, llda * loc_n_a * sizeof(input_t), cudaMemcpyHostToDevice, stream));
+//     CUDA_CHECK(cudaMemcpyAsync(d_B, B, lldb * loc_n_b * sizeof(input_t), cudaMemcpyHostToDevice, stream));
+//     CUDA_CHECK(cudaMemcpyAsync(d_C, C, lldc * loc_n_c * sizeof(output_t), cudaMemcpyHostToDevice, stream));
+
+//     CUBLASMP_CHECK(cublasMpGridCreate(
+//         handle,
+//         nprow,
+//         npcol,
+//         grid_layout == 'c' ? CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR,
+//         cal_comm,
+//         &grid));
+
+//     CUBLASMP_CHECK(
+//         cublasMpMatrixDescriptorCreate(handle,global_m_a, global_n_a, mbA, nbA, 0, 0, llda, cuda_input_type, grid, &descA));
+//     CUBLASMP_CHECK(
+//         cublasMpMatrixDescriptorCreate(handle,global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, cuda_input_type, grid, &descB));
+//     CUBLASMP_CHECK(
+//         cublasMpMatrixDescriptorCreate(handle,global_m_c, global_n_c, mbC, nbC, 0, 0, lldc, cuda_output_type, grid, &descC));
+//     cublasOperation_t transA,transB;
+//     if(transa=='N')
+//         transA=CUBLAS_OP_N;
+//     else if(transa=='T')
+//         transA=CUBLAS_OP_T;
+//     else if(transa=='C')
+//         transA=CUBLAS_OP_C;
+//     else{
+//         if(rank==0)
+//             printf("transa=%c is not supported\n", transa);
+//         exit(1);
+//     }
+//     if(transb=='N')
+//         transB=CUBLAS_OP_N;
+//     else if(transb=='T')
+//         transB=CUBLAS_OP_T;
+//     else if(transb=='C')
+//         transB=CUBLAS_OP_C;
+//     else{
+//         if(rank==0)
+//             printf("transb=%c is not supported\n", transb);
+//         exit(1);
+//     }
+
+
+
+//     CUBLASMP_CHECK(cublasMpGemm_bufferSize(
+//         handle,
+//         transA,
+//         transB,
+//         m,
+//         n,
+//         k,
+//         &alpha,
+//         d_A,
+//         ia,
+//         ja,
+//         descA,
+//         d_B,
+//         ib,
+//         jb,
+//         descB,
+//         &beta,
+//         d_C,
+//         ic,
+//         jc,
+//         descC,
+//         cublas_compute_type,
+//         &workspaceInBytesOnDevice,
+//         &workspaceInBytesOnHost));
+
+//     CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream));
+//     printf("workspaceInBytesOnDevice=%zu, workspaceInBytesOnHost=%zu, rank:%d\n", workspaceInBytesOnDevice, workspaceInBytesOnHost, rank);
+//     std::vector<int8_t> h_work(workspaceInBytesOnHost);
+
+//     CUDA_CHECK(cudaStreamSynchronize(stream));
+
+//     const double begin = MPI_Wtime();
+
+//     CUBLASMP_CHECK(cublasMpGemm(
+//         handle,
+//         transA,
+//         transB,
+//         m,
+//         n,
+//         k,
+//         &alpha,
+//         d_A,
+//         ia,
+//         ja,
+//         descA,
+//         d_B,
+//         ib,
+//         jb,
+//         descB,
+//         &beta,
+//         d_C,
+//         ic,
+//         jc,
+//         descC,
+//         cublas_compute_type,
+//         d_work,
+//         workspaceInBytesOnDevice,
+//         h_work.data(),
+//         workspaceInBytesOnHost));
+//     printf("Duration(before synchronize): %lf GFlops: %lf rank:%d\n", MPI_Wtime() - begin, (2 * m * n * k * 1e-9) / (MPI_Wtime() - begin), rank);
+//     CUDA_CHECK(cudaStreamSynchronize(stream));
+//     printf("Duration(after synchronize): %lf GFlops: %lf rank:%d\n", MPI_Wtime() - begin, (2 * m * n * k * 1e-9) / (MPI_Wtime() - begin), rank);
+//     if(verbose){
+//     }
+//     CUDA_CHECK(cudaMemcpyAsync(C, d_C, lldc * loc_n_c * sizeof(output_t), cudaMemcpyDeviceToHost, stream));
+//     CUDA_CHECK(cudaStreamSynchronize(stream));
+
+//     CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descA));
+//     CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descB));
+//     CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descC));
+
+//     CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid));
+
+//     CUBLASMP_CHECK(cublasMpDestroy(handle));
+
+//     CUDA_CHECK(cudaFreeAsync(d_A, stream));
+//     CUDA_CHECK(cudaFreeAsync(d_B, stream));
+//     CUDA_CHECK(cudaFreeAsync(d_C, stream));
+//     CUDA_CHECK(cudaFreeAsync(d_work, stream));
+
+//     CAL_CHECK(cal_comm_destroy(cal_comm));
+//     CUDA_CHECK(cudaStreamDestroy(stream));
+
+//     MPI_Barrier(MPI_COMM_WORLD);
+
+// }
+void CudaConnector::pgemm_device(cublasMpHandle_t handle,cublasOperation_t transA,cublasOperation_t transB,const int &m,const int &n,const int &k,
+                                    const void *alpha,
+                                    const ComplexMatrixDevice &d_A,int64_t ia,int64_t ja,
+                                    const ComplexMatrixDevice &d_B,int64_t ib,int64_t jb,
+                                    const void *beta,
+                                    ComplexMatrixDevice &d_C,int64_t ic,int64_t jc,
+                                    cublasComputeType_t cublas_compute_type)
+{
+    void* d_work;
+    
+    size_t workspaceInBytesOnDevice,workspaceInBytesOnHost;
+    CUBLASMP_CHECK(cublasMpGemm_bufferSize(handle,transA,transB,m,n,k,
+                                            alpha,
+                                            (const void *)(d_A.ptr()),ia,ja,d_A.desc_cublas,
+                                            (const void *)d_B.ptr(),ib,jb,d_B.desc_cublas,
+                                            beta,
+                                            d_C.ptr(),ic,jc,d_C.desc_cublas,
+                                            cublas_compute_type,
+                                            &workspaceInBytesOnDevice,
+                                            &workspaceInBytesOnHost));
+    CUDA_CHECK(cudaMalloc((void**)&d_work,workspaceInBytesOnDevice));
+    std::vector<int8_t> h_work(workspaceInBytesOnHost);
+    CUBLASMP_CHECK(cublasMpGemm(handle,transA,transB,m,n,k,
+                                alpha,
+                                (const void *)d_A.ptr(),ia,ja,d_A.desc_cublas,
+                                (const void *)d_B.ptr(),ib,jb,d_B.desc_cublas,
+                                beta,
+                                d_C.ptr(),ic,jc,d_C.desc_cublas,
+                                cublas_compute_type,
+                                d_work,workspaceInBytesOnDevice,
+                                h_work.data(),workspaceInBytesOnHost));
+    
+    CUDA_CHECK(cudaFree(d_work));
+}    
+
+void CudaConnector::pgemm_nvhpc(const GpuDeviceStream& gpu_dev_stream,cublasOperation_t transA,cublasOperation_t transB,const int & m,const int & n,const int & k,
+                        const void *alpha,
+                        const ComplexMatrixDevice &d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA,
+                        const ComplexMatrixDevice &d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB,
+                        const void *beta,
+                        ComplexMatrixDevice & d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC,
+                        cublasComputeType_t cublas_compute_type)
+{
+    using input_t = cuDoubleComplex;
+    using output_t = cuDoubleComplex;
+    using compute_t = cuDoubleComplex;
+    const cudaDataType_t cuda_input_type = CUDA_C_64F;
+    const cudaDataType_t cuda_output_type = CUDA_C_64F;
+
+    int64_t mbA = array_descA.mb();
+    int64_t nbA = array_descA.nb();
+    int64_t mbB = array_descB.mb();
+    int64_t nbB = array_descB.nb();
+    int64_t mbC = array_descC.mb();
+    int64_t nbC = array_descC.nb();
+    int nprow = array_descA.nprows();
+    int npcol = array_descA.npcols();
+    int llda= array_descA.lld();
+    int lldb= array_descB.lld();
+    int lldc= array_descC.lld();
+    cublasMpGrid_t grid = nullptr;
+
+    cublasMpMatrixDescriptor_t descA = nullptr;
+    cublasMpMatrixDescriptor_t descB = nullptr;
+    cublasMpMatrixDescriptor_t descC = nullptr;
+    char grid_layout = 'r';
+    int rank=gpu_dev_stream.rank;
+    int nranks=gpu_dev_stream.nranks;
+    
+    const int myprow = (grid_layout == 'c' ? rank % nprow : rank / npcol);
+    const int mypcol = (grid_layout == 'c' ? rank / nprow : rank % npcol);
+    
+    cublasMpHandle_t handle = nullptr;
+    CUBLASMP_CHECK(cublasMpCreate(&handle, gpu_dev_stream.stream));
+    void* d_work = nullptr;
+
+
+    size_t workspaceInBytesOnDevice = 0;
+    size_t workspaceInBytesOnHost = 0;
+
+    int64_t global_m_a, global_n_a, global_m_b, global_n_b;
+    if(transA == CUBLAS_OP_N){
+        global_m_a = (ia - 1) + m;
+        global_n_a = (ja - 1) + k;
+    }else{
+        global_m_a = (ia - 1) + k;
+        global_n_a = (ja - 1) + m;
+    }
+    if(transB == CUBLAS_OP_N){
+        global_m_b = (ib - 1) + k;
+        global_n_b = (jb - 1) + n;
+    }else{
+        global_m_b = (ib - 1) + n;
+        global_n_b = (jb - 1) + k;
+    }
+    const int64_t global_m_c = (ic - 1) + m;
+    const int64_t global_n_c = (jc - 1) + n;
+    
+    CUBLASMP_CHECK(cublasMpGridCreate(
+        handle, nprow, npcol,
+        grid_layout == 'c' ? CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR,
+        gpu_dev_stream.cal_comm, &grid)
+    );
+    
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle,global_m_a, global_n_a, mbA, nbA, 0, 0, llda, cuda_input_type, grid, &descA));
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle,global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, cuda_input_type, grid, &descB));
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle,global_m_c, global_n_c, mbC, nbC, 0, 0, lldc, cuda_output_type, grid, &descC));
+    
+    CUBLASMP_CHECK(cublasMpGemm_bufferSize(
+        handle, transA, transB,
+        m, n, k,
+        alpha,
+        d_A.ptr(), ia, ja, descA,
+        d_B.ptr(), ib, jb, descB,
+        beta,
+        d_C.ptr(), ic, jc, descC,
+        cublas_compute_type,
+        &workspaceInBytesOnDevice, &workspaceInBytesOnHost)
+    );
+    gpu_dev_stream.cudaSync();
+    CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, gpu_dev_stream.stream));
+    std::vector<int8_t> h_work(workspaceInBytesOnHost);
+
+    CUBLASMP_CHECK(cublasMpGemm(
+        handle, transA, transB, 
+        m, n, k,
+        alpha,
+        d_A.ptr(), ia, ja, descA,
+        d_B.ptr(), ib, jb, descB,
+        beta,
+        d_C.ptr(), ic, jc, descC,
+        cublas_compute_type,
+        d_work, workspaceInBytesOnDevice,
+        h_work.data(), workspaceInBytesOnHost
+    ));
+    
+    gpu_dev_stream.cudaSync();
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descA));
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descB));
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descC));
+
+    CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid));
+
+    CUBLASMP_CHECK(cublasMpDestroy(handle));
+    CUDA_CHECK(cudaFree(d_work));
+
+    MPI_Barrier(MPI_COMM_WORLD);
+}
+// void CudaConnector::pgemm_nvhpc_cuFloatComplex(const GpuDeviceStream& gpu_dev_stream,cublasOperation_t transA,cublasOperation_t transB,const int & m,const int & n,const int & k,
+//                         const void *alpha,
+//                         const cuFloatComplex* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA,
+//                         const cuFloatComplex* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB,
+//                         const void *beta,
+//                         cuFloatComplex * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC,
+//                         cublasComputeType_t cublas_compute_type)
+// {
+//     using input_t = cuFloatComplex;
+//     using output_t = cuFloatComplex;
+//     using compute_t = cuFloatComplex;
+//     const cudaDataType_t cuda_input_type = CUDA_C_32F;
+//     const cudaDataType_t cuda_output_type = CUDA_C_32F;
+
+//     int64_t mbA = array_descA.mb();
+//     int64_t nbA = array_descA.nb();
+//     int64_t mbB = array_descB.mb();
+//     int64_t nbB = array_descB.nb();
+//     int64_t mbC = array_descC.mb();
+//     int64_t nbC = array_descC.nb();
+//     int nprow = array_descA.nprows();
+//     int npcol = array_descA.npcols();
+//     int llda= array_descA.lld();
+//     int lldb= array_descB.lld();
+//     int lldc= array_descC.lld();
+//     cublasMpGrid_t grid = nullptr;
+
+//     cublasMpMatrixDescriptor_t descA = nullptr;
+//     cublasMpMatrixDescriptor_t descB = nullptr;
+//     cublasMpMatrixDescriptor_t descC = nullptr;
+//     char grid_layout = 'r';
+//     int rank=gpu_dev_stream.rank;
+//     int nranks=gpu_dev_stream.nranks;
+//     // if(rank==0)
+//     //     printf("sizeof(input_t)=%zu sizeof(output_t)=%zu sizeof(compute_t)=%zu\n", sizeof(input_t), sizeof(output_t), sizeof(compute_t));
+//     const int myprow = (grid_layout == 'c' ? rank % nprow : rank / npcol);
+//     const int mypcol = (grid_layout == 'c' ? rank / nprow : rank % npcol);
+//     const int local_device = gpu_dev_stream.local_device;
+//     cudaStream_t stream = gpu_dev_stream.stream;
+
+//     cublasMpHandle_t handle = nullptr;
+//     CUBLASMP_CHECK(cublasMpCreate(&handle, stream));
+//     void* d_work = nullptr;
+
+
+//     size_t workspaceInBytesOnDevice = 0;
+//     size_t workspaceInBytesOnHost = 0;
+
+//     const int64_t global_m_a = (ia - 1) + m;
+//     const int64_t global_n_a = (ja - 1) + k;
+//     const int64_t global_m_b = (ib - 1) + k;
+//     const int64_t global_n_b = (jb - 1) + n;
+//     const int64_t global_m_c = (ic - 1) + m;
+//     const int64_t global_n_c = (jc - 1) + n;
+
+//     gpu_dev_stream.cudaSync();
+//     // printf("before create grid, rank:%d\n", rank);
+//     CUBLASMP_CHECK(cublasMpGridCreate(
+//         handle,
+//         nprow,
+//         npcol,
+//         grid_layout == 'c' ? CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR,
+//         gpu_dev_stream.cal_comm,
+//         &grid));
+//     // printf("after create grid, rank:%d\n", rank);
+//     CUBLASMP_CHECK(
+//         cublasMpMatrixDescriptorCreate(handle,global_m_a, global_n_a, mbA, nbA, 0, 0, llda, cuda_input_type, grid, &descA));
+//     CUBLASMP_CHECK(
+//         cublasMpMatrixDescriptorCreate(handle,global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, cuda_input_type, grid, &descB));
+//     CUBLASMP_CHECK(
+//         cublasMpMatrixDescriptorCreate(handle,global_m_c, global_n_c, mbC, nbC, 0, 0, lldc, cuda_output_type, grid, &descC));
+//     // printf("after create desc, rank:%d\n", rank);
+//     CUBLASMP_CHECK(cublasMpGemm_bufferSize(
+//         handle,
+//         transA,
+//         transB,
+//         m,
+//         n,
+//         k,
+//         alpha,
+//         d_A,
+//         ia,
+//         ja,
+//         descA,
+//         d_B,
+//         ib,
+//         jb,
+//         descB,
+//         beta,
+//         d_C,
+//         ic,
+//         jc,
+//         descC,
+//         cublas_compute_type,
+//         &workspaceInBytesOnDevice,
+//         &workspaceInBytesOnHost));
+//     // printf("workspaceInBytesOnDevice=%zu, workspaceInBytesOnHost=%zu, rank:%d\n", workspaceInBytesOnDevice, workspaceInBytesOnHost, rank);
+//     CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream));
+//     std::vector<int8_t> h_work(workspaceInBytesOnHost);
+
+//     gpu_dev_stream.cudaSync();
+
+//     // const double begin = MPI_Wtime();
+//     // printf("before gemm, rank:%d\n", rank);
+//     CUBLASMP_CHECK(cublasMpGemm(
+//         handle,
+//         transA,
+//         transB,
+//         m,
+//         n,
+//         k,
+//         alpha,
+//         d_A,
+//         ia,
+//         ja,
+//         descA,
+//         d_B,
+//         ib,
+//         jb,
+//         descB,
+//         beta,
+//         d_C,
+//         ic,
+//         jc,
+//         descC,
+//         cublas_compute_type,
+//         d_work,
+//         workspaceInBytesOnDevice,
+//         h_work.data(),
+//         workspaceInBytesOnHost));
+//     // printf("after gemm, rank:%d\n", rank);
+//     CUDA_CHECK(cudaStreamSynchronize(stream));
+//     CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descA));
+//     CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descB));
+//     CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descC));
+
+//     CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid));
+
+//     CUBLASMP_CHECK(cublasMpDestroy(handle));
+//     CUDA_CHECK(cudaFree(d_work));
+
+//     MPI_Barrier(MPI_COMM_WORLD);
+// }
+void CudaConnector::pgemm_nvhpc_mixed_precision(
+    const GpuDeviceStream& gpu_dev_stream,cublasOperation_t transA,cublasOperation_t transB,const int & m,const int & n,const int & k,
+        const void *alpha,
+        const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA,
+        const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB,
+        const void *beta,
+        void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC,
+        cublasComputeType_t cublas_compute_type
+)
+{
+    cudaDataType_t cuda_compute_type;
+    if(cublas_compute_type == CUBLAS_COMPUTE_64F_PEDANTIC){
+        cuda_compute_type = CUDA_C_64F;
+    }else if(cublas_compute_type == CUBLAS_COMPUTE_32F_PEDANTIC){
+        cuda_compute_type = CUDA_C_32F;
+    }else{
+        fprintf(stderr, "Unsupported cublas_compute_type\n");
+    }
+    int64_t mbA = array_descA.mb();
+    int64_t nbA = array_descA.nb();
+    int64_t mbB = array_descB.mb();
+    int64_t nbB = array_descB.nb();
+    int64_t mbC = array_descC.mb();
+    int64_t nbC = array_descC.nb();
+    int nprow = array_descA.nprows();
+    int npcol = array_descA.npcols();
+    int llda= array_descA.lld();
+    int lldb= array_descB.lld();
+    int lldc= array_descC.lld();
+    cublasMpGrid_t grid = nullptr;
+
+    cublasMpMatrixDescriptor_t descA = nullptr;
+    cublasMpMatrixDescriptor_t descB = nullptr;
+    cublasMpMatrixDescriptor_t descC = nullptr;
+    char grid_layout = 'r';
+    int rank=gpu_dev_stream.rank;
+    int nranks=gpu_dev_stream.nranks;
+    // if(rank==0)
+    //     printf("sizeof(input_t)=%zu sizeof(output_t)=%zu sizeof(compute_t)=%zu\n", sizeof(input_t), sizeof(output_t), sizeof(compute_t));
+    const int myprow = (grid_layout == 'c' ? rank % nprow : rank / npcol);
+    const int mypcol = (grid_layout == 'c' ? rank / nprow : rank % npcol);
+    const int local_device = gpu_dev_stream.local_device;
+    cudaStream_t stream = gpu_dev_stream.stream;
+
+    cublasMpHandle_t handle = nullptr;
+    // printf("before create handle, rank:%d\n");
+    // CudaConnector::check_memory(gpu_dev_stream);
+    CUBLASMP_CHECK(cublasMpCreate(&handle, stream));
+    // printf("after create handle, rank:%d\n");
+    // CudaConnector::check_memory(gpu_dev_stream);
+    void* d_work = nullptr;
+
+
+    size_t workspaceInBytesOnDevice = 0;
+    size_t workspaceInBytesOnHost = 0;
+
+    int64_t global_m_a, global_n_a, global_m_b, global_n_b;
+    if(transA == CUBLAS_OP_N){
+        global_m_a = (ia - 1) + m;
+        global_n_a = (ja - 1) + k;
+    }else{
+        global_m_a = (ia - 1) + k;
+        global_n_a = (ja - 1) + m;
+    }
+    if(transB == CUBLAS_OP_N){
+        global_m_b = (ib - 1) + k;
+        global_n_b = (jb - 1) + n;
+    }else{
+        global_m_b = (ib - 1) + n;
+        global_n_b = (jb - 1) + k;
+    }
+    const int64_t global_m_c = (ic - 1) + m;
+    const int64_t global_n_c = (jc - 1) + n;
+
+    // gpu_dev_stream.cudaSync();
+    // printf("before create grid, rank:%d\n", rank);
+    // CudaConnector::check_memory(gpu_dev_stream);
+    CUBLASMP_CHECK(cublasMpGridCreate(
+        handle,
+        nprow,
+        npcol,
+        grid_layout == 'c' ? CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR,
+        gpu_dev_stream.cal_comm,
+        &grid));
+    // printf("after create grid, rank:%d\n", rank);
+    // CudaConnector::check_memory(gpu_dev_stream);
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle,global_m_a, global_n_a, mbA, nbA, 0, 0, llda, cuda_compute_type, grid, &descA));
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle,global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, cuda_compute_type, grid, &descB));
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle,global_m_c, global_n_c, mbC, nbC, 0, 0, lldc, cuda_compute_type, grid, &descC));
+    
+    CUBLASMP_CHECK(cublasMpGemm_bufferSize(
+        handle, transA, transB,
+        m, n, k,
+        alpha,
+        d_A, ia, ja, descA,
+        d_B, ib, jb, descB,
+        beta,
+        d_C, ic, jc, descC,
+        cublas_compute_type,
+        &workspaceInBytesOnDevice, &workspaceInBytesOnHost)
+    );
+    // printf("workspaceInBytesOnDevice=%zu GiB, workspaceInBytesOnHost=%zu GiB, rank:%d\n", workspaceInBytesOnDevice, workspaceInBytesOnHost, rank);
+    gpu_dev_stream.cudaSync();
+    // printf("before malloc d_work, rank:%d\n");
+    // CudaConnector::check_memory(gpu_dev_stream);
+    CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream));
+    // printf("after malloc d_work, rank:%d\n");
+    // CudaConnector::check_memory(gpu_dev_stream);
+    std::vector<int8_t> h_work(workspaceInBytesOnHost);
+
+    // gpu_dev_stream.cudaSync();
+
+    // const double begin = MPI_Wtime();
+    CUBLASMP_CHECK(cublasMpGemm(
+        handle, transA, transB,
+        m, n, k,
+        alpha,
+        d_A, ia, ja, descA,
+        d_B, ib, jb, descB,
+        beta,
+        d_C, ic, jc, descC,
+        cublas_compute_type,
+        d_work, workspaceInBytesOnDevice,
+        h_work.data(), workspaceInBytesOnHost)
+    );
+    // gpu_dev_stream.cudaSync();
+    // printf("before free d_work, rank:%d\n");
+    // CudaConnector::check_memory(gpu_dev_stream);
+    CUDA_CHECK(cudaFreeAsync(d_work, stream));
+    // printf("after free d_work, rank:%d\n");
+    // CudaConnector::check_memory(gpu_dev_stream);
+    gpu_dev_stream.cudaSync();
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descA));
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descB));
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descC));
+
+    CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid));
+
+    CUBLASMP_CHECK(cublasMpDestroy(handle));
+
+    MPI_Barrier(MPI_COMM_WORLD);
+}
+
+
+void CudaConnector::pgemm_nvhpc_mixed_precision(
+    cublasOperation_t transA,cublasOperation_t transB,const int & m,const int & n,const int & k,
+    const void *alpha,
+    const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA,
+    const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB,
+    const void *beta,
+    void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC,
+    cublasComputeType_t cublas_compute_type
+)
+{
+    cudaDataType_t cuda_compute_type;
+    if(cublas_compute_type == CUBLAS_COMPUTE_64F_PEDANTIC){
+        cuda_compute_type = CUDA_C_64F;
+    }else if(cublas_compute_type == CUBLAS_COMPUTE_32F_PEDANTIC){
+        cuda_compute_type = CUDA_C_32F;
+    }else{
+        fprintf(stderr, "Unsupported cublas_compute_type\n");
+    }
+    int64_t mbA = array_descA.mb();
+    int64_t nbA = array_descA.nb();
+    int64_t mbB = array_descB.mb();
+    int64_t nbB = array_descB.nb();
+    int64_t mbC = array_descC.mb();
+    int64_t nbC = array_descC.nb();
+    int nprow = array_descA.nprows();
+    int npcol = array_descA.npcols();
+    int llda= array_descA.lld();
+    int lldb= array_descB.lld();
+    int lldc= array_descC.lld();
+    cublasMpGrid_t grid = nullptr;
+
+    cublasMpMatrixDescriptor_t descA = nullptr;
+    cublasMpMatrixDescriptor_t descB = nullptr;
+    cublasMpMatrixDescriptor_t descC = nullptr;
+    char grid_layout = 'r';
+    int rank=mpi_comm_global_h.myid;
+    int nranks=mpi_comm_global_h.nprocs;
+    // if(rank==0)
+    //     printf("sizeof(input_t)=%zu sizeof(output_t)=%zu sizeof(compute_t)=%zu\n", sizeof(input_t), sizeof(output_t), sizeof(compute_t));
+    const int myprow = (grid_layout == 'c' ? rank % nprow : rank / npcol);
+    const int mypcol = (grid_layout == 'c' ? rank / nprow : rank % npcol);
+    const int local_device = device_stream.local_device;
+    cudaStream_t stream = (cudaStream_t)device_stream.stream;
+
+    cublasMpHandle_t handle = nullptr;
+    CUBLASMP_CHECK(cublasMpCreate(&handle, stream));
+    void* d_work = nullptr;
+
+
+    size_t workspaceInBytesOnDevice = 0;
+    size_t workspaceInBytesOnHost = 0;
+
+    int64_t global_m_a, global_n_a, global_m_b, global_n_b;
+    if(transA == CUBLAS_OP_N){
+        global_m_a = (ia - 1) + m;
+        global_n_a = (ja - 1) + k;
+    }else{
+        global_m_a = (ia - 1) + k;
+        global_n_a = (ja - 1) + m;
+    }
+    if(transB == CUBLAS_OP_N){
+        global_m_b = (ib - 1) + k;
+        global_n_b = (jb - 1) + n;
+    }else{
+        global_m_b = (ib - 1) + n;
+        global_n_b = (jb - 1) + k;
+    }
+    const int64_t global_m_c = (ic - 1) + m;
+    const int64_t global_n_c = (jc - 1) + n;
+
+    CUBLASMP_CHECK(cublasMpGridCreate(
+        handle,
+        nprow,
+        npcol,
+        grid_layout == 'c' ? CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR,
+        device_stream.cal_comm,
+        &grid));
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle,global_m_a, global_n_a, mbA, nbA, 0, 0, llda, cuda_compute_type, grid, &descA));
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle,global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, cuda_compute_type, grid, &descB));
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle,global_m_c, global_n_c, mbC, nbC, 0, 0, lldc, cuda_compute_type, grid, &descC));
+    
+    CUBLASMP_CHECK(cublasMpGemm_bufferSize(
+        handle, transA, transB,
+        m, n, k,
+        alpha,
+        d_A, ia, ja, descA,
+        d_B, ib, jb, descB,
+        beta,
+        d_C, ic, jc, descC,
+        cublas_compute_type,
+        &workspaceInBytesOnDevice, &workspaceInBytesOnHost)
+    );
+    device_stream.cudaSync();
+    CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream));
+    std::vector<int8_t> h_work(workspaceInBytesOnHost);
+
+    // const double begin = MPI_Wtime();
+    CUBLASMP_CHECK(cublasMpGemm(
+        handle, transA, transB,
+        m, n, k,
+        alpha,
+        d_A, ia, ja, descA,
+        d_B, ib, jb, descB,
+        beta,
+        d_C, ic, jc, descC,
+        cublas_compute_type,
+        d_work, workspaceInBytesOnDevice,
+        h_work.data(), workspaceInBytesOnHost)
+    );
+    
+    CUDA_CHECK(cudaFreeAsync(d_work, stream));
+    
+    device_stream.cudaSync();
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descA));
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descB));
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descC));
+
+    CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid));
+
+    CUBLASMP_CHECK(cublasMpDestroy(handle));
+}
+void CudaConnector::pgeadd_nvhpc(
+    const GpuDeviceStream& gpu_dev_stream,const cublasOperation_t& trans,
+    const void *alpha,
+    const void* d_A, const int64_t& ia, const int64_t& ja, const Array_Desc& array_descA,
+    const void* beta,
+    void* d_B, const int64_t& ib, const int64_t& jb, const Array_Desc& array_descB,
+    const cudaDataType_t& compute_type,
+    const char& order
+)
+{
+    cudaStream_t stream = gpu_dev_stream.stream;
+    cublasMpHandle_t handle = nullptr;
+    const int64_t mbA = array_descA.mb();
+    const int64_t nbA = array_descA.nb();
+    const int64_t mbB = array_descB.mb();
+    const int64_t nbB = array_descB.nb();
+    
+    int nprow, npcol;
+    assert(array_descA.nprows() == array_descB.nprows());
+    assert(array_descA.npcols() == array_descB.npcols());
+    ORDER_CHECK(order);
+    if(order == 'c'||order == 'C'){
+        nprow = array_descA.npcols();
+        npcol = array_descA.nprows();
+    }else{
+        nprow = array_descA.nprows();
+        npcol = array_descA.npcols();
+    }
+    int llda,loc_n_a,lldb,loc_n_b,mA,nA,mB,nB;
+    if(order == 'c'||order == 'C'){
+        llda = array_descA.n_loc();
+        loc_n_a = array_descA.m_loc();
+        lldb = array_descB.n_loc();
+        loc_n_b = array_descB.m_loc();
+        mA = array_descA.n();
+        nA = array_descA.m();
+        mB = array_descB.n();
+        nB = array_descB.m();
+
+    }else{
+        llda = array_descA.m_loc();
+        loc_n_a = array_descA.n_loc();
+        lldb = array_descB.m_loc();
+        loc_n_b = array_descB.n_loc();
+        mA = array_descA.m();
+        nA = array_descA.n();
+        mB = array_descB.m();
+        nB = array_descB.n();
+    }
+    if(trans == CUBLAS_OP_N){
+        assert(mA == nA);
+        assert(mB == nB);
+    }else{
+        assert(mA == nB);
+        assert(mB == nA);
+    }
+    const int global_m_a = ia-1+mA;
+    const int global_n_a = ja-1+nA;
+    const int global_m_b = ib-1+mB;
+    const int global_n_b = jb-1+nB;
+    int rank = gpu_dev_stream.rank;
+    int nranks = gpu_dev_stream.nranks;
+
+    const int local_device = gpu_dev_stream.local_device;
+    CUBLASMP_CHECK(cublasMpCreate(&handle, stream));
+    cublasMpGrid_t grid = nullptr;
+    cublasMpMatrixDescriptor_t descA = nullptr;
+    cublasMpMatrixDescriptor_t descB = nullptr;
+    void* d_work = nullptr;
+    size_t workspaceInBytesOnDevice = 0;
+    size_t workspaceInBytesOnHost = 0;
+    CUBLASMP_CHECK(cublasMpGridCreate(
+        handle, nprow, npcol, 
+        order=='c'||order=='C'?CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR,
+        gpu_dev_stream.cal_comm,
+        &grid));
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle, global_m_a, global_n_a, mbA, nbA, 0, 0, llda, compute_type, grid, &descA));
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle, global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, compute_type, grid, &descB));
+    CUBLASMP_CHECK(cublasMpGeadd_bufferSize(
+        handle, trans,
+        mB, nB,
+        alpha,
+        d_A, ia, ja, descA,
+        beta,
+        d_B, ib, jb, descB,
+        &workspaceInBytesOnDevice, &workspaceInBytesOnHost));
+    gpu_dev_stream.calSync();
+    
+    CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream));
+    gpu_dev_stream.calSync();
+    std::vector<int8_t> h_work(workspaceInBytesOnHost);
+    CUBLASMP_CHECK(cublasMpGeadd(
+        handle, trans,
+        mB, nB,
+        alpha,
+        d_A, ia, ja, descA,
+        beta,
+        d_B, ib, jb, descB,
+        d_work, workspaceInBytesOnDevice,
+        h_work.data(), workspaceInBytesOnHost));
+    gpu_dev_stream.calSync();
+    CUDA_CHECK(cudaFreeAsync(d_work, stream));
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle, descA));
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle, descB));
+    CUBLASMP_CHECK(cublasMpGridDestroy(handle, grid));
+    CUBLASMP_CHECK(cublasMpDestroy(handle));
+}  
+
+void CudaConnector::pgemr2d_nvhpc(
+    const GpuDeviceStream& gpu_dev_stream,const int& m, const int& n,
+    const void* d_A, const int64_t& ia, const int64_t& ja, const Array_Desc& array_descA,
+    void* d_B, const int64_t& ib, const int64_t& jb, const Array_Desc& array_descB,
+    const cudaDataType_t& compute_type
+)
+{
+    
+    cudaStream_t stream = gpu_dev_stream.stream;
+    cublasMpHandle_t handle = nullptr;
+    const int64_t mbA = array_descA.mb();
+    const int64_t nbA = array_descA.nb();
+    const int64_t mbB = array_descB.mb();
+    const int64_t nbB = array_descB.nb();
+    int nprow, npcol;
+    assert(array_descA.nprows() == array_descB.nprows());
+    assert(array_descA.npcols() == array_descB.npcols());
+    nprow = array_descA.nprows();
+    npcol = array_descA.npcols();
+    
+    int llda = array_descA.m_loc();
+    int loc_n_a = array_descA.n_loc();
+
+    int lldb = array_descB.m_loc();
+    int loc_n_b = array_descB.n_loc();
+
+    const int global_m_a = ia-1+m;
+    const int global_n_a = ja-1+n;
+
+    const int global_m_b = ib-1+m;
+    const int global_n_b = jb-1+n;
+    
+    CUBLASMP_CHECK(cublasMpCreate(&handle, stream));
+    cublasMpGrid_t grid = nullptr;
+    cublasMpMatrixDescriptor_t descA = nullptr;
+    cublasMpMatrixDescriptor_t descB = nullptr;
+    void* d_work = nullptr;
+    size_t workspaceInBytesOnDevice = 0;
+    size_t workspaceInBytesOnHost = 0;
+    CUBLASMP_CHECK(cublasMpGridCreate(
+        handle, nprow, npcol, 
+        CUBLASMP_GRID_LAYOUT_ROW_MAJOR,
+        gpu_dev_stream.cal_comm,
+        &grid)
+    );
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle, global_m_a, global_n_a, mbA, nbA, 0, 0, llda, compute_type, grid, &descA));
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle, global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, compute_type, grid, &descB));
+    CUBLASMP_CHECK(cublasMpGemr2D_bufferSize(
+        handle,
+        m, n,
+        d_A, ia, ja, descA,
+        d_B, ib, jb, descB,
+        &workspaceInBytesOnDevice, &workspaceInBytesOnHost,
+        gpu_dev_stream.cal_comm)
+    );
+
+    gpu_dev_stream.calSync();
+    CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream));
+    std::vector<int8_t> h_work(workspaceInBytesOnHost);
+
+    CUBLASMP_CHECK(cublasMpGemr2D(
+        handle,
+        m, n,
+        d_A, ia, ja, descA,
+        d_B, ib, jb, descB,
+        d_work, workspaceInBytesOnDevice,
+        h_work.data(), workspaceInBytesOnHost,
+        gpu_dev_stream.cal_comm)
+    );
+    gpu_dev_stream.calSync();
+    CUDA_CHECK(cudaFreeAsync(d_work, stream));
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle, descA));
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle, descB));
+    CUBLASMP_CHECK(cublasMpGridDestroy(handle, grid));
+    CUBLASMP_CHECK(cublasMpDestroy(handle));
+} 
+#endif
+
diff --git a/src/cuda_connector.cu b/src/cuda_connector.cu
new file mode 100644
index 00000000..f6ffa7db
--- /dev/null
+++ b/src/cuda_connector.cu
@@ -0,0 +1,601 @@
+#include "cuda_connector.h"
+#include "device_stream.h"
+// add by hbchen in 2025-05-19
+__device__ int get_indxg2p(const int &indxglob, const int &nb, const int &iproc,
+                              const int &isrcproc, const int &nprocs)
+{
+    return (isrcproc + indxglob / nb) % nprocs;
+}
+__device__ int get_indxg2l(const int &indxglob, const int &nb, const int &iproc,
+                              const int &isrcproc, const int &nprocs)
+{
+    return nb * (indxglob / (nb * nprocs)) + indxglob % nb;
+}
+__device__ int get_index_g2l_r(const int& gindx, const int& m, const int& mb,const int &myprow, const int& irsrc, const int& nprows)
+{
+    return myprow != get_indxg2p(gindx, mb, myprow, irsrc, nprows) ||
+                   gindx >= m
+               ? -1
+               : get_indxg2l(gindx, mb, myprow, irsrc, nprows);
+}
+__device__ int get_index_g2l_c(const int& gindx, const int &n, const int &nb, const int &mypcol, const int &icsrc, const int &npcols)
+{
+    return mypcol != get_indxg2p(gindx, nb, mypcol, icsrc, npcols) ||
+                   gindx >= n
+               ? -1
+               : get_indxg2l(gindx, nb, mypcol, icsrc, npcols);
+}
+__global__ void det_multiply_seq(const cuDoubleComplex *inC,const int* d_ipiv,const int& num, cuDoubleComplex* ouC){
+    int f = blockIdx.x*blockDim.x + threadIdx.x;
+    // printf("f = %d\n", f);
+    ouC[f]=make_cuDoubleComplex(1.0, 0.0);
+    for(int i=f;i<num;i+=blockDim.x){
+        ouC[f]=cuCmul(ouC[f], inC[i*num+i]);
+        if(d_ipiv[i]!=i+1){
+            ouC[f].x=-ouC[f].x;
+            ouC[f].y=-ouC[f].y;
+        }
+    }
+}
+__global__ void cuDoubleComplex_minus_identity_kernel(cuDoubleComplex *d_a,const int& d_n){
+    int f= blockIdx.x*blockDim.x + threadIdx.x;
+    if(f<d_n){
+        d_a[f*d_n+f].x=d_a[f*d_n+f].x-1.0;
+    }
+}
+__global__ void multiply_number_for_ComplexMatrixDevice_kernel(cuDoubleComplex* d_a, const cuDoubleComplex& d_num, const int& d_len)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < d_len)
+    {
+        d_a[idx] = cuCmul(d_a[idx], d_num);
+    }
+}
+cuDoubleComplex CudaConnector::det_cuDoubleComplex(const cuDoubleComplex *d_a,const int *d_ipiv, const int& n, const cudaStream_t stream){
+    int blockSize = 256;
+    // printf("blockSize = %d\n", blockSize);
+    int gridSize = 1;
+    int *d_n;
+    cudaMallocAsync((void**)&d_n, sizeof(int), stream);
+    cudaMemcpyAsync(d_n, &n, sizeof(int), cudaMemcpyHostToDevice, stream);
+    cuDoubleComplex *d_ouC;
+    cuDoubleComplex *h_ouC;
+    cudaHostAlloc((void**)&h_ouC, blockSize*sizeof(cuDoubleComplex), cudaHostAllocDefault);
+    cuDoubleComplex ouC=make_cuDoubleComplex(1.0, 0.0);
+    cudaMallocAsync((void**)&d_ouC, blockSize*sizeof(cuDoubleComplex), stream);
+    // printf("start det_multiply\n");
+    det_multiply_seq<<<gridSize, blockSize, 0, stream>>>(d_a,d_ipiv,*d_n,d_ouC);
+    cudaMemcpyAsync(h_ouC, d_ouC, blockSize*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost, stream);
+    cudaStreamSynchronize(stream);
+    cudaFreeAsync(d_ouC, stream);
+    cudaFreeAsync(d_n, stream);
+    for(int i=0;i<blockSize;i++){
+        // printf("h_ouC[%d] = (%f, %f)\n", i, h_ouC[i].x, h_ouC[i].y);
+        ouC=cuCmul(ouC, h_ouC[i]);
+    }
+    cudaFreeHost(h_ouC);
+    return ouC;
+}
+void CudaConnector::cuDoubleComplex_minus_identity_Async(cuDoubleComplex *d_a,const int& h_n, const cudaStream_t stream){
+    int blockSize = 256;
+    int gridSize = (h_n + blockSize - 1) / blockSize;
+    int * d_n;
+    cudaMallocAsync((void**)&d_n, sizeof(int), stream);
+    cudaMemcpyAsync(d_n, &h_n, sizeof(int), cudaMemcpyHostToDevice, stream);
+    cuDoubleComplex_minus_identity_kernel<<<gridSize, blockSize, 0, stream>>>(d_a,*d_n);
+    cudaStreamSynchronize(stream);
+    cudaFreeAsync(d_n, stream);
+}
+void CudaConnector::multiply_number_for_ComplexMatrixDevice(ComplexMatrixDevice& mat, const cuDoubleComplex& num, const cudaStream_t& stream)
+{
+    int blockSize = 256;
+    cuDoubleComplex * d_num;
+    int *d_len;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_len, sizeof(int), stream));
+    int len = mat.nr()*mat.nc();
+    CUDA_CHECK(cudaMemcpyAsync(d_len, &len, sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(cuDoubleComplex), stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_num, &num, sizeof(cuDoubleComplex), cudaMemcpyHostToDevice, stream));
+    int gridSize = (mat.nr()*mat.nc() + blockSize - 1) / blockSize;
+    multiply_number_for_ComplexMatrixDevice_kernel<<<gridSize, blockSize, 0, stream>>>(mat.ptr(), *d_num, *d_len);
+}
+__global__ void diag_add_ComplexMatrixDevice_kernel(cuDoubleComplex* d_a, const double& d_num, const int& d_m, const int& d_n, const int& d_m_loc, const int& d_n_loc, const int& d_mb, const int& d_nb, const int& d_myprow, const int& d_mypcol, const int& d_irsrc, const int& d_icsrc, const int& d_nprows, const int& d_npcols)
+{
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row < d_m)
+    {
+        int col = row; // Diagonal element
+        int local_row = get_index_g2l_r(row, d_m, d_mb, d_myprow, d_irsrc, d_nprows);
+        int local_col = get_index_g2l_c(col, d_n, d_nb, d_mypcol, d_icsrc, d_npcols);
+        if (local_row != -1 && local_col != -1 && local_row < d_m_loc && local_col < d_n_loc)
+        {
+            int local_index = local_row + local_col * d_m_loc; // Column-major order
+            d_a[local_index].x += d_num; // Add the real number to the real part
+        }
+    }
+}
+
+void CudaConnector::diag_add_ComplexMatrixDevice(ComplexMatrixDevice& mat, const double& num, const Array_Desc& arrdesc,const cudaStream_t& stream)
+{
+    double *d_num;
+    int blockSize = 256;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(double), stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_num, &num, sizeof(double), cudaMemcpyHostToDevice, stream));
+    int *d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_m, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_n, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_m_loc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_n_loc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_mb, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_nb, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_myprow, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_mypcol, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_irsrc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_icsrc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_nprows, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_npcols, sizeof(int), stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_m, &arrdesc.m(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_n, &arrdesc.n(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_m_loc, &arrdesc.m_loc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_n_loc, &arrdesc.n_loc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_mb, &arrdesc.mb(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_nb, &arrdesc.nb(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_myprow, &arrdesc.myprow(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_mypcol, &arrdesc.mypcol(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_irsrc, &arrdesc.irsrc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_icsrc, &arrdesc.icsrc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_nprows, &arrdesc.nprows(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_npcols, &arrdesc.npcols(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    int gridSize = (arrdesc.m() + blockSize - 1) / blockSize;
+    diag_add_ComplexMatrixDevice_kernel<<<gridSize, blockSize, 0, stream>>>(mat.ptr(),*d_num,*d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols);
+    CUDA_CHECK(cudaFreeAsync(d_num, stream));
+    CUDA_CHECK(cudaFreeAsync(d_m, stream));
+    CUDA_CHECK(cudaFreeAsync(d_n, stream));
+    CUDA_CHECK(cudaFreeAsync(d_m_loc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_n_loc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_mb, stream));
+    CUDA_CHECK(cudaFreeAsync(d_nb, stream));
+    CUDA_CHECK(cudaFreeAsync(d_myprow, stream));
+    CUDA_CHECK(cudaFreeAsync(d_mypcol, stream));
+    CUDA_CHECK(cudaFreeAsync(d_irsrc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_icsrc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_nprows, stream));
+    CUDA_CHECK(cudaFreeAsync(d_npcols, stream));
+
+}
+__global__ void diag_add_matrix_device_blacs_kernel(const cuDoubleComplex* num, cuDoubleComplex* d_A, const Array_Desc_Device& array_desc_device)
+{
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+    if(i>=array_desc_device.m())
+        return;
+    int ilo = array_desc_device.indx_g2l_r(i);
+    if(ilo==-1)
+        return;
+    int jlo = array_desc_device.indx_g2l_c(i);
+    if(jlo==-1)
+        return;
+    if(ilo<array_desc_device.m_loc() && jlo<array_desc_device.n_loc())
+    {
+        d_A[ilo + jlo*array_desc_device.m_loc()] = cuCadd(d_A[ilo + jlo*array_desc_device.m_loc()], *num);
+    }
+}
+void CudaConnector::diag_add_matrix_device_blacs(
+    const void* num, void* d_A, const Array_Desc& array_desc,const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+){
+    Array_Desc_Device array_desc_device(array_desc);
+    Array_Desc_Device* d_array_desc_device;
+    void* d_num;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_array_desc_device, sizeof(Array_Desc_Device), (cudaStream_t)device_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_array_desc_device, &array_desc_device, sizeof(Array_Desc_Device), cudaMemcpyHostToDevice, (cudaStream_t)device_stream.stream));
+    int blockSize = 256;
+    int gridSize = (array_desc_device.m() + blockSize - 1) / blockSize;
+    if(compute_type == LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){
+        CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(cuDoubleComplex), (cudaStream_t)device_stream.stream));
+        CUDA_CHECK(cudaMemcpyAsync(d_num, num, sizeof(cuDoubleComplex), cudaMemcpyHostToDevice, (cudaStream_t)device_stream.stream));
+        
+        diag_add_matrix_device_blacs_kernel<<<gridSize, blockSize, 0, (cudaStream_t)device_stream.stream>>>((cuDoubleComplex*)d_num,(cuDoubleComplex*)d_A,*d_array_desc_device);
+       
+    }else if(compute_type == LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT){
+        CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(cuFloatComplex), (cudaStream_t)device_stream.stream));
+        CUDA_CHECK(cudaMemcpyAsync(d_num, num, sizeof(cuFloatComplex), cudaMemcpyHostToDevice, (cudaStream_t)device_stream.stream));
+    }else if(compute_type == LIBRPA_COMPUTE_TYPE_DOUBLE){
+        CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(double), (cudaStream_t)device_stream.stream));
+        CUDA_CHECK(cudaMemcpyAsync(d_num, num, sizeof(double), cudaMemcpyHostToDevice, (cudaStream_t)device_stream.stream));
+    }else{
+        CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(float), (cudaStream_t)device_stream.stream));
+        CUDA_CHECK(cudaMemcpyAsync(d_num, num, sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)device_stream.stream));
+    }
+    CUDA_CHECK(cudaFreeAsync(d_num, (cudaStream_t)device_stream.stream));
+    CUDA_CHECK(cudaFreeAsync(d_array_desc_device, (cudaStream_t)device_stream.stream));
+
+}
+__global__ void det_ComplexMatrixDevice_blacs_kernel(const cuDoubleComplex* d_in,cuDoubleComplex* d_out,const int& m,const int& n,const int& m_loc,const int& n_loc,const int& mb,const int& nb,const int& myprow,const int& mypcol,const int& irsrc,const int& icsrc,const int& nprows,const int& npcols)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    // printf("idx=%d\n",idx);
+    d_out[idx] = make_cuDoubleComplex(1.0,0.0);
+    
+    for(int i=idx;i<m;i+=blockDim.x)
+    {
+        int local_row = get_index_g2l_r(i, m, mb, myprow, irsrc, nprows);
+        int local_col = get_index_g2l_c(i, m, mb, mypcol, icsrc, npcols);
+        if (local_row != -1 && local_col != -1 && local_row < m_loc && local_col < n_loc)
+        {
+            d_out[idx] = cuCmul(d_out[idx],d_in[local_row + local_col*m_loc]);
+        }
+    }
+}
+std::complex<double> CudaConnector::det_ComplexMatrixDevice_blacs(const cudaStream_t& stream, const ComplexMatrixDevice&d_A, const LIBRPA::Array_Desc &arrdesc_pi)
+{
+    std::complex<double> det_loc={1.0,0.0};
+    int *d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_m, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_n, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_m_loc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_n_loc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_mb, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_nb, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_myprow, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_mypcol, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_irsrc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_icsrc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_nprows, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_npcols, sizeof(int), stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_m, &arrdesc_pi.m(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_n, &arrdesc_pi.n(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_m_loc, &arrdesc_pi.m_loc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_n_loc, &arrdesc_pi.n_loc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_mb, &arrdesc_pi.mb(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_nb, &arrdesc_pi.nb(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_myprow, &arrdesc_pi.myprow(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_mypcol, &arrdesc_pi.mypcol(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_irsrc, &arrdesc_pi.irsrc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_icsrc, &arrdesc_pi.icsrc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_nprows, &arrdesc_pi.nprows(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_npcols, &arrdesc_pi.npcols(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    int blockSize = 256;
+    int gridSize = 1;
+    cuDoubleComplex* d_detA;
+    std::complex<double> *h_detA = new std::complex<double>[blockSize];
+    CUDA_CHECK(cudaMallocAsync((void**)&d_detA, sizeof(cuDoubleComplex) * blockSize, stream));
+    // printf("before det_ComplexMatrixDevice_blacs_kernel\n");
+    // CUDA_CHECK(cudaStreamSynchronize(stream));
+    det_ComplexMatrixDevice_blacs_kernel<<<gridSize, blockSize, 0, stream>>>(d_A.ptr(),d_detA,*d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols);
+    CUDA_CHECK(cudaFreeAsync(d_m, stream));
+    CUDA_CHECK(cudaFreeAsync(d_n, stream));
+    CUDA_CHECK(cudaFreeAsync(d_m_loc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_n_loc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_mb, stream));
+    CUDA_CHECK(cudaFreeAsync(d_nb, stream));
+    CUDA_CHECK(cudaFreeAsync(d_myprow, stream));
+    CUDA_CHECK(cudaFreeAsync(d_mypcol, stream));
+    CUDA_CHECK(cudaFreeAsync(d_irsrc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_icsrc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_nprows, stream));
+    CUDA_CHECK(cudaFreeAsync(d_npcols, stream));
+    // printf("after det_ComplexMatrixDevice_blacs_kernel\n");
+    CUDA_CHECK(cudaMemcpyAsync(h_detA, d_detA, sizeof(cuDoubleComplex) * blockSize, cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaFreeAsync(d_detA, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    for(int i=0;i<blockSize;i++)
+    {
+        det_loc*=h_detA[i];
+    }
+    delete[] h_detA;
+    return det_loc;
+}
+__global__ void transpose_kernel(const cuDoubleComplex* d_in, cuDoubleComplex* d_out, const int& d_m,const int& d_n, const bool& d_conjugate)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    // printf("Thread %d processing index %d\n", idx, idx);
+    if (idx < d_m * d_n)
+    {
+        // data was stored in column-major order
+        int row = idx % d_m;
+        int col = idx / d_m;
+        int transpose_id= col + row * d_n;
+        d_out[transpose_id] = d_in[idx];
+        if (d_conjugate){
+            d_out[transpose_id].y = -d_out[transpose_id].y;
+        }
+        
+    }
+}
+
+void CudaConnector::transpose_ComplexMatrixDevice(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice& in ,bool is_conjugate)
+{
+    int *d_m_ptr,*d_n_ptr;
+    bool *d_conjugate_ptr;
+    int m = in.nr();
+    int n = in.nc();
+    CUDA_CHECK(cudaMallocAsync((void**)&d_m_ptr, sizeof(int), gpu_dev_stream.stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_n_ptr, sizeof(int), gpu_dev_stream.stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_conjugate_ptr, sizeof(bool), gpu_dev_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_m_ptr, &m, sizeof(int), cudaMemcpyHostToDevice, gpu_dev_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_n_ptr, &n, sizeof(int), cudaMemcpyHostToDevice, gpu_dev_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_conjugate_ptr, &is_conjugate, sizeof(bool), cudaMemcpyHostToDevice, gpu_dev_stream.stream));
+
+    const cuDoubleComplex* d_in = in.ptr();
+    cuDoubleComplex* d_out;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_out, sizeof(cuDoubleComplex) * n * m, gpu_dev_stream.stream));
+    int blockSize = 256;
+    int gridSize = (m * n + blockSize - 1) / blockSize;
+    // printf("before transpose kernel, gridSize=%d, blockSize=%d, m=%d, n=%d, is_conjugate=%d\n", gridSize, blockSize, m, n, is_conjugate);
+    transpose_kernel<<<gridSize, blockSize, 0, gpu_dev_stream.stream>>>(d_in, d_out, *d_m_ptr, *d_n_ptr, *d_conjugate_ptr);
+    // printf("after transpose kernel\n");
+    CUDA_CHECK(cudaFreeAsync(d_m_ptr, gpu_dev_stream.stream));
+    CUDA_CHECK(cudaFreeAsync(d_n_ptr, gpu_dev_stream.stream));
+    CUDA_CHECK(cudaFreeAsync(d_conjugate_ptr, gpu_dev_stream.stream));
+    gpu_dev_stream.cudaSync();
+    in.set_ptr(d_out);
+    in.set_nr(n);
+    in.set_nc(m);
+}
+__global__ void cuFloatComplex_to_cuDoubleComplex_kernel(const cuFloatComplex* d_in, cuDoubleComplex* d_out, const int64_t& d_len)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < d_len)
+    {
+        d_out[idx] = make_cuDoubleComplex(static_cast<double>(d_in[idx].x), static_cast<double>(d_in[idx].y));
+    }
+}
+void CudaConnector::cuFloatComplex_to_cuDoubleComplex_Async(const cuFloatComplex* d_in, cuDoubleComplex* d_out, const int64_t& len, const cudaStream_t& stream)
+{
+    int blockSize = 256;
+    int gridSize = (len + blockSize - 1) / blockSize;
+    // printf("gridSize = %d, blockSize = %d, len = %d\n", gridSize, blockSize, len);
+    // Launch the kernel
+    int64_t *d_len;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_len, sizeof(int64_t), stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_len, &len, sizeof(int64_t), cudaMemcpyHostToDevice, stream));
+    cuFloatComplex_to_cuDoubleComplex_kernel<<<gridSize, blockSize, 0, stream>>>(d_in, d_out, *d_len);
+}
+__global__ void cuDoubleComplex_to_cuFloatComplex_kernel(const cuDoubleComplex* d_a, cuFloatComplex* d_b, const int64_t& d_len)
+{
+    
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (idx < d_len)
+    {
+        d_b[idx] = make_cuFloatComplex(static_cast<float>(d_a[idx].x), static_cast<float>(d_a[idx].y));
+    }
+}
+void CudaConnector::cuDoubleComplex_to_cuFloatComplex_Async(const cuDoubleComplex* d_a, cuFloatComplex* d_b, const int64_t& len, const cudaStream_t& stream)
+{
+    int blockSize = 256;
+    int gridSize = (len + blockSize - 1) / blockSize;
+    // printf("gridSize = %d, blockSize = %d, len = %d\n", gridSize, blockSize, len);
+    // Launch the kernel
+    int64_t *d_len;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_len, sizeof(int64_t), stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_len, &len, sizeof(int64_t), cudaMemcpyHostToDevice, stream));
+    // printf("start kernel function\n");
+    cuDoubleComplex_to_cuFloatComplex_kernel<<<gridSize, blockSize, 0, stream>>>(d_a, d_b, *d_len);
+}
+__global__ void float_to_double_kernel(const float* d_a, double* d_b, const int64_t& d_len)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < d_len)
+    {
+        d_b[idx] = d_a[idx];
+    }
+}
+void CudaConnector::float_to_double_device(const float* d_a, double* d_b, const int64_t& len)
+{
+    int blockSize = 256;
+    int gridSize = (len + blockSize - 1) / blockSize;
+    int64_t *d_len;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_len, sizeof(int64_t), device_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_len, &len, sizeof(int64_t), cudaMemcpyHostToDevice, device_stream.stream));
+    float_to_double_kernel<<<gridSize, blockSize, 0, device_stream.stream>>>(d_a, d_b, *d_len);
+    CUDA_CHECK(cudaFreeAsync(d_len, device_stream.stream));
+}
+__global__ void double_to_float_kernel(const double* d_a, float* d_b, const int64_t& d_len)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < d_len)
+    {
+        d_b[idx] = static_cast<float>(d_a[idx]);
+    }
+}
+void CudaConnector::double_to_float_device(const double* d_a, float* d_b, const int64_t& len)
+{
+    int blockSize = 256;
+    int gridSize = (len + blockSize - 1) / blockSize;
+    int64_t *d_len;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_len, sizeof(int64_t), device_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_len, &len, sizeof(int64_t), cudaMemcpyHostToDevice, device_stream.stream));
+    double_to_float_kernel<<<gridSize, blockSize, 0, device_stream.stream>>>(d_a, d_b, *d_len);
+    CUDA_CHECK(cudaFreeAsync(d_len, device_stream.stream));
+}
+__global__ void trace_ComplexMatrixDevice_blacs_kernel(const cuDoubleComplex* d_in, cuDoubleComplex* d_out, const int& d_m, const int& d_n, const int& d_m_loc, const int& d_n_loc, const int& d_mb, const int& d_nb, const int& d_myprow, const int& d_mypcol, const int& d_irsrc, const int& d_icsrc, const int& d_nprows, const int& d_npcols)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    d_out[idx] = make_cuDoubleComplex(0.0, 0.0);
+    for(int i=idx;i<d_m;i+=blockDim.x){
+        int local_row = get_index_g2l_r(i, d_m, d_mb, d_myprow, d_irsrc, d_nprows);
+        int local_col = get_index_g2l_c(i, d_n, d_nb, d_mypcol, d_icsrc, d_npcols);
+        if (local_row != -1 && local_col != -1 && local_row < d_m_loc && local_col < d_n_loc)
+        {
+            d_out[idx] = cuCadd(d_out[idx], d_in[local_row + local_col * d_m_loc]);
+        }
+    }
+}
+
+__global__ void trace_matrix_device_blacs_kernel(const cuDoubleComplex* d_in, cuDoubleComplex* d_out, const Array_Desc_Device& array_desc_device)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    d_out[idx] = make_cuDoubleComplex(0.0, 0.0);
+    for(int i=idx;i<array_desc_device.m();i+=blockDim.x){
+        int local_row = array_desc_device.indx_g2l_r(i);
+        int local_col = array_desc_device.indx_g2l_c(i);
+        if (local_row != -1 && local_col != -1 && local_row < array_desc_device.m_loc() && local_col < array_desc_device.n_loc())
+        {
+            d_out[idx] = cuCadd(d_out[idx], d_in[local_row + local_col * array_desc_device.m_loc()]);
+        }
+    }
+}
+void CudaConnector::trace_matrix_device_blacs(void* trace, const void* d_A, const Array_Desc &arrdesc_A, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type)
+{
+    Array_Desc_Device array_desc_device(arrdesc_A);
+    Array_Desc_Device* d_array_desc_device;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_array_desc_device, sizeof(Array_Desc_Device), (cudaStream_t)device_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_array_desc_device, &array_desc_device, sizeof(Array_Desc_Device), cudaMemcpyHostToDevice, (cudaStream_t)device_stream.stream));
+    int blockSize = 256;
+    int gridSize = 1;
+    void* d_trace;
+    void* h_trace;
+    if(compute_type == LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){
+        std::complex<double> *trace_z = (std::complex<double>*)trace;
+        CUDA_CHECK(cudaMallocAsync((void**)&d_trace, blockSize*sizeof(std::complex<double>), device_stream.stream));
+        h_trace = (std::complex<double>*)malloc(blockSize*sizeof(std::complex<double>));
+        std::complex<double> *h_trace_z = (std::complex<double>*)h_trace;
+        *(trace_z) = std::complex<double>(0.0, 0.0);
+        trace_matrix_device_blacs_kernel<<<gridSize, blockSize, 0, device_stream.stream>>>((const cuDoubleComplex*)d_A, (cuDoubleComplex*)d_trace, *d_array_desc_device);
+        CUDA_CHECK(cudaMemcpyAsync(h_trace, d_trace, blockSize*sizeof(std::complex<double>), cudaMemcpyDeviceToHost, device_stream.stream));
+        device_stream.sync();
+        for(int i=0;i<blockSize;i++){
+            *(trace_z) += h_trace_z[i];
+        }
+    }
+    CUDA_CHECK(cudaFreeAsync(d_array_desc_device,device_stream.stream));
+    CUDA_CHECK(cudaFreeAsync(d_trace,device_stream.stream));
+    free(h_trace);
+}
+__global__ void det_matrix_device_blacs_kernel(const cuDoubleComplex* d_in, cuDoubleComplex* d_out, const Array_Desc_Device& array_desc_device)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int local_row,local_col;
+    d_out[idx] = make_cuDoubleComplex(1.0, 0.0);
+    for(int i=idx;i<array_desc_device.m();i+=blockDim.x){
+        local_row = array_desc_device.indx_g2l_r(i);
+        local_col = array_desc_device.indx_g2l_c(i);
+        if (local_row != -1 && local_col != -1 && local_row < array_desc_device.m_loc() && local_col < array_desc_device.n_loc())
+        {
+            d_out[idx] = cuCmul(d_out[idx], d_in[local_row + local_col * array_desc_device.m_loc()]);
+        }
+    }
+}
+void CudaConnector::det_matrix_device_blacs(void* det, const void* d_A, const Array_Desc &arrdesc_A, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type)
+{
+    Array_Desc_Device array_desc_device(arrdesc_A);
+    Array_Desc_Device* d_array_desc_device;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_array_desc_device, sizeof(Array_Desc_Device), (cudaStream_t)device_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_array_desc_device, &array_desc_device, sizeof(Array_Desc_Device), cudaMemcpyHostToDevice, (cudaStream_t)device_stream.stream));
+    int blockSize = 256;
+    int gridSize = 1;
+    void* d_det;
+    void* h_det;
+    if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){
+        std::complex<double> *det_z = (std::complex<double>*)det;
+        CUDA_CHECK(cudaMallocAsync((void**)&d_det, blockSize*sizeof(std::complex<double>), device_stream.stream));
+        h_det = (std::complex<double>*)malloc(blockSize*sizeof(std::complex<double>));
+        std::complex<double> *h_det_z = (std::complex<double>*)h_det;
+        *(det_z) = std::complex<double>(1.0, 0.0);
+        det_matrix_device_blacs_kernel<<<gridSize, blockSize, 0, device_stream.stream>>>((const cuDoubleComplex*)d_A, (cuDoubleComplex*)d_det, *d_array_desc_device);
+        CUDA_CHECK(cudaMemcpyAsync(h_det, d_det, blockSize*sizeof(std::complex<double>), cudaMemcpyDeviceToHost, device_stream.stream));
+        device_stream.sync();
+        for(int i=0;i<blockSize;i++){
+            *(det_z) *= h_det_z[i];
+        }
+    }
+    CUDA_CHECK(cudaFreeAsync(d_array_desc_device,device_stream.stream));
+    CUDA_CHECK(cudaFreeAsync(d_det,device_stream.stream));
+    free(h_det);
+}
+std::complex<double> CudaConnector::trace_ComplexMatrixDevice_blacs(const cudaStream_t& stream, const ComplexMatrixDevice& d_A, const LIBRPA::Array_Desc &arrdesc)
+{
+    
+    int blockSize = 256;
+    int *d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_m, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_n, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_m_loc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_n_loc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_mb, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_nb, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_myprow, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_mypcol, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_irsrc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_icsrc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_nprows, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_npcols, sizeof(int), stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_m, &arrdesc.m(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_n, &arrdesc.n(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_m_loc, &arrdesc.m_loc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_n_loc, &arrdesc.n_loc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_mb, &arrdesc.mb(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_nb, &arrdesc.nb(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_myprow, &arrdesc.myprow(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_mypcol, &arrdesc.mypcol(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_irsrc, &arrdesc.irsrc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_icsrc, &arrdesc.icsrc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_nprows, &arrdesc.nprows(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_npcols, &arrdesc.npcols(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    cuDoubleComplex *d_out;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_out, sizeof(cuDoubleComplex)*blockSize, stream));
+    std::complex<double> *h_out;
+    std::complex<double> trace_loc = {0.0, 0.0};
+    h_out = new std::complex<double>[blockSize];
+    int gridSize = 1;
+    trace_ComplexMatrixDevice_blacs_kernel<<<gridSize, blockSize, 0, stream>>>(d_A.ptr(),d_out,*d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols);
+    CUDA_CHECK(cudaFreeAsync(d_m, stream));
+    CUDA_CHECK(cudaFreeAsync(d_n, stream));
+    CUDA_CHECK(cudaFreeAsync(d_m_loc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_n_loc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_mb, stream));
+    CUDA_CHECK(cudaFreeAsync(d_nb, stream));
+    CUDA_CHECK(cudaFreeAsync(d_myprow, stream));
+    CUDA_CHECK(cudaFreeAsync(d_mypcol, stream));
+    CUDA_CHECK(cudaFreeAsync(d_irsrc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_icsrc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_nprows, stream));
+    CUDA_CHECK(cudaFreeAsync(d_npcols, stream));
+    CUDA_CHECK(cudaMemcpyAsync(h_out, d_out, sizeof(cuDoubleComplex)*blockSize, cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaFreeAsync(d_out, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    for(int i=0;i<blockSize;i++)
+    {
+        trace_loc += h_out[i];
+    }
+    delete[] h_out;
+    return trace_loc;
+}
+void ComplexMatrixDevice::set_as_zero(const cudaStream_t& stream){
+    if(d_data == nullptr|| m==0 || n==0){
+        fprintf(stderr,"Error: ComplexMatrixDevice::set_as_zero: d_data is null or m or n is zero\n");
+        std::abort();
+    }
+    if(stream == nullptr){
+        CUDA_CHECK(cudaMemset(d_data, 0, sizeof(cuDoubleComplex)*m*n));
+    }else{
+        CUDA_CHECK(cudaMemsetAsync(d_data, 0, sizeof(cuDoubleComplex)*m*n, stream));
+    }
+}
+__global__ void set_as_identity_kernel(cuDoubleComplex*d_A, const Array_Desc_Device& array_desc_device){
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+    if(i>=array_desc_device.m())
+        return;
+    int ilo = array_desc_device.indx_g2l_r(i);
+    if(ilo==-1)
+        return;
+    int jlo = array_desc_device.indx_g2l_c(i);
+    if(jlo==-1)
+        return;
+    d_A[ilo+jlo*array_desc_device.m_loc()] = make_cuDoubleComplex(1.0, 0.0);
+}
+void ComplexMatrixDevice::set_as_identity(const GpuDeviceStream& gpu_dev_stream, const Array_Desc_Device& array_desc_device){
+    this->set_as_zero(gpu_dev_stream.stream);
+    Array_Desc_Device* d_array_desc_device;
+    // printf("array_desc_device.m():%d\n", array_desc_device.m());
+    CUDA_CHECK(cudaMallocAsync((void**)&d_array_desc_device, sizeof(Array_Desc_Device), gpu_dev_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_array_desc_device, &array_desc_device, sizeof(Array_Desc_Device), cudaMemcpyHostToDevice, gpu_dev_stream.stream));
+    if(array_desc_device.m()!=array_desc_device.n()){
+        fprintf(stderr,"Error: ComplexMatrixDevice::set_as_identity: m and n are not equal\n");
+        std::abort();
+    }
+    int blockSize = 256;
+    int gridSize = (array_desc_device.m()+blockSize-1)/blockSize;
+    set_as_identity_kernel<<<gridSize, blockSize, 0, gpu_dev_stream.stream>>>(d_data, *d_array_desc_device);
+    CUDA_CHECK(cudaFreeAsync(d_array_desc_device, gpu_dev_stream.stream));
+}
\ No newline at end of file
diff --git a/src/cuda_connector.h b/src/cuda_connector.h
new file mode 100644
index 00000000..f496bf08
--- /dev/null
+++ b/src/cuda_connector.h
@@ -0,0 +1,621 @@
+#ifndef CUDA_CONNECTOR_H
+#define CUDA_CONNECTOR_H
+
+#pragma once
+//=================hbchen 2025-05-11=========================
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+#include <complex>
+#include <magma_v2.h>
+//=================hbchen 2025-05-11=========================
+#include <iostream>
+#include <device_launch_parameters.h>
+#include <omp.h>
+#include <mpi.h>
+#include "base_blacs.h"
+#include <fstream>
+#ifdef ENABLE_NVHPC
+#include <cusolverMp.h>
+#include <cublasmp.h>
+#include "helpers.h"
+#include "scalapack_connector.h"
+#include <curand.h>
+#include <chrono>
+#include "gpu_device_stream.h"
+#endif
+using LIBRPA::Array_Desc;
+#include "array_desc_device.h"
+#include "matrix_device.h"
+#ifdef ENABLE_NVHPC
+
+
+class ComplexMatrixDevice{
+private:
+    int m=0;
+    int n=0;
+public:
+    cuDoubleComplex* d_data=nullptr;
+    
+    cusolverMpGrid_t grid_cusolver=nullptr;
+    cusolverMpMatrixDescriptor_t desc_cusolver=nullptr;
+
+    cublasMpGrid_t grid_cublas=nullptr;
+    cublasMpMatrixDescriptor_t desc_cublas=nullptr;
+
+    bool is_cusolver_init=false;
+    bool is_cublas_init=false;
+
+    ComplexMatrixDevice(){
+
+    }
+    ComplexMatrixDevice(const int& m, const int& n){
+        ComplexMatrixDevice();
+        this->m=m;
+        this->n=n;
+        CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(cuDoubleComplex)));
+    }
+    ComplexMatrixDevice(const int& m, const int& n,void* c_data){
+        this->m=m;
+        this->n=n;
+        CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(cuDoubleComplex)));
+        CUDA_CHECK(cudaMemcpy(d_data, c_data, m * n * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
+    }
+    void set_data(const int& m, const int& n, const cudaStream_t& stream=nullptr){
+        if(m!=this->m || n!=this->n){
+            clean(stream);
+            this->m=m;
+            this->n=n;
+            if(stream==nullptr){
+                CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(cuDoubleComplex)));
+            }else{
+                CUDA_CHECK(cudaMallocAsync((void**)&d_data, m * n * sizeof(cuDoubleComplex),stream));
+            }
+        }
+    }
+    void set_data(const int& m, const int& n,const void* c_data, const cudaStream_t& stream=nullptr){
+        set_data(m,n,stream);
+        if(stream==nullptr){
+            CUDA_CHECK(cudaMemcpy(d_data, c_data, m * n * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
+        }else{
+            CUDA_CHECK(cudaMemcpyAsync(d_data, c_data, m * n * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice,stream));
+        }
+    }
+    void set_data_device(const int& m, const int& n,const void* d_A, const GpuDeviceStream& gpu_dev_stream)
+    {
+        if(m!=this->m || n!=this->n){
+            set_data(m,n,gpu_dev_stream.stream);
+        }
+        
+        CUDA_CHECK(cudaMemcpyPeerAsync(this->d_data,gpu_dev_stream.local_device, d_A, gpu_dev_stream.local_device, m * n * sizeof(cuDoubleComplex), gpu_dev_stream.stream));
+
+    }
+    void cusolver_init(cusolverMpHandle_t handle, cal_comm_t cal_comm, const Array_Desc& arrdesc){
+        if(!is_cusolver_init){
+            CUSOLVERMP_CHECK(cusolverMpCreateDeviceGrid(handle,&grid_cusolver,cal_comm,arrdesc.nprows(),arrdesc.npcols(),CUSOLVERMP_GRID_MAPPING_ROW_MAJOR));
+            CUSOLVERMP_CHECK(cusolverMpCreateMatrixDesc(&desc_cusolver,grid_cusolver,CUDA_C_64F,m,n,arrdesc.mb(),arrdesc.nb(),0,0,arrdesc.m_loc()));
+            is_cusolver_init=true;
+        }
+    }
+    void cublas_init(cublasMpHandle_t handle, cal_comm_t cal_comm, const Array_Desc& arrdesc){
+        if(!is_cublas_init){
+            CUBLASMP_CHECK(cublasMpGridCreate(handle,arrdesc.nprows(),arrdesc.npcols(),CUBLASMP_GRID_LAYOUT_ROW_MAJOR,cal_comm,&grid_cublas));
+            CUBLASMP_CHECK(cublasMpMatrixDescriptorCreate(handle,m,n,arrdesc.mb(),arrdesc.nb(),0,0,arrdesc.m_loc(),CUDA_C_64F,grid_cublas,&desc_cublas));
+            is_cublas_init=true;
+        }
+    }
+    cuDoubleComplex* ptr(){
+        return d_data;
+    }
+    const cuDoubleComplex* ptr() const {
+        return d_data;
+    }
+    int nr(){
+        return this->m;
+    }
+    int nc(){
+        return this->n;
+    }
+    void set_nr(int m){
+        this->m=m;
+    }
+    void set_nc(int n){
+        this->n=n;
+    }
+    void set_ptr(cuDoubleComplex* d_data){
+        if(this->d_data!=nullptr){
+            cudaFree(this->d_data);
+            this->d_data=nullptr;
+        }
+        this->d_data=d_data;
+    }
+    void clean(const cudaStream_t& stream=nullptr){
+        if(d_data!=nullptr){
+            if(stream==nullptr)
+                cudaFree(d_data);
+            else
+                cudaFreeAsync(d_data,stream);
+            d_data=nullptr;
+        }
+        this->m=0;
+        this->n=0;
+    }
+    void set_as_zero(const cudaStream_t& stream=nullptr);
+    void set_as_identity(const GpuDeviceStream& gpu_dev_stream, const Array_Desc_Device&);
+    void cublasClean(cublasMpHandle_t handle){
+        if(desc_cublas!=nullptr){
+            CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,desc_cublas));
+            desc_cublas=nullptr;
+        }
+        if(grid_cublas!=nullptr){
+            CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid_cublas));
+            grid_cublas=nullptr;
+        }
+        is_cublas_init=false;
+    }
+    ~ComplexMatrixDevice(){
+        if(is_cublas_init){
+            fprintf(stderr, "Error: ComplexMatrixDevice not cleaned cublasMp resources before destructing!\n");
+            std::abort();
+        }
+        // CUDA_CHECK(cudaDeviceSynchronize());
+        if(d_data!=nullptr){
+            CUDA_CHECK(cudaFree(d_data));
+            d_data=nullptr;
+        }
+        if(desc_cusolver!=nullptr){
+            CUSOLVERMP_CHECK(cusolverMpDestroyMatrixDesc(desc_cusolver));
+            desc_cusolver=nullptr;
+        }
+        if(grid_cusolver!=nullptr){
+            CUSOLVERMP_CHECK(cusolverMpDestroyGrid(grid_cusolver));
+            grid_cusolver=nullptr;
+        }
+    }
+    
+};
+
+
+#endif
+
+class CudaConnector
+{
+public:
+    static
+    void write_file(double* A,int M,int N,const char* name){
+        std::fstream out;
+        out.open(name, std::ios::out);
+        if (!out.is_open()) {
+            std::cerr << "Failed to open file: " << name << std::endl;
+            return;
+        }
+        for(int i=0;i<M;i++){
+            for(int j=0;j<N;j++){
+                out<<A[i+j*M]<<" ";
+            }
+            out<<std::endl;
+        }
+        out.close();
+    }
+    static
+    void write_file(cuDoubleComplex* A,int M,int N,const char* name){
+        std::fstream out;
+        std::ifstream file(name);
+        if(file.good()){
+            return;
+        }
+        out.open(name, std::ios::out);
+        if (!out.is_open()) {
+            std::cerr << "Failed to open file: " << name << std::endl;
+            return;
+        }
+        
+        for(int i=0;i<M;i++){
+            for(int j=0;j<N;j++){
+                if(std::abs(A[i+j*M].y)<1e-10)
+                    A[i+j*M].y=0;
+                if(std::abs(A[i+j*M].x)<1e-10)
+                    A[i+j*M].x=0;
+                out<<A[i+j*M].x<<"+"<<A[i+j*M].y<<"i ";
+            }
+            out<<std::endl;
+        }
+        out.close();
+    }
+    static inline
+    void check_memory(const GpuDeviceStream& gpu_dev_stream) {
+        size_t free, total;
+        CUDA_CHECK(cudaMemGetInfo(&free, &total));  // 直接查询驱动
+        printf("rank:%d, Used: %f GiB / %f GiB\n",gpu_dev_stream.rank, (total - free) / (1024.0*1024.0*1024.0), total / (1024.0*1024.0*1024.0));
+    }
+    static inline 
+    cuDoubleComplex* transpose(const cublasHandle_t &handle,const cuDoubleComplex* d_a, const int n, const int lda,bool conjugate=false)
+    {
+        cuDoubleComplex* a_fort;
+        cudaMalloc((void**)&a_fort, n * lda * sizeof(cuDoubleComplex)); // 分配内存
+        cuDoubleComplex alpha = {1.0, 0.0}, beta = {0.0, 0.0};
+        // 调用 cublasZgeam 实现转置
+        if (conjugate)
+            cublasZgeam(handle, CUBLAS_OP_C, CUBLAS_OP_C,
+                    n, lda, // 转置后的维度
+                    &alpha,
+                    d_a, lda, // 输入矩阵的列数作为 leading dimension
+                    &beta,
+                    d_a, lda,       // 第二个矩阵设为空
+                    a_fort, n // 输出矩阵的 leading dimension
+            );
+        else
+        cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T,
+                    n, lda, // 转置后的维度
+                    &alpha,
+                    d_a, lda, // 输入矩阵的列数作为 leading dimension
+                    &beta,
+                    d_a, lda,       // 第二个矩阵设为空
+                    a_fort, n // 输出矩阵的 leading dimension
+        );
+        return a_fort;
+    }
+    static inline
+    void doubleComplexTocuDoubleComplex_host(const std::complex<double> *a, cuDoubleComplex *b, const int n, const int lda,bool is_transpose=false)
+    {
+        if (is_transpose)
+        {
+            #pragma omp parallel for
+            for (int i = 0; i < n; i++)
+            {
+                for (int j = 0; j < lda; j++)
+                {
+                    b[i*lda+j].x = a[j*n+i].real();
+                    b[i*lda+j].y = a[j*n+i].imag();
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for
+            for (int i = 0; i < n*lda; i++)
+            {
+                b[i].x = a[i].real();
+                b[i].y = a[i].imag();
+            }
+        }
+    }
+    static inline
+    void cuDoubleComplexToDoubleComplex_host(std::complex<double> *a,const cuDoubleComplex *b,const int n,const int lda,bool is_transpose=false)
+    {
+        if (is_transpose)
+        {
+            for (int i = 0; i < n; i++)
+            {
+                for (int j = 0; j < lda; j++)
+                {
+                    a[j*n+i].real(b[i*lda+j].x);
+                    a[j*n+i].imag(b[i*lda+j].y);
+                }
+            }
+        }
+        else
+        {
+            for (int i = 0; i < n*lda; i++)
+            {
+                a[i].real(b[i].x);
+                a[i].imag(b[i].y);
+            }
+        }
+    }
+    // update by chenhaobo in 2025-05-19
+    static inline
+    void cuZgetrf_f(const int& m, const int& n, cuDoubleComplex *d_a, const int& lda, int *d_ipiv, int *d_info, cusolverDnHandle_t cusolverH)
+    {
+        cudaStream_t stream;
+        cusolverStatus_t status = cusolverDnGetStream(cusolverH, &stream);
+        // printf("cusolverDnGetStream status:%d\n",status);
+        // printf("Stream stream:%d\n",stream);
+        // printf("cusolverDnhandle:%d\n",cusolverH);
+        int lwork;
+        // auto start = std::chrono::high_resolution_clock::now();
+        cusolverDnZgetrf_bufferSize(cusolverH, m, n, d_a, lda, &lwork);
+        cuDoubleComplex *d_work;
+        cudaMallocAsync((void**)&d_work, lwork * sizeof(cuDoubleComplex), stream);
+        // cudaStreamSynchronize(stream);
+        cusolverDnZgetrf(cusolverH, m, n, d_a, lda, d_work, d_ipiv, d_info);
+        // cudaStreamSynchronize(stream);
+        // auto end = std::chrono::high_resolution_clock::now();
+        // auto duration= std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+        // int info;
+        // cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost);
+        // printf("stream:%d, info:%d,Time taken of the cudaStreamSynchronize on GPU: %lld microseconds\n",stream,info, duration.count());
+        cudaFreeAsync(d_work, stream);
+    }
+    static
+    cuDoubleComplex det_cuDoubleComplex(const cuDoubleComplex *d_a,const int *d_ipiv, const int& n, const cudaStream_t stream);
+    static
+    void cuDoubleComplex_minus_identity_Async(cuDoubleComplex *d_a,const int& h_n, const cudaStream_t stream);
+    static inline void cuDoubleComplex_minus_identity_host(cuDoubleComplex* h_a, const int& n)
+    {
+        #pragma omp parallel for
+        for(int i=0;i<n;i++){
+            h_a[i*n+i].x -= 1.0; // 减去单位矩阵的对角线元素
+        }
+    }
+    static inline cuDoubleComplex det_cuZgetrf_f_from_host(const int& m, const int& n, cuDoubleComplex *h_a, const int& lda, int *ipiv, int *info, cusolverDnHandle_t cusolverH,const int &deviceId)
+    {
+        cudaSetDevice(deviceId);
+        cudaStream_t stream;
+        cusolverStatus_t status = cusolverDnGetStream(cusolverH, &stream);
+        int lwork;
+        
+        int *d_ipiv;
+        cudaMallocAsync((void**)&d_ipiv, n * sizeof(int), stream);
+        int *d_info;
+        cudaMallocAsync((void**)&d_info, sizeof(int), stream);
+        cudaMemcpyAsync(d_ipiv, ipiv, n * sizeof(int), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_info, info, sizeof(int), cudaMemcpyHostToDevice, stream);
+        cuDoubleComplex *d_a;
+        cudaMallocAsync((void**)&d_a, m * n * sizeof(cuDoubleComplex), stream);
+        cusolverDnZgetrf_bufferSize(cusolverH, m, n, d_a, lda, &lwork);
+        cuDoubleComplex *d_work;
+        cudaMallocAsync((void**)&d_work, lwork * sizeof(cuDoubleComplex), stream);
+        cudaMemcpyAsync(d_a, h_a, m * n * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice, stream);
+        CudaConnector::cuDoubleComplex_minus_identity_Async(d_a,n, stream);
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+            double start_time_zgetrf = omp_get_wtime();
+        #endif
+        cusolverDnZgetrf(cusolverH, m, n, d_a, lda, d_work, d_ipiv, d_info);
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+            printf("cusolverDnZgetrf time: %f seconds\n", omp_get_wtime() - start_time_zgetrf);
+        #endif
+        // Check for errors in the info variable
+        cudaMemcpyAsync(info, d_info, sizeof(int), cudaMemcpyDeviceToHost, stream);
+        cuDoubleComplex det = det_cuDoubleComplex(d_a, d_ipiv, n, stream);
+        // cudaStreamSynchronize(stream);
+        cudaMemcpyAsync(ipiv, d_ipiv, n * sizeof(int), cudaMemcpyDeviceToHost, stream);
+        cudaFreeAsync(d_work, stream);
+        cudaFreeAsync(d_ipiv, stream);
+        cudaFreeAsync(d_info, stream);
+        cudaFreeAsync(d_a, stream);
+        cudaStreamSynchronize(stream);
+        return det;
+    }
+    static inline cuDoubleComplex det_magmaZgetrf_f_mgpu_from_host(const int& m, const int& n, cuDoubleComplex *h_a, const int& lda, int *ipiv, int *info, const int& ngpu, magma_queue_t* queues)
+    {
+        cuDoubleComplex_minus_identity_host(h_a, m);
+        int nb= magma_get_zgetrf_nb( m, n );
+        // printf("nb=%d\n", nb);
+        int ldda= magma_roundup( m, nb );  // multiple of 32 by default
+        int ldn_local,n_local;
+        // printf("ldda:%d\n", ldda);
+        cuDoubleComplex **d_lA = new cuDoubleComplex*[ngpu];
+        for(int dev=0; dev < ngpu; dev++ ) {
+            n_local = ((n/nb)/ngpu)*nb;
+            if (dev < (n/nb) % ngpu)
+                n_local += nb;
+            else if (dev == (n/nb) % ngpu)
+                n_local += n % nb;
+            ldn_local = magma_roundup( n_local, nb );  // multiple of 32 by default
+            // printf("dev=%d, n_local=%d, ldn_local=%d\n", dev, n_local, ldn_local);
+            magma_setdevice( dev );
+            magma_zmalloc( &d_lA[dev], ldda*ldn_local );
+        }
+        // // show h_a
+        // printf("h_a\n");
+        // for(int i=0;i<m;i++){
+        //     for(int j=0;j<n;j++){
+        //         printf("(%f,%f) ", h_a[i*lda+j].x, h_a[i*lda+j].y);
+        //     }
+        //     printf("\n");
+        // }
+        magma_zsetmatrix_1D_col_bcyclic( ngpu, m, n, nb, h_a, lda, d_lA, ldda, queues );
+        // // show d_lA
+        // printf("d_la before LU composition\n");
+        // for(int dev=0; dev < ngpu; dev++ ) {
+        //     n_local = ((n/nb)/ngpu)*nb;
+        //     if (dev < (n/nb) % ngpu)
+        //         n_local += nb;
+        //     else if (dev == (n/nb) % ngpu)
+        //         n_local += n % nb;
+        //     ldn_local = magma_roundup( n_local, nb );  // multiple of 32 by default
+        //     magma_setdevice( dev );
+        //     printf("Device %d d_lA:\n", dev);
+        //     cuDoubleComplex *h_lA=new cuDoubleComplex[ldda*ldn_local];
+        //     cudaMemcpy(h_lA, d_lA[dev], ldda*ldn_local*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
+        //     for(int i=0;i<ldn_local;i++){
+        //         for(int j=0;j<ldda;j++){
+        //             printf("(%f,%f) ", h_lA[i*ldda+j].x, h_lA[i*ldda+j].y);
+        //         }
+        //         printf("\n");
+        //     }
+        //     delete[] h_lA;
+        // }
+        magma_zgetrf_mgpu( ngpu, m, n, d_lA, ldda, ipiv, info );
+        // if (*info != 0) {
+        //     printf("magma_zgetrf_mgpu returned error %lld: %s.\n",
+        //            (long long) *info, magma_strerror( *info ));
+        // }
+        // *info=1;
+        // printf("ipiv after magma_zgetrf_mgpu:");
+        // for(int i=0;i<m;i++){
+        //     printf("%d ", ipiv[i]);
+        //     ipiv[i]=0;
+        // }
+        // printf("\n");
+        // // lapack test
+        // lapackf77_zgetrf( &m, &n, h_a, &lda, ipiv, info );
+        // if (*info != 0) {
+        //     printf("magma_zgetrf_mgpu returned error %lld: %s.\n",
+        //            (long long) *info, magma_strerror( *info ));
+        // }
+        // printf("ipiv after lapack zgetrf:");
+        // for(int i=0;i<m;i++){
+        //     printf("%d ", ipiv[i]);
+        //     // ipiv[i]=0;
+        // }
+        // printf("\n");
+        // // show h_a
+        // printf("h_a after LU composition\n");
+        // for(int i=0;i<m;i++){
+        //     for(int j=0;j<n;j++){
+        //         printf("(%f,%f) ", h_a[i*lda+j].x, h_a[i*lda+j].y);
+        //         h_a[i*lda+j].x = 0.0; // reset h_a to zero
+        //         h_a[i*lda+j].y = 0.0; // reset h_a to zero
+        //     }
+        //     printf("\n");
+        // }
+        // // lapack test over
+        magma_zgetmatrix_1D_col_bcyclic( ngpu, m, n, nb, d_lA, ldda, h_a, lda, queues );
+        for( int dev=0; dev < ngpu; dev++ ) {
+            magma_setdevice( dev );
+            magma_free( d_lA[dev] );
+        }
+        delete[] d_lA;
+        // Calculate the determinant from the diagonal elements and the pivot indices
+        cuDoubleComplex det = cuDoubleComplex{1.0, 0.0};
+        // printf("det=%f+%fi\n", det.x, det.y);
+        for(int i=0;i<m;i++){
+            // printf("h_a[%d][%d]=(%f,%f)\n", i, i, h_a[i*lda+i].x, h_a[i*lda+i].y);
+            det=cuCmul(det, h_a[i*lda+i]);
+            if( ipiv[i] != i + 1 ) {
+                det.x = -det.x; // Adjust sign for row swaps
+                det.y = -det.y; // Adjust sign for row swaps
+            }
+        }
+        return det;
+    }
+    #ifdef ENABLE_NVHPC
+    // static
+    // void pzgetrf_cusolverMp(const int &, const int &, std::complex<double> *, const int &,
+    //                             const int &, const LIBRPA::Array_Desc &, int *, int &,const char order='C');
+    static 
+    void pzgetrf_nvhpc(const GpuDeviceStream&, ComplexMatrixDevice &, const int &,
+                                const int &, const LIBRPA::Array_Desc &, int64_t *, int *,const char& order='C');
+    static 
+    void pgetrf_nvhpc_mixed_precision(
+        const GpuDeviceStream&, void *, 
+        const int &, const int &,
+        const LIBRPA::Array_Desc &, int64_t *, int *,
+        const cudaDataType_t &,const char &order='C'
+    );
+    static 
+    void pgetrf_nvhpc_mixed_precision(
+        void *, const int &, const int &,
+        const LIBRPA::Array_Desc &, int64_t *, int *,
+        const cudaDataType_t &,const char &order='C'
+    );
+    static
+    void pgetrs_nvhpc_mixed_precision(
+        const GpuDeviceStream&, const cublasOperation_t&,
+        const void* d_A, const int64_t& IA, const int64_t& JA, const LIBRPA::Array_Desc &,
+        const int64_t* d_ipiv,
+        void* d_B, const int64_t& IB, const int64_t& JB, const LIBRPA::Array_Desc &,
+        int* d_info,const cudaDataType_t& compute_type,
+        const char& order='C'
+    );
+    static void pgetrf_trs_nvhpc_mixed_precision(
+        const GpuDeviceStream&, const cublasOperation_t&,
+        void* d_A, const int64_t& IA, const int64_t& JA, const LIBRPA::Array_Desc &,
+        void* d_B, const int64_t& IB, const int64_t& JB, const LIBRPA::Array_Desc &,
+        const cudaDataType_t& compute_type, const char& order='C'
+    );
+    static
+    std::complex<double> det_ComplexMatrixDevice_blacs(const cudaStream_t&, const ComplexMatrixDevice&, const LIBRPA::Array_Desc &);
+    static
+    std::complex<double> trace_ComplexMatrixDevice_blacs(const cudaStream_t&, const ComplexMatrixDevice&, const LIBRPA::Array_Desc &);
+    static
+    void trace_matrix_device_blacs(void* trace, const void* d_A, const Array_Desc &arrdesc_A, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type);
+    static
+    void det_matrix_device_blacs(void* det, const void* d_A, const Array_Desc &arrdesc_A, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type);
+    
+    // static
+    // void pgemm_cublasMp(const char &transa, const char &transb, const int &m, const int &n, const int &k,
+    //                     const double &alphaD, const std::complex<double> *A, const int &ia, const int &ja, const LIBRPA::Array_Desc &arrdesc_A,
+    //                     const std::complex<double> *B, const int &ib, const int &jb, const LIBRPA::Array_Desc &arrdesc_B,
+    //                     const double &betaD, std::complex<double> *C, const int &ic, const int &jc, const LIBRPA::Array_Desc &arrdesc_C);
+    
+    static 
+    void pgemm_device(cublasMpHandle_t,cublasOperation_t,cublasOperation_t,const int &,const int &,const int &,
+                        const void *,
+                        const ComplexMatrixDevice &,int64_t,int64_t,
+                        const ComplexMatrixDevice &,int64_t,int64_t,
+                        const void *,
+                        ComplexMatrixDevice &,int64_t,int64_t,
+                        cublasComputeType_t);
+       
+    static 
+    void pgemm_nvhpc(const GpuDeviceStream&,cublasOperation_t,cublasOperation_t,const int &,const int &,const int &,
+                        const void *,
+                        const ComplexMatrixDevice &,int64_t,int64_t,const Array_Desc&,
+                        const ComplexMatrixDevice &,int64_t,int64_t,const Array_Desc&,
+                        const void *,
+                        ComplexMatrixDevice &,int64_t,int64_t,const Array_Desc&,
+                        cublasComputeType_t);
+    // static
+    // void pgemm_nvhpc_cuFloatComplex(const GpuDeviceStream& gpu_dev_stream,cublasOperation_t transA,cublasOperation_t transB,const int & m,const int & n,const int & k,
+    //                     const void *alpha,
+    //                     const cuFloatComplex* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA,
+    //                     const cuFloatComplex* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB,
+    //                     const void *beta,
+    //                     cuFloatComplex * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC,
+    //                     cublasComputeType_t cublas_compute_type);
+    static
+    void pgemm_nvhpc_mixed_precision(
+        const GpuDeviceStream& gpu_dev_stream,cublasOperation_t transA,cublasOperation_t transB,
+        const int & m,const int & n,const int & k,
+        const void *alpha,
+        const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA,
+        const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB,
+        const void *beta,
+        void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC,
+        cublasComputeType_t cublas_compute_type
+    );
+    static
+    void pgemm_nvhpc_mixed_precision(
+        cublasOperation_t transA,cublasOperation_t transB,
+        const int & m,const int & n,const int & k,
+        const void *alpha,
+        const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA,
+        const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB,
+        const void *beta,
+        void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC,
+        cublasComputeType_t cublas_compute_type
+    );
+    static
+    void pgeadd_nvhpc(
+        const GpuDeviceStream& gpu_dev_stream,const cublasOperation_t& trans,
+        const void *alpha,
+        const void* d_A, const int64_t& ia, const int64_t& ja, const Array_Desc& array_descA,
+        const void* beta,
+        void* d_B, const int64_t& ib, const int64_t& jb, const Array_Desc& array_descB,
+        const cudaDataType_t&,
+        const char& order = 'c'
+    );
+    static
+    void pgemr2d_nvhpc(
+        const GpuDeviceStream& gpu_dev_stream,const int&,const int&,
+        const void* d_A, const int64_t& ia, const int64_t& ja, const Array_Desc& array_descA,
+        void* d_B, const int64_t& ib, const int64_t& jb, const Array_Desc& array_descB,
+        const cudaDataType_t&
+    );
+    static
+    void cuFloatComplex_to_cuDoubleComplex_Async(const cuFloatComplex* d_a, cuDoubleComplex* d_b, const int64_t& len, const cudaStream_t& stream);
+    static 
+    void cuDoubleComplex_to_cuFloatComplex_Async(const cuDoubleComplex* d_a, cuFloatComplex* d_b, const int64_t& len, const cudaStream_t& stream);
+    static 
+    void float_to_double_device(const float* d_a, double* d_b, const int64_t& len);
+    static
+    void double_to_float_device(const double* d_a, float* d_b, const int64_t& len);
+    static
+    void multiply_number_for_ComplexMatrixDevice(ComplexMatrixDevice& mat, const cuDoubleComplex& num, const cudaStream_t& stream);                    
+    static
+    void diag_add_ComplexMatrixDevice(ComplexMatrixDevice& mat, const double& num, const Array_Desc& arrdesc,const cudaStream_t& stream);
+    static
+    void diag_add_matrix_device_blacs(
+        const void* num, void* d_A, const Array_Desc& array_desc,const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+    );
+    static 
+    void transpose_ComplexMatrixDevice(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice& in ,bool is_conjugate=false);
+
+    #endif 
+    
+};
+
+#endif // CUDA_CONNECTOR_H
\ No newline at end of file
diff --git a/src/device_connector.cpp b/src/device_connector.cpp
new file mode 100644
index 00000000..b343450c
--- /dev/null
+++ b/src/device_connector.cpp
@@ -0,0 +1,248 @@
+#include "device_connector.h"
+#ifdef ENABLE_NVHPC
+#include "cuda_connector.h"
+#endif
+#include "device_stream.h"
+
+void DeviceConnector::pgetrf_device_mixed_precision(
+    void* d_A, const int& m, const int& n,
+    const LIBRPA::Array_Desc& arrdesc_pi,
+    int64_t* d_ipiv, int* d_info,
+    const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type,
+    const char& order
+){
+    {
+        #ifdef ENABLE_NVHPC
+        cudaDataType_t cuda_compute_type;
+        if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){
+            cuda_compute_type = CUDA_C_64F;
+        }else if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT){
+            cuda_compute_type = CUDA_C_32F;
+        }else if(compute_type==LIBRPA_COMPUTE_TYPE_DOUBLE){
+            cuda_compute_type = CUDA_R_64F;
+        }else if(compute_type==LIBRPA_COMPUTE_TYPE_FLOAT){
+            cuda_compute_type = CUDA_R_32F;
+        }else{
+            fprintf(stderr, "Error: Unsupported compute type in pgetrf_device_mixed_precision\n");
+            exit(1);
+        }
+        CudaConnector::pgetrf_nvhpc_mixed_precision(
+            d_A, m, n,
+            arrdesc_pi,
+            d_ipiv, d_info,
+            cuda_compute_type,
+            order
+        );
+        #else
+        fprintf(stderr, "Error: NVHPC is not enabled in this build, cannot call pgetrf_device_mixed_precision\n");
+        exit(1);
+        #endif
+    }
+}
+
+void DeviceConnector::pgemm_device_mixed_precision(
+    const char& transa, const char& transb,
+    const int & m,const int & n,const int & k,
+    const void *alpha,
+    const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA,
+    const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB,
+    const void *beta,
+    void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC,
+    const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+){
+    #ifdef ENABLE_NVHPC
+    cublasComputeType_t cublas_compute_type;
+    cublasOperation_t transA, transB;
+    if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){
+        cublas_compute_type = CUBLAS_COMPUTE_64F_PEDANTIC;
+    }else if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT){
+        cublas_compute_type = CUBLAS_COMPUTE_32F_PEDANTIC;
+    }else if(compute_type==LIBRPA_COMPUTE_TYPE_DOUBLE){
+        cublas_compute_type = CUBLAS_COMPUTE_64F;
+    }else if(compute_type==LIBRPA_COMPUTE_TYPE_FLOAT){
+        cublas_compute_type = CUBLAS_COMPUTE_32F;
+    }else{
+        fprintf(stderr, "Error: Unsupported compute type in pgemm_device_mixed_precision\n");
+        exit(1);
+    }
+    if(transa=='N'||transa=='n'){
+        transA = CUBLAS_OP_N;
+    }else if(transa=='T'||transa=='t'){
+        transA = CUBLAS_OP_T;
+    }else if(transa=='C'||transa=='c'){
+        transA = CUBLAS_OP_C;
+    }else{
+        fprintf(stderr, "Error: Unsupported transa in pgemm_device_mixed_precision\n");
+        exit(1);
+    }
+    if(transb=='N'||transb=='n'){
+        transB = CUBLAS_OP_N;
+    }else if(transb=='T'||transb=='t'){
+        transB = CUBLAS_OP_T;
+    }else if(transb=='C'||transb=='c'){
+        transB = CUBLAS_OP_C;
+    }else{
+        fprintf(stderr, "Error: Unsupported transb in pgemm_device_mixed_precision\n");
+        exit(1);
+    }
+    CudaConnector::pgemm_nvhpc_mixed_precision(
+        transA, transB,
+        m, n, k,
+        alpha,
+        d_A, ia, ja, array_descA,
+        d_B, ib, jb, array_descB,
+        beta,
+        d_C, ic, jc, array_descC,
+        cublas_compute_type
+    );
+    #endif
+}
+
+void DeviceConnector::transpose_device_blas(
+    const void* d_A, 
+    const int& m, const int& n,
+    void* d_B,
+    const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type,
+    const bool& is_conjugate
+){
+    #ifdef ENABLE_NVHPC
+    cublasHandle_t cublasH = NULL;
+    CUBLAS_CHECK(cublasCreate(&cublasH));
+    CUBLAS_CHECK(cublasSetStream(cublasH, (cudaStream_t)device_stream.stream));
+    cublasOperation_t trans = CUBLAS_OP_T;
+    if(is_conjugate)
+        trans = CUBLAS_OP_C;
+    if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){
+        cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
+        cuDoubleComplex beta = make_cuDoubleComplex(0.0, 0.0);
+        CUBLAS_CHECK(cublasZgeam(
+            cublasH, trans, CUBLAS_OP_N,
+            m, n, 
+            &alpha,
+            (cuDoubleComplex*)d_A, n,
+            &beta,
+            (cuDoubleComplex*)d_B, m,
+            (cuDoubleComplex*)d_B, m));
+    }else if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT){
+        cuFloatComplex alpha = make_cuFloatComplex(1.0, 0.0);
+        cuFloatComplex beta = make_cuFloatComplex(0.0, 0.0);
+        CUBLAS_CHECK(cublasCgeam(
+            cublasH, trans, CUBLAS_OP_N,
+            m, n, 
+            &alpha,
+            (cuFloatComplex*)d_A, n,
+            &beta,
+            (cuFloatComplex*)d_B, m,
+            (cuFloatComplex*)d_B, m));
+    }else if(compute_type==LIBRPA_COMPUTE_TYPE_DOUBLE){
+        double alpha = 1.0;
+        double beta = 0.0;
+        CUBLAS_CHECK(cublasDgeam(
+            cublasH, trans, CUBLAS_OP_N,
+            m, n, 
+            &alpha,
+            (double*)d_A, n,
+            &beta,
+            (double*)d_B, m,
+            (double*)d_B, m));
+        
+    }else if(compute_type==LIBRPA_COMPUTE_TYPE_FLOAT){
+        float alpha = 1.0;
+        float beta = 0.0;
+        CUBLAS_CHECK(cublasSgeam(
+            cublasH, trans, CUBLAS_OP_N,
+            m, n, 
+            &alpha,
+            (float*)d_A, n,
+            &beta,
+            (float*)d_B, m,
+            (float*)d_B, m));
+    }else{
+        fprintf(stderr, "Error: Unsupported compute type in transpose_device_blas\n");
+        exit(1);
+    }
+    CUBLAS_CHECK(cublasDestroy(cublasH));
+    #endif
+}
+
+void DeviceConnector::num_multiply_matrix_device(
+    const int& n, const void* num, void* d_A, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+){
+    #ifdef ENABLE_NVHPC
+    cublasHandle_t cublasH = NULL;
+    CUBLAS_CHECK(cublasCreate(&cublasH));
+    CUBLAS_CHECK(cublasSetStream(cublasH, (cudaStream_t)device_stream.stream));
+    if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){
+        CUBLAS_CHECK(cublasZscal(
+            cublasH, n,
+            (cuDoubleComplex*)num,
+            (cuDoubleComplex*)d_A, 1
+        ));
+    }else if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT){
+        CUBLAS_CHECK(cublasCscal(
+            cublasH, n,
+            (cuFloatComplex*)num,
+            (cuFloatComplex*)d_A, 1
+        )); 
+    }else if(compute_type==LIBRPA_COMPUTE_TYPE_DOUBLE){
+        CUBLAS_CHECK(cublasDscal(
+            cublasH, n,
+            (double*)num,
+            (double*)d_A, 1
+        ));
+    }else if(compute_type==LIBRPA_COMPUTE_TYPE_FLOAT){
+        CUBLAS_CHECK(cublasSscal(
+            cublasH, n,
+            (float*)num,
+            (float*)d_A, 1
+        ));
+    }else{
+        fprintf(stderr, "Error: Unsupported compute type in num_multiply_matrix_device\n");
+        exit(1);
+    }
+    #endif
+}
+
+void DeviceConnector::diag_add_matrix_device_blacs(
+    const void* num, void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+){
+    #ifdef ENABLE_NVHPC
+    CudaConnector::diag_add_matrix_device_blacs(
+        num, d_A, array_desc, compute_type
+    );
+    #endif
+}
+
+void DeviceConnector::trace_matrix_device_blacs(
+    void* trace, const void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+){
+    #ifdef ENABLE_NVHPC
+    CudaConnector::trace_matrix_device_blacs(
+        trace, d_A, array_desc, compute_type
+    );
+    #endif
+}
+
+void DeviceConnector::det_matrix_device_blacs(
+    void* det, const void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+){
+    #ifdef ENABLE_NVHPC
+    CudaConnector::det_matrix_device_blacs(
+        det, d_A, array_desc, compute_type
+    );
+    #endif
+}
+
+void DeviceConnector::float_to_double_device(float* d_A, double* d_B, const int64_t& n)
+{
+    #ifdef ENABLE_NVHPC
+    CudaConnector::float_to_double_device(d_A, d_B, n);
+    #endif
+}
+
+void DeviceConnector::double_to_float_device(double* d_A, float* d_B, const int64_t& n)
+{
+    #ifdef ENABLE_NVHPC
+    CudaConnector::double_to_float_device(d_A, d_B, n);
+    #endif
+}
diff --git a/src/device_connector.h b/src/device_connector.h
new file mode 100644
index 00000000..b4188f68
--- /dev/null
+++ b/src/device_connector.h
@@ -0,0 +1,54 @@
+#ifndef DEVICE_CONNECTOR_H
+#define DEVICE_CONNECTOR_H
+#include "base_blacs.h"
+#include  "helpers.h"
+using LIBRPA::Array_Desc;
+
+class DeviceConnector{
+public:
+    static void pgetrf_device_mixed_precision(
+        void* d_A, const int& m, const int& n,
+        const LIBRPA::Array_Desc& arrdesc_pi,
+        int64_t* d_ipiv, int* d_info,
+        const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type,
+        const char& order='C'
+    );
+    static void pgemm_device_mixed_precision(
+        const char& transa, const char& transb,
+        const int & m,const int & n,const int & k,
+        const void *alpha,
+        const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA,
+        const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB,
+        const void *beta,
+        void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC,
+        const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+    );
+    static void transpose_device_blas(
+        const void* d_A, 
+        const int& m, const int& n,
+        void* d_B,
+        const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type,
+        const bool& is_conjugate=false
+    );
+    static void num_multiply_matrix_device(
+        const int& n, const void* num, void* d_A, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+    );
+    static void diag_add_matrix_device_blacs(
+        const void* num, void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+    );
+    
+    static void trace_matrix_device_blacs(
+        void* trace, const void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+    );
+    static void det_matrix_device_blacs(
+        void* det, const void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+    );
+    static void float_to_double_device(float* d_A, double* d_B, const int64_t& n);
+    static void double_to_float_device(double* d_A, float* d_B, const int64_t& n);
+
+    
+};
+
+
+
+#endif // DEVICE_CONNECTOR_H
\ No newline at end of file
diff --git a/src/device_stream.cpp b/src/device_stream.cpp
new file mode 100644
index 00000000..03b9ddfc
--- /dev/null
+++ b/src/device_stream.cpp
@@ -0,0 +1,34 @@
+#include "device_stream.h"
+#include "envs_mpi.h"
+using LIBRPA::envs::mpi_comm_global_h;
+void DeviceStream::init(){
+    local_device = DeviceStream::getLocalDevice();
+    #ifdef ENABLE_NVHPC
+    CUDA_CHECK(cudaSetDevice(local_device));
+    CUDA_CHECK(cudaFree(nullptr));
+    {
+        params.allgather    = DeviceStream::allgather;
+        params.req_test     = DeviceStream::request_test;
+        params.req_free     = DeviceStream::request_free;
+        params.data         = (void*)(MPI_COMM_WORLD);
+        params.rank         = mpi_comm_global_h.myid;
+        params.nranks       = mpi_comm_global_h.nprocs;
+        params.local_device = local_device;
+
+        CAL_CHECK(cal_comm_create(params, &cal_comm));
+    }
+    CUDA_CHECK(cudaStreamCreate((cudaStream_t*)&stream));
+    CUSOLVERMP_CHECK(cusolverMpCreate(&cusolverMp_handle, local_device, (cudaStream_t)stream));
+    CUBLASMP_CHECK(cublasMpCreate(&cublasMp_handle, (cudaStream_t)stream));
+    #endif
+}
+void DeviceStream::check_memory(){
+    #ifdef ENABLE_NVHPC
+    size_t free, total;
+    CUDA_CHECK(cudaMemGetInfo(&free, &total));  // 直接查询驱动
+    printf("rank:%d, Used: %f GiB / %f GiB\n",mpi_comm_global_h.myid, (total - free) / (1024.0*1024.0*1024.0), total / (1024.0*1024.0*1024.0));
+    #endif
+}
+
+
+DeviceStream device_stream = DeviceStream();
\ No newline at end of file
diff --git a/src/device_stream.h b/src/device_stream.h
new file mode 100644
index 00000000..91b9800c
--- /dev/null
+++ b/src/device_stream.h
@@ -0,0 +1,134 @@
+#ifndef DEVICE_STREAM_H
+#define DEVICE_STREAM_H
+
+#include <iostream>
+#ifdef ENABLE_NVHPC
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+#include <cusolverMp.h>
+#include <cublasmp.h>
+#endif
+#include "helpers.h"
+#include <mpi.h>
+class DeviceStream{
+private:
+    #ifdef ENABLE_NVHPC
+    cal_comm_create_params_t params;
+    static inline calError_t allgather(void* src_buf, void* recv_buf, size_t size, void* data, void** request)
+    {
+        MPI_Request req;
+        int         err = MPI_Iallgather(src_buf, size, MPI_BYTE, recv_buf, size, MPI_BYTE, (MPI_Comm)(data), &req);
+        if (err != MPI_SUCCESS)
+        {
+            return CAL_ERROR;
+        }
+        *request = (void*)(req);
+        return CAL_OK;
+    }
+
+    static inline calError_t request_test(void* request)
+    {
+        MPI_Request req = (MPI_Request)(request);
+        int         completed;
+        int         err = MPI_Test(&req, &completed, MPI_STATUS_IGNORE);
+        if (err != MPI_SUCCESS)
+        {
+            return CAL_ERROR;
+        }
+        return completed ? CAL_OK : CAL_ERROR_INPROGRESS;
+    }
+
+    static inline calError_t request_free(void* request)
+    {
+        return CAL_OK;
+    }
+    #endif
+    static inline int getLocalDevice()
+    {
+        int localRank;
+        MPI_Comm localComm;
+
+        MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &localComm);
+        MPI_Comm_rank(localComm, &localRank);
+        MPI_Comm_free(&localComm);
+
+        int deviceCount = 0;
+        #ifdef ENABLE_NVHPC
+        CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
+        #endif
+
+        return localRank % deviceCount;
+    }
+public:
+    int local_device;
+    #ifdef ENABLE_NVHPC
+    cudaStream_t stream = nullptr;
+    cal_comm_t cal_comm = nullptr;
+    cusolverMpHandle_t cusolverMp_handle = nullptr;
+    cublasMpHandle_t cublasMp_handle = nullptr;
+    #endif
+    DeviceStream(){}
+    void check_memory();
+    void init();
+    void finalize(){
+        if(stream!=nullptr){
+            #ifdef ENABLE_NVHPC
+            CUDA_CHECK(cudaStreamDestroy((cudaStream_t)stream));
+            #endif
+            stream=nullptr;
+        }
+        #ifdef ENABLE_NVHPC
+        if(cusolverMp_handle!=nullptr){
+            CUSOLVERMP_CHECK(cusolverMpDestroy(cusolverMp_handle));
+            cusolverMp_handle=nullptr;
+        }
+        if(cublasMp_handle!=nullptr){
+            CUBLASMP_CHECK(cublasMpDestroy(cublasMp_handle));
+            cublasMp_handle=nullptr;
+        }
+        if(cal_comm!=nullptr){
+            CAL_CHECK(cal_comm_destroy(cal_comm));
+            cal_comm=nullptr;
+        }
+        #endif
+    }
+    ~DeviceStream(){
+        if(stream!=nullptr){
+            #ifdef ENABLE_NVHPC
+            CUDA_CHECK(cudaStreamDestroy((cudaStream_t)stream));
+            #endif
+            stream=nullptr;
+        }
+        #ifdef ENABLE_NVHPC
+        if(cusolverMp_handle!=nullptr){
+            CUSOLVERMP_CHECK(cusolverMpDestroy(cusolverMp_handle));
+            cusolverMp_handle=nullptr;
+        }
+        if(cublasMp_handle!=nullptr){
+            CUBLASMP_CHECK(cublasMpDestroy(cublasMp_handle));
+            cublasMp_handle=nullptr;
+        }
+        if(cal_comm!=nullptr){
+            CAL_CHECK(cal_comm_destroy(cal_comm));
+            cal_comm=nullptr;
+        }
+        #endif
+    }
+    void sync() const{
+        #ifdef ENABLE_NVHPC
+        CUDA_CHECK(cudaStreamSynchronize((cudaStream_t)stream));
+        #endif
+    }
+    #ifdef ENABLE_NVHPC
+    void cudaSync() const{
+        CUDA_CHECK(cudaStreamSynchronize((cudaStream_t)stream));
+    }
+    void calSync() const {
+        CAL_CHECK(cal_stream_sync(cal_comm,(cudaStream_t)stream));
+    }
+    #endif
+};
+
+extern DeviceStream device_stream;
+#endif // DEVICE_STREAM_H
\ No newline at end of file
diff --git a/src/dielecmodel.cpp b/src/dielecmodel.cpp
index b634dc33..8338c640 100644
--- a/src/dielecmodel.cpp
+++ b/src/dielecmodel.cpp
@@ -20,6 +20,9 @@ using LIBRPA::envs::mpi_comm_global_h;
 using LIBRPA::envs::ofs_myid;
 using LIBRPA::utils::lib_printf;
 
+#ifdef ENABLE_NVHPC
+#include "cuda_connector.h"
+#endif
 const int DoubleHavriliakNegami::d_npar = 8;
 
 const std::function<double(double, const std::vector<double> &)>
@@ -498,18 +501,20 @@ void diele_func::wing_mu_to_lambda(matrix_m<std::complex<double>> &sqrtveig_blac
     int n_lambda = this->n_nonsingular - 1;
     Array_Desc desc_wing_mu(blacs_ctxt_global_h);
     desc_wing_mu.init_square_blk(n_abf, 3, 0, 0);
+    desc_wing_mu.init(n_abf, 3, desc_nabf_nabf_opt.mb(), desc_wing_mu.nb(), 0, 0);
     Array_Desc desc_wing(blacs_ctxt_global_h);
     desc_wing.init_square_blk(n_nonsingular - 1, 3, 0, 0);
     Array_Desc desc_body(blacs_ctxt_global_h);
     desc_body.init_square_blk(n_nonsingular - 1, n_nonsingular - 1, 0, 0);
-    // opt descriptor for wing
-    Array_Desc desc_wing_opt(blacs_ctxt_global_h);
-    desc_wing_opt.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0);
+    int mb = std::min(128, desc_body.mb());
+    desc_body.init(n_nonsingular - 1, n_nonsingular - 1, mb, mb, 0, 0);
+    
+    desc_wing.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0);
 
     for (int iomega = 0; iomega != this->omega.size(); iomega++)
     {
         auto &wing_tmp = this->wing.at(iomega);
-        wing_tmp = init_local_mat<complex<double>>(desc_wing_opt, MAJOR::COL);
+        wing_tmp = init_local_mat<complex<double>>(desc_wing, MAJOR::COL);
         // TODO: reconstruct wing_mu
         auto wing_mu_tmp = init_local_mat<complex<double>>(desc_wing_mu, MAJOR::COL);
         for (int alpha = 0; alpha != 3; alpha++)
@@ -529,7 +534,7 @@ void diele_func::wing_mu_to_lambda(matrix_m<std::complex<double>> &sqrtveig_blac
         ScalapackConnector::pgemm_f('C', 'N', n_lambda, 3, n_abf, 1.0, sqrtveig_blacs.ptr(), 1, 2,
                                     desc_nabf_nabf_opt.desc, wing_mu_tmp.ptr(), 1, 1,
                                     desc_wing_mu.desc, 0.0, wing_tmp.ptr(), 1, 1,
-                                    desc_wing_opt.desc);
+                                    desc_wing.desc);
     }
 
     this->wing_mu.clear();
@@ -1058,6 +1063,8 @@ Array_Desc diele_func::get_body_inv(matrix_m<std::complex<double>> &chi0_block,
 
     Array_Desc desc_body(blacs_ctxt_global_h);
     desc_body.init_square_blk(n_nonsingular - 1, n_nonsingular - 1, 0, 0);
+    int mb = std::min(128,desc_body.mb());
+    desc_body.init(n_nonsingular - 1, n_nonsingular - 1, mb, mb, 0, 0);
     this->body_inv = init_local_mat<complex<double>>(desc_body, MAJOR::COL);
 
     /* for (int ilambda = 0; ilambda < n_nonsingular - 1; ilambda++)
@@ -1077,6 +1084,46 @@ Array_Desc diele_func::get_body_inv(matrix_m<std::complex<double>> &chi0_block,
     return desc_body;
 };
 
+#ifdef ENABLE_NVHPC
+Array_Desc diele_func::get_body_inv_nvhpc(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice& d_chi0_block, const Array_Desc& desc_nabf_nabf_opt)
+{
+    Profiler::start("get_inverse_body_of_chi0_nvhpc");
+    gpu_dev_stream.calSync();
+    Array_Desc desc_body(blacs_ctxt_global_h);
+    desc_body.init_square_blk(n_nonsingular - 1, n_nonsingular - 1, 0, 0);
+    int mb = std::min(128,desc_body.mb());
+    desc_body.init(n_nonsingular - 1, n_nonsingular - 1, mb, mb, 0, 0);
+    this->d_body_inv.set_data(desc_body.m_loc(),desc_body.n_loc(),gpu_dev_stream.stream);
+    Array_Desc_Device desc_body_dev(desc_body);
+    CudaConnector::pgemr2d_nvhpc(
+        gpu_dev_stream, n_nonsingular - 1, n_nonsingular - 1, 
+        d_chi0_block.ptr(), 2, 2, desc_nabf_nabf_opt, 
+        this->d_body_inv.ptr(), 1, 1, desc_body, 
+        CUDA_C_64F
+    );
+    ComplexMatrixDevice d_identity;
+    d_identity.set_data(desc_body.m_loc(),desc_body.n_loc(),gpu_dev_stream.stream);
+    d_identity.set_as_identity(gpu_dev_stream, desc_body_dev);
+    char order = 'c';
+    if(order == 'c'||order == 'C'){
+        CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_body_inv);
+        CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_identity);
+    }
+    CudaConnector::pgetrf_trs_nvhpc_mixed_precision(
+        gpu_dev_stream, CUBLAS_OP_N,
+        d_body_inv.ptr(), 1, 1, desc_body,
+        d_identity.ptr(), 1, 1, desc_body,
+        CUDA_C_64F, order
+    );
+    if(order == 'c'||order == 'C'){
+        CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_identity);
+    }
+    
+    this->d_body_inv.set_data_device(d_identity.nr(),d_identity.nc(),d_identity.ptr(),gpu_dev_stream);
+    Profiler::stop("get_inverse_body_of_chi0_nvhpc");
+    return desc_body;
+}
+#endif
 void diele_func::construct_L(const int ifreq, Array_Desc &desc_body)
 {
     Profiler::start("cal_L");
@@ -1086,17 +1133,16 @@ void diele_func::construct_L(const int ifreq, Array_Desc &desc_body)
     Array_Desc desc_wing(blacs_ctxt_global_h);
     desc_wing.init_square_blk(n_nonsingular - 1, 3, 0, 0);
     // opt descriptor for wing
-    Array_Desc desc_wing_opt(blacs_ctxt_global_h);
-    desc_wing_opt.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0);
-
+    desc_wing.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0);
+   
     Array_Desc desc_lam_3(blacs_ctxt_global_h);
-    desc_lam_3.init_square_blk(n_nonsingular - 1, 3, 0, 0);
+    desc_lam_3.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0);
 
     Array_Desc desc_3_lam(blacs_ctxt_global_h);
-    desc_3_lam.init_square_blk(3, n_nonsingular - 1, 0, 0);
+    desc_3_lam.init(3, n_nonsingular - 1, desc_wing.nb(), desc_body.mb(), 0, 0);
 
     Array_Desc desc_3_3(blacs_ctxt_global_h);
-    desc_3_3.init_square_blk(3, 3, 0, 0);
+    desc_3_3.init(3, 3, desc_wing.nb(), desc_wing.nb(), 0, 0);
 
     auto lam_3 = init_local_mat<complex<double>>(desc_lam_3, MAJOR::COL);
     auto _3_lam = init_local_mat<complex<double>>(desc_3_lam, MAJOR::COL);
@@ -1104,12 +1150,12 @@ void diele_func::construct_L(const int ifreq, Array_Desc &desc_body)
     // tmp = head.at(ifreq) - transpose(wing.at(ifreq), true) * body_inv * wing.at(ifreq);
     ScalapackConnector::pgemm_f('N', 'N', n_nonsingular - 1, 3, n_nonsingular - 1, 1.0,
                                 body_inv.ptr(), 1, 1, desc_body.desc, wing.at(ifreq).ptr(), 1, 1,
-                                desc_wing_opt.desc, 0.0, lam_3.ptr(), 1, 1, desc_lam_3.desc);
+                                desc_wing.desc, 0.0, lam_3.ptr(), 1, 1, desc_lam_3.desc);
     ScalapackConnector::pgemm_f('C', 'N', 3, 3, n_nonsingular - 1, 1.0, wing.at(ifreq).ptr(), 1, 1,
-                                desc_wing_opt.desc, lam_3.ptr(), 1, 1, desc_lam_3.desc, 0.0,
+                                desc_wing.desc, lam_3.ptr(), 1, 1, desc_lam_3.desc, 0.0,
                                 Lind_loc.ptr(), 1, 1, desc_3_3.desc);
     ScalapackConnector::pgemm_f('C', 'N', 3, n_nonsingular - 1, n_nonsingular - 1, 1.0,
-                                wing.at(ifreq).ptr(), 1, 1, desc_wing_opt.desc, body_inv.ptr(), 1,
+                                wing.at(ifreq).ptr(), 1, 1, desc_wing.desc, body_inv.ptr(), 1,
                                 1, desc_body.desc, 0.0, _3_lam.ptr(), 1, 1, desc_3_lam.desc);
 
     for (int i = 0; i != 3; i++)
@@ -1146,6 +1192,114 @@ void diele_func::construct_L(const int ifreq, Array_Desc &desc_body)
     Profiler::stop("cal_L");
 };
 
+#ifdef ENABLE_NVHPC
+void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const int& ifreq, Array_Desc& desc_body)
+{
+    Profiler::start("cal_L");
+    this->Lind.resize(3, 3, MAJOR::COL);
+    this->bw.resize(n_nonsingular - 1, 3, MAJOR::COL);
+    this->wb.resize(3, n_nonsingular - 1, MAJOR::COL);
+    Array_Desc desc_wing(blacs_ctxt_global_h);
+    desc_wing.init_square_blk(n_nonsingular - 1, 3, 0, 0);
+    // opt descriptor for wing
+    desc_wing.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0);
+   
+    Array_Desc desc_lam_3(blacs_ctxt_global_h);
+    desc_lam_3.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0);
+
+    Array_Desc desc_3_lam(blacs_ctxt_global_h);
+    desc_3_lam.init(3, n_nonsingular - 1, desc_wing.nb(), desc_body.mb(), 0, 0);
+
+    Array_Desc desc_3_3(blacs_ctxt_global_h);
+    desc_3_3.init(3, 3, desc_wing.nb(), desc_wing.nb(), 0, 0);
+
+    auto lam_3 = init_local_mat<complex<double>>(desc_lam_3, MAJOR::COL);
+    auto _3_lam = init_local_mat<complex<double>>(desc_3_lam, MAJOR::COL);
+    auto Lind_loc = init_local_mat<complex<double>>(desc_3_3, MAJOR::COL);
+    // tmp = head.at(ifreq) - transpose(wing.at(ifreq), true) * body_inv * wing.at(ifreq);
+    
+    ComplexMatrixDevice d_lam_3,d_3_lam,d_Lind_loc,d_wing_ifreq;
+    
+    d_wing_ifreq.set_data(wing.at(ifreq).nr(),wing.at(ifreq).nc(),wing.at(ifreq).ptr(),gpu_dev_stream.stream);
+    
+    d_lam_3.set_data(desc_lam_3.m_loc(),desc_lam_3.n_loc(),gpu_dev_stream.stream);
+    d_3_lam.set_data(desc_3_lam.m_loc(),desc_3_lam.n_loc(),gpu_dev_stream.stream);
+    d_Lind_loc.set_data(desc_3_3.m_loc(),desc_3_3.n_loc(),gpu_dev_stream.stream);
+    
+    std::complex<double> calpha(1.0,0.0),cbeta(0.0,0.0);
+
+    CudaConnector::pgemm_nvhpc(
+        gpu_dev_stream, CUBLAS_OP_N, CUBLAS_OP_N, n_nonsingular - 1, 3, n_nonsingular - 1,
+        &calpha,
+        d_body_inv, 1, 1, desc_body,
+        d_wing_ifreq, 1, 1, desc_wing,
+        &cbeta,
+        d_lam_3, 1, 1, desc_lam_3,
+        CUBLAS_COMPUTE_64F_PEDANTIC
+    );
+
+    // printf("desc_wing:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_wing.m_loc(),desc_wing.n_loc(),desc_wing.lld(),desc_wing.mb(),desc_wing.nb());
+    // printf("desc_lam_3:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_lam_3.m_loc(),desc_lam_3.n_loc(),desc_lam_3.lld(),desc_lam_3.mb(),desc_lam_3.nb());
+    // printf("desc_3_3:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_3_3.m_loc(),desc_3_3.n_loc(),desc_3_3.lld(),desc_3_3.mb(),desc_3_3.nb());
+    CudaConnector::pgemm_nvhpc(
+        gpu_dev_stream, CUBLAS_OP_C, CUBLAS_OP_N, 3, 3, n_nonsingular - 1,
+        &calpha,
+        d_wing_ifreq, 1, 1, desc_wing,
+        d_lam_3, 1, 1, desc_lam_3,
+        &cbeta,
+        d_Lind_loc, 1, 1, desc_3_3,
+        CUBLAS_COMPUTE_64F_PEDANTIC
+    );
+
+    CudaConnector::pgemm_nvhpc(
+        gpu_dev_stream, CUBLAS_OP_C, CUBLAS_OP_N, 3, n_nonsingular - 1, n_nonsingular - 1,
+        &calpha,
+        d_wing_ifreq, 1, 1, desc_wing,
+        d_body_inv, 1, 1, desc_body,
+        &cbeta,
+        d_3_lam, 1, 1, desc_3_lam,
+        CUBLAS_COMPUTE_64F_PEDANTIC
+    );
+
+    CUDA_CHECK(cudaMemcpyAsync(lam_3.ptr(), d_lam_3.ptr(), desc_lam_3.m_loc() * desc_lam_3.n_loc() * sizeof(std::complex<double>), cudaMemcpyDeviceToHost, gpu_dev_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(_3_lam.ptr(), d_3_lam.ptr(), desc_3_lam.m_loc() * desc_3_lam.n_loc() * sizeof(std::complex<double>), cudaMemcpyDeviceToHost, gpu_dev_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(Lind_loc.ptr(), d_Lind_loc.ptr(), desc_3_3.m_loc() * desc_3_3.n_loc() * sizeof(std::complex<double>), cudaMemcpyDeviceToHost, gpu_dev_stream.stream));
+    gpu_dev_stream.cudaSync();
+    for (int i = 0; i != 3; i++)
+    {
+        auto loc_i = desc_3_3.indx_g2l_r(i);
+        for (int ilambda = 0; ilambda < n_nonsingular - 1; ilambda++)
+        {
+            auto loc_ilambda = desc_lam_3.indx_g2l_r(ilambda);
+            auto loc_ibw = desc_lam_3.indx_g2l_c(i);
+            if (loc_ibw >= 0 && loc_ilambda >= 0)
+                this->bw(ilambda, i) = lam_3(loc_ilambda, loc_ibw);
+
+            loc_ilambda = desc_3_lam.indx_g2l_c(ilambda);
+            auto loc_iwb = desc_3_lam.indx_g2l_r(i);
+            if (loc_iwb >= 0 && loc_ilambda >= 0)
+                this->wb(i, ilambda) = _3_lam(loc_iwb, loc_ilambda);
+
+            MPI_Allreduce(MPI_IN_PLACE, &bw(ilambda, i), 1, MPI_DOUBLE_COMPLEX, MPI_SUM,
+                          mpi_comm_global_h.comm);
+            MPI_Allreduce(MPI_IN_PLACE, &wb(i, ilambda), 1, MPI_DOUBLE_COMPLEX, MPI_SUM,
+                          mpi_comm_global_h.comm);
+        }
+
+        for (int j = 0; j != 3; j++)
+        {
+            auto loc_j = desc_3_3.indx_g2l_c(j);
+            if (loc_j >= 0 && loc_i >= 0)
+                this->Lind(i, j) = head.at(ifreq)(i, j) - Lind_loc(loc_i, loc_j);
+            MPI_Allreduce(MPI_IN_PLACE, &Lind(i, j), 1, MPI_DOUBLE_COMPLEX, MPI_SUM,
+                          mpi_comm_global_h.comm);
+        }
+    }
+
+    Profiler::stop("cal_L");
+};
+#endif
+
 void diele_func::get_Leb_points()
 {
     auto quad_order = lebedev::QuadratureOrder::order_5810;
@@ -1306,25 +1460,7 @@ void diele_func::cal_eps(const int ifreq, Array_Desc &desc_nabf_nabf_opt, Array_
             chi0(ilo, jlo) = result;
         }
     }
-    // auto identity = init_local_mat<complex<double>>(desc_body, MAJOR::COL);
-    // for (int i = 0; i < n_nonsingular - 1; i++)
-    // {
-    //     const int ilo = desc_body.indx_g2l_r(i);
-    //     if (ilo < 0) continue;
-    //     for (int j = 0; j < n_nonsingular - 1; j++)
-    //     {
-    //         const int jlo = desc_body.indx_g2l_c(j);
-    //         if (jlo < 0) continue;
-    //         if (i == j)
-    //             identity(ilo, jlo) = 1.0;
-    //         else
-    //             identity(ilo, jlo) = 0.0;
-    //     }
-    // }
-    // ScalapackConnector::pgemm_f('N', 'N', n_nonsingular - 1, n_nonsingular - 1, n_nonsingular -
-    // 1,
-    //                             1.0, body_inv.ptr(), 1, 1, desc_body.desc, identity.ptr(), 1, 1,
-    //                             desc_body.desc, 1.0, chi0.ptr(), 2, 2, desc_nabf_nabf_opt.desc);
+    
     ScalapackConnector::pgeadd_f('N', n_nonsingular - 1, n_nonsingular - 1, 1.0, body_inv.ptr(), 1,
                                  1, desc_body.desc, 1.0, chi0.ptr(), 2, 2, desc_nabf_nabf_opt.desc);
     Profiler::stop("cal_inverse_dielectric_matrix_ij");
@@ -1334,6 +1470,119 @@ void diele_func::cal_eps(const int ifreq, Array_Desc &desc_nabf_nabf_opt, Array_
     Profiler::stop("cal_inverse_dielectric_matrix");
 };
 
+#ifdef ENABLE_NVHPC
+void diele_func::cal_eps_nvhpc(const GpuDeviceStream& gpu_dev_stream, const int& ifreq, Array_Desc &desc_nabf_nabf_opt, Array_Desc &desc_body)
+{
+    Profiler::start("cal_inverse_dielectric_matrix");
+    // mpi_comm_global_h.barrier();
+    this->chi0 = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
+
+    const double k_volume = std::abs(G.Det());
+    this->vol_gamma = k_volume / nk;
+    double vol_gamma_numeric = 0.0;
+    if (ifreq == 0 && mpi_comm_global_h.is_root())
+    {
+        for (int ileb = 0; ileb != qw_leb.size(); ileb++)
+        {
+            vol_gamma_numeric += qw_leb[ileb] * std::pow(q_gamma[ileb], 3) / 3.0;
+        }
+        std::cout << "Number of angular grids for average inverse dielectric matrix: "
+                  << qw_leb.size() << std::endl;
+        std::cout << "vol_gamma_numeric/vol_gamma: " << vol_gamma_numeric << ", " << vol_gamma
+                  << std::endl;
+        std::cout << "Angular quadrature accuracy for volume: " << vol_gamma_numeric / vol_gamma
+                  << " (should be close to 1)" << std::endl;
+    }
+    /*std::cout << "major of Matz: " << wing[0].is_row_major() << "," << body_inv.is_row_major()
+              << "," << transpose(wing.at(0), true).is_row_major() << "," << Lind.is_row_major()
+              << std::endl;*/
+    construct_L_nvhpc(gpu_dev_stream, ifreq, desc_body);
+
+    Profiler::start("precompute_q_data");
+
+    const size_t nleb = qw_leb.size();
+    std::vector<std::complex<double>> weights(nleb);
+    std::vector<std::array<double, 3>> q_vectors(nleb);
+
+    const auto L00 = Lind(0, 0), L01 = Lind(0, 1), L02 = Lind(0, 2);
+    const auto L10 = Lind(1, 0), L11 = Lind(1, 1), L12 = Lind(1, 2);
+    const auto L20 = Lind(2, 0), L21 = Lind(2, 1), L22 = Lind(2, 2);
+
+#pragma omp parallel for schedule(static)
+    for (int ileb = 0; ileb < nleb; ++ileb)
+    {
+        const double qx = qx_leb[ileb];
+        const double qy = qy_leb[ileb];
+        const double qz = qz_leb[ileb];
+
+        q_vectors[ileb] = {qx, qy, qz};
+
+        const auto qLq = qx * (qx * L00 + qy * L01 + qz * L02) +
+                         qy * (qx * L10 + qy * L11 + qz * L12) +
+                         qz * (qx * L20 + qy * L21 + qz * L22);
+
+        weights[ileb] = qw_leb[ileb] * std::pow(q_gamma[ileb], 3) / (3.0 * vol_gamma) / qLq;
+    }
+    Profiler::stop("precompute_q_data");
+
+    Profiler::start("cal_inverse_dielectric_matrix_ij");
+    int i_start = 0, i_end = n_nonsingular;
+    int j_start = 0, j_end = n_nonsingular;
+#pragma omp parallel for schedule(dynamic, 4) collapse(2)
+    for (int i = i_start; i != i_end; i++)
+    {
+        for (int j = j_start; j != j_end; j++)
+        {
+            const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i);
+            if (ilo < 0) continue;
+            const int jlo = desc_nabf_nabf_opt.indx_g2l_c(j);
+            if (jlo < 0) continue;
+
+            complex<double> result = 0.0;
+
+            if (i == 0 && j == 0)
+            {
+                for (int ileb = 0; ileb < nleb; ++ileb)
+                {
+                    result += weights[ileb];
+                }
+            }
+            else if (i == 0 || j == 0)
+            {
+                result = 0.0;
+            }
+            else
+            {
+                const int idx_i = i - 1, idx_j = j - 1;
+
+                const auto bw_i0 = bw(idx_i, 0), bw_i1 = bw(idx_i, 1), bw_i2 = bw(idx_i, 2);
+                const auto wb_j0 = wb(0, idx_j), wb_j1 = wb(1, idx_j), wb_j2 = wb(2, idx_j);
+
+                for (int ileb = 0; ileb < nleb; ++ileb)
+                {
+                    const auto &[qx, qy, qz] = q_vectors[ileb];
+                    const auto bwq = bw_i0 * qx + bw_i1 * qy + bw_i2 * qz;
+                    const auto qwb = qx * wb_j0 + qy * wb_j1 + qz * wb_j2;
+
+                    result += weights[ileb] * bwq * qwb;
+                }
+            }
+            chi0(ilo, jlo) = result;
+        }
+    }
+    ScalapackConnector::pgeadd_f(
+        'N', n_nonsingular - 1, n_nonsingular - 1, 
+        1.0, 
+        body_inv.ptr(), 1, 1, desc_body.desc, 
+        1.0, 
+        chi0.ptr(), 2, 2, desc_nabf_nabf_opt.desc);
+    Profiler::stop("cal_inverse_dielectric_matrix_ij");
+    if (mpi_comm_global_h.is_root())
+        std::cout << "* Success: calculate average inverse dielectric matrix no." << ifreq + 1
+                  << "." << std::endl;
+    Profiler::stop("cal_inverse_dielectric_matrix");
+};
+#endif
 /*std::complex<double> diele_func::compute_chi0_inv_00(const int ifreq)
 {
     std::complex<double> total = 0.0;
@@ -1423,4 +1672,23 @@ void diele_func::rewrite_eps(matrix_m<std::complex<double>> &chi0_block, const i
     this->body_inv.clear();
 };
 
+#ifdef ENABLE_NVHPC
+void diele_func::rewrite_eps_nvhpc(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice& d_chi0_block, const int& ifreq, Array_Desc& desc_nabf_nabf_opt)
+{
+    auto desc_body = get_body_inv_nvhpc(gpu_dev_stream, d_chi0_block, desc_nabf_nabf_opt);
+    this->body_inv = init_local_mat<complex<double>>(desc_body, MAJOR::COL);
+    CUDA_CHECK(cudaMemcpyAsync(this->body_inv.ptr(),this->d_body_inv.ptr(),sizeof(cuDoubleComplex)*desc_body.m_loc()*desc_body.n_loc(),cudaMemcpyDeviceToHost,gpu_dev_stream.stream));
+    
+    cal_eps_nvhpc(gpu_dev_stream, ifreq, desc_nabf_nabf_opt, desc_body);
+    d_chi0_block.set_data(this->chi0.nr(),this->chi0.nc(),this->chi0.ptr(),gpu_dev_stream.stream);
+    // this->chi0.clear();
+    ofs_myid << get_timestamp() << "success finish rewrite_eps"<<endl;
+    
+    this->Lind.clear();
+    this->body_inv.clear();
+    this->d_body_inv.clean(gpu_dev_stream.stream);
+    ofs_myid << get_timestamp() << "success clean the data"<<endl;
+}
+#endif
+
 diele_func df_headwing = diele_func();
diff --git a/src/dielecmodel.h b/src/dielecmodel.h
index b479b540..70928ca6 100644
--- a/src/dielecmodel.h
+++ b/src/dielecmodel.h
@@ -18,7 +18,9 @@
 #include "utils_blacs.h"
 #include "utils_io.h"
 #include "vec.h"
-
+#ifdef ENABLE_NVHPC
+#include "cuda_connector.h"
+#endif
 using LIBRPA::Array_Desc;
 using LIBRPA::envs::blacs_ctxt_global_h;
 using RI::Tensor;
@@ -46,6 +48,9 @@ class diele_func
     // ( i:n_lambda, j:n_lambda )
     matrix_m<std::complex<double>> body_inv;
     // ( i:3, j:3 )
+    #ifdef ENABLE_NVHPC
+    ComplexMatrixDevice d_body_inv;
+    #endif
     matrix_m<std::complex<double>> Lind;
     // ( i:n_lambda, j:3 )
     matrix_m<std::complex<double>> bw;
@@ -132,16 +137,26 @@ class diele_func
     Array_Desc get_body_inv(matrix_m<std::complex<double>> &chi0_block,
                             Array_Desc &desc_nabf_nabf_opt);
     void construct_L(const int ifreq, Array_Desc &desc_body);
+    #ifdef ENABLE_NVHPC
+    void construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const int& ifreq, Array_Desc& desc_body);
+    #endif
     // Lebedev-Laikov quadrature
     void get_Leb_points();
     void get_g_enclosing_gamma();
     void calculate_q_gamma();
     void cal_eps(const int ifreq, Array_Desc &desc_nabf_nabf_opt, Array_Desc &desc_body);
+    #ifdef ENABLE_NVHPC
+    void cal_eps_nvhpc(const GpuDeviceStream& gpu_dev_stream, const int& ifreq, Array_Desc &desc_nabf_nabf_opt, Array_Desc &desc_body);
+    #endif
     // not used now due to performance optimization
     // std::complex<double> compute_chi0_inv_00(const int ifreq);
     // std::complex<double> compute_chi0_inv_ij(const int ifreq, int i, int j);
     void rewrite_eps(matrix_m<std::complex<double>> &chi0_block, const int ifreq,
                      Array_Desc &desc_nabf_nabf_opt);
+    #ifdef ENABLE_NVHPC
+    Array_Desc get_body_inv_nvhpc(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice&, const Array_Desc&);
+    void rewrite_eps_nvhpc(const GpuDeviceStream& gpu_dev_stream,ComplexMatrixDevice&,const int& ifreq,Array_Desc& desc_nabf_nabf_opt);
+    #endif
     void assign_chi0(matrix_m<std::complex<double>> &chi0_block, Array_Desc &desc_nabf_nabf_opt);
 };
 
diff --git a/src/epsilon_cuda.cpp b/src/epsilon_cuda.cpp
new file mode 100644
index 00000000..dc194711
--- /dev/null
+++ b/src/epsilon_cuda.cpp
@@ -0,0 +1,1448 @@
+#include "epsilon_cuda.h"
+#include "epsilon.h"
+#include "device_stream.h"
+#include <magma_v2.h>
+#include <random>
+#include <math.h>
+#include <omp.h>
+
+#include <algorithm>
+#include <array>
+#include <set>
+#include <stdexcept>
+#include <valarray>
+
+#include "atoms.h"
+#include "constants.h"
+#include "envs_blacs.h"
+#include "envs_io.h"
+#include "envs_mpi.h"
+#include "lapack_connector.h"
+#include "libri_utils.h"
+#include "matrix_m_parallel_utils.h"
+#include "parallel_mpi.h"
+#include "params.h"
+#include "pbc.h"
+#include "profiler.h"
+#include "scalapack_connector.h"
+#include "stl_io_helper.h"
+#include "utils_blacs.h"
+#include "utils_io.h"
+#include "utils_mem.h"
+#include "utils_mpi_io.h"
+#ifdef LIBRPA_USE_LIBRI
+#include <RI/comm/mix/Communicate_Tensors_Map_Judge.h>
+#include <RI/global/Tensor.h>
+using RI::Tensor;
+using RI::Communicate_Tensors_Map_Judge::comm_map2_first;
+#endif
+#ifdef ENABLE_NVHPC
+#include "helpers.h"
+#include <fstream> 
+#include <string>
+#endif
+
+using LIBRPA::Array_Desc;
+using LIBRPA::envs::blacs_ctxt_global_h;
+using LIBRPA::envs::mpi_comm_global_h;
+using LIBRPA::envs::ofs_myid;
+using LIBRPA::utils::lib_printf;
+CorrEnergy compute_RPA_correlation_blacs_2d_cuda(Chi0 &chi0, atpair_k_cplx_mat_t &coulmat)
+{
+    lib_printf("Begin to compute_RPA_correlation_blacs_2d_nvhpc  myid: %d\n", mpi_comm_global_h.myid);
+    system("free -m");
+    CorrEnergy corr;
+    if (mpi_comm_global_h.myid == 0) lib_printf("Calculating EcRPA with BLACS/ScaLAPACK_nvhpc 2D\n");
+    const auto &mf = chi0.mf;
+    const complex<double> CONE{1.0, 0.0};
+    const int n_abf = LIBRPA::atomic_basis_abf.nb_total;
+    if (mpi_comm_global_h.myid == 0) lib_printf("n_abf = %d\n", n_abf);
+    const auto part_range = LIBRPA::atomic_basis_abf.get_part_range();
+
+    mpi_comm_global_h.barrier();
+
+    Array_Desc desc_nabf_nabf(blacs_ctxt_global_h);
+    desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0);
+    const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy(
+        'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf);
+    const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf);
+    auto chi0_block = init_local_mat<complex<double>>(desc_nabf_nabf, MAJOR::COL);
+    auto coul_block = init_local_mat<complex<double>>(desc_nabf_nabf, MAJOR::COL);
+    auto coul_chi0_block = init_local_mat<complex<double>>(desc_nabf_nabf, MAJOR::COL);
+    #ifdef ENABLE_NVHPC
+    MatrixDevice<std::complex<double>> d_chi0_block, d_coul_block, d_coul_chi0_block;
+    #endif
+    vector<Vector3_Order<double>> qpts;
+    for(const auto &q : chi0.klist)
+    {
+        qpts.push_back(q);
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+        // printf("processId:%d, q: (%f, %f, %f)\n", mpi_comm_global_h.myid, q.x, q.y, q.z);
+        #endif
+    }
+    complex<double> tot_RPA_energy(0.0, 0.0);
+    map<Vector3_Order<double>, complex<double>> cRPA_q;
+    if (mpi_comm_global_h.is_root()) lib_printf("Finish init RPA blacs 2d\n");
+    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+    // printf("success before for loop processid:%d\n", mpi_comm_global_h.myid);
+    #endif
+#ifdef LIBRPA_USE_LIBRI
+    
+    for (const auto &q : qpts)
+    {
+        coul_block.zero_out();
+
+        int iq = std::distance(klist.begin(), std::find(klist.begin(), klist.end(), q));
+        std::array<double, 3> qa = {q.x, q.y, q.z};
+        // collect the block elements of coulomb matrices
+        {
+            double vq_begin = omp_get_wtime();
+            // LibRI tensor for communication, release once done
+            std::map<int, std::map<std::pair<int, std::array<double, 3>>, Tensor<complex<double>>>>
+                coul_libri;
+            coul_libri.clear();
+            for (const auto &Mu_Nu : local_atpair)
+            {
+                const auto Mu = Mu_Nu.first;
+                const auto Nu = Mu_Nu.second;
+
+                if (coulmat.count(Mu) == 0 || coulmat.at(Mu).count(Nu) == 0 ||
+                    coulmat.at(Mu).at(Nu).count(q) == 0)
+                    continue;
+                const auto &Vq = coulmat.at(Mu).at(Nu).at(q);
+                const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu);
+                const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu);
+                std::valarray<complex<double>> Vq_va(Vq->c, Vq->size);
+                auto pvq = std::make_shared<std::valarray<complex<double>>>();
+                *pvq = Vq_va;
+                coul_libri[Mu][{Nu, qa}] = Tensor<complex<double>>({n_mu, n_nu}, pvq);
+            }
+            double arr_end = omp_get_wtime();
+            mpi_comm_global_h.barrier();
+            double comm_begin = omp_get_wtime();
+            const auto IJq_coul =
+                comm_map2_first(mpi_comm_global_h.comm, coul_libri, s0_s1.first, s0_s1.second);
+            double comm_end = omp_get_wtime();
+            mpi_comm_global_h.barrier();
+            double block_begin = omp_get_wtime();
+            collect_block_from_ALL_IJ_Tensor(coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf,
+                                             qa, true, CONE, IJq_coul, MAJOR::ROW);
+            double block_end = omp_get_wtime();
+            lib_printf(
+                "Vq Time  myid: %d  arr_time: %f  comm_time: %f   block_time: %f   pair_size: %d\n",
+                mpi_comm_global_h.myid, arr_end - vq_begin, comm_end - comm_begin,
+                block_end - block_begin, set_IJ_nabf_nabf.size());
+            mpi_comm_global_h.barrier();
+            double vq_end = omp_get_wtime();
+
+            if (mpi_comm_global_h.myid == 0)
+                lib_printf(" | Total vq time: %f  lri_coul: %f   comm_vq: %f   block_vq: %f\n",
+                           vq_end - vq_begin, comm_begin - vq_begin, block_begin - comm_begin,
+                           vq_end - block_begin);
+        }
+        double chi_arr_time = 0.0;
+        double chi_comm_time = 0.0;
+        double chi_2d_time = 0.0;
+        for (const auto &freq : chi0.tfg.get_freq_nodes())
+        {
+            const auto ifreq = chi0.tfg.get_freq_index(freq);
+            const double freq_weight = chi0.tfg.find_freq_weight(freq);
+            double pi_freq_begin = omp_get_wtime();
+            chi0_block.zero_out();
+            {
+                double chi_begin_arr = omp_get_wtime();
+                std::map<int,
+                         std::map<std::pair<int, std::array<double, 3>>, Tensor<complex<double>>>>
+                    chi0_libri;
+                atom_mapping<ComplexMatrix>::pair_t_old chi0_wq;
+                if(!chi0.get_chi0_q().empty())
+                    chi0_wq = chi0.get_chi0_q().at(freq).at(q);
+                chi0_libri.clear();
+                if(!chi0.get_chi0_q().empty())
+                for (const auto &M_Nchi : chi0_wq)
+                {
+                    const auto &M = M_Nchi.first;
+                    const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M);
+                    for (const auto &N_chi : M_Nchi.second)
+                    {
+                        const auto &N = N_chi.first;
+                        const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N);
+                        const auto &chi = N_chi.second;
+                        std::valarray<complex<double>> chi_va(chi.c, chi.size);
+                        auto pchi = std::make_shared<std::valarray<complex<double>>>();
+                        *pchi = chi_va;
+                        chi0_libri[M][{N, qa}] = Tensor<complex<double>>({n_mu, n_nu}, pchi);
+                    }
+                }
+                if (mpi_comm_global_h.is_root())
+                {
+                    lib_printf("Begin to clean chi0 !!! \n");
+                    LIBRPA::utils::display_free_mem();
+                    lib_printf("chi0_freq_q size: %d,  freq: %f, q:( %f, %f, %f )\n",
+                               chi0_wq.size(), freq, q.x, q.y, q.z);
+                }
+                if(!chi0.get_chi0_q().empty())
+                    chi0.free_chi0_q(freq, q);
+                
+                LIBRPA::utils::release_free_mem();
+                mpi_comm_global_h.barrier();
+                double chi_end_arr = omp_get_wtime();
+                const auto IJq_chi0 =
+                    comm_map2_first(mpi_comm_global_h.comm, chi0_libri, s0_s1.first, s0_s1.second);
+                // ofs_myid << "IJq_chi0" << endl << IJq_chi0;
+                double chi_end_comm = omp_get_wtime();
+                collect_block_from_ALL_IJ_Tensor(chi0_block, desc_nabf_nabf,
+                                                 LIBRPA::atomic_basis_abf, qa, true, CONE, IJq_chi0,
+                                                 MAJOR::ROW);
+                mpi_comm_global_h.barrier();
+                double chi_end_2d = omp_get_wtime();
+
+                chi_arr_time = (chi_end_arr - chi_begin_arr);
+                chi_comm_time = (chi_end_comm - chi_end_arr);
+                chi_2d_time = (chi_end_2d - chi_end_comm);
+            }
+
+            double pi_begin = omp_get_wtime();
+            d_coul_block.set_data(coul_block.nr(), coul_block.nc(), coul_block.ptr(),device_stream.stream);
+            d_chi0_block.set_data(chi0_block.nr(), chi0_block.nc(), chi0_block.ptr(),device_stream.stream);
+            d_coul_chi0_block.set_data(coul_chi0_block.nr(), coul_chi0_block.nc(), coul_chi0_block.ptr(),device_stream.stream);
+            std::complex<double> calpha(1.0,0.0),cbeta(0.0,0.0);
+            bool is_mixed_precision = false;
+            if(is_mixed_precision)
+            {
+                MatrixDevice<std::complex<float>> d_coul_block_f(coul_block.nr(),coul_block.nc(),device_stream.stream);
+                MatrixDevice<std::complex<float>> d_chi0_block_f(chi0_block.nr(),chi0_block.nc(),device_stream.stream);
+                MatrixDevice<std::complex<float>> d_coul_chi0_block_f(coul_chi0_block.nr(),coul_chi0_block.nc(),device_stream.stream);
+                DeviceConnector::double_to_float_device((double*)d_coul_block.ptr(),(float*)d_coul_block_f.ptr(),d_coul_block.nr()*d_coul_block.nc()*2);
+                DeviceConnector::double_to_float_device((double*)d_chi0_block.ptr(),(float*)d_chi0_block_f.ptr(),d_chi0_block.nr()*d_chi0_block.nc()*2);
+                DeviceConnector::double_to_float_device((double*)d_coul_chi0_block.ptr(),(float*)d_coul_chi0_block_f.ptr(),d_coul_chi0_block.nr()*d_coul_chi0_block.nc()*2);
+                std::complex<float> calpha_f(1.0f,0.0f),cbeta_f(0.0f,0.0f);
+                DeviceConnector::pgemm_device_mixed_precision(
+                    'N', 'N', n_abf, n_abf, n_abf,
+                    &calpha_f,
+                    d_coul_block_f.ptr(), 1, 1, desc_nabf_nabf,
+                    d_chi0_block_f.ptr(), 1, 1, desc_nabf_nabf,
+                    &cbeta_f,
+                    d_coul_chi0_block_f.ptr(), 1, 1, desc_nabf_nabf,
+                    LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT
+                );
+                DeviceConnector::float_to_double_device((float*)d_coul_chi0_block_f.ptr(),(double*)d_coul_chi0_block.ptr(),d_coul_chi0_block.nr()*d_coul_chi0_block.nc()*2);
+            }else{
+                DeviceConnector::pgemm_device_mixed_precision(
+                    'N', 'N', n_abf, n_abf, n_abf,
+                    &calpha,
+                    d_coul_block.ptr(), 1, 1, desc_nabf_nabf,
+                    d_chi0_block.ptr(), 1, 1, desc_nabf_nabf,
+                    &cbeta,
+                    d_coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf,
+                    LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE
+                );
+            }    
+            double pi_end = omp_get_wtime();
+            complex<double> trace_pi(0.0, 0.0);
+            complex<double> trace_pi_loc(0.0, 0.0);
+            DeviceConnector::trace_matrix_device_blacs(&trace_pi_loc,d_coul_chi0_block.ptr(),desc_nabf_nabf,LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE);
+            cuDoubleComplex calpha1;
+            calpha1.x = -1.0;
+            calpha1.y = 0.0;
+            DeviceConnector::num_multiply_matrix_device(d_coul_chi0_block.nr()*d_coul_chi0_block.nc(),&calpha1,d_coul_chi0_block.ptr(),LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE);
+            
+            DeviceConnector::diag_add_matrix_device_blacs(&calpha,d_coul_chi0_block.ptr(),desc_nabf_nabf, LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE);
+            
+            int64_t *d_ipiv;
+            int *d_info ;
+            CUDA_CHECK(cudaMallocAsync((void**)&d_ipiv, sizeof(int64_t)*max(desc_nabf_nabf.m_loc(), desc_nabf_nabf.n_loc()),device_stream.stream));
+            CUDA_CHECK(cudaMallocAsync((void**)&d_info, sizeof(int),device_stream.stream));
+            
+            complex<double> ln_det =
+                compute_pi_det_blacs_2d_nvhpc(d_coul_chi0_block, desc_nabf_nabf, d_ipiv, d_info, 'c');
+            CUDA_CHECK(cudaFreeAsync(d_ipiv,device_stream.stream));
+            CUDA_CHECK(cudaFreeAsync(d_info,device_stream.stream));
+
+            double det_end = omp_get_wtime();
+            mpi_comm_global_h.barrier();
+            MPI_Allreduce(&trace_pi_loc, &trace_pi, 1, MPI_DOUBLE_COMPLEX, MPI_SUM,
+                          mpi_comm_global_h.comm);
+            double pi_freq_end = omp_get_wtime();
+            if (mpi_comm_global_h.myid == 0)
+            {
+                lib_printf(
+                    "| TIME of DET-freq-q:  %f,  q: ( %f, %f, %f)  TOT: %f  CHI_arr: %f  CHI_comm: "
+                    "%f, CHI_2d: %f, Pi: %f, Det: %f\n",
+                    freq, q.x, q.y, q.z, pi_freq_end - pi_freq_begin, chi_arr_time, chi_comm_time,
+                    chi_2d_time, pi_end - pi_begin, det_end - pi_end);
+                complex<double> rpa_for_omega_q = trace_pi + ln_det;
+                cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;  //! check
+                tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;
+            }
+        }
+    }
+#else
+    throw std::logic_error("need compilation with LibRI");
+#endif
+    if (mpi_comm_global_h.myid == 0)
+    {
+        for (auto &q_crpa : cRPA_q)
+        {
+            corr.qcontrib[q_crpa.first] = q_crpa.second;
+            // cout << q_crpa.first << q_crpa.second << endl;
+        }
+        // cout << "gx_num_" << chi0.tfg.size() << "  tot_RPA_energy:  " << setprecision(8)
+        // <<tot_RPA_energy << endl;
+    }
+    mpi_comm_global_h.barrier();
+    corr.value = tot_RPA_energy;
+
+    corr.etype = CorrEnergy::type::RPA;
+    return corr;
+}
+
+CorrEnergy compute_RPA_correlation_cuda(const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat)
+{
+    CorrEnergy corr;
+    if (mpi_comm_global_h.myid == 0) lib_printf("Calculating EcRPA without BLACS/ScaLAPACK(cuda)\n");
+    // lib_printf("Begin cal cRPA , pid:  %d\n", mpi_comm_global_h.myid);
+    const auto &mf = chi0.mf;
+
+    // freq, q
+    map<double, map<Vector3_Order<double>, atom_mapping<ComplexMatrix>::pair_t_old>>
+        pi_freq_q_Mu_Nu;
+    if (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR ||
+        LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI)
+        pi_freq_q_Mu_Nu = compute_Pi_q_MPI(chi0, coulmat);
+    else
+        pi_freq_q_Mu_Nu = compute_Pi_q(chi0, coulmat);
+    lib_printf("Finish Pi freq on Proc %4d, size %zu\n", mpi_comm_global_h.myid,
+               pi_freq_q_Mu_Nu.size());
+    // mpi_comm_global_h.barrier();
+
+    int range_all = N_all_mu;
+
+    vector<int> part_range;
+    part_range.resize(atom_mu.size());
+    part_range[0] = 0;
+    int count_range = 0;
+    
+    for (int I = 0; I != atom_mu.size() - 1; I++)
+    {
+        count_range += atom_mu[I];
+        part_range[I + 1] = count_range;
+    }
+    
+
+    // pi_freq_q contains all atoms
+    map<double, map<Vector3_Order<double>, ComplexMatrix>> pi_freq_q;
+    
+    for(const auto &freq : chi0.tfg.get_freq_nodes())
+    {
+        // printf("| process %d, freq: %f\n", mpi_comm_global_h.myid, freq);
+        map<Vector3_Order<double>, atom_mapping<ComplexMatrix>::pair_t_old> freq_q_MuNupi;
+        if(!chi0.get_chi0_q().empty())
+            freq_q_MuNupi=pi_freq_q_Mu_Nu.at(freq);
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+        // printf("success before freq_q_MuNupi processid:%d, freq_q_MuNupi.size(): %zu\n",
+        //        mpi_comm_global_h.myid, freq_q_MuNupi.size());
+        #endif
+        for(const auto &q:chi0.klist){
+            atom_mapping<ComplexMatrix>::pair_t_old q_MuNupi;
+            if(!chi0.get_chi0_q().empty())
+                q_MuNupi = freq_q_MuNupi.at(q);
+            const auto MuNupi = q_MuNupi;
+            pi_freq_q[freq][q].create(range_all, range_all);
+
+            ComplexMatrix pi_munu_tmp(range_all, range_all);
+            pi_munu_tmp.zero_out();
+            if(!chi0.get_chi0_q().empty())
+            for (const auto &Mu_Nupi : MuNupi)
+            {
+                const auto Mu = Mu_Nupi.first;
+                const auto Nupi = Mu_Nupi.second;
+                const size_t n_mu = atom_mu[Mu];
+                for (const auto &Nu_pi : Nupi)
+                {
+                    const auto Nu = Nu_pi.first;
+                    const auto pimat = Nu_pi.second;
+                    const size_t n_nu = atom_mu[Nu];
+
+                    for (size_t mu = 0; mu != n_mu; ++mu)
+                    {
+                        for (size_t nu = 0; nu != n_nu; ++nu)
+                        {
+                            pi_munu_tmp(part_range[Mu] + mu, part_range[Nu] + nu) += pimat(mu, nu);
+                        }
+                    }
+                }
+            }
+            if (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR ||
+                LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI)
+            {
+                mpi_comm_global_h.reduce_ComplexMatrix(pi_munu_tmp, pi_freq_q.at(freq).at(q), 0);
+            }
+            else
+            {
+                pi_freq_q.at(freq).at(q) = std::move(pi_munu_tmp);
+            }
+        }
+    }
+    // lib_printf("Finish Pi communicate %4d, size %zu\n", mpi_comm_global_h.myid,
+    // pi_freq_q_Mu_Nu.size());
+    mpi_comm_global_h.barrier();
+    // if (mpi_comm_global_h.myid == 0)
+    {
+        complex<double> tot_RPA_energy(0.0, 0.0);
+        map<Vector3_Order<double>, complex<double>> cRPA_q;
+        int deviceCount;
+        cudaError_t err= cudaGetDeviceCount(&deviceCount);
+        
+        // if(err==cudaSuccess&&deviceCount>0&&deviceCount!=4)
+        //     printf("cudaSuccess:%d\n",err==cudaSuccess&&deviceCount>0);
+        // ==============================test the velocity of cuda stream========================================
+        
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+            range_all=TEST_LU_NUMBER;
+            complex<double>* temp_complex=new complex<double>[range_all*range_all];
+            std::random_device rd;
+            std::mt19937 gen(rd());
+            std::uniform_real_distribution<> dis(-1.0, 1.0);
+            double temp_begin_time=omp_get_wtime();
+            // #pragma omp parallel for
+            for(int i=0;i<range_all*range_all;i++){
+                temp_complex[i].real(dis(gen));
+                temp_complex[i].imag(dis(gen));
+            }
+            printf("time for generate random complex matrix: %f\n",omp_get_wtime()-temp_begin_time);
+            for (const auto &freq_qpi : pi_freq_q)
+            {
+                const auto freq = freq_qpi.first;
+                for (const auto &q_pi : freq_qpi.second)
+                {
+                    const auto q= q_pi.first;
+                    printf("test freq: %f, q: (%f, %f, %f)\n",freq,q.x,q.y,q.z);
+                    pi_freq_q[freq][q].c=new complex<double>[range_all*range_all];
+                    complex<double>* c_ptr=pi_freq_q[freq][q].c;
+                    
+                    double temp_begin_time=omp_get_wtime();
+                    #pragma omp parallel for
+                    for(int i=0;i<range_all*range_all;i++){
+                        c_ptr[i]=temp_complex[i];
+                        
+                    }
+                    printf("time for copy complex matrix: %f\n",omp_get_wtime()-temp_begin_time);
+                }
+            }
+            delete[] temp_complex;
+        #endif
+        printf("the size of matrix:%d,%d\n",range_all,range_all);
+        // ==============================end test, the data is a fault data, but it can be used to test the speed of cuda stream========================================
+
+        int NUM_STREAMS = 4;
+        printf("deviceCount:%d\n",deviceCount);
+        cudaStream_t* streams=new cudaStream_t[deviceCount*NUM_STREAMS];
+        int ngpu= deviceCount;
+        magma_queue_t *queues=new magma_queue_t[ngpu];
+        // create streams
+        double start_time=omp_get_wtime();
+        int max_range_all=60000;
+        printf("range_all:%d, max_range_all:%d\n",range_all,max_range_all);
+        if(range_all<max_range_all){
+            // #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+            printf("success before create streams");
+            // #endif
+            for(int i=0;i<deviceCount;i++){
+                cudaSetDevice(i);//set the device
+                for(int j=0;j<NUM_STREAMS;j++){
+                    cudaStreamCreate(&streams[i*NUM_STREAMS+j]);
+                    printf("the stream was created,device:%d,stream:%d\n",i,j);
+                }
+            }
+            // #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+            printf("success after create streams");
+            // #endif
+        }else{
+            printf("the range_all is too large, LU decomposition will be done with the function magma_zgetrf_mgpu");
+            magma_init();
+            for( int dev = 0; dev < ngpu; ++dev ) {
+
+                magma_queue_create( dev, &queues[dev] );
+            }
+        }
+        // #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+        printf("time for create streams: %f\n",omp_get_wtime()-start_time);
+        // #endif
+        int processSize;
+        int processId= mpi_comm_global_h.myid;
+        MPI_Comm_size(MPI_COMM_WORLD, &processSize);
+        printf("processSize:%d, processId:%d\n",processSize,processId);
+        int num_iteration=0;
+        double time_calculate=omp_get_wtime();
+        #ifdef OPEN_OMP_FOR_LU_DECOMPOSITION
+        #pragma omp parallel
+        #pragma omp single
+        #endif
+        {
+        
+        for (const auto &freq_qpi : pi_freq_q)
+        {
+            const auto freq = freq_qpi.first;
+            const double freq_weight = chi0.tfg.find_freq_weight(freq);
+            for (const auto &q_pi : freq_qpi.second)
+            {
+                // if(processId!=num_iteration%processSize){
+                //     num_iteration++;
+                //     continue;
+                // }
+                #ifndef OPEN_OMP_FOR_LU_DECOMPOSITION
+                    double one_task_begin = omp_get_wtime();
+                #endif
+                const auto q = q_pi.first;
+                const auto pimat = q_pi.second;
+                // double test_startTime=omp_get_wtime();
+                #ifdef OPEN_OMP_FOR_LU_DECOMPOSITION
+                #pragma omp task firstprivate(num_iteration,freq,q,freq_weight) 
+                #endif
+                {
+                    complex<double> trace_pi = trace(pi_freq_q.at(freq).at(q));
+                    complex<double> rpa_for_omega_q(0.0, 0.0);
+                    int info_LU = 1;
+                    int *ipiv = new int[range_all];
+                    cuDoubleComplex det_test;
+                    if(range_all<max_range_all){
+                        int deviceId=(num_iteration/NUM_STREAMS)%deviceCount;
+                        int streamId=num_iteration%NUM_STREAMS;
+                        cudaSetDevice(deviceId);//set the device
+                        #ifndef OPEN_OMP_FOR_LU_DECOMPOSITION
+                            printf("one task time after set device:%f\n",omp_get_wtime()-one_task_begin);
+                        #endif
+                        cudaStream_t stream=streams[deviceId*NUM_STREAMS+streamId];
+                        cusolverDnHandle_t handle;
+                        cusolverDnCreate(&handle);
+                        cusolverDnSetStream(handle, stream);
+                        cuDoubleComplex* h_pi=(cuDoubleComplex*)pi_freq_q[freq][q].c;
+                        // // show h_pi
+                        // printf("h_pi matrix:\n");
+                        // for(int i=0;i<range_all;i++){
+                        //     for(int j=0;j<range_all;j++){
+                        //         printf("%f+%fi ",h_pi[i*range_all+j].x,h_pi[i*range_all+j].y);
+                        //     }
+                        //     printf("\n");
+                        // }
+                        det_test=CudaConnector::det_cuZgetrf_f_from_host(range_all,range_all,(cuDoubleComplex*)pi_freq_q[freq][q].c,range_all,ipiv,&info_LU,handle,deviceId);
+                        cusolverDnDestroy(handle);
+                        // printf("iteration:%d",num_iteration);
+                        // printf("det_test.x:%f,det_test.y:%f,info_LU:%d\n",det_test.x,det_test.y,info_LU);
+                        
+                    }else{
+                        // magma_init();
+                        // magma_queue_t* queues = new magma_queue_t[ngpu];
+                        // for( int dev = 0; dev < ngpu; ++dev ) {
+                        //     magma_setdevice( dev );
+                        //     magma_queue_create( dev, &queues[dev] );
+                        // }
+                        // cuDoubleComplex* h_pi=new cuDoubleComplex[range_all*range_all];
+                        // for(int i=0;i<range_all*range_all;i++){
+                        //     h_pi[i].x = pi_freq_q[freq][q].c[i].real();
+                        //     h_pi[i].y = pi_freq_q[freq][q].c[i].imag();
+                        // }
+                        det_test=CudaConnector::det_magmaZgetrf_f_mgpu_from_host(range_all,range_all,(cuDoubleComplex*)pi_freq_q[freq][q].c,range_all,ipiv,&info_LU,ngpu,queues);
+                        // for( int dev = 0; dev < ngpu; ++dev ) {
+                        //     magma_setdevice( dev );
+                        //     magma_queue_destroy( queues[dev] );
+                        // }
+                        // printf("iteration:%d",num_iteration);
+                        // printf("det_test.x:%f,det_test.y:%f,info_LU:%d\n",det_test.x,det_test.y,info_LU);
+                        // delete[] queues;
+                        // delete[] h_pi;
+                        // magma_finalize();
+                    }
+                    if (info_LU != 0)
+                    {
+                        printf("Error in LU decomposition, info_LU: %d\n", info_LU);
+                        exit(1);
+                    }
+                    // printf("ipiv:");
+                    // for(int i=0;i<range_all;i++)
+                    //     printf("%d ",ipiv[i]);
+                    // printf("\n");
+                    
+                    delete[] ipiv;
+                    if(range_all%2==1){
+                        det_test.x=-det_test.x;
+                        det_test.y=-det_test.y;
+                    }
+                    complex<double> det_for_rpa(det_test.x, det_test.y);
+                    // printf("det_for_rpa_gpu: %f+%fi\n",det_test.x,det_test.y);
+                    
+                    // auto end_time = std::chrono::high_resolution_clock::now();
+                    // auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
+                    // printf("LU time by lapack: %lld us\n", duration.count());
+                    // complex<double> trace_pi;
+                    complex<double> ln_det;
+                    ln_det = std::log(det_for_rpa);
+                    
+                    // printf("in_det: %f+%fi, trace_pi: %f+%fi\n", ln_det.real(), ln_det.imag(),
+                    //        trace_pi.real(), trace_pi.imag());
+                    // cout << "PI trace vector:" << endl;
+                    // cout << endl;
+                    rpa_for_omega_q = ln_det + trace_pi;
+                    // cout << " ifreq:" << freq << "      rpa_for_omega_k: " << rpa_for_omega_q << "
+                    // lnt_det: " << ln_det << "    trace_pi " << trace_pi << endl;
+                    // printf("tot_RPA_energy_gpu: %f+%fi,num_iteration:%d\n",
+                    //        tot_RPA_energy.real(), tot_RPA_energy.imag(), num_iteration);
+                    // printf("rpa_for_omega_q:%f+%fi,freq_weight:%f,irk_weight[q]:%f\n",
+                    //        rpa_for_omega_q.real(), rpa_for_omega_q.imag(), freq_weight,
+                    //        irk_weight[q]);
+                    #ifdef OPEN_OMP_FOR_LU_DECOMPOSITION
+                    #pragma omp critical
+                    #endif
+                    {
+                        cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;
+                    
+                        tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;
+                    }
+                    // printf("freq: %f, q: (%f, %f, %f), rpa_for_omega_q: %f+%fi,tot_RPA_energy_gpu: %f+%fi,num_iteration:%d,deviceId:%d,streamId:%d\n",
+                    //        freq, q.x, q.y, q.z, rpa_for_omega_q.real(), rpa_for_omega_q.imag(),tot_RPA_energy.real(), tot_RPA_energy.imag(), num_iteration, deviceId, streamId);
+                    
+                    // printf("tot_RPA_energy_gpu: %f+%fi,num_iteration:%d\n",tot_RPA_energy.real(),tot_RPA_energy.imag(),num_iteration);
+                }
+                // double test_endTime=omp_get_wtime();
+                // printf("task_time:%f(aimed to test whether task is congested)\n",test_endTime-test_startTime);
+                num_iteration++;
+                #ifndef OPEN_OMP_FOR_LU_DECOMPOSITION
+                    printf("one task time:%f\n",omp_get_wtime()-one_task_begin);
+                #endif
+            }
+        }
+        
+        }
+        printf("time for calculate: %f\n",omp_get_wtime()-time_calculate);
+        printf("mpi_comm_global_h.myid:%d,num_iteration:%d\n",mpi_comm_global_h.myid,num_iteration);
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+        // printf("time for all tasks: %f\n",omp_get_wtime()-start_time);
+        #endif
+        printf("tot_RPA_energy_gpu: %f+%fi\n",tot_RPA_energy.real(),tot_RPA_energy.imag());
+        if(range_all<max_range_all){
+            for(int i=0;i<deviceCount;i++){
+                cudaSetDevice(i);//set the device
+                for(int j=0;j<NUM_STREAMS;j++){
+                    cudaStreamDestroy(streams[i*NUM_STREAMS+j]);
+                }
+            }
+        }else{
+            for( int dev = 0; dev < ngpu; ++dev ) {
+                magma_queue_destroy( queues[dev] );
+            }
+            magma_finalize();
+        }
+        double end_time=omp_get_wtime();
+        printf("time=%f\n", end_time-start_time);
+        delete[] queues;
+        delete[] streams;
+        
+        // lib_printf("Finish EcRPA %4d, size %zu\n", mpi_comm_global_h.myid,
+        // pi_freq_q_Mu_Nu.size());
+        mpi_comm_global_h.barrier();
+        map<Vector3_Order<double>, complex<double>> global_cRPA_q;
+        for (auto q_weight : irk_weight)
+        {
+            MPI_Reduce(&cRPA_q[q_weight.first], &global_cRPA_q[q_weight.first], 1,
+                       MPI_DOUBLE_COMPLEX, MPI_SUM, 0, mpi_comm_global_h.comm);
+        }
+
+        for (auto &q_crpa : global_cRPA_q)
+        {
+            corr.qcontrib[q_crpa.first] = q_crpa.second;
+        }
+        complex<double> gather_tot_RPA_energy(0.0, 0.0);
+        MPI_Reduce(&tot_RPA_energy, &gather_tot_RPA_energy, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, 0,
+                   mpi_comm_global_h.comm);
+        corr.value = gather_tot_RPA_energy;
+    }
+    // printf("gather_tot_RPA_energy_gpu: %f+%fi\n",corr.value.real(),corr.value.imag());
+    corr.etype = CorrEnergy::type::RPA;
+    return corr;
+}
+#ifdef ENABLE_NVHPC
+
+complex<double> compute_pi_det_blacs_2d_nvhpc(
+    MatrixDevice<std::complex<double>> &d_A, const LIBRPA::Array_Desc &arrdesc_pi, int64_t *d_ipiv, int *d_info,char order)
+{
+    MatrixDevice<std::complex<double>> d_A_T;
+    
+    ORDER_CHECK(order);  
+    if(order=='C'||order=='c'){
+        d_A_T.set_data(d_A.nc(), d_A.nr(), device_stream.stream);
+        DeviceConnector::transpose_device_blas(d_A.ptr(),d_A_T.nr(), d_A_T.nc(),d_A_T.ptr(), LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE);
+    }
+    int ia=1,ja=1;
+    bool is_mixed_precision=false;
+    if(is_mixed_precision){
+        MatrixDevice<std::complex<float>> d_A_f(d_A.nr(),d_A.nc(),device_stream.stream);
+        
+        DeviceConnector::double_to_float_device((order=='C'||order=='c')?(double*)d_A_T.ptr():(double*)d_A.ptr(),(float*)d_A_f.ptr(),d_A.nr()*d_A.nc()*2);
+        
+        DeviceConnector::pgetrf_device_mixed_precision(
+            d_A_f.ptr(), ia, ja, arrdesc_pi,
+            d_ipiv, d_info,
+            LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT,
+            order
+        );
+        DeviceConnector::float_to_double_device((float*)d_A_f.ptr(),(order=='C'||order=='c')?(double*)d_A_T.ptr():(double*)d_A.ptr(),d_A.nr()*d_A.nc()*2);
+    }else{
+        DeviceConnector::pgetrf_device_mixed_precision(
+            (order=='C'||order=='c')?(void*)d_A_T.ptr():(void*)d_A.ptr(), ia, ja, arrdesc_pi, 
+            d_ipiv, d_info, 
+            LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE, 
+            order);
+    }
+    if(order=='C'||order=='c'){
+        DeviceConnector::transpose_device_blas(d_A_T.ptr(),d_A.nr(),d_A.nc(), d_A.ptr(), LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE);
+        d_A_T.clean(device_stream.stream);
+    }
+    complex<double> ln_det_loc(0.0, 0.0);
+    complex<double> ln_det_all(0.0, 0.0);
+    std::complex<double> det_loc;
+    
+    DeviceConnector::det_matrix_device_blacs(
+        &det_loc, d_A.ptr(), arrdesc_pi, LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE
+    );
+    if(det_loc.real() > 0)
+    {
+        ln_det_loc = std::log(det_loc);
+    }
+    else
+    {
+        ln_det_loc = std::log(-det_loc);
+    }
+    
+    MPI_Allreduce(&ln_det_loc, &ln_det_all, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_global_h.comm);
+    return ln_det_all;
+}
+// Done: converge compute_Wc_freq_q_blacs and compute_Wc_freq_q_blacs_wing
+map<double, atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
+compute_Wc_freq_q_blacs_cuda(Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat_eps,
+                        atpair_k_cplx_mat_t &coulmat_wc,
+                        const vector<std::complex<double>> &epsmac_LF_imagfreq)
+{
+    map<double,
+        atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
+        Wc_freq_q;
+    const complex<double> CONE{1.0, 0.0};
+    const int n_abf = LIBRPA::atomic_basis_abf.nb_total;
+    const auto part_range = LIBRPA::atomic_basis_abf.get_part_range();
+
+    if (mpi_comm_global_h.myid == 0)
+    {
+        cout << "Calculating Wc using NVHPC" << endl;
+    }
+    mpi_comm_global_h.barrier();
+
+    Profiler::start("compute_Wc_freq_q_blacs_init");
+    Array_Desc desc_nabf_nabf(blacs_ctxt_global_h);
+    // Use a square blocksize instead max block, otherwise heev and inversion will complain about
+    // illegal parameter Maximal blocksize ensure that atom indices related to the rows/columns of a
+    // local matrix is minimized.
+    desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0);
+    // This, however, is not optimal for matrix operations, and may lead to segment fault during
+    // MPI operations with parallel linear algebra subroutine. Thus we define an optimal blocksize
+    Array_Desc desc_nabf_nabf_opt(blacs_ctxt_global_h);
+    const int nb_opt = min(128, desc_nabf_nabf.nb());
+    desc_nabf_nabf_opt.init(n_abf, n_abf, nb_opt, nb_opt, 0, 0);
+    // obtain the indices of atom-pair block necessary to build 2D block of a Hermitian/symmetric
+    // matrix
+    const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy(
+        'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf);
+    const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf);
+    // temp_block is used to collect data from IJ-pair data structure with comm_map2_first
+    auto temp_block = init_local_mat<complex<double>>(desc_nabf_nabf, MAJOR::COL);
+    #ifdef ENABLE_NVHPC
+    GpuDeviceStream gpu_dev_stream;
+    ComplexMatrixDevice d_temp_block;
+    ComplexMatrixDevice d_coul_block;
+    #endif
+    // Below are the working arrays for matrix operations
+    auto chi0_block = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
+    auto coul_block = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
+    auto coul_eigen_block = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
+    auto coul_chi0_block = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
+    #ifdef ENABLE_NVHPC
+    ComplexMatrixDevice d_chi0_block;
+    ComplexMatrixDevice d_coul_chi0_block;
+    ComplexMatrixDevice d_coul_eigen_block;
+    #endif
+    auto coulwc_block = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
+    #ifdef ENABLE_NVHPC
+    ComplexMatrixDevice d_coulwc_block;
+    #endif
+
+    const double mem_blocks = (chi0_block.size() + coul_block.size() + coul_eigen_block.size() +
+                               coul_chi0_block.size() + coulwc_block.size()) *
+                              16.0e-6;
+    ofs_myid << get_timestamp()
+             << " Memory consumption of task-local blocks for screened Coulomb [MB]: " << mem_blocks
+             << endl;
+
+    const auto atpair_local = dispatch_upper_trangular_tasks(
+        natom, blacs_ctxt_global_h.myid, blacs_ctxt_global_h.nprows, blacs_ctxt_global_h.npcols,
+        blacs_ctxt_global_h.myprow, blacs_ctxt_global_h.mypcol);
+#ifdef LIBRPA_DEBUG
+    ofs_myid << get_timestamp() << " atpair_local " << atpair_local << endl;
+    ofs_myid << get_timestamp() << " s0_s1 " << s0_s1 << endl;
+#endif
+
+    // IJ pair of Wc to be returned
+    pair<set<int>, set<int>> Iset_Jset_Wc;
+    for (const auto &ap : atpair_local)
+    {
+        Iset_Jset_Wc.first.insert(ap.first);
+        Iset_Jset_Wc.second.insert(ap.second);
+    }
+
+    // Prepare local basis indices for 2D->IJ map
+    int I, iI;
+    map<int, vector<int>> map_lor_v;
+    map<int, vector<int>> map_loc_v;
+    for (int i_lo = 0; i_lo != desc_nabf_nabf.m_loc(); i_lo++)
+    {
+        int i_glo = desc_nabf_nabf.indx_l2g_r(i_lo);
+        LIBRPA::atomic_basis_abf.get_local_index(i_glo, I, iI);
+        map_lor_v[I].push_back(iI);
+    }
+    for (int i_lo = 0; i_lo != desc_nabf_nabf.n_loc(); i_lo++)
+    {
+        int i_glo = desc_nabf_nabf.indx_l2g_c(i_lo);
+        LIBRPA::atomic_basis_abf.get_local_index(i_glo, I, iI);
+        map_loc_v[I].push_back(iI);
+    }
+
+    vector<Vector3_Order<double>> qpts;
+    for (const auto &q_weight : irk_weight) qpts.push_back(q_weight.first);
+
+    vec<double> eigenvalues(n_abf);
+    Profiler::cease("compute_Wc_freq_q_blacs_init");
+    LIBRPA::utils::lib_printf_root("Time for Wc initialization (seconds, Wall/CPU): %f %f\n",
+                                   Profiler::get_wall_time_last("compute_Wc_freq_q_blacs_init"),
+                                   Profiler::get_cpu_time_last("compute_Wc_freq_q_blacs_init"));
+
+    Profiler::start("compute_Wc_freq_q_work");
+#ifdef LIBRPA_USE_LIBRI
+    for (const auto &q : qpts)
+    {
+        const int iq = std::distance(qpts.cbegin(), std::find(qpts.cbegin(), qpts.cend(), q));
+        const int iq_in_k =
+            std::distance(klist.cbegin(), std::find(klist.cbegin(), klist.cend(), q));
+        // q-point in fractional coordinates
+        const auto &qf = kfrac_list[iq_in_k];
+        LIBRPA::utils::lib_printf_root("Computing Wc(q), %d / %d, q=(%f, %f, %f)\n", iq + 1,
+                                       qpts.size(), qf.x, qf.y, qf.z);
+        coul_block.zero_out();
+        coulwc_block.zero_out();
+        // lib_printf("coul_block\n%s", str(coul_block).c_str());
+
+        // q-array for LibRI object
+        std::array<double, 3> qa = {q.x, q.y, q.z};
+
+        // collect the block elements of truncated coulomb matrices first
+        // as we reuse coul_eigen_block to reduce memory usage
+        Profiler::start("epsilon_prepare_coulwc_sqrt", "Prepare sqrt of truncated Coulomb");
+        {
+            size_t n_singular_coulwc;
+            // LibRI tensor for communication, release once done
+            std::map<int,
+                     std::map<std::pair<int, std::array<double, 3>>, RI::Tensor<complex<double>>>>
+                couleps_libri;
+            Profiler::start("epsilon_prepare_coulwc_sqrt_1", "Setup libRI object");
+            for (const auto &Mu_Nu : atpair_local)
+            {
+                const auto Mu = Mu_Nu.first;
+                const auto Nu = Mu_Nu.second;
+                // ofs_myid << "Mu " << Mu << " Nu " << Nu << endl;
+                if (coulmat_wc.count(Mu) == 0 || coulmat_wc.at(Mu).count(Nu) == 0 ||
+                    coulmat_wc.at(Mu).at(Nu).count(q) == 0)
+                    continue;
+                const auto &Vq = coulmat_wc.at(Mu).at(Nu).at(q);
+                const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu);
+                const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu);
+                std::valarray<complex<double>> Vq_va(Vq->c, Vq->size);
+                auto pvq = std::make_shared<std::valarray<complex<double>>>();
+                *pvq = Vq_va;
+                couleps_libri[Mu][{Nu, qa}] = RI::Tensor<complex<double>>({n_mu, n_nu}, pvq);
+            }
+            Profiler::stop("epsilon_prepare_coulwc_sqrt_1");
+
+            Profiler::start("epsilon_prepare_coulwc_sqrt_2", "libRI Communicate");
+            const auto IJq_coul = RI::Communicate_Tensors_Map_Judge::comm_map2_first(
+                mpi_comm_global_h.comm, couleps_libri, s0_s1.first, s0_s1.second);
+            Profiler::stop("epsilon_prepare_coulwc_sqrt_2");
+
+            Profiler::start("epsilon_prepare_coulwc_sqrt_3", "Collect 2D-block from IJ");
+
+            collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf,
+                                             qa, true, CONE, IJq_coul, MAJOR::ROW);
+            ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1, desc_nabf_nabf.desc,
+                                          coulwc_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
+                                          blacs_ctxt_global_h.ictxt);
+            Profiler::stop("epsilon_prepare_coulwc_sqrt_3");
+            Profiler::start("epsilon_prepare_coulwc_sqrt_4", "Perform square root");
+            power_hemat_blacs(coulwc_block, desc_nabf_nabf_opt, coul_eigen_block,
+                              desc_nabf_nabf_opt, n_singular_coulwc, eigenvalues.c, 0.5,
+                              Params::sqrt_coulomb_threshold);
+            Profiler::stop("epsilon_prepare_coulwc_sqrt_4");
+        }
+        Profiler::stop("epsilon_prepare_coulwc_sqrt");
+        LIBRPA::utils::lib_printf_root(
+            "Time to prepare sqrt root of Coulomb for Wc(q) (seconds, Wall/CPU): %f %f\n",
+            Profiler::get_wall_time_last("epsilon_prepare_coulwc_sqrt"),
+            Profiler::get_cpu_time_last("epsilon_prepare_coulwc_sqrt"));
+        ofs_myid << get_timestamp() << " Done coulwc sqrt" << endl;
+
+        Profiler::start("epsilon_prepare_couleps_sqrt", "Prepare sqrt of bare Coulomb");
+        // collect the block elements of coulomb matrices
+        {
+            // LibRI tensor for communication, release once done
+            std::map<int,
+                     std::map<std::pair<int, std::array<double, 3>>, RI::Tensor<complex<double>>>>
+                couleps_libri;
+            ofs_myid << get_timestamp() << " Start build couleps_libri" << endl;
+            for (const auto &Mu_Nu : atpair_local)
+            {
+                const auto Mu = Mu_Nu.first;
+                const auto Nu = Mu_Nu.second;
+                // ofs_myid << "Mu " << Mu << " Nu " << Nu << endl;
+                if (coulmat_eps.count(Mu) == 0 || coulmat_eps.at(Mu).count(Nu) == 0 ||
+                    coulmat_eps.at(Mu).at(Nu).count(q) == 0)
+                    continue;
+                const auto &Vq = coulmat_eps.at(Mu).at(Nu).at(q);
+                const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu);
+                const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu);
+                std::valarray<complex<double>> Vq_va(Vq->c, Vq->size);
+                auto pvq = std::make_shared<std::valarray<complex<double>>>();
+                *pvq = Vq_va;
+                couleps_libri[Mu][{Nu, qa}] = RI::Tensor<complex<double>>({n_mu, n_nu}, pvq);
+            }
+            ofs_myid << get_timestamp() << " Done build couleps_libri" << endl;
+            // ofs_myid << "Couleps_libri" << endl << couleps_libri;
+            // if (couleps_libri.size() == 0)
+            //     throw std::logic_error("data at q-point not found in coulmat_eps");
+
+            // perform communication
+            ofs_myid << get_timestamp() << " Start collect couleps_libri, targets" << endl;
+#ifdef LIBRPA_DEBUG
+            ofs_myid << set_IJ_nabf_nabf << endl;
+            ofs_myid << "Extended blocks" << endl;
+            ofs_myid << "atom 1: " << s0_s1.first << endl;
+            ofs_myid << "atom 2: " << s0_s1.second << endl;
+#endif
+            // ofs_myid << "Owned blocks\n";
+            // print_keys(ofs_myid, couleps_libri);
+            // mpi_comm_global_h.barrier();
+            const auto IJq_coul = RI::Communicate_Tensors_Map_Judge::comm_map2_first(
+                mpi_comm_global_h.comm, couleps_libri, s0_s1.first, s0_s1.second);
+            ofs_myid << get_timestamp() << " Done collect couleps_libri, collected blocks" << endl;
+
+            ofs_myid << get_timestamp() << " Start construct couleps 2D block" << endl;
+            collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf,
+                                             qa, true, CONE, IJq_coul, MAJOR::ROW);
+            #ifndef ENABLE_NVHPC
+            ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1, desc_nabf_nabf.desc,
+                                          coul_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
+                                          blacs_ctxt_global_h.ictxt);
+            #else
+            d_temp_block.set_data(temp_block.nr(),temp_block.nc(),temp_block.ptr());
+            d_coul_block.set_data(coul_block.nr(),coul_block.nc());
+            CudaConnector::pgemr2d_nvhpc(
+                gpu_dev_stream, n_abf, n_abf,
+                d_temp_block.ptr(), 1, 1, desc_nabf_nabf,
+                d_coul_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                CUDA_C_64F
+            );
+            gpu_dev_stream.cudaSync();
+            CUDA_CHECK(cudaMemcpy(coul_block.ptr(), d_coul_block.ptr(), sizeof(cuDoubleComplex)*coul_block.nr()*coul_block.nc(), cudaMemcpyDeviceToHost));
+            #endif
+            ofs_myid << get_timestamp() << " Done construct couleps 2D block" << endl;
+        }
+        
+        size_t n_singular;
+        ofs_myid << get_timestamp() << " Start power hemat couleps\n";
+        matrix_m<std::complex<double>> sqrtveig_blacs;
+        #ifdef ENABLE_NVHPC
+        ComplexMatrixDevice d_sqrtveig_blacs;
+        #endif
+        if (is_gamma_point(q))
+        {
+            // choice of power_hemat_blacs_real/power_hemat_blacs_desc
+            // leads to sub-meV difference
+            sqrtveig_blacs = power_hemat_blacs_real(
+                coul_block, desc_nabf_nabf_opt, coul_eigen_block, desc_nabf_nabf_opt, n_singular,
+                eigenvalues.c, 0.5, Params::sqrt_coulomb_threshold);
+            if (Params::replace_w_head && Params::option_dielect_func == 3)
+            {
+                df_headwing.wing_mu_to_lambda(sqrtveig_blacs, desc_nabf_nabf_opt);
+            }
+        }
+        else
+        {
+            sqrtveig_blacs = power_hemat_blacs(coul_block, desc_nabf_nabf_opt, coul_eigen_block,
+                                               desc_nabf_nabf_opt, n_singular, eigenvalues.c, 0.5,
+                                               Params::sqrt_coulomb_threshold);
+        }
+        ofs_myid << get_timestamp() << " Done power hemat couleps\n";
+        // lib_printf("nabf %d nsingu %lu\n", n_abf, n_singular);
+        // release sqrtv when the q-point is not Gamma, or macroscopic dielectric constant at
+        // imaginary frequency is not prepared
+        if (epsmac_LF_imagfreq.empty() || !is_gamma_point(q)) sqrtveig_blacs.clear();
+        const size_t n_nonsingular = n_abf - n_singular;
+        if(gpu_dev_stream.rank==0){
+            printf("n_abf:%lu,n_nonsingular:%lu,n_singular:%lu\n",n_abf,n_nonsingular,n_singular);
+        }
+        Profiler::stop("epsilon_prepare_couleps_sqrt");
+        LIBRPA::utils::lib_printf_root(
+            "Time to prepare sqrt root of Coulomb for Epsilon(q) (seconds, Wall/CPU): %f %f\n",
+            Profiler::get_wall_time_last("epsilon_prepare_couleps_sqrt"),
+            Profiler::get_cpu_time_last("epsilon_prepare_couleps_sqrt"));
+        ofs_myid << get_timestamp() << " Done couleps sqrt\n";
+        std::flush(ofs_myid);
+
+        for (const auto &freq : chi0.tfg.get_freq_nodes())
+        {
+            const auto ifreq = chi0.tfg.get_freq_index(freq);
+            Profiler::start("epsilon_wc_work_q_omega");
+            Profiler::start("epsilon_prepare_chi0_2d", "Prepare Chi0 2D block");
+            chi0_block.zero_out();
+            {
+                std::map<int, std::map<std::pair<int, std::array<double, 3>>,
+                                       RI::Tensor<complex<double>>>>
+                    chi0_libri;
+                if (chi0.get_chi0_q().count(freq) > 0 && chi0.get_chi0_q().at(freq).count(q) > 0)
+                {
+                    const auto &chi0_wq = chi0.get_chi0_q().at(freq).at(q);
+                    for (const auto &M_Nchi : chi0_wq)
+                    {
+                        const auto &M = M_Nchi.first;
+                        const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M);
+                        for (const auto &N_chi : M_Nchi.second)
+                        {
+                            const auto &N = N_chi.first;
+                            const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N);
+                            const auto &chi = N_chi.second;
+                            std::valarray<complex<double>> chi_va(chi.c, chi.size);
+                            auto pchi = std::make_shared<std::valarray<complex<double>>>();
+                            *pchi = chi_va;
+                            chi0_libri[M][{N, qa}] =
+                                RI::Tensor<complex<double>>({n_mu, n_nu}, pchi);
+                        }
+                    }
+                    // Release the chi0 block for this frequency and q to reduce memory load,
+                    // as they will not be used again
+                    chi0.free_chi0_q(freq, q);
+                }
+                // ofs_myid << "chi0_libri" << endl << chi0_libri;
+                Profiler::start("epsilon_prepare_chi0_2d_comm_map2");
+                const auto IJq_chi0 = RI::Communicate_Tensors_Map_Judge::comm_map2_first(
+                    mpi_comm_global_h.comm, chi0_libri, s0_s1.first, s0_s1.second);
+                Profiler::stop("epsilon_prepare_chi0_2d_comm_map2");
+                Profiler::start("epsilon_prepare_chi0_2d_collect_block");
+                collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf,
+                                                 LIBRPA::atomic_basis_abf, qa, true, CONE, IJq_chi0,
+                                                 MAJOR::ROW);
+                #ifndef ENABLE_NVHPC
+                ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1,
+                                              desc_nabf_nabf.desc, chi0_block.ptr(), 1, 1,
+                                              desc_nabf_nabf_opt.desc, blacs_ctxt_global_h.ictxt);
+                #else
+                d_temp_block.set_data(temp_block.nr(),temp_block.nc(),temp_block.ptr());
+                d_chi0_block.set_data(chi0_block.nr(),chi0_block.nc());
+                CudaConnector::pgemr2d_nvhpc(
+                    gpu_dev_stream, n_abf, n_abf,
+                    d_temp_block.ptr(), 1, 1, desc_nabf_nabf,
+                    d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                    CUDA_C_64F
+                );
+                #endif
+                Profiler::stop("epsilon_prepare_chi0_2d_collect_block");
+            }
+            Profiler::stop("epsilon_prepare_chi0_2d");
+
+            Profiler::start("epsilon_compute_eps", "Compute dielectric matrix");
+            if(gpu_dev_stream.rank==0)
+                printf("is_gamma_point(q):%d\n",is_gamma_point(q));
+            const std::complex<double> calpha(1.0,0.0),cbeta(0.0,0.0);
+            if (epsmac_LF_imagfreq.size() > 0 && is_gamma_point(q))
+            {
+                ofs_myid << get_timestamp() << " Entering dielectric matrix head overwrite" << endl;
+                // rotate to Coulomb-eigenvector basis
+                // descending order
+                
+                d_sqrtveig_blacs.set_data(sqrtveig_blacs.nr(), sqrtveig_blacs.nc(), sqrtveig_blacs.ptr());
+                d_coul_chi0_block.set_data(coul_chi0_block.nr(), coul_chi0_block.nc());
+                gpu_dev_stream.cudaSync();
+                CudaConnector::pgemm_nvhpc(
+                    gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_nonsingular, n_abf,
+                    &calpha,
+                    d_chi0_block,1,1,desc_nabf_nabf_opt,
+                    d_sqrtveig_blacs,1,1,desc_nabf_nabf_opt,
+                    &cbeta,
+                    d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                    CUBLAS_COMPUTE_64F_PEDANTIC);
+                
+                CudaConnector::pgemm_nvhpc(
+                    gpu_dev_stream,CUBLAS_OP_C,CUBLAS_OP_N,n_nonsingular, n_nonsingular, n_abf,
+                    &calpha,
+                    d_sqrtveig_blacs,1,1,desc_nabf_nabf_opt,
+                    d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                    &cbeta,
+                    d_chi0_block,1,1,desc_nabf_nabf_opt,
+                    CUBLAS_COMPUTE_64F_PEDANTIC);
+                
+                if (Params::option_dielect_func == 3)
+                {
+                    
+                    cuDoubleComplex calpha1;
+                    calpha1.x = -1.0;
+                    calpha1.y = 0.0;
+                    CudaConnector::multiply_number_for_ComplexMatrixDevice(d_chi0_block,calpha1,gpu_dev_stream.stream);
+                    CudaConnector::diag_add_ComplexMatrixDevice(d_chi0_block,1.0,desc_nabf_nabf_opt,gpu_dev_stream.stream);
+                    gpu_dev_stream.cudaSync();
+                    // {
+                    //     std::string filename = "gpu_";
+                    //     filename += to_string(gpu_dev_stream.nranks);
+                    //     filename += "_";
+                    //     filename += to_string(gpu_dev_stream.rank);
+                    //     gpu_dev_stream.cudaSync();
+                    //     CUDA_CHECK(cudaMemcpy(coul_chi0_block.ptr(),d_chi0_block.ptr(),sizeof(cuDoubleComplex)*coul_chi0_block.nr()*coul_chi0_block.nc(),cudaMemcpyDeviceToHost));
+                    //     CudaConnector::write_file((cuDoubleComplex*)coul_chi0_block.ptr(),coul_chi0_block.nr(),coul_chi0_block.nc(),filename.data());
+                    // }
+                    // CUDA_CHECK(cudaMemcpy(chi0_block.ptr(),d_chi0_block.ptr(),sizeof(cuDoubleComplex)*chi0_block.nr()*chi0_block.nc(),cudaMemcpyDeviceToHost));
+                    
+                    ofs_myid << get_timestamp() << "Perform the head & wing element overwrite"
+                             << endl;
+                    // df_headwing.rewrite_eps(chi0_block, ifreq, desc_nabf_nabf_opt);
+                    df_headwing.rewrite_eps_nvhpc(gpu_dev_stream, d_chi0_block, ifreq, desc_nabf_nabf_opt);
+                    // d_chi0_block.set_data(chi0_block.nr(), chi0_block.nc(), chi0_block.ptr());
+                    
+                    d_coul_eigen_block.set_data(coul_eigen_block.nr(), coul_eigen_block.nc(), coul_eigen_block.ptr());
+                    // rotate back to ABF
+                    // descending order
+                    CudaConnector::pgemm_nvhpc(
+                        gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_nonsingular, n_nonsingular,
+                        &calpha,
+                        d_coul_eigen_block,1,1,desc_nabf_nabf_opt,
+                        d_chi0_block,1,1,desc_nabf_nabf_opt,
+                        &cbeta,
+                        d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                        CUBLAS_COMPUTE_64F_PEDANTIC);
+                    
+                    CudaConnector::pgemm_nvhpc(
+                        gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_C,n_abf, n_abf, n_nonsingular,
+                        &calpha,
+                        d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                        d_coul_eigen_block,1,1,desc_nabf_nabf_opt,
+                        &cbeta,
+                        d_chi0_block,1,1,desc_nabf_nabf_opt,
+                        CUBLAS_COMPUTE_64F_PEDANTIC);
+                    // subtract 1 from diagonal
+                    CudaConnector::diag_add_ComplexMatrixDevice(d_chi0_block,-1.0,desc_nabf_nabf_opt,gpu_dev_stream.stream);
+                    Profiler::start("epsilon_multiply_coulwc", "Multiply truncated Coulomb");
+                    d_coulwc_block.set_data(coulwc_block.nr(), coulwc_block.nc(), coulwc_block.ptr());
+                    d_coul_chi0_block.set_data(coul_chi0_block.nr(), coul_chi0_block.nc());
+                    CudaConnector::pgemm_nvhpc(
+                        gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf,
+                        &calpha,
+                        d_coulwc_block,1,1,desc_nabf_nabf_opt,
+                        d_chi0_block,1,1,desc_nabf_nabf_opt,
+                        &cbeta,
+                        d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                        CUBLAS_COMPUTE_64F_PEDANTIC);
+                    CudaConnector::pgemm_nvhpc(
+                        gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf,
+                        &calpha,
+                        d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                        d_coulwc_block,1,1,desc_nabf_nabf_opt,
+                        &cbeta,
+                        d_chi0_block,1,1,desc_nabf_nabf_opt,
+                        CUBLAS_COMPUTE_64F_PEDANTIC);
+                    Profiler::stop("epsilon_multiply_coulwc");
+                }
+                else
+                {
+                    const int ilo = desc_nabf_nabf_opt.indx_g2l_r(0);
+                    const int jlo = desc_nabf_nabf_opt.indx_g2l_c(0);
+                    if (ilo >= 0 && jlo >= 0)
+                    {
+                        ofs_myid << get_timestamp() << "Perform the head element overwrite" << endl;
+                        std::complex<double> temp_element = 1.0 - epsmac_LF_imagfreq[ifreq];
+                        CUDA_CHECK(cudaMemcpy(d_chi0_block.ptr() + ilo + jlo * d_chi0_block.nr(),
+                                               &temp_element, sizeof(cuDoubleComplex),
+                                               cudaMemcpyHostToDevice));
+                    }
+                    d_coul_eigen_block.set_data(coul_eigen_block.nr(), coul_eigen_block.nc(), coul_eigen_block.ptr());
+                    CudaConnector::pgemm_nvhpc(
+                        gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_nonsingular, n_nonsingular,
+                        &calpha,
+                        d_coul_eigen_block,1,1,desc_nabf_nabf_opt,
+                        d_chi0_block,1,1,desc_nabf_nabf_opt,
+                        &cbeta,
+                        d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                        CUBLAS_COMPUTE_64F_PEDANTIC);
+                    CudaConnector::pgemm_nvhpc(
+                        gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_C,n_abf, n_abf, n_nonsingular,
+                        &calpha,
+                        d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                        d_coul_eigen_block,1,1,desc_nabf_nabf_opt,
+                        &cbeta,
+                        d_chi0_block,1,1,desc_nabf_nabf_opt,
+                        CUBLAS_COMPUTE_64F_PEDANTIC);
+                    
+                    cuDoubleComplex calpha1;
+                    calpha1.x = -1.0;
+                    calpha1.y = 0.0;
+                    CudaConnector::multiply_number_for_ComplexMatrixDevice(d_chi0_block,calpha1,gpu_dev_stream.stream);
+                    CudaConnector::diag_add_ComplexMatrixDevice(d_chi0_block,1.0,desc_nabf_nabf_opt,gpu_dev_stream.stream);
+                    gpu_dev_stream.cudaSync();
+                    Profiler::start("epsilon_invert_eps and epsilon_multiply_coulwc", "Invert dielectric matrix and Multiply truncated Coulomb");
+                    char order = 'c';
+                    d_coulwc_block.set_data(coulwc_block.nr(), coulwc_block.nc(), coulwc_block.ptr(),gpu_dev_stream.stream);
+                    d_coul_chi0_block.set_data_device(coulwc_block.nr(), coulwc_block.nc(), d_coulwc_block.ptr(),gpu_dev_stream);
+                    int64_t* d_ipiv;
+                    int* d_info;
+                    CUDA_CHECK(cudaMallocAsync(&d_info,sizeof(int),gpu_dev_stream.stream));
+                    if(order == 'c'||order == 'C'){
+                        CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*desc_nabf_nabf_opt.n_loc(),gpu_dev_stream.stream));
+                        CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_coul_chi0_block);
+                        CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_chi0_block);
+                    }else{
+                        CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*desc_nabf_nabf_opt.m_loc(),gpu_dev_stream.stream));
+                    }
+                    CudaConnector::pgetrf_nvhpc_mixed_precision(
+                        gpu_dev_stream, d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                        d_ipiv, d_info,
+                        CUDA_C_64F, order
+                    );
+                    CudaConnector::pgetrs_nvhpc_mixed_precision(
+                        gpu_dev_stream, CUBLAS_OP_N,
+                        d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                        d_ipiv,
+                        d_coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                        d_info,
+                        CUDA_C_64F, order
+                    );
+                    if(order == 'c'||order == 'C'){
+                        CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_coul_chi0_block);
+                    }
+                    CUDA_CHECK(cudaFreeAsync(d_info, gpu_dev_stream.stream));
+                    CUDA_CHECK(cudaFreeAsync(d_ipiv, gpu_dev_stream.stream));
+                    d_chi0_block.set_data_device(coulwc_block.nr(), coulwc_block.nc(), d_coulwc_block.ptr(), gpu_dev_stream);
+                    
+                    CudaConnector::pgeadd_nvhpc(
+                        gpu_dev_stream, CUBLAS_OP_N,
+                        &calpha, 
+                        d_coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                        &calpha1,
+                        d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                        CUDA_C_64F, order
+                    );
+                    d_coul_chi0_block.set_data_device(d_chi0_block.nr(), d_chi0_block.nc(), d_chi0_block.ptr(), gpu_dev_stream);
+                    CudaConnector::pgemm_nvhpc(
+                        gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf,
+                        &calpha,
+                        (order=='r'||order=='R')?d_coulwc_block:d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                        (order=='r'||order=='R')?d_coul_chi0_block:d_coulwc_block,1,1,desc_nabf_nabf_opt,
+                        &cbeta,
+                        d_chi0_block,1,1,desc_nabf_nabf_opt,
+                        CUBLAS_COMPUTE_64F_PEDANTIC);
+                    Profiler::stop("epsilon_invert_eps and epsilon_multiply_coulwc");
+                }
+            }
+            else
+            {
+                Profiler::start("epsilon_compute_eps_pgemm_1");
+                // d_chi0_block.set_data(chi0_block.nr(), chi0_block.nc(), chi0_block.ptr());
+                d_coul_block.set_data(coul_block.nr(), coul_block.nc(), coul_block.ptr());
+                d_coul_chi0_block.set_data(coul_chi0_block.nr(), coul_chi0_block.nc());
+                
+                CudaConnector::pgemm_nvhpc(
+                    gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf,
+                    &calpha,
+                    d_coul_block,1,1,desc_nabf_nabf_opt,
+                    d_chi0_block,1,1,desc_nabf_nabf_opt,
+                    &cbeta,
+                    d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                    CUBLAS_COMPUTE_64F_PEDANTIC);
+                Profiler::cease("epsilon_compute_eps_pgemm_1");
+                Profiler::start("epsilon_compute_eps_pgemm_2");
+                CudaConnector::pgemm_nvhpc(
+                    gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf,
+                    &calpha,
+                    d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                    d_coul_block,1,1,desc_nabf_nabf_opt,
+                    &cbeta,
+                    d_chi0_block,1,1,desc_nabf_nabf_opt,
+                    CUBLAS_COMPUTE_64F_PEDANTIC);
+                // d_coul_block.cublasClean(gpu_dev_stream.cublas_handle);
+                Profiler::cease("epsilon_compute_eps_pgemm_2");
+                // now chi0_block is actually v1/2 chi v1/2
+                cuDoubleComplex calpha1;
+                calpha1.x = -1.0;
+                calpha1.y = 0.0;
+                CudaConnector::multiply_number_for_ComplexMatrixDevice(d_chi0_block,calpha1,gpu_dev_stream.stream);
+                gpu_dev_stream.cudaSync();
+                CudaConnector::diag_add_ComplexMatrixDevice(d_chi0_block,1.0,desc_nabf_nabf_opt,gpu_dev_stream.stream);
+                Profiler::stop("epsilon_compute_eps");
+                // now chi0_block is actually the dielectric matrix
+                // perform inversion
+                Profiler::start("epsilon_invert_eps and epsilon_multiply_coulwc", "Invert dielectric matrix and Multiply truncated Coulomb");
+                char order = 'c';
+                d_coulwc_block.set_data(coulwc_block.nr(), coulwc_block.nc(), coulwc_block.ptr(),gpu_dev_stream.stream);
+                d_coul_chi0_block.set_data_device(coulwc_block.nr(), coulwc_block.nc(), d_coulwc_block.ptr(),gpu_dev_stream);
+                int64_t* d_ipiv;
+                int* d_info;
+                CUDA_CHECK(cudaMallocAsync(&d_info,sizeof(int),gpu_dev_stream.stream));
+                if(order == 'c'||order == 'C'){
+                    CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*desc_nabf_nabf_opt.n_loc(),gpu_dev_stream.stream));
+                    CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_coul_chi0_block);
+                    CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_chi0_block);
+                }else{
+                    CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*desc_nabf_nabf_opt.m_loc(),gpu_dev_stream.stream));
+                }
+                CudaConnector::pgetrf_nvhpc_mixed_precision(
+                    gpu_dev_stream, d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                    d_ipiv, d_info,
+                    CUDA_C_64F, order
+                );
+                
+                CudaConnector::pgetrs_nvhpc_mixed_precision(
+                    gpu_dev_stream, CUBLAS_OP_N,
+                    d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                    d_ipiv,
+                    d_coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                    d_info,
+                    CUDA_C_64F, order
+                );
+                if(order == 'c'||order == 'C'){
+                    CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_coul_chi0_block);
+                }
+                CUDA_CHECK(cudaFreeAsync(d_info, gpu_dev_stream.stream));
+                CUDA_CHECK(cudaFreeAsync(d_ipiv, gpu_dev_stream.stream));
+                d_chi0_block.set_data_device(coulwc_block.nr(), coulwc_block.nc(), d_coulwc_block.ptr(), gpu_dev_stream);
+                
+                CudaConnector::pgeadd_nvhpc(
+                    gpu_dev_stream, CUBLAS_OP_N,
+                    &calpha, 
+                    d_coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                    &calpha1,
+                    d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                    CUDA_C_64F, order
+                );
+                d_coul_chi0_block.set_data_device(d_chi0_block.nr(), d_chi0_block.nc(), d_chi0_block.ptr(), gpu_dev_stream);
+                CudaConnector::pgemm_nvhpc(
+                    gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf,
+                    &calpha,
+                    (order=='r'||order=='R')?d_coulwc_block:d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                    (order=='r'||order=='R')?d_coul_chi0_block:d_coulwc_block,1,1,desc_nabf_nabf_opt,
+                    &cbeta,
+                    d_chi0_block,1,1,desc_nabf_nabf_opt,
+                    CUBLAS_COMPUTE_64F_PEDANTIC);
+                Profiler::stop("epsilon_invert_eps and epsilon_multiply_coulwc");
+            }
+            // Array_Desc_Device array_desc_device(desc_nabf_nabf_opt);
+            // printf("successful create object array_desc_device\n");
+            #ifndef ENABLE_NVHPC
+            gpu_dev_stream.calSync();
+            CUDA_CHECK(cudaMemcpy(chi0_block.ptr(),d_chi0_block.ptr(),sizeof(cuDoubleComplex)*chi0_block.nr()*chi0_block.nc(),cudaMemcpyDeviceToHost));
+            ScalapackConnector::pgemr2d_f(n_abf, n_abf, chi0_block.ptr(), 1, 1,
+                                          desc_nabf_nabf_opt.desc, temp_block.ptr(), 1, 1,
+                                          desc_nabf_nabf.desc, blacs_ctxt_global_h.ictxt);
+            #else
+            d_temp_block.set_data(temp_block.nr(),temp_block.nc());
+            CudaConnector::pgemr2d_nvhpc(
+                gpu_dev_stream, n_abf, n_abf,
+                d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                d_temp_block.ptr(), 1, 1, desc_nabf_nabf,
+                CUDA_C_64F
+            );
+            CUDA_CHECK(cudaMemcpyAsync(temp_block.ptr(), d_temp_block.ptr(), sizeof(cuDoubleComplex)*temp_block.nr()*temp_block.nc(), cudaMemcpyDeviceToHost, gpu_dev_stream.stream));
+            gpu_dev_stream.cudaSync();
+            #endif
+            
+            Profiler::start("epsilon_convert_wc_2d_to_ij", "Convert Wc, 2D -> IJ");
+            Profiler::start("epsilon_convert_wc_map_block", "Initialize Wc atom-pair map");
+            map<int, map<int, matrix_m<complex<double>>>> Wc_MNmap;
+            // map_block_to_IJ_storage(Wc_MNmap, LIBRPA::atomic_basis_abf,
+            //                         LIBRPA::atomic_basis_abf, chi0_block,
+            //                         desc_nabf_nabf, MAJOR::ROW);
+            map_block_to_IJ_storage_new(Wc_MNmap, LIBRPA::atomic_basis_abf, map_lor_v, map_loc_v,
+                                        temp_block, desc_nabf_nabf, MAJOR::ROW);
+            Profiler::stop("epsilon_convert_wc_map_block");
+
+            Profiler::start("epsilon_convert_wc_communicate", "Communicate");
+            {
+                std::map<int, std::map<std::pair<int, std::array<double, 3>>,
+                                       RI::Tensor<complex<double>>>>
+                    Wc_libri;
+                Profiler::start("epsilon_convert_wc_communicate_1");
+                for (const auto &M_NWc : Wc_MNmap)
+                {
+                    const auto &M = M_NWc.first;
+                    const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M);
+                    for (const auto &N_Wc : M_NWc.second)
+                    {
+                        const auto &N = N_Wc.first;
+                        const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N);
+                        const auto &Wc = N_Wc.second;
+                        // std::valarray<complex<double>> Wc_va(Wc.ptr(), Wc.size());
+                        // auto pWc = std::make_shared<std::valarray<complex<double>>>();
+                        // *pWc = Wc_va;
+                        /*if (iq == 10 && ifreq == 10)
+                        {
+                            char fn[100];
+                            sprintf(fn, "Wc_M_%zu_N_%zu.dat", M, N);
+                            print_matrix_mm_file(Wc, Params::output_dir + "/" + fn);
+                        }*/
+                        Wc_libri[M][{N, qa}] = RI::Tensor<complex<double>>({n_mu, n_nu}, Wc.sptr());
+                    }
+                }
+                Profiler::stop("epsilon_convert_wc_communicate_1");
+                Profiler::start("epsilon_convert_wc_communicate_2");
+                // main timing
+                // cout << Wc_libri;
+                const auto IJq_Wc = RI::Communicate_Tensors_Map_Judge::comm_map2_first(
+                    mpi_comm_global_h.comm, Wc_libri, Iset_Jset_Wc.first, Iset_Jset_Wc.second);
+                Profiler::stop("epsilon_convert_wc_communicate_2");
+                Profiler::start("epsilon_convert_wc_communicate_3");
+                // parse collected to
+                for (const auto &MN : atpair_local)
+                {
+                    const auto &M = MN.first;
+                    const auto &N = MN.second;
+                    const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M);
+                    const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N);
+                    // Use row major for later usage in LibRI
+                    Wc_freq_q[freq][M][N][q] = matrix_m<complex<double>>(
+                        n_mu, n_nu, IJq_Wc.at(M).at({N, qa}).data, MAJOR::ROW);
+                }
+                Profiler::stop("epsilon_convert_wc_communicate_3");
+                // for ( int i_mu = 0; i_mu != n_mu; i_mu++ )
+                //     for ( int i_nu = 0; i_nu != n_nu; i_nu++ )
+                //     {
+                //     }
+            }
+            Profiler::stop("epsilon_convert_wc_communicate");
+            Profiler::stop("epsilon_convert_wc_2d_to_ij");
+            Profiler::cease("epsilon_wc_work_q_omega");
+            LIBRPA::utils::lib_printf_root(
+                "Time for Wc(i_q=%d, i_omega=%d) (seconds, Wall/CPU): %f %f\n", iq + 1, ifreq + 1,
+                Profiler::get_wall_time_last("epsilon_wc_work_q_omega"),
+                Profiler::get_cpu_time_last("epsilon_wc_work_q_omega"));
+        }
+    }
+#else
+    throw std::logic_error("need compilation with LibRI");
+#endif
+    Profiler::cease("compute_Wc_freq_q_work");
+    LIBRPA::utils::lib_printf_root("Time for Wc computation (seconds, Wall/CPU): %f %f\n",
+                                   Profiler::get_wall_time_last("compute_Wc_freq_q_work"),
+                                   Profiler::get_cpu_time_last("compute_Wc_freq_q_work"));
+
+    return Wc_freq_q;
+}
+
+#endif
\ No newline at end of file
diff --git a/src/epsilon_cuda.h b/src/epsilon_cuda.h
new file mode 100644
index 00000000..0fd7b2d2
--- /dev/null
+++ b/src/epsilon_cuda.h
@@ -0,0 +1,18 @@
+#pragma once
+#include "epsilon.h"
+#include "device_connector.h"
+#include "matrix_device.h"
+
+#ifdef LIBRPA_USE_CUDA
+CorrEnergy compute_RPA_correlation_cuda(const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat);
+#endif
+#ifdef ENABLE_NVHPC
+CorrEnergy compute_RPA_correlation_blacs_2d_cuda(Chi0 &chi0, atpair_k_cplx_mat_t &coulmat);
+
+map<double, atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
+compute_Wc_freq_q_blacs_cuda(Chi0 &, const atpair_k_cplx_mat_t &,
+                        atpair_k_cplx_mat_t &,
+                        const vector<std::complex<double>> &);
+complex<double> compute_pi_det_blacs_2d_nvhpc(
+    MatrixDevice<std::complex<double>> &, const LIBRPA::Array_Desc &arrdesc_pi, int64_t *d_ipiv, int *d_info,char order='C');
+#endif
diff --git a/src/gpu_device_stream.cpp b/src/gpu_device_stream.cpp
new file mode 100644
index 00000000..e69de29b
diff --git a/src/gpu_device_stream.h b/src/gpu_device_stream.h
new file mode 100644
index 00000000..849556bd
--- /dev/null
+++ b/src/gpu_device_stream.h
@@ -0,0 +1,121 @@
+#pragma once
+//=================hbchen 2025-05-11=========================
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+#include <complex>
+//=================hbchen 2025-05-11=========================
+#include <iostream>
+#include <device_launch_parameters.h>
+#include <omp.h>
+#include <mpi.h>
+#ifdef ENABLE_NVHPC
+#include <cusolverMp.h>
+#include <cublasmp.h>
+#include "helpers.h"
+#endif
+
+class GpuDeviceStream{
+private:
+    cal_comm_create_params_t params;
+    static inline calError_t allgather(void* src_buf, void* recv_buf, size_t size, void* data, void** request)
+    {
+        MPI_Request req;
+        int         err = MPI_Iallgather(src_buf, size, MPI_BYTE, recv_buf, size, MPI_BYTE, (MPI_Comm)(data), &req);
+        if (err != MPI_SUCCESS)
+        {
+            return CAL_ERROR;
+        }
+        *request = (void*)(req);
+        return CAL_OK;
+    }
+
+    static inline calError_t request_test(void* request)
+    {
+        MPI_Request req = (MPI_Request)(request);
+        int         completed;
+        int         err = MPI_Test(&req, &completed, MPI_STATUS_IGNORE);
+        if (err != MPI_SUCCESS)
+        {
+            return CAL_ERROR;
+        }
+        return completed ? CAL_OK : CAL_ERROR_INPROGRESS;
+    }
+
+    static inline calError_t request_free(void* request)
+    {
+        return CAL_OK;
+    }
+    static inline int getLocalDevice()
+    {
+        int localRank;
+        MPI_Comm localComm;
+
+        MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &localComm);
+        MPI_Comm_rank(localComm, &localRank);
+        MPI_Comm_free(&localComm);
+
+        int deviceCount = 0;
+        CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
+
+        return localRank % deviceCount;
+    }
+
+
+public:  
+    int rank;
+    int nranks;
+    int local_device;
+    cudaStream_t stream = nullptr;
+    cal_comm_t cal_comm = nullptr;
+ 
+    cusolverMpHandle_t cusolver_handle = nullptr;
+    cublasMpHandle_t cublas_handle = nullptr;
+    GpuDeviceStream(){
+        MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        local_device = GpuDeviceStream::getLocalDevice();
+        // printf("myrank:%d, local_device:%d\n", rank, local_device);
+        CUDA_CHECK(cudaSetDevice(local_device));
+        CUDA_CHECK(cudaFree(nullptr));
+        {
+            params.allgather    = GpuDeviceStream::allgather;
+            params.req_test     = GpuDeviceStream::request_test;
+            params.req_free     = GpuDeviceStream::request_free;
+            params.data         = (void*)(MPI_COMM_WORLD);
+            params.rank         = rank;
+            params.nranks       = nranks;
+            params.local_device = local_device;
+
+            CAL_CHECK(cal_comm_create(params, &cal_comm));
+        }
+        CUDA_CHECK(cudaStreamCreate(&stream));
+        CUSOLVERMP_CHECK(cusolverMpCreate(&cusolver_handle, local_device, stream));
+        CUBLASMP_CHECK(cublasMpCreate(&cublas_handle, stream));
+
+    }
+    ~GpuDeviceStream(){
+        if(stream!=nullptr){
+            CUDA_CHECK(cudaStreamDestroy(stream));
+            stream=nullptr;
+        }
+        if(cal_comm!=nullptr){
+            CAL_CHECK(cal_comm_destroy(cal_comm));
+            cal_comm=nullptr;
+        }
+        if(cusolver_handle!=nullptr){
+            CUSOLVERMP_CHECK(cusolverMpDestroy(cusolver_handle));
+            cusolver_handle=nullptr;
+        }
+        if(cublas_handle!=nullptr){
+            CUBLASMP_CHECK(cublasMpDestroy(cublas_handle));
+            cublas_handle=nullptr;
+        }
+    }
+    void cudaSync() const {
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+    }
+    void calSync() const {
+        CAL_CHECK(cal_stream_sync(cal_comm,stream));
+    }
+};
\ No newline at end of file
diff --git a/src/helpers.h b/src/helpers.h
new file mode 100644
index 00000000..ec014586
--- /dev/null
+++ b/src/helpers.h
@@ -0,0 +1,107 @@
+
+#ifndef HELPERS_H
+#define HELPERS_H
+
+#pragma once
+#include <mpi.h>
+#include <iostream>
+#ifdef ENABLE_NVHPC
+#include <cal.h>
+#include <cublasmp.h>
+#include <cusolverMp.h>
+#endif
+
+
+typedef enum{
+    LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE,
+    LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT,
+    LIBRPA_COMPUTE_TYPE_DOUBLE,
+    LIBRPA_COMPUTE_TYPE_FLOAT
+}LIBRPA_DEVICE_COMPUTE_TYPE;
+
+
+#ifdef ENABLE_NVHPC
+#define NVHPC_MPI_CHECK(call)                                                                                          \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        int status = call;                                                                                             \
+        if (status != MPI_SUCCESS)                                                                                     \
+        {                                                                                                              \
+            fprintf(stderr, "MPI error at %s:%d : %d\n", __FILE__, __LINE__, status);                                  \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    } while (0)
+#define NVSHMEM_CHECK(call)                                                                                            \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        int status = call;                                                                                             \
+        if (status != 0)                                                                                               \
+        {                                                                                                              \
+            fprintf(stderr, "NVSHMEM error at %s:%d : %d\n", __FILE__, __LINE__, status);                              \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    } while (0)
+
+
+#define CUDA_CHECK(call)                                                                                               \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        cudaError_t status = call;                                                                                     \
+        if (status != cudaSuccess)                                                                                     \
+        {                                                                                                              \
+            fprintf(stderr, "CUDA error at %s:%d : %s\n", __FILE__, __LINE__, cudaGetErrorString(status));             \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    } while (0)
+
+#define CUBLASMP_CHECK(call)                                                                                           \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        cublasStatus_t status = call;                                                                                  \
+        if (status != CUBLAS_STATUS_SUCCESS)                                                                           \
+        {                                                                                                              \
+            fprintf(stderr, "cuBLASMp error at %s:%d : %d\n", __FILE__, __LINE__, status);                             \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    } while (0)
+#define CUBLAS_CHECK(err)                                                                                              \
+    do {                                                                                                               \
+        cublasStatus_t err_ = (err);                                                                                   \
+        if (err_ != CUBLAS_STATUS_SUCCESS) {                                                                           \
+            std::printf("cublas error %d at %s:%d\n", err_, __FILE__, __LINE__);                                       \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    } while (0)
+#define CAL_CHECK(call)                                                                                                 \
+    do                                                                                                                  \
+    {                                                                                                                   \
+        calError_t status = call;                                                                                       \
+        if (status != CAL_OK)                                                                                           \
+        {                                                                                                               \
+            fprintf(stderr, "CAL error at %s:%d : %d\n", __FILE__, __LINE__, status);                                   \
+            exit(EXIT_FAILURE);                                                                                         \
+        }                                                                                                               \
+    }while(0)
+
+#define CUSOLVERMP_CHECK(call)                                                                                         \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        cusolverStatus_t status = call;                                                                                \
+        if (status != CUSOLVER_STATUS_SUCCESS)                                                                         \
+        {                                                                                                              \
+            fprintf(stderr, "cuSOLVERMp error at %s:%d : %d\n", __FILE__, __LINE__, status);                           \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    } while (0)
+#endif
+#define ORDER_CHECK(order)                                                                                             \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        if(order !='C'&&order !='c'&&order !='R'&&order !='r')                                                         \
+        {                                                                                                              \
+            fprintf(stderr, "Order should be either 'C' or 'R', order error at %s:%d:%s\n", __FILE__, __LINE__, order);\
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    }while (0)
+
+#endif // HELPERS_H
\ No newline at end of file
diff --git a/src/matrix_device.cpp b/src/matrix_device.cpp
new file mode 100644
index 00000000..28b7987e
--- /dev/null
+++ b/src/matrix_device.cpp
@@ -0,0 +1,16 @@
+#include "matrix_device.h"
+#include "device_stream.h"
+
+template <typename T>
+void MatrixDevice<T>::set_data_device(const int& m, const int& n, const void* d_A, const void* stream){
+    set_data(m,n,stream);
+    if(stream==nullptr){
+        #ifdef ENABLE_NVHPC
+        CUDA_CHECK(cudaMemcpyPeer(this->d_data, device_stream.local_device, d_A, device_stream.local_device, m * n * sizeof(T)));
+        #endif
+    }else{
+        #ifdef ENABLE_NVHPC
+        CUDA_CHECK(cudaMemcpyPeerAsync(this->d_data, device_stream.local_device, d_A, device_stream.local_device, m * n * sizeof(T), (cudaStream_t)stream));
+        #endif
+    }
+}
\ No newline at end of file
diff --git a/src/matrix_device.h b/src/matrix_device.h
new file mode 100644
index 00000000..000c3847
--- /dev/null
+++ b/src/matrix_device.h
@@ -0,0 +1,111 @@
+#ifndef MATRIX_DEVICE_H
+#define MATRIX_DEVICE_H
+
+#include<iostream>
+#ifdef ENABLE_NVHPC
+#include <cuda_runtime.h>
+#include "helpers.h"
+#endif
+
+template <typename T>
+class MatrixDevice{
+private:
+    T* d_data=nullptr;
+    int m=0;
+    int n=0;
+public:
+    MatrixDevice(){}
+    MatrixDevice(const int& m, const int& n, const void* stream){
+        this->m=m;
+        this->n=n;
+        if(stream==nullptr){
+            #ifdef ENABLE_NVHPC
+            CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(T)));
+            #endif
+        }else{
+            #ifdef ENABLE_NVHPC
+            CUDA_CHECK(cudaMallocAsync((void**)&d_data, m * n * sizeof(T), (cudaStream_t)stream));
+            #endif
+        }
+    }
+    MatrixDevice(const int& m, const int& n,const void* c_data, const void* stream){
+        this->m=m;
+        this->n=n;
+        if(stream==nullptr){
+            #ifdef ENABLE_NVHPC
+            CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(T)));
+            CUDA_CHECK(cudaMemcpy(d_data, c_data, m * n * sizeof(T), cudaMemcpyHostToDevice));
+            #endif
+        }else{
+            #ifdef ENABLE_NVHPC
+            CUDA_CHECK(cudaMallocAsync((void**)&d_data, m * n * sizeof(T), (cudaStream_t)stream));
+            CUDA_CHECK(cudaMemcpyAsync(d_data, c_data, m * n * sizeof(T), cudaMemcpyHostToDevice, (cudaStream_t)stream));
+            #endif
+        }
+    }
+    void set_data(const int& m, const int& n, const void* stream){
+        if(m!=this->m || n!=this->n){
+            clean(stream);
+            this->m=m;
+            this->n=n;
+            if(stream==nullptr){
+                #ifdef ENABLE_NVHPC
+                CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(T)));
+                #endif
+            }else{
+                #ifdef ENABLE_NVHPC
+                CUDA_CHECK(cudaMallocAsync((void**)&d_data, m * n * sizeof(T), (cudaStream_t)stream));
+                #endif
+            }
+        }
+    }
+    void set_data(const int& m, const int& n,const void* c_data, const void* stream){
+        set_data(m,n,stream);
+        if(stream==nullptr){
+            #ifdef ENABLE_NVHPC
+            CUDA_CHECK(cudaMemcpy(d_data, c_data, m * n * sizeof(T), cudaMemcpyHostToDevice));
+            #endif
+        }else{
+            #ifdef ENABLE_NVHPC
+            CUDA_CHECK(cudaMemcpyAsync(d_data, c_data, m * n * sizeof(T), cudaMemcpyHostToDevice, (cudaStream_t)stream));
+            #endif
+        }
+    }
+    void set_data_device(const int& m, const int& n, const void* d_A, const void* stream);
+    
+    
+    void clean(const void* stream){
+        if(d_data!=nullptr){
+            if(stream==nullptr){
+                #ifdef ENABLE_NVHPC
+                cudaFree(d_data);
+                #endif
+            }else{
+                #ifdef ENABLE_NVHPC
+                cudaFreeAsync(d_data, (cudaStream_t)stream);
+                #endif
+            }
+            d_data=nullptr;
+        }
+        this->m=0;
+        this->n=0;
+    }
+    T* ptr(){
+        return d_data;
+    }
+    const T* ptr() const {
+        return d_data;
+    }
+    int nr() const {
+        return m;
+    }
+    int nc() const {
+        return n;
+    }
+    ~MatrixDevice(){
+        clean(nullptr);
+    }
+
+};
+
+#endif // MATRIX_DEVICE_H
\ No newline at end of file