From adbe674a8dee9a7c7cc3a5d5c8b5bce1dfc2ad36 Mon Sep 17 00:00:00 2001
From: userchba <88939318+userchba@users.noreply.github.com>
Date: Fri, 25 Jul 2025 01:09:12 +0800
Subject: [PATCH 01/18] epsilon.cpp

fix a bug for rpa calculation when open 4 mpi
---
 epsilon.cpp | 2971 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 2971 insertions(+)
 create mode 100644 epsilon.cpp
diff --git a/epsilon.cpp b/epsilon.cpp
new file mode 100644
index 00000000..9ed7d46b
--- /dev/null
+++ b/epsilon.cpp
@@ -0,0 +1,2971 @@
+#include "epsilon.h"
+#define OPEN_TEST_FOR_LU_DECOMPOSITION
+#include <math.h>
+#include <omp.h>
+
+#include <algorithm>
+#include <array>
+#include <set>
+#include <stdexcept>
+#include <valarray>
+
+#include "atoms.h"
+#include "constants.h"
+#include "envs_blacs.h"
+#include "envs_io.h"
+#include "envs_mpi.h"
+#include "lapack_connector.h"
+#include "libri_utils.h"
+#include "matrix_m_parallel_utils.h"
+#include "parallel_mpi.h"
+#include "params.h"
+#include "pbc.h"
+#include "profiler.h"
+#include "scalapack_connector.h"
+#include "stl_io_helper.h"
+#include "utils_blacs.h"
+#include "utils_io.h"
+#include "utils_mem.h"
+#include "utils_mpi_io.h"
+
+#ifdef LIBRPA_USE_LIBRI
+#include <RI/comm/mix/Communicate_Tensors_Map_Judge.h>
+#include <RI/global/Tensor.h>
+using RI::Tensor;
+using RI::Communicate_Tensors_Map_Judge::comm_map2_first;
+#endif
+
+using LIBRPA::Array_Desc;
+using LIBRPA::envs::blacs_ctxt_global_h;
+using LIBRPA::envs::mpi_comm_global_h;
+using LIBRPA::envs::ofs_myid;
+using LIBRPA::utils::lib_printf;
+
+CorrEnergy compute_RPA_correlation_blacs_2d_gamma_only(Chi0 &chi0, atpair_k_cplx_mat_t &coulmat)
+{
+    CorrEnergy corr;
+    if (mpi_comm_global_h.myid == 0)
+        lib_printf("Calculating EcRPA with BLACS/ScaLAPACK 2D gamma_only\n");
+    // lib_printf("Calculating EcRPA with BLACS, pid:  %d\n", mpi_comm_global_h.myid);
+    const auto &mf = chi0.mf;
+    const double CONE = 1.0;
+    const int n_abf = LIBRPA::atomic_basis_abf.nb_total;
+    // std::cout << "n_abf " << n_abf << std::endl;
+    // std::cout << "n_atoms " << LIBRPA::atomic_basis_abf.n_atoms << std::endl;
+    const auto part_range = LIBRPA::atomic_basis_abf.get_part_range();
+    // std::cout << "part_range " << part_range[0] << " " << part_range[1] << std::endl;
+    auto nbs_ = LIBRPA::atomic_basis_abf.get_atom_nbs();
+    // std::cout << "nbs_ " << nbs_[0] << " " << nbs_[1] << std::endl;
+
+    mpi_comm_global_h.barrier();
+
+    Array_Desc desc_nabf_nabf(blacs_ctxt_global_h);
+    // use a square blocksize instead max block, otherwise heev and inversion will complain about
+    // illegal parameter
+    desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0);
+    const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy(
+        'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf);
+    const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf);
+    auto chi0_block = init_local_mat<double>(desc_nabf_nabf, MAJOR::COL);
+    auto coul_block = init_local_mat<double>(desc_nabf_nabf, MAJOR::COL);
+    auto coul_chi0_block = init_local_mat<double>(desc_nabf_nabf, MAJOR::COL);
+
+    vector<Vector3_Order<double>> qpts;
+    for (const auto &qMuNuchi : chi0.get_chi0_q().at(chi0.tfg.get_freq_nodes()[0]))
+        qpts.push_back(qMuNuchi.first);
+
+    complex<double> tot_RPA_energy(0.0, 0.0);
+    map<Vector3_Order<double>, complex<double>> cRPA_q;
+    if (mpi_comm_global_h.is_root()) lib_printf("Finish init RPA blacs 2d\n");
+#ifdef LIBRPA_USE_LIBRI
+    for (const auto &q : qpts)
+    {
+        coul_block.zero_out();
+
+        int iq = std::distance(klist.begin(), std::find(klist.begin(), klist.end(), q));
+        std::array<double, 3> qa = {q.x, q.y, q.z};
+        // collect the block elements of coulomb matrices
+        {
+            double vq_begin = omp_get_wtime();
+            // LibRI tensor for communication, release once done
+            std::map<int, std::map<std::pair<int, std::array<double, 3>>, Tensor<double>>>
+                coul_libri;
+
+            for (const auto &Mu_Nu : local_atpair)
+            {
+                const auto Mu = Mu_Nu.first;
+                const auto Nu = Mu_Nu.second;
+                // ofs_myid << "myid " << blacs_ctxt_global_h.myid << "Mu " << Mu << " Nu " << Nu <<
+                // endl;
+                if (coulmat.count(Mu) == 0 || coulmat.at(Mu).count(Nu) == 0 ||
+                    coulmat.at(Mu).at(Nu).count(q) == 0)
+                    continue;
+                const auto &Vq = coulmat.at(Mu).at(Nu).at(q);
+                const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu);
+                const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu);
+                matrix tmp_vq_real = (*Vq).real();
+                std::valarray<double> Vq_va(tmp_vq_real.c, Vq->size);
+                auto pvq = std::make_shared<std::valarray<double>>();
+                *pvq = Vq_va;
+                coul_libri[Mu][{Nu, std::array<double, 3>{0, 0, 0}}] =
+                    Tensor<double>({n_mu, n_nu}, pvq);
+                coulmat.at(Mu).at(Nu).at(q).reset();
+            }
+
+            LIBRPA::utils::release_free_mem();
+
+            // printf("Finish RPA blacs 2d  vq arr\n");
+            double arr_end = omp_get_wtime();
+            mpi_comm_global_h.barrier();
+            double comm_begin = omp_get_wtime();
+            // printf("Begin comm_map2_first  myid: %d\n",mpi_comm_global_h.myid);
+            const auto IJq_coul =
+                comm_map2_first(mpi_comm_global_h.comm, coul_libri, s0_s1.first, s0_s1.second);
+            double comm_end = omp_get_wtime();
+            mpi_comm_global_h.barrier();
+
+            double block_begin = omp_get_wtime();
+
+            collect_block_from_ALL_IJ_Tensor(coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf,
+                                             qa, true, CONE, IJq_coul, MAJOR::ROW);
+
+            double block_end = omp_get_wtime();
+            lib_printf(
+                "Vq Time  myid: %d  arr_time: %f  comm_time: %f   block_time: %f   pair_size: %d\n",
+                mpi_comm_global_h.myid, arr_end - vq_begin, comm_end - comm_begin,
+                block_end - block_begin, set_IJ_nabf_nabf.size());
+            mpi_comm_global_h.barrier();
+            double vq_end = omp_get_wtime();
+
+            if (mpi_comm_global_h.myid == 0)
+                lib_printf(" | Total vq time: %f  lri_coul: %f   comm_vq: %f   block_vq: %f\n",
+                           vq_end - vq_begin, comm_begin - vq_begin, block_begin - comm_begin,
+                           vq_end - block_begin);
+        }
+
+        double chi_arr_time = 0.0;
+        double chi_comm_time = 0.0;
+        double chi_2d_time = 0.0;
+        for (const auto &freq : chi0.tfg.get_freq_nodes())
+        {
+            const auto ifreq = chi0.tfg.get_freq_index(freq);
+            const double freq_weight = chi0.tfg.find_freq_weight(freq);
+            double pi_freq_begin = omp_get_wtime();
+            chi0_block.zero_out();
+            {
+                double chi_begin_arr = omp_get_wtime();
+                std::map<int, std::map<std::pair<int, std::array<double, 3>>, Tensor<double>>>
+                    chi0_libri;
+                const auto &chi0_wq = chi0.get_chi0_q().at(freq).at(q);
+
+                for (const auto &M_Nchi : chi0_wq)
+                {
+                    const auto &M = M_Nchi.first;
+                    const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M);
+                    for (const auto &N_chi : M_Nchi.second)
+                    {
+                        const auto &N = N_chi.first;
+                        const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N);
+                        const auto &chi = N_chi.second.real();
+                        std::valarray<double> chi_va(chi.c, chi.size);
+                        auto pchi = std::make_shared<std::valarray<double>>();
+                        *pchi = chi_va;
+                        chi0_libri[M][{N, std::array<double, 3>{0, 0, 0}}] =
+                            Tensor<double>({n_mu, n_nu}, pchi);
+                    }
+                }
+
+                // if(mpi_comm_global_h.is_root())
+                // {
+                //     lib_printf("Begin to clean chi0 !!! \n");
+                //     system("free -m");
+                //     lib_printf("chi0_freq_q size: %d\n",chi0_wq.size());
+                // }
+
+                chi0.free_chi0_q(freq, q);
+
+                LIBRPA::utils::release_free_mem();
+
+                // if(mpi_comm_global_h.is_root())
+                // {
+                //     lib_printf("After clean chi0 !!! \n");
+                //     system("free -m");
+                //     lib_printf("chi0_freq_q size: %d\n",chi0_wq.size());
+                // }
+
+                mpi_comm_global_h.barrier();
+                double chi_end_arr = omp_get_wtime();
+                // ofs_myid << "chi0_libri" << endl << chi0_libri;
+
+                const auto IJq_chi0 =
+                    comm_map2_first(mpi_comm_global_h.comm, chi0_libri, s0_s1.first, s0_s1.second);
+                // ofs_myid << "IJq_chi0" << endl << IJq_chi0;
+                double chi_end_comm = omp_get_wtime();
+
+                collect_block_from_ALL_IJ_Tensor(chi0_block, desc_nabf_nabf,
+                                                 LIBRPA::atomic_basis_abf, qa, true, CONE, IJq_chi0,
+                                                 MAJOR::ROW);
+                // printf("End collect block myid: %d ifreq: %d   TIME_USED:
+                // %f\n",mpi_comm_global_h.myid,ifreq,chi_end_comm-chi_end_arr);
+                mpi_comm_global_h.barrier();
+                double chi_end_2d = omp_get_wtime();
+
+                chi_arr_time = (chi_end_arr - chi_begin_arr);
+                chi_comm_time = (chi_end_comm - chi_end_arr);
+                chi_2d_time = (chi_end_2d - chi_end_comm);
+            }
+
+            double pi_begin = omp_get_wtime();
+            ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, coul_block.ptr(), 1, 1,
+                                        desc_nabf_nabf.desc, chi0_block.ptr(), 1, 1,
+                                        desc_nabf_nabf.desc, 0.0, coul_chi0_block.ptr(), 1, 1,
+                                        desc_nabf_nabf.desc);
+            // char fnp[100];
+            // sprintf(fnp, "pi_ifreq_%d_iq_%d.mtx", ifreq, iq);
+            double pi_end = omp_get_wtime();
+            // printf("End pgemm  myid: %d ifreq: %d \n",mpi_comm_global_h.myid,ifreq);
+            double trace_pi = 0.0;
+            double trace_pi_loc = 0.0;
+            for (int i = 0; i != n_abf; i++)
+            {
+                const int ilo = desc_nabf_nabf.indx_g2l_r(i);
+                const int jlo = desc_nabf_nabf.indx_g2l_c(i);
+                if (ilo >= 0 && jlo >= 0) trace_pi_loc += coul_chi0_block(ilo, jlo);
+            }
+
+            coul_chi0_block *= -1.0;
+            for (int i = 0; i != n_abf; i++)
+            {
+                const int ilo = desc_nabf_nabf.indx_g2l_r(i);
+                const int jlo = desc_nabf_nabf.indx_g2l_c(i);
+                if (ilo >= 0 && jlo >= 0) coul_chi0_block(ilo, jlo) += CONE;
+            }
+
+            int *ipiv = new int[desc_nabf_nabf.m_loc() * 10];
+            int info;
+            // printf("begin det  myid: %d ifreq: %d \n",mpi_comm_global_h.myid,ifreq);
+            double ln_det =
+                compute_pi_det_blacs_2d_gamma_only(coul_chi0_block, desc_nabf_nabf, ipiv, info);
+            // printf("End det  myid: %d ifreq: %d \n",mpi_comm_global_h.myid,ifreq);
+            double det_end = omp_get_wtime();
+            mpi_comm_global_h.barrier();
+            MPI_Allreduce(&trace_pi_loc, &trace_pi, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_global_h.comm);
+            double pi_freq_end = omp_get_wtime();
+
+            if (mpi_comm_global_h.myid == 0)
+            {
+                lib_printf(
+                    "| TIME of DET-freq-q:  %f,  q: ( %f, %f, %f)  TOT: %f  CHI_arr: %f  CHI_comm: "
+                    "%f, CHI_2d: %f, Pi: %f, Det: %f\n",
+                    freq, q.x, q.y, q.z, pi_freq_end - pi_freq_begin, chi_arr_time, chi_comm_time,
+                    chi_2d_time, pi_end - pi_begin, det_end - pi_end);
+                complex<double> rpa_for_omega_q = complex<double>(trace_pi + ln_det);
+                /*std::cout << "q: " << iq << ", freq: " << ifreq << ", ln_det:" << ln_det
+                          << ", trace_pi: " << trace_pi << ", rpa_for_omega_q" << rpa_for_omega_q
+                          << ", contribution: "
+                          << rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI << std::endl;*/
+                cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;  //! check
+                // std::cout << "rpa_for_omega_q: " << rpa_for_omega_q
+                //          << ", freq_weight: " << freq_weight << ", irk_weight[q]:" <<
+                //          irk_weight[q]
+                //          << ", cRPA_q[q]: " << cRPA_q[q] << std::endl;
+                tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;
+            }
+        }
+    }
+#else
+    throw std::logic_error("need compilation with LibRI");
+#endif
+    if (mpi_comm_global_h.myid == 0)
+    {
+        for (auto &q_crpa : cRPA_q)
+        {
+            corr.qcontrib[q_crpa.first] = q_crpa.second;
+        }
+    }
+    mpi_comm_global_h.barrier();
+    corr.value = tot_RPA_energy;
+
+    corr.etype = CorrEnergy::type::RPA;
+    return corr;
+}
+
+CorrEnergy compute_RPA_correlation_blacs_2d(Chi0 &chi0, atpair_k_cplx_mat_t &coulmat)
+{
+    lib_printf("Begin to compute_RPA_correlation_blacs_2d  myid: %d\n", mpi_comm_global_h.myid);
+    system("free -m");
+    CorrEnergy corr;
+    if (mpi_comm_global_h.myid == 0) lib_printf("Calculating EcRPA with BLACS/ScaLAPACK 2D\n");
+    // lib_printf("Calculating EcRPA with BLACS, pid:  %d\n", mpi_comm_global_h.myid);
+    const auto &mf = chi0.mf;
+    const complex<double> CONE{1.0, 0.0};
+    const int n_abf = LIBRPA::atomic_basis_abf.nb_total;
+    const auto part_range = LIBRPA::atomic_basis_abf.get_part_range();
+
+    mpi_comm_global_h.barrier();
+
+    Array_Desc desc_nabf_nabf(blacs_ctxt_global_h);
+    // use a square blocksize instead max block, otherwise heev and inversion will complain about
+    // illegal parameter
+    desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0);
+    const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy(
+        'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf);
+    const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf);
+    auto chi0_block = init_local_mat<complex<double>>(desc_nabf_nabf, MAJOR::COL);
+    auto coul_block = init_local_mat<complex<double>>(desc_nabf_nabf, MAJOR::COL);
+    auto coul_chi0_block = init_local_mat<complex<double>>(desc_nabf_nabf, MAJOR::COL);
+    // ofs_myid << "Iset Jset " << s0_s1 << endl;
+    // ofs_myid << "atpair_unordered_local of myid " << blacs_ctxt_global_h.myid << " " <<
+    // atpair_unordered_local << endl;
+    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+    // printf("success before vector qpts processid:%d,chi0.tfg.get_freq_nodes()[0]:%f,chi0.get_chi0_q().size():%d\n", mpi_comm_global_h.myid,
+        //    chi0.tfg.get_freq_nodes()[0], chi0.get_chi0_q().size());
+    // printf("chi0.get_chi0_q().empty():%d\n", chi0.get_chi0_q().empty());
+    printf("processId:%d,chi0.klist.size():%zu\n", mpi_comm_global_h.myid, chi0.klist.size());
+    // for(const auto &k : chi0.klist)
+    // {
+    //     printf("processId:%d, k: (%f, %f, %f)\n", mpi_comm_global_h.myid, k.x, k.y, k.z);
+    // }
+    #endif
+    vector<Vector3_Order<double>> qpts;
+    
+    // for (const auto &qMuNuchi : chi0.get_chi0_q().at(chi0.tfg.get_freq_nodes()[0]))
+    // {
+    //     qpts.push_back(qMuNuchi.first);
+    //     #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+    //     const auto &q = qMuNuchi.first;
+    //     printf("processId:%d, q: (%f, %f, %f)\n", mpi_comm_global_h.myid, q.x, q.y, q.z);
+    //     #endif
+    // }
+    for(const auto &q : chi0.klist)
+    {
+        qpts.push_back(q);
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+        printf("processId:%d, q: (%f, %f, %f)\n", mpi_comm_global_h.myid, q.x, q.y, q.z);
+        #endif
+    }
+    complex<double> tot_RPA_energy(0.0, 0.0);
+    map<Vector3_Order<double>, complex<double>> cRPA_q;
+    if (mpi_comm_global_h.is_root()) lib_printf("Finish init RPA blacs 2d\n");
+    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+    printf("success before for loop processid:%d\n", mpi_comm_global_h.myid);
+    #endif
+#ifdef LIBRPA_USE_LIBRI
+    
+    for (const auto &q : qpts)
+    {
+        coul_block.zero_out();
+
+        int iq = std::distance(klist.begin(), std::find(klist.begin(), klist.end(), q));
+        std::array<double, 3> qa = {q.x, q.y, q.z};
+        // collect the block elements of coulomb matrices
+        {
+            double vq_begin = omp_get_wtime();
+            // LibRI tensor for communication, release once done
+            std::map<int, std::map<std::pair<int, std::array<double, 3>>, Tensor<complex<double>>>>
+                coul_libri;
+            coul_libri.clear();
+            for (const auto &Mu_Nu : local_atpair)
+            {
+                const auto Mu = Mu_Nu.first;
+                const auto Nu = Mu_Nu.second;
+                // ofs_myid << "myid " << blacs_ctxt_global_h.myid << "Mu " << Mu << " Nu " << Nu <<
+                // endl;
+                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+                // printf("success before if coulmat.count:%d\n", mpi_comm_global_h.myid);
+                #endif
+                if (coulmat.count(Mu) == 0 || coulmat.at(Mu).count(Nu) == 0 ||
+                    coulmat.at(Mu).at(Nu).count(q) == 0)
+                    continue;
+                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+                // printf("success after if coulmat.count:%d\n", mpi_comm_global_h.myid);
+                #endif
+                const auto &Vq = coulmat.at(Mu).at(Nu).at(q);
+                const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu);
+                const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu);
+                std::valarray<complex<double>> Vq_va(Vq->c, Vq->size);
+                auto pvq = std::make_shared<std::valarray<complex<double>>>();
+                *pvq = Vq_va;
+                coul_libri[Mu][{Nu, qa}] = Tensor<complex<double>>({n_mu, n_nu}, pvq);
+            }
+            // printf("Finish RPA blacs 2d  vq arr\n");
+            double arr_end = omp_get_wtime();
+            mpi_comm_global_h.barrier();
+            double comm_begin = omp_get_wtime();
+            // printf("Begin comm_map2_first  myid: %d\n",mpi_comm_global_h.myid);
+            const auto IJq_coul =
+                comm_map2_first(mpi_comm_global_h.comm, coul_libri, s0_s1.first, s0_s1.second);
+            double comm_end = omp_get_wtime();
+            mpi_comm_global_h.barrier();
+            // printf("End vq comm_map2_first  myid: %d   TIME_USED:
+            // %f\n",mpi_comm_global_h.myid,comm_end-comm_begin);
+            //  ofs_myid << "IJq_coul" << endl << IJq_coul;
+            // printf("Finish RPA blacs 2d  vq 2d\n");
+            double block_begin = omp_get_wtime();
+            // for (const auto &IJ: set_IJ_nabf_nabf)
+            // {
+            //     const auto &I = IJ.first;
+            //     const auto &J = IJ.second;
+            //     // cout << IJq_coul.at(I).at({J, qa});
+            //     collect_block_from_IJ_storage_syhe(
+            //         coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, IJ.first,
+            //         IJ.second, true, CONE, IJq_coul.at(I).at({J, qa}).ptr(), MAJOR::ROW);
+            //     // lib_printf("myid %d I %d J %d nr %d nc %d\n%s",
+            //     //        blacs_ctxt_global_h.myid, I, J,
+            //     //        coul_block.nr(), coul_block.nc(),
+            //     //        str(coul_block).c_str());
+            // }
+            collect_block_from_ALL_IJ_Tensor(coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf,
+                                             qa, true, CONE, IJq_coul, MAJOR::ROW);
+            double block_end = omp_get_wtime();
+            lib_printf(
+                "Vq Time  myid: %d  arr_time: %f  comm_time: %f   block_time: %f   pair_size: %d\n",
+                mpi_comm_global_h.myid, arr_end - vq_begin, comm_end - comm_begin,
+                block_end - block_begin, set_IJ_nabf_nabf.size());
+            mpi_comm_global_h.barrier();
+            double vq_end = omp_get_wtime();
+
+            if (mpi_comm_global_h.myid == 0)
+                lib_printf(" | Total vq time: %f  lri_coul: %f   comm_vq: %f   block_vq: %f\n",
+                           vq_end - vq_begin, comm_begin - vq_begin, block_begin - comm_begin,
+                           vq_end - block_begin);
+        }
+
+        // if(mpi_comm_global_h.is_root())
+        // printf("Finish RPA blacs 2d  vq comm\n");
+        //  char fn[100];
+        //  sprintf(fn, "coul_iq_%d.mtx", iq);
+        //  print_matrix_mm_file_parallel(fn, coul_block, desc_nabf_nabf);
+        //  ofs_myid << str(coul_block);
+        //  lib_printf("coul_block\n%s", str(coul_block).c_str());
+        double chi_arr_time = 0.0;
+        double chi_comm_time = 0.0;
+        double chi_2d_time = 0.0;
+        for (const auto &freq : chi0.tfg.get_freq_nodes())
+        {
+            const auto ifreq = chi0.tfg.get_freq_index(freq);
+            const double freq_weight = chi0.tfg.find_freq_weight(freq);
+            double pi_freq_begin = omp_get_wtime();
+            chi0_block.zero_out();
+            {
+                double chi_begin_arr = omp_get_wtime();
+                std::map<int,
+                         std::map<std::pair<int, std::array<double, 3>>, Tensor<complex<double>>>>
+                    chi0_libri;
+                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+                // printf("success before chi0.get_chi0_q().at(freq).at(q) processId:%d\n", mpi_comm_global_h.myid);
+                // printf("processId:%d,chi0.get_chi0_q().empty():%d\n", mpi_comm_global_h.myid, chi0.get_chi0_q().empty());
+                #endif
+                atom_mapping<ComplexMatrix>::pair_t_old chi0_wq;
+                if(!chi0.get_chi0_q().empty())
+                    chi0_wq = chi0.get_chi0_q().at(freq).at(q);
+                // const auto &chi0_wq = chi0.get_chi0_q().at(freq).at(q);
+                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+                // printf("success after chi0.get_chi0_q().at(freq).at(q) processId:%d\n", mpi_comm_global_h.myid);
+                #endif
+                chi0_libri.clear();
+                if(!chi0.get_chi0_q().empty())
+                for (const auto &M_Nchi : chi0_wq)
+                {
+                    const auto &M = M_Nchi.first;
+                    const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M);
+                    for (const auto &N_chi : M_Nchi.second)
+                    {
+                        const auto &N = N_chi.first;
+                        const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N);
+                        const auto &chi = N_chi.second;
+                        std::valarray<complex<double>> chi_va(chi.c, chi.size);
+                        auto pchi = std::make_shared<std::valarray<complex<double>>>();
+                        *pchi = chi_va;
+                        chi0_libri[M][{N, qa}] = Tensor<complex<double>>({n_mu, n_nu}, pchi);
+                    }
+                }
+                if (mpi_comm_global_h.is_root())
+                {
+                    lib_printf("Begin to clean chi0 !!! \n");
+                    LIBRPA::utils::display_free_mem();
+                    lib_printf("chi0_freq_q size: %d,  freq: %f, q:( %f, %f, %f )\n",
+                               chi0_wq.size(), freq, q.x, q.y, q.z);
+                }
+                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+                // printf("success before chi0.free_chi0_q(freq, q) processId:%d\n", mpi_comm_global_h.myid);
+                #endif
+                if(!chi0.get_chi0_q().empty())
+                    chi0.free_chi0_q(freq, q);
+                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+                // printf("success after chi0.free_chi0_q(freq, q) processId:%d\n", mpi_comm_global_h.myid);
+                #endif
+
+
+                LIBRPA::utils::release_free_mem();
+                // if(mpi_comm_global_h.is_root())
+                // {
+                //     lib_printf("After clean chi0 !!! \n");
+                //     system("free -m");
+                //     lib_printf("chi0_freq_q size: %d\n",chi0_wq.size());
+                // }
+                mpi_comm_global_h.barrier();
+                double chi_end_arr = omp_get_wtime();
+                // ofs_myid << "chi0_libri" << endl << chi0_libri;
+
+                const auto IJq_chi0 =
+                    comm_map2_first(mpi_comm_global_h.comm, chi0_libri, s0_s1.first, s0_s1.second);
+                // ofs_myid << "IJq_chi0" << endl << IJq_chi0;
+                double chi_end_comm = omp_get_wtime();
+                collect_block_from_ALL_IJ_Tensor(chi0_block, desc_nabf_nabf,
+                                                 LIBRPA::atomic_basis_abf, qa, true, CONE, IJq_chi0,
+                                                 MAJOR::ROW);
+                mpi_comm_global_h.barrier();
+                double chi_end_2d = omp_get_wtime();
+
+                chi_arr_time = (chi_end_arr - chi_begin_arr);
+                chi_comm_time = (chi_end_comm - chi_end_arr);
+                chi_2d_time = (chi_end_2d - chi_end_comm);
+                // char fnc[100];
+                // sprintf(fnc, "chi_ifreq_%d_iq_%d.mtx", ifreq, iq);
+                // if( ifreq== 0)
+                //     print_matrix_mm_file_parallel(fnc, chi0_block, desc_nabf_nabf);
+            }
+
+            double pi_begin = omp_get_wtime();
+            ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, coul_block.ptr(), 1, 1,
+                                        desc_nabf_nabf.desc, chi0_block.ptr(), 1, 1,
+                                        desc_nabf_nabf.desc, 0.0, coul_chi0_block.ptr(), 1, 1,
+                                        desc_nabf_nabf.desc);
+            // char fnp[100];
+            // sprintf(fnp, "pi_ifreq_%d_iq_%d.mtx", ifreq, iq);
+            double pi_end = omp_get_wtime();
+
+            complex<double> trace_pi(0.0, 0.0);
+            complex<double> trace_pi_loc(0.0, 0.0);
+            for (int i = 0; i != n_abf; i++)
+            {
+                const int ilo = desc_nabf_nabf.indx_g2l_r(i);
+                const int jlo = desc_nabf_nabf.indx_g2l_c(i);
+                if (ilo >= 0 && jlo >= 0) trace_pi_loc += coul_chi0_block(ilo, jlo);
+            }
+
+            coul_chi0_block *= -1.0;
+            for (int i = 0; i != n_abf; i++)
+            {
+                const int ilo = desc_nabf_nabf.indx_g2l_r(i);
+                const int jlo = desc_nabf_nabf.indx_g2l_c(i);
+                if (ilo >= 0 && jlo >= 0) coul_chi0_block(ilo, jlo) += CONE;
+                // std::cout << "1-Pi: " << ilo << "," << jlo << "," << coul_chi0_block(ilo, jlo)
+                //<< std::endl;
+            }
+            // if( ifreq== 0 && mpi_comm_global_h.is_root() )
+            //     print_whole_matrix("pi-2D-loc", coul_chi0_block);
+
+            int *ipiv = new int[desc_nabf_nabf.m_loc() * 10];
+            int info;
+            complex<double> ln_det =
+                compute_pi_det_blacs_2d(coul_chi0_block, desc_nabf_nabf, ipiv, info);
+            double det_end = omp_get_wtime();
+            mpi_comm_global_h.barrier();
+            MPI_Allreduce(&trace_pi_loc, &trace_pi, 1, MPI_DOUBLE_COMPLEX, MPI_SUM,
+                          mpi_comm_global_h.comm);
+            double pi_freq_end = omp_get_wtime();
+            // double task_end = omp_get_wtime();
+            //  if(mpi_comm_global_h.is_root())
+            //      lib_printf("| After det for freq:  %f,  q: ( %f, %f, %f)   TIME_LOCMAT: %f
+            //      TIME_DET: %f  TIME_CAL_Pi: %f, TIME_TRAN_LOC: %f\n",ifreq,
+            //      q.x,q.y,q.z,task_mid-task_begin,task_end-task_mid,pi_time,loc_tran_time);
+            // para_mpi.mpi_barrier();
+
+            if (mpi_comm_global_h.myid == 0)
+            {
+                lib_printf(
+                    "| TIME of DET-freq-q:  %f,  q: ( %f, %f, %f)  TOT: %f  CHI_arr: %f  CHI_comm: "
+                    "%f, CHI_2d: %f, Pi: %f, Det: %f\n",
+                    freq, q.x, q.y, q.z, pi_freq_end - pi_freq_begin, chi_arr_time, chi_comm_time,
+                    chi_2d_time, pi_end - pi_begin, det_end - pi_end);
+                complex<double> rpa_for_omega_q = trace_pi + ln_det;
+                cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;  //! check
+                tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;
+            }
+        }
+    }
+#else
+    throw std::logic_error("need compilation with LibRI");
+#endif
+    if (mpi_comm_global_h.myid == 0)
+    {
+        for (auto &q_crpa : cRPA_q)
+        {
+            corr.qcontrib[q_crpa.first] = q_crpa.second;
+            // cout << q_crpa.first << q_crpa.second << endl;
+        }
+        // cout << "gx_num_" << chi0.tfg.size() << "  tot_RPA_energy:  " << setprecision(8)
+        // <<tot_RPA_energy << endl;
+    }
+    mpi_comm_global_h.barrier();
+    corr.value = tot_RPA_energy;
+
+    corr.etype = CorrEnergy::type::RPA;
+    return corr;
+}
+double compute_pi_det_blacs_2d_gamma_only(matrix_m<double> &loc_piT, const Array_Desc &arrdesc_pi,
+                                          int *ipiv, int &info)
+{
+    int one = 1;
+    int range_all = N_all_mu;
+    int DESCPI_T[9];
+
+    double det_begin = omp_get_wtime();
+
+    ScalapackConnector::pgetrf_f(range_all, range_all, loc_piT.ptr(), one, one, arrdesc_pi.desc,
+                                 ipiv, info);
+    double trf_end = omp_get_wtime();
+
+    double ln_det_loc = 0.0;
+    double ln_det_all = 0.0;
+
+    for (int ig = 0; ig != range_all; ig++)
+    {
+        int locr = arrdesc_pi.indx_g2l_r(ig);
+        int locc = arrdesc_pi.indx_g2l_c(ig);
+        if (locr >= 0 && locc >= 0)
+        {
+            double tmp_ln_det;
+            if (loc_piT(locr, locc) > 0)
+            {
+                tmp_ln_det = std::log(loc_piT(locr, locc));
+            }
+            else
+            {
+                tmp_ln_det = std::log(-loc_piT(locr, locc));
+            }
+            ln_det_loc += tmp_ln_det;
+        }
+    }
+    double ln_end = omp_get_wtime();
+
+    MPI_Allreduce(&ln_det_loc, &ln_det_all, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_global_h.comm);
+    double det_end = omp_get_wtime();
+    return ln_det_all;
+}
+complex<double> compute_pi_det_blacs_2d(matrix_m<complex<double>> &loc_piT,
+                                        const Array_Desc &arrdesc_pi, int *ipiv, int &info)
+{
+    int one = 1;
+    int range_all = N_all_mu;
+    int DESCPI_T[9];
+    // if(out_pi)
+    // {
+    //     print_complex_real_matrix("first_pi",pi_freq_q.at(0).at(0));
+    //     print_complex_real_matrix("first_loc_piT_mat",loc_piT);
+    // }
+    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+    printf("success before pzgetrf_ processid:%d,range_all: %d, loc_piT.nr(): %d, loc_piT.nc(): %d\n",
+           mpi_comm_global_h.myid, range_all, loc_piT.nr(), loc_piT.nc());
+    #endif
+    double det_begin = omp_get_wtime();
+    // ScalapackConnector::transpose_desc(DESCPI_T, arrdesc_pi.desc);
+    pzgetrf_(&range_all, &range_all, loc_piT.ptr(), &one, &one, arrdesc_pi.desc, ipiv, &info);
+    double trf_end = omp_get_wtime();
+    // ScalapackConnector::pgetrf_f(range_all,range_all,loc_piT.c,one,one,DESCPI_T,ipiv, info);
+    // printf("   after LU myid: %d\n",mpi_comm_global_h.myid);
+    // printf("desc myid: %d,  m n: %d,%d,  mb nb: %d, %d,  loc_m_n: %d, %d, myp: %d,%d, npr,npc:
+    // %d, %d\n",mpi_comm_global_h.myid, arrdesc_pi.m(),arrdesc_pi.n(),
+    // arrdesc_pi.mb(),arrdesc_pi.nb(),
+    // arrdesc_pi.m_loc(),arrdesc_pi.n_loc(),arrdesc_pi.myprow(),arrdesc_pi.mypcol(),arrdesc_pi.nprows(),arrdesc_pi.npcols());
+    complex<double> ln_det_loc(0.0, 0.0);
+    complex<double> ln_det_all(0.0, 0.0);
+    // complex<double> det_loc(1.0,0.0);
+    // complex<double> det_glo(0.0,0.0);
+    // vector<complex<double>>  det_dig;
+    // vector<complex<double>>  ln_det_dig;
+    // vector<complex<double>>  det_dig_r;
+    // vector<complex<double>>  det_dig_c;
+    // printf(" myid: %d ig=25, locr,locc: %d,
+    // %d)\n",mpi_comm_global_h.myid,arrdesc_pi.indx_g2l_r(25),arrdesc_pi.indx_g2l_c(25));
+    for (int ig = 0; ig != range_all; ig++)
+    {
+        // int locr=para_mpi.localIndex(ig,row_nblk,para_mpi.nprow,para_mpi.myprow);
+        // int locc=para_mpi.localIndex(ig,col_nblk,para_mpi.npcol,para_mpi.mypcol);
+        int locr = arrdesc_pi.indx_g2l_r(ig);
+        int locc = arrdesc_pi.indx_g2l_c(ig);
+        if (locr >= 0 && locc >= 0)
+        {
+            // if(ipiv[locr]!=(ig+1))
+            // 	det_loc=-1*det_loc * loc_piT(locc,locr);
+            // else
+            // 	det_loc=det_loc * loc_piT(locc,locr);
+            // det_dig.push_back(loc_piT(locr,locc));
+            // det_dig_r.push_back(locr);
+            // det_dig_c.push_back(locc);
+            complex<double> tmp_ln_det;
+            if (loc_piT(locr, locc).real() > 0)
+            {
+                tmp_ln_det = std::log(loc_piT(locr, locc));
+                // ln_det_dig.push_back(tmp_ln_det);
+            }
+            else
+            {
+                tmp_ln_det = std::log(-loc_piT(locr, locc));
+                // ln_det_dig.push_back(tmp_ln_det);
+            }
+            ln_det_loc += tmp_ln_det;
+        }
+    }
+    double ln_end = omp_get_wtime();
+    //     ComplexMatrix det_mm(loc_piT.nr(),loc_piT.nc());
+    //     for(int i=0;i!=loc_piT.nr();i++)
+    //         for(int j=0;j!=loc_piT.nc();j++)
+    //             det_mm(i,j)=loc_piT(i,j);
+    //    // sort(det_dig.rbegin(),det_dig.rend());
+    //     ComplexMatrix det_dig_mm(det_dig.size(),4);
+    //     for(int i=0;i!=det_dig.size();i++)
+    //     {
+    //         det_dig_mm(i,0) =det_dig_r[i];
+    //         det_dig_mm(i,1) =det_dig_c[i];
+    //         det_dig_mm(i,2)=det_dig[i];
+    //         det_dig_mm(i,3)=ln_det_dig[i];
+    //     }
+    //     char fn[100];
+    //     sprintf(fn, "det_dig_myid_%d.mtx", mpi_comm_global_h.myid);
+    //     print_complex_matrix_file("det_dig_loc", det_dig_mm, fn, false);
+
+    //     sprintf(fn, "det_mat_myid_%d.mtx", mpi_comm_global_h.myid);
+    //     print_complex_matrix_file("det_mat_loc", det_mm, fn, false);
+
+    MPI_Allreduce(&ln_det_loc, &ln_det_all, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_global_h.comm);
+    double det_end = omp_get_wtime();
+    // if(mpi_comm_global_h.myid == 0)
+    //     lib_printf("    | Det time   trf: %f   ln: %f   allreduce:
+    //     %f\n",trf_end-det_begin,ln_end-trf_end, det_end-ln_end);
+    // MPI_Allreduce(&det_loc,&det_glo,1,MPI_DOUBLE_COMPLEX,MPI_PROD,mpi_comm_global_h.comm);
+    // ln_det_all=std::log(det_glo);
+    return ln_det_all;
+}
+
+complex<double> compute_pi_det_blacs(ComplexMatrix &loc_piT, const Array_Desc &arrdesc_pi,
+                                     int *ipiv, int &info)
+{
+    // int range_all = atom_mu_part_range[natom-1]+atom_mu[natom-1];
+    // int desc_pi[9];
+    // int loc_row, loc_col, info;
+    // int row_nblk=1;
+    // int col_nblk=1;
+    int one = 1;
+    int range_all = N_all_mu;
+    // para_mpi.set_blacs_mat(desc_pi,loc_row,loc_col,range_all,range_all,row_nblk,col_nblk);
+    // int *ipiv = new int [loc_row*10];
+    // ComplexMatrix loc_piT(loc_col,loc_row);
+
+    // for(int i=0;i!=loc_row;i++)
+    // {
+    //     int global_row = para_mpi.globalIndex(i,row_nblk,para_mpi.nprow,para_mpi.myprow);
+    //     int mu;
+    //     int I=atom_mu_glo2loc(global_row,mu);
+    //     for(int j=0;j!=loc_col;j++)
+    //     {
+    //         int global_col = para_mpi.globalIndex(j,col_nblk,para_mpi.npcol,para_mpi.mypcol);
+    //         int nu;
+    //         int J=atom_mu_glo2loc(global_col,nu);
+
+    //         if( global_col == global_row)
+    //         {
+    //             loc_piT(j,i)=complex<double>(1.0,0.0) - pi_freq_q.at(I).at(J)(mu,nu);
+    //         }
+    //         else
+    //         {
+    //             loc_piT(j,i)=-1*  pi_freq_q.at(I).at(J)(mu,nu);
+    //         }
+
+    //     }
+    // }
+    int DESCPI_T[9];
+    // if(out_pi)
+    // {
+    //     print_complex_real_matrix("first_pi",pi_freq_q.at(0).at(0));
+    //     print_complex_real_matrix("first_loc_piT_mat",loc_piT);
+    // }
+
+    ScalapackConnector::transpose_desc(DESCPI_T, arrdesc_pi.desc);
+
+    // para_mpi.mpi_barrier();
+    // printf("   before LU Myid: %d        Available DOS memory = %ld
+    // bytes\n",mpi_comm_global_h.myid, memavail()); printf("   before LU myid: %d  range_all: %d,
+    // loc_mat.size: %d\n",mpi_comm_global_h.myid,range_all,loc_piT.size);
+    pzgetrf_(&range_all, &range_all, loc_piT.c, &one, &one, DESCPI_T, ipiv, &info);
+    // printf("   after LU myid: %d\n",mpi_comm_global_h.myid);
+    complex<double> ln_det_loc(0.0, 0.0);
+    complex<double> ln_det_all(0.0, 0.0);
+    for (int ig = 0; ig != range_all; ig++)
+    {
+        // int locr=para_mpi.localIndex(ig,row_nblk,para_mpi.nprow,para_mpi.myprow);
+        // int locc=para_mpi.localIndex(ig,col_nblk,para_mpi.npcol,para_mpi.mypcol);
+        int locr = arrdesc_pi.indx_g2l_r(ig);
+        int locc = arrdesc_pi.indx_g2l_c(ig);
+        if (locr >= 0 && locc >= 0)
+        {
+            // if(ipiv[locr]!=(ig+1))
+            // 	det_loc=-1*det_loc * loc_piT(locc,locr);
+            // else
+            // 	det_loc=det_loc * loc_piT(locc,locr);
+            if (loc_piT(locc, locr).real() > 0)
+                ln_det_loc += std::log(loc_piT(locc, locr));
+            else
+                ln_det_loc += std::log(-loc_piT(locc, locr));
+        }
+    }
+    MPI_Allreduce(&ln_det_loc, &ln_det_all, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_global_h.comm);
+    return ln_det_all;
+}
+
+CorrEnergy compute_RPA_correlation_blacs(const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat)
+{
+    CorrEnergy corr;
+    if (mpi_comm_global_h.myid == 0) lib_printf("Calculating EcRPA with BLACS/ScaLAPACK row\n");
+
+    const auto &mf = chi0.mf;
+    const complex<double> CONE{1.0, 0.0};
+    const int n_abf = LIBRPA::atomic_basis_abf.nb_total;
+    const auto part_range = LIBRPA::atomic_basis_abf.get_part_range();
+
+    mpi_comm_global_h.barrier();
+
+    LIBRPA::Array_Desc arrdesc_pi(blacs_ctxt_global_h);
+    arrdesc_pi.init_square_blk(n_abf, n_abf, 0, 0);
+    int loc_row = arrdesc_pi.m_loc(), loc_col = arrdesc_pi.n_loc(), info;
+
+    // para_mpi.set_blacs_mat(desc_pi,loc_row,loc_col,N_all_mu,N_all_mu,row_nblk,col_nblk);
+    int *ipiv = new int[loc_row * 10];
+    // double vq_begin_m2t= omp_get_wtime();
+    // std::map<int, std::map<std::pair<int, std::array<double, 3>>, Tensor<complex<double>>>>
+    // vq_libri; for(auto &Ip:Vq)
+    // {
+    //     auto I=Ip.first;
+    //     const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(I);
+    //     for(auto &Jp:Ip.second)
+    //     {
+    //         auto J=Jp.first;
+    //         const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(J);
+    //         for(auto &qp:Jp.second)
+    //         {
+    //             auto q=qp.first;
+    //             std::array<double, 3> qa = {q.x, q.y, q.z};
+    //             const auto &vq_ptr=qp.second;
+    //             std::valarray<complex<double>> Vq_va(vq_ptr->c, vq_ptr->size);
+    //             auto pvq = std::make_shared<std::valarray<complex<double>>>();
+    //             *pvq = Vq_va;
+    //             vq_libri[I][{J, qa}] = Tensor<complex<double>>({n_mu, n_nu}, pvq);
+    //             if(I!=J)
+    //             {
+    //                 auto vqT=transpose(*vq_ptr, 1);
+    //                 std::valarray<complex<double>> VqT_va(vqT.c, vqT.size);
+    //                 auto pvqT = std::make_shared<std::valarray<complex<double>>>();
+    //                 *pvqT = VqT_va;
+    //                 vq_libri[J][{I, qa}] = Tensor<complex<double>>({n_nu, n_mu}, pvqT);
+    //             }
+    //         }
+    //     }
+    // }
+    // double vq_end_m2t = omp_get_wtime();
+    // set<int> loc_atp_IJ;
+    // for(auto &atp:local_atpair)
+    // {
+    //     loc_atp_IJ.insert(atp.first);
+    //     loc_atp_IJ.insert(atp.second);
+    // }
+    // set<int> all_atom_set;
+    // for(int I=0;I!=natom;I++)
+    //     all_atom_set.insert(I);
+    // const auto IJq_coul = Communicate_Tensors_Map_Judge::comm_map2_first(mpi_comm_global_h.comm,
+    // vq_libri, all_atom_set, loc_atp_IJ); atpair_k_cplx_mat_t Vq_loc; double vq_end_comm =
+    // omp_get_wtime(); for(auto Ip:IJq_coul)
+    // {
+    //     auto I=Ip.first;
+    //     auto n_mu=atom_mu[I];
+    //     for(auto &Jqp:Ip.second)
+    //     {
+    //         auto J=Jqp.first.first;
+    //         auto n_nu=atom_mu[J];
+    //         auto qa=Jqp.first.second;
+    //         Vector3_Order<double> q{qa[0],qa[1],qa[2]};
+    //         shared_ptr<ComplexMatrix> vq_ptr = make_shared<ComplexMatrix>();
+    //         vq_ptr->create(n_mu, n_nu);
+    //         const auto length=sizeof(complex<double>)* n_mu *n_nu;
+    //         memcpy((*vq_ptr).c, Jqp.second.ptr(),length);
+    //         Vq_loc[I][J][q]=vq_ptr;
+    //         //printf("| process %d, I: %d  J: %d\n",mpi_comm_global_h.myid, I,J );
+    //     }
+    // }
+    // double vq_end_t2m = omp_get_wtime();
+    // mpi_comm_global_h.barrier();
+    // if(mpi_comm_global_h.is_root())
+    //     lib_printf("| Vq_time %f, TIME_m2t: %f   TIME_comm: %f  TIME_t2m:
+    //     %f\n",vq_end_t2m-vq_begin_m2t,vq_end_m2t-vq_begin_m2t,vq_end_comm-vq_end_m2t,vq_end_t2m-vq_end_comm);
+    map<double, map<Vector3_Order<double>, ComplexMatrix>> pi_freq_q;
+    complex<double> tot_RPA_energy(0.0, 0.0);
+    map<Vector3_Order<double>, complex<double>> cRPA_q;
+    for (const auto &freq_q_MuNuchi0 : chi0.get_chi0_q())
+    {
+        const auto freq = freq_q_MuNuchi0.first;
+        const double freq_weight = chi0.tfg.find_freq_weight(freq);
+        for (const auto &q_MuNuchi0 : freq_q_MuNuchi0.second)
+        {
+            double task_begin = omp_get_wtime();
+            const auto q = q_MuNuchi0.first;
+            auto &MuNuchi0 = q_MuNuchi0.second;
+
+            // ComplexMatrix loc_piT(loc_col,loc_row);
+            auto loc_piT = init_local_mat<complex<double>>(arrdesc_pi, MAJOR::COL);
+            complex<double> trace_pi(0.0, 0.0);
+            double vq_time = 0.0;
+            double pi_time = 0.0;
+            double loc_tran_time = 0.0;
+            for (int Mu = 0; Mu != natom; Mu++)
+            {
+                double Mu_begin = omp_get_wtime();
+                // lib_printf(" |process %d,  Mu:  %d\n",mpi_comm_global_h.myid,Mu);
+                const size_t n_mu = atom_mu[Mu];
+                atom_mapping<ComplexMatrix>::pair_t_old Vq_row = gather_vq_row_q(Mu, coulmat, q);
+                double Mu_after_vq = omp_get_wtime();
+                // atom_mapping<ComplexMatrix>::pair_t_old Vq_row;
+                // const auto IJq_coul =
+                // Communicate_Tensors_Map_Judge::comm_map2_first(mpi_comm_global_h.comm, vq_libri,
+                // {Mu}, loc_atp_atoms); double Mu_vq_comm = omp_get_wtime(); for(auto Ip:IJq_coul)
+                // {
+                //     auto I=Ip.first;
+                //     auto n_mu=atom_mu[I];
+                //     for(auto &Jqp:Ip.second)
+                //     {
+                //         auto J=Jqp.first.first;
+                //         auto n_nu=atom_mu[J];
+                //         auto q=Jqp.first.second;
+                //         Vq_row[I][J].create(n_mu,n_nu);
+                //         const auto length=sizeof(complex<double>)* n_mu *n_nu;
+                //         memcpy(Vq_row[I][J].c, Jqp.second.ptr(),length);
+                //     }
+                // }
+                // double Mu_after_vq=omp_get_wtime();
+                // printf("   |process %d, Mu: %d  vq_row.size:
+                // %d\n",para_mpi.get_myid(),Mu,Vq_row[Mu].size()); ComplexMatrix
+                // loc_pi_row=compute_Pi_freq_q_row(q,MuNuchi0,Vq_loc,Mu,q);
+                ComplexMatrix loc_pi_row = compute_Pi_freq_q_row(q, MuNuchi0, Vq_row, Mu);
+                // printf("   |process %d,   compute_pi\n",para_mpi.get_myid());
+                ComplexMatrix glo_pi_row(n_mu, N_all_mu);
+                mpi_comm_global_h.barrier();
+                mpi_comm_global_h.allreduce_ComplexMatrix(loc_pi_row, glo_pi_row);
+                double Mu_after_pi_loc = omp_get_wtime();
+                // cout<<"  glo_pi_rowT nr,nc: "<<glo_pi_row.nr<<" "<<glo_pi_row.nc<<endl;
+
+                for (int i_mu = 0; i_mu != n_mu; i_mu++)
+                    trace_pi += glo_pi_row(i_mu, atom_mu_part_range[Mu] + i_mu);
+                // select glo_pi_rowT to pi_blacs
+                for (int i = 0; i != loc_row; i++)
+                {
+                    // int global_row =
+                    // para_mpi.globalIndex(i,row_nblk,para_mpi.nprow,para_mpi.myprow);
+                    int global_row = arrdesc_pi.indx_l2g_r(i);
+                    int mu_blacs;
+                    int I_blacs = atom_mu_glo2loc(global_row, mu_blacs);
+                    if (I_blacs == Mu)
+                        for (int j = 0; j != loc_col; j++)
+                        {
+                            // int global_col =
+                            // para_mpi.globalIndex(j,col_nblk,para_mpi.npcol,para_mpi.mypcol);
+                            int global_col = arrdesc_pi.indx_l2g_c(j);
+                            int nu_blacs;
+                            int J_blacs = atom_mu_glo2loc(global_col, nu_blacs);
+                            // cout<<" Mu: "<<Mu<<"  i,j: "<<i<<"  "<<j<<"    glo_row,col:
+                            // "<<global_row<<"  "<<global_col<<"  J:"<<J_blacs<< "  index i,j:
+                            // "<<atom_mu_part_range[J_blacs] + mu_blacs<<" "<<nu_blacs<<endl;
+                            if (global_col == global_row)
+                            {
+                                loc_piT(i, j) =
+                                    complex<double>(1.0, 0.0) -
+                                    glo_pi_row(mu_blacs, atom_mu_part_range[J_blacs] + nu_blacs);
+                            }
+                            else
+                            {
+                                loc_piT(i, j) =
+                                    -glo_pi_row(mu_blacs, atom_mu_part_range[J_blacs] + nu_blacs);
+                            }
+                        }
+                }
+                double Mu_after_loc_tran = omp_get_wtime();
+                vq_time += (Mu_after_vq - Mu_begin);
+                pi_time += (Mu_after_pi_loc - Mu_after_vq);
+                loc_tran_time += (Mu_after_loc_tran - Mu_after_pi_loc);
+            }
+            // if(freq == chi0.tfg.get_freq_nodes()[0] && mpi_comm_global_h.is_root())
+            //     print_complex_matrix(" loc_piT",loc_piT);
+            double task_mid = omp_get_wtime();
+            // printf("|process  %d, before det\n",mpi_comm_global_h.myid);
+            complex<double> ln_det = compute_pi_det_blacs_2d(loc_piT, arrdesc_pi, ipiv, info);
+            double task_end = omp_get_wtime();
+            if (mpi_comm_global_h.is_root())
+                lib_printf(
+                    "| After det for freq:  %f,  q: ( %f, %f, %f)   TIME_Vq_COMM: %f   TIME_DET: "
+                    "%f  TIME_CAL_Pi: %f, TIME_TRAN_LOC: %f\n",
+                    freq, q.x, q.y, q.z, vq_time, task_end - task_mid, pi_time, loc_tran_time);
+            // para_mpi.mpi_barrier();
+            if (mpi_comm_global_h.myid == 0)
+            {
+                complex<double> rpa_for_omega_q = trace_pi + ln_det;
+                // cout << " ifreq:" << freq << "      rpa_for_omega_k: " << rpa_for_omega_q << "
+                // lnt_det: " << ln_det << "    trace_pi " << trace_pi << endl;
+                cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;  //! check
+                tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;
+            }
+        }
+    }
+
+    if (mpi_comm_global_h.myid == 0)
+    {
+        for (auto &q_crpa : cRPA_q)
+        {
+            corr.qcontrib[q_crpa.first] = q_crpa.second;
+            // cout << q_crpa.first << q_crpa.second << endl;
+        }
+        // cout << "gx_num_" << chi0.tfg.size() << "  tot_RPA_energy:  " << setprecision(8)
+        // <<tot_RPA_energy << endl;
+    }
+    mpi_comm_global_h.barrier();
+    corr.value = tot_RPA_energy;
+    corr.etype = CorrEnergy::type::RPA;
+    return corr;
+}
+
+CorrEnergy compute_RPA_correlation(const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat)
+{
+    CorrEnergy corr;
+    if (mpi_comm_global_h.myid == 0) lib_printf("Calculating EcRPA without BLACS/ScaLAPACK\n");
+    // lib_printf("Begin cal cRPA , pid:  %d\n", mpi_comm_global_h.myid);
+    const auto &mf = chi0.mf;
+
+    // freq, q
+    map<double, map<Vector3_Order<double>, atom_mapping<ComplexMatrix>::pair_t_old>>
+        pi_freq_q_Mu_Nu;
+    if (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR ||
+        LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI)
+        pi_freq_q_Mu_Nu = compute_Pi_q_MPI(chi0, coulmat);
+    else
+        pi_freq_q_Mu_Nu = compute_Pi_q(chi0, coulmat);
+    lib_printf("Finish Pi freq on Proc %4d, size %zu\n", mpi_comm_global_h.myid,
+               pi_freq_q_Mu_Nu.size());
+    // mpi_comm_global_h.barrier();
+
+    int range_all = N_all_mu;
+
+    vector<int> part_range;
+    part_range.resize(atom_mu.size());
+    part_range[0] = 0;
+    int count_range = 0;
+    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+    // printf("success before part_range processid:%d, atom_mu.size(): %zu\n",
+        //    mpi_comm_global_h.myid, atom_mu.size());
+    #endif
+    for (int I = 0; I != atom_mu.size() - 1; I++)
+    {
+        count_range += atom_mu[I];
+        part_range[I + 1] = count_range;
+    }
+    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+    // printf("success after part_range processid:%d, atom_mu.size(): %zu\n",
+    //        mpi_comm_global_h.myid, atom_mu.size());
+    #endif
+
+    // cout << "part_range:" << endl;
+    // for (int I = 0; I != atom_mu.size(); I++)
+    // {
+    //     cout << part_range[I] << endl;
+    // }
+    // cout << "part_range over" << endl;
+
+    // pi_freq_q contains all atoms
+    map<double, map<Vector3_Order<double>, ComplexMatrix>> pi_freq_q;
+    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+    // printf("| process %d, qpts.size(): %zu,freq.size():%zu\n", mpi_comm_global_h.myid, chi0.klist.size(),chi0.tfg.get_freq_nodes().size());
+    #endif
+    for(const auto &freq : chi0.tfg.get_freq_nodes())
+    {
+        // printf("| process %d, freq: %f\n", mpi_comm_global_h.myid, freq);
+        map<Vector3_Order<double>, atom_mapping<ComplexMatrix>::pair_t_old> freq_q_MuNupi;
+        if(!chi0.get_chi0_q().empty())
+            freq_q_MuNupi=pi_freq_q_Mu_Nu.at(freq);
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+        // printf("success before freq_q_MuNupi processid:%d, freq_q_MuNupi.size(): %zu\n",
+        //        mpi_comm_global_h.myid, freq_q_MuNupi.size());
+        #endif
+        for(const auto &q:chi0.klist){
+            atom_mapping<ComplexMatrix>::pair_t_old q_MuNupi;
+            if(!chi0.get_chi0_q().empty())
+                q_MuNupi = freq_q_MuNupi.at(q);
+            const auto MuNupi = q_MuNupi;
+            pi_freq_q[freq][q].create(range_all, range_all);
+
+            ComplexMatrix pi_munu_tmp(range_all, range_all);
+            pi_munu_tmp.zero_out();
+            if(!chi0.get_chi0_q().empty())
+            for (const auto &Mu_Nupi : MuNupi)
+            {
+                const auto Mu = Mu_Nupi.first;
+                const auto Nupi = Mu_Nupi.second;
+                const size_t n_mu = atom_mu[Mu];
+                for (const auto &Nu_pi : Nupi)
+                {
+                    const auto Nu = Nu_pi.first;
+                    const auto pimat = Nu_pi.second;
+                    const size_t n_nu = atom_mu[Nu];
+
+                    for (size_t mu = 0; mu != n_mu; ++mu)
+                    {
+                        for (size_t nu = 0; nu != n_nu; ++nu)
+                        {
+                            pi_munu_tmp(part_range[Mu] + mu, part_range[Nu] + nu) += pimat(mu, nu);
+                        }
+                    }
+                }
+            }
+            if (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR ||
+                LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI)
+            {
+                mpi_comm_global_h.reduce_ComplexMatrix(pi_munu_tmp, pi_freq_q.at(freq).at(q), 0);
+            }
+            else
+            {
+                pi_freq_q.at(freq).at(q) = std::move(pi_munu_tmp);
+            }
+        }
+    }
+    // lib_printf("Finish Pi communicate %4d, size %zu\n", mpi_comm_global_h.myid,
+    // pi_freq_q_Mu_Nu.size());
+    mpi_comm_global_h.barrier();
+    // if (mpi_comm_global_h.myid == 0)
+    {
+        complex<double> tot_RPA_energy(0.0, 0.0);
+        map<Vector3_Order<double>, complex<double>> cRPA_q;
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+        int num_iteration = 0;
+        #endif
+        for (const auto &freq_qpi : pi_freq_q)
+        {
+            const auto freq = freq_qpi.first;
+            const double freq_weight = chi0.tfg.find_freq_weight(freq);
+            for (const auto &q_pi : freq_qpi.second)
+            {
+                const auto q = q_pi.first;
+                const auto pimat = q_pi.second;
+                complex<double> rpa_for_omega_q(0.0, 0.0);
+                ComplexMatrix identity(range_all, range_all);
+                ComplexMatrix identity_minus_pi(range_all, range_all);
+                identity.set_as_identity_matrix();
+                identity_minus_pi = identity - pi_freq_q[freq][q];
+                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+                // if(num_iteration==0)
+                // if(mpi_comm_global_h.myid == 1)
+                // {
+                //     complex<double>* test_c= identity_minus_pi.c;
+                //     for(int i=0;i<range_all;i++){
+                //         for(int j=0;j<range_all;j++){
+                //             printf("%f+%fi ",
+                //                    test_c[i*range_all+j].real(), test_c[i*range_all+j].imag());
+                //         }
+                //         printf("\n");
+                //     }
+                // }
+                num_iteration++;
+                #endif
+                complex<double> det_for_rpa(1.0, 0.0);
+                int info_LU = 0;
+                int *ipiv = new int[range_all];
+                LapackConnector::zgetrf(range_all, range_all, identity_minus_pi, range_all, ipiv,
+                                        &info_LU);
+                for (int ib = 0; ib != range_all; ib++)
+                {
+                    if (ipiv[ib] != (ib + 1))
+                        det_for_rpa = -det_for_rpa * identity_minus_pi(ib, ib);
+                    else
+                        det_for_rpa = det_for_rpa * identity_minus_pi(ib, ib);
+                }
+                delete[] ipiv;
+
+                complex<double> trace_pi;
+                complex<double> ln_det;
+                ln_det = std::log(det_for_rpa);
+                trace_pi = trace(pi_freq_q.at(freq).at(q));
+                // cout << "PI trace vector:" << endl;
+                // cout << endl;
+                rpa_for_omega_q = ln_det + trace_pi;
+                // cout << " ifreq:" << freq << "      rpa_for_omega_k: " << rpa_for_omega_q << "
+                // lnt_det: " << ln_det << "    trace_pi " << trace_pi << endl;
+                cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;
+                tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;
+            }
+        }
+        // lib_printf("Finish EcRPA %4d, size %zu\n", mpi_comm_global_h.myid,
+        // pi_freq_q_Mu_Nu.size());
+        mpi_comm_global_h.barrier();
+        map<Vector3_Order<double>, complex<double>> global_cRPA_q;
+        for (auto q_weight : irk_weight)
+        {
+            MPI_Reduce(&cRPA_q[q_weight.first], &global_cRPA_q[q_weight.first], 1,
+                       MPI_DOUBLE_COMPLEX, MPI_SUM, 0, mpi_comm_global_h.comm);
+        }
+
+        for (auto &q_crpa : global_cRPA_q)
+        {
+            corr.qcontrib[q_crpa.first] = q_crpa.second;
+        }
+        complex<double> gather_tot_RPA_energy(0.0, 0.0);
+        MPI_Reduce(&tot_RPA_energy, &gather_tot_RPA_energy, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, 0,
+                   mpi_comm_global_h.comm);
+        corr.value = gather_tot_RPA_energy;
+    }
+    corr.etype = CorrEnergy::type::RPA;
+    return corr;
+}
+
+CorrEnergy compute_MP2_correlation(const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat)
+{
+    CorrEnergy corr;
+    corr.etype = CorrEnergy::type::MP2;
+    return corr;
+}
+
+map<double, map<Vector3_Order<double>, atom_mapping<ComplexMatrix>::pair_t_old>> compute_Pi_q(
+    const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat)
+{
+    map<double, map<Vector3_Order<double>, atom_mapping<ComplexMatrix>::pair_t_old>> pi;
+    lib_printf("Begin compute_Pi_q , pid:  %d\n", mpi_comm_global_h.myid);
+    for (auto const &freq_qJQchi0 : chi0.get_chi0_q())
+    {
+        const double freq = freq_qJQchi0.first;
+        for (auto &q_JQchi0 : freq_qJQchi0.second)
+        {
+            Vector3_Order<double> q = q_JQchi0.first;
+            for (auto &JQchi0 : q_JQchi0.second)
+            {
+                const size_t J = JQchi0.first;
+                const size_t J_mu = atom_mu[J];
+                for (auto &Qchi0 : JQchi0.second)
+                {
+                    const size_t Q = Qchi0.first;
+                    const size_t Q_mu = atom_mu[Q];
+                    // auto &chi0_mat = Qchi0.second;
+                    for (int I = 0; I != natom; I++)
+                    {
+                        // const size_t I = I_p.first;
+                        const size_t I_mu = atom_mu[I];
+                        pi[freq][q][I][Q].create(I_mu, Q_mu);
+                        if (J != Q) pi[freq][q][I][J].create(I_mu, J_mu);
+                    }
+                }
+            }
+            // if(freq==chi0.tfg.get_freq_nodes()[0])
+            //     for(auto &Ip:pi[freq][q])
+            //         for(auto &Jp:Ip.second)
+            //             lib_printf("  |process  %d, pi atpair: %d, %d
+            //             \n",mpi_comm_global_h.myid,Ip.first,Jp.first);
+        }
+    }
+
+    // ofstream fp;
+    // std::stringstream ss;
+    // ss<<"out_pi_rank_"<<mpi_comm_global_h.myid<<".txt";
+    // fp.open(ss.str());
+    for (auto &freq_p : chi0.get_chi0_q())
+    {
+        const double freq = freq_p.first;
+        const auto chi0_freq = freq_p.second;
+        for (auto &k_pair : chi0_freq)
+        {
+            Vector3_Order<double> ik_vec = k_pair.first;
+            auto chi0_freq_k = k_pair.second;
+            for (auto &J_p : chi0_freq_k)
+            {
+                const size_t J = J_p.first;
+                for (auto &Q_p : J_p.second)
+                {
+                    const size_t Q = Q_p.first;
+                    auto &chi0_mat = Q_p.second;
+                    for (int I = 0; I != natom; I++)
+                    {
+                        // const size_t I = I_p.first;
+                        // printf("cal_pi  pid: %d , IJQ:  %d  %d  %d\n", mpi_comm_global_h.myid, I,
+                        // J, Q);
+                        //   cout<<"         pi_IQ: "<<pi_k.at(freq).at(ik_vec).at(I).at(Q)(0,0)<<"
+                        //   pi_IJ: "<<pi_k.at(freq).at(ik_vec).at(I).at(J)(0,0);
+                        if (I <= J)
+                        {
+                            // if (freq == chi0.tfg.get_freq_nodes()[0])
+                            //     lib_printf("cal_pi  pid: %d , IJQ:  %d  %d  %d   type: %d \n",
+                            //     mpi_comm_global_h.myid, I, J, Q,1);
+                            //      << "  Vq: " << (*Vq.at(I).at(J).at(ik_vec))(0, 0) << endl;
+                            pi.at(freq).at(ik_vec).at(I).at(Q) +=
+                                (*Vq.at(I).at(J).at(ik_vec)) * chi0_mat;
+                            // if (freq == chi0.tfg.get_freq_nodes()[0])
+                            // {
+                            //     std:stringstream sm;
+                            //     complex<double> trace_pi;
+                            //     trace_pi = trace(pi.at(freq).at(ik_vec).at(I).at(Q));
+                            //     sm << " IJQ: " << I << " " << J << " " << Q << "  ik_vec: " <<
+                            //     ik_vec << "  trace_pi:  " << trace_pi << endl;
+                            //     print_complex_matrix_file(sm.str().c_str(),
+                            //     (*Vq.at(I).at(J).at(ik_vec)),fp,false);
+                            //     print_complex_matrix_file("chi0:", chi0_mat,fp,false);
+                            //     print_complex_matrix_file("pi_mat:",
+                            //     pi.at(freq).at(ik_vec).at(I).at(Q),fp,false);
+                            // }
+                        }
+                        else
+                        {
+                            // if (freq == chi0.tfg.get_freq_nodes()[0])
+                            //     lib_printf("cal_pi  pid: %d , IJQ:  %d  %d  %d   type: %d \n",
+                            //     mpi_comm_global_h.myid, I, J, Q,2);
+                            //      << "  Vq: " << transpose(*Vq.at(J).at(I).at(ik_vec), 1)(0, 0) <<
+                            //      endl;
+                            pi.at(freq).at(ik_vec).at(I).at(Q) +=
+                                transpose(*Vq.at(J).at(I).at(ik_vec), 1) * chi0_mat;
+                        }
+
+                        if (J != Q)
+                        {
+                            ComplexMatrix chi0_QJ = transpose(chi0_mat, 1);
+                            if (I <= Q)
+                            {
+                                // if (freq == chi0.tfg.get_freq_nodes()[0])
+                                //     lib_printf("cal_pi  pid: %d , IJQ:  %d  %d  %d   type: %d
+                                //     \n", mpi_comm_global_h.myid, I, J, Q,3);
+                                //      << "  Vq: " << (*Vq.at(I).at(Q).at(ik_vec))(0, 0) << endl;
+                                pi.at(freq).at(ik_vec).at(I).at(J) +=
+                                    (*Vq.at(I).at(Q).at(ik_vec)) * chi0_QJ;
+                            }
+                            else
+                            {
+                                // if (freq == chi0.tfg.get_freq_nodes()[0])
+                                //     lib_printf("cal_pi  pid: %d , IJQ:  %d  %d  %d   type: %d
+                                //     \n", mpi_comm_global_h.myid, I, J, Q,4);
+                                //      << "  Vq: " << transpose(*Vq.at(J).at(I).at(ik_vec), 1)(0,
+                                //      0) << endl;
+                                pi.at(freq).at(ik_vec).at(I).at(J) +=
+                                    transpose(*Vq.at(Q).at(I).at(ik_vec), 1) * chi0_QJ;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+    // fp.close();
+    // print_complex_matrix("
+    // first_pi_mat:",pi.at(chi0.tfg.get_freq_nodes()[0]).at({0,0,0}).at(0).at(0));
+    /* print_complex_matrix("
+     * last_pi_mat:",pi.at(chi0.tfg.get_freq_nodes()[0]).at({0,0,0}).at(natom-1).at(natom-1)); */
+    return pi;
+}
+
+map<double, map<Vector3_Order<double>, atom_mapping<ComplexMatrix>::pair_t_old>> compute_Pi_q_MPI(
+    const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat)
+{
+    map<double, map<Vector3_Order<double>, atom_mapping<ComplexMatrix>::pair_t_old>> pi;
+    lib_printf("Begin compute_Pi_q_MPI , pid:  %d\n", mpi_comm_global_h.myid);
+    for (auto const &freq_qJQchi0 : chi0.get_chi0_q())
+    {
+        const double freq = freq_qJQchi0.first;
+        for (auto &q_JQchi0 : freq_qJQchi0.second)
+        {
+            Vector3_Order<double> q = q_JQchi0.first;
+            for (auto &JQchi0 : q_JQchi0.second)
+            {
+                const size_t J = JQchi0.first;
+                const size_t J_mu = atom_mu[J];
+                for (auto &Qchi0 : JQchi0.second)
+                {
+                    const size_t Q = Qchi0.first;
+                    const size_t Q_mu = atom_mu[Q];
+                    // auto &chi0_mat = Qchi0.second;
+                    for (int I = 0; I != natom; I++)
+                    {
+                        // const size_t I = I_p.first;
+                        const size_t I_mu = atom_mu[I];
+                        pi[freq][q][I][Q].create(I_mu, Q_mu);
+                        if (J != Q) pi[freq][q][I][J].create(I_mu, J_mu);
+                    }
+                }
+            }
+            // if(freq==chi0.tfg.get_freq_nodes()[0])
+            //     for(auto &Ip:pi[freq][q])
+            //         for(auto &Jp:Ip.second)
+            //             lib_printf("  |process  %d, pi atpair: %d, %d
+            //             \n",mpi_comm_global_h.myid,Ip.first,Jp.first);
+        }
+    }
+
+    // ofstream fp;
+    // std::stringstream ss;
+    // ss<<"out_pi_rank_"<<mpi_comm_global_h.myid<<".txt";
+    // fp.open(ss.str());
+    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+    // printf("success before irk_weight, pid: %d\n", mpi_comm_global_h.myid);
+    #endif
+    for (auto &k_pair : irk_weight)
+    {
+        Vector3_Order<double> ik_vec = k_pair.first;
+        for (int I = 0; I != natom; I++)
+        {
+            #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+            // printf("success before gather_vp_row_q irk_weight, pid: %d\n", mpi_comm_global_h.myid);
+            #endif
+            atom_mapping<ComplexMatrix>::pair_t_old Vq_row = gather_vq_row_q(I, coulmat, ik_vec);
+            #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+            // printf("success after gather_vp_row_q irk_weight, pid: %d\n", mpi_comm_global_h.myid);
+            #endif
+            for (auto &freq_p : chi0.get_chi0_q())
+            {
+                const double freq = freq_p.first;
+                const auto chi0_freq = freq_p.second;
+
+                auto chi0_freq_k = freq_p.second.at(ik_vec);
+
+                for (auto &J_p : chi0_freq_k)
+                {
+                    const size_t J = J_p.first;
+                    for (auto &Q_p : J_p.second)
+                    {
+                        const size_t Q = Q_p.first;
+                        auto &chi0_mat = Q_p.second;
+
+                        // const size_t I = I_p.first;
+                        // printf("cal_pi  pid: %d , IJQ:  %d  %d  %d\n", mpi_comm_global_h.myid, I,
+                        // J, Q);
+                        //   cout<<"         pi_IQ: "<<pi_k.at(freq).at(ik_vec).at(I).at(Q)(0,0)<<"
+                        //   pi_IJ: "<<pi_k.at(freq).at(ik_vec).at(I).at(J)(0,0);
+
+                        // if (freq == chi0.tfg.get_freq_nodes()[0])
+                        //     lib_printf("cal_pi  pid: %d , IJQ:  %d  %d  %d   type: %d \n",
+                        //     mpi_comm_global_h.myid, I, J, Q,1);
+                        //      << "  Vq: " << (*Vq.at(I).at(J).at(ik_vec))(0, 0) << endl;
+                        pi.at(freq).at(ik_vec).at(I).at(Q) += Vq_row.at(I).at(J) * chi0_mat;
+                        // if (freq == chi0.tfg.get_freq_nodes()[0])
+                        // {
+                        //     std:stringstream sm;
+                        //     complex<double> trace_pi;
+                        //     trace_pi = trace(pi.at(freq).at(ik_vec).at(I).at(Q));
+                        //     sm << " IJQ: " << I << " " << J << " " << Q << "  ik_vec: " << ik_vec
+                        //     << "  trace_pi:  " << trace_pi << endl;
+                        //     print_complex_matrix_file(sm.str().c_str(),
+                        //     Vq_row.at(I).at(J),fp,false); print_complex_matrix_file("chi0:",
+                        //     chi0_mat,fp,false); print_complex_matrix_file("pi_mat:",
+                        //     pi.at(freq).at(ik_vec).at(I).at(Q),fp,false);
+                        // }
+
+                        if (J != Q)
+                        {
+                            ComplexMatrix chi0_QJ = transpose(chi0_mat, 1);
+                            // if (freq == chi0.tfg.get_freq_nodes()[0])
+                            //     lib_printf("cal_pi  pid: %d , IJQ:  %d  %d  %d   type: %d \n",
+                            //     mpi_comm_global_h.myid, I, J,Q,3);
+                            //      << "  Vq: " << (*Vq.at(I).at(Q).at(ik_vec))(0, 0) << endl;
+                            pi.at(freq).at(ik_vec).at(I).at(J) += Vq_row.at(I).at(Q) * chi0_QJ;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    // fp.close();
+    //  print_complex_matrix("
+    //  first_pi_mat:",pi.at(chi0.tfg.get_freq_nodes()[0]).at({0,0,0}).at(0).at(0));
+    /* print_complex_matrix("
+     * last_pi_mat:",pi.at(chi0.tfg.get_freq_nodes()[0]).at({0,0,0}).at(natom-1).at(natom-1)); */
+    lib_printf("End compute_Pi_q_MPI , pid:  %d\n", mpi_comm_global_h.myid);
+    return pi;
+}
+
+ComplexMatrix compute_Pi_freq_q_row(const Vector3_Order<double> &ik_vec,
+                                    const atom_mapping<ComplexMatrix>::pair_t_old &chi0_freq_q,
+                                    const atom_mapping<ComplexMatrix>::pair_t_old &Vq_row,
+                                    const int &I)
+{
+    map<size_t, ComplexMatrix> pi;
+    // lib_printf("Begin cal_pi_k , pid:  %d\n", para_mpi.get_myid());
+    auto I_mu = atom_mu[I];
+    for (int J = 0; J != natom; J++) pi[J].create(I_mu, atom_mu[J]);
+
+    omp_lock_t pi_lock;
+    omp_init_lock(&pi_lock);
+#pragma omp parallel for schedule(dynamic)
+    for (int iap = 0; iap != local_atpair.size(); iap++)
+    {
+        const size_t J = local_atpair[iap].first;
+        const size_t Q = local_atpair[iap].second;
+        auto &chi0_mat = chi0_freq_q.at(J).at(Q);
+        auto tmp_pi_mat = Vq_row.at(I).at(J) * chi0_mat;
+        ComplexMatrix chi0_QJ = transpose(chi0_mat, 1);
+        auto tmp_pi_mat2 = Vq_row.at(I).at(Q) * chi0_QJ;
+        omp_set_lock(&pi_lock);
+        pi.at(Q) += tmp_pi_mat;
+        if (J != Q)
+        {
+            pi.at(J) += tmp_pi_mat2;
+        }
+        omp_unset_lock(&pi_lock);
+    }
+    omp_destroy_lock(&pi_lock);
+    // for (auto &J_p : chi0_freq_q)
+    // {
+    //     const size_t J = J_p.first;
+    //     for (auto &Q_p : J_p.second)
+    //     {
+    //         const size_t Q = Q_p.first;
+    //         auto &chi0_mat = Q_p.second;
+    //         pi.at(Q) += Vq_row.at(I).at(J) * chi0_mat;
+    //         if (J != Q)
+    //         {
+    //             ComplexMatrix chi0_QJ = transpose(chi0_mat, 1);
+    //             pi.at(J) += Vq_row.at(I).at(Q) * chi0_QJ;
+    //         }
+    //     }
+    // }
+    // Pi_rowT
+    // ComplexMatrix pi_row(N_all_mu,atom_mu[I]);
+    // complex<double> *pi_row_ptr=pi_row.c;
+    // for(auto &Jp:pi)
+    // {
+    //     auto J=Jp.first;
+    //     auto J_mu=atom_mu[J];
+    //     const auto length=sizeof(complex<double>)* I_mu *J_mu;
+    //     memcpy(pi_row_ptr, pi.at(J).c,length);
+    //     pi_row_ptr+=I_mu *J_mu;
+    // }
+    ComplexMatrix pi_row(atom_mu[I], N_all_mu);
+    for (int i = 0; i != pi_row.nr; i++)
+        for (int J = 0; J != natom; J++)
+            for (int j = 0; j != atom_mu[J]; j++)
+                pi_row(i, atom_mu_part_range[J] + j) = pi.at(J)(i, j);
+    return pi_row;
+}
+
+ComplexMatrix compute_Pi_freq_q_row_ri(const Vector3_Order<double> &ik_vec,
+                                       const atom_mapping<ComplexMatrix>::pair_t_old &chi0_freq_q,
+                                       const atpair_k_cplx_mat_t &Vq_loc, const int &I,
+                                       const Vector3_Order<double> &q)
+{
+    map<size_t, ComplexMatrix> pi;
+    // lib_printf("Begin cal_pi_k , pid:  %d\n", mpi_comm_global_h.myid);
+    auto I_mu = atom_mu[I];
+    for (int J = 0; J != natom; J++) pi[J].create(I_mu, atom_mu[J]);
+
+    omp_lock_t pi_lock;
+    omp_init_lock(&pi_lock);
+#pragma omp parallel for schedule(dynamic)
+    for (int iap = 0; iap != local_atpair.size(); iap++)
+    {
+        const size_t J = local_atpair[iap].first;
+        const size_t Q = local_atpair[iap].second;
+        auto &chi0_mat = chi0_freq_q.at(J).at(Q);
+        // printf("| IN cal Pi process %d, I: %d  J: %d  Q: %d\n",mpi_comm_global_h.myid, I,J,Q );
+        auto tmp_pi_mat = *Vq_loc.at(I).at(J).at(q) * chi0_mat;
+        ComplexMatrix chi0_QJ = transpose(chi0_mat, 1);
+        auto tmp_pi_mat2 = *Vq_loc.at(I).at(Q).at(q) * chi0_QJ;
+        omp_set_lock(&pi_lock);
+        pi.at(Q) += tmp_pi_mat;
+        if (J != Q)
+        {
+            pi.at(J) += tmp_pi_mat2;
+        }
+        omp_unset_lock(&pi_lock);
+    }
+    omp_destroy_lock(&pi_lock);
+    // for (auto &J_p : chi0_freq_q)
+    // {
+    //     const size_t J = J_p.first;
+    //     for (auto &Q_p : J_p.second)
+    //     {
+    //         const size_t Q = Q_p.first;
+    //         auto &chi0_mat = Q_p.second;
+    //         pi.at(Q) += Vq_row.at(I).at(J) * chi0_mat;
+    //         if (J != Q)
+    //         {
+    //             ComplexMatrix chi0_QJ = transpose(chi0_mat, 1);
+    //             pi.at(J) += Vq_row.at(I).at(Q) * chi0_QJ;
+    //         }
+    //     }
+    // }
+    // Pi_rowT
+    // ComplexMatrix pi_row(N_all_mu,atom_mu[I]);
+    // complex<double> *pi_row_ptr=pi_row.c;
+    // for(auto &Jp:pi)
+    // {
+    //     auto J=Jp.first;
+    //     auto J_mu=atom_mu[J];
+    //     const auto length=sizeof(complex<double>)* I_mu *J_mu;
+    //     memcpy(pi_row_ptr, pi.at(J).c,length);
+    //     pi_row_ptr+=I_mu *J_mu;
+    // }
+    ComplexMatrix pi_row(atom_mu[I], N_all_mu);
+    for (int i = 0; i != pi_row.nr; i++)
+        for (int J = 0; J != natom; J++)
+            for (int j = 0; j != atom_mu[J]; j++)
+                pi_row(i, atom_mu_part_range[J] + j) = pi.at(J)(i, j);
+    return pi_row;
+}
+
+atom_mapping<ComplexMatrix>::pair_t_old gather_vq_row_q(const int &I,
+                                                        const atpair_k_cplx_mat_t &coulmat,
+                                                        const Vector3_Order<double> &ik_vec)
+{
+    auto I_mu = atom_mu[I];
+    atom_mapping<ComplexMatrix>::pair_t_old Vq_row;
+    for (int J_tmp = 0; J_tmp != natom; J_tmp++)
+    {
+        auto J_mu = atom_mu[J_tmp];
+        ComplexMatrix loc_vq(atom_mu[I], atom_mu[J_tmp]);
+        Vq_row[I][J_tmp].create(atom_mu[I], atom_mu[J_tmp]);
+        // const auto length=sizeof(complex<double>)* I_mu *J_mu;
+        // complex<double> *loc_vq_ptr=loc_vq.c;
+        if (I <= J_tmp)
+        {
+            if (Vq.count(I))
+                if (Vq.at(I).count(J_tmp)) loc_vq = *Vq.at(I).at(J_tmp).at(ik_vec);
+        }
+        else
+        {
+            if (Vq.count(J_tmp))
+                if (Vq.at(J_tmp).count(I)) loc_vq = transpose(*Vq.at(J_tmp).at(I).at(ik_vec), 1);
+        }
+        mpi_comm_global_h.allreduce_ComplexMatrix(loc_vq, Vq_row[I][J_tmp]);
+    }
+    return Vq_row;
+}
+
+map<double, atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
+compute_Wc_freq_q(Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat_eps,
+                  atpair_k_cplx_mat_t &coulmat_wc,
+                  const vector<std::complex<double>> &epsmac_LF_imagfreq)
+{
+    map<double,
+        atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
+        Wc_freq_q;
+    const int range_all = LIBRPA::atomic_basis_abf.nb_total;
+    const auto part_range = LIBRPA::atomic_basis_abf.get_part_range();
+
+    if (mpi_comm_global_h.myid == 0)
+    {
+        cout << "Calculating Wc using LAPACK" << endl;
+    }
+
+    mpi_comm_global_h.barrier();
+    // use q-points as the outmost loop, so that square root of Coulomb will not be recalculated at
+    // each frequency point
+    vector<Vector3_Order<double>> qpts;
+    for (const auto &qMuNuchi : chi0.get_chi0_q().at(chi0.tfg.get_freq_nodes()[0]))
+        qpts.push_back(qMuNuchi.first);
+
+    for (const auto &q : qpts)
+    {
+        int iq = std::distance(klist.begin(), std::find(klist.begin(), klist.end(), q));
+        char fn[80];
+
+        ComplexMatrix Vq_all(range_all, range_all);
+        for (const auto &Mu_NuqVq : coulmat_eps)
+        {
+            auto Mu = Mu_NuqVq.first;
+            auto n_mu = atom_mu[Mu];
+            for (auto &Nu_qVq : Mu_NuqVq.second)
+            {
+                auto Nu = Nu_qVq.first;
+                if (0 == Nu_qVq.second.count(q)) continue;
+                auto n_nu = atom_mu[Nu];
+                for (int i_mu = 0; i_mu != n_mu; i_mu++)
+                    for (int i_nu = 0; i_nu != n_nu; i_nu++)
+                    {
+                        Vq_all(part_range[Mu] + i_mu, part_range[Nu] + i_nu) =
+                            (*Nu_qVq.second.at(q))(i_mu, i_nu);
+                        Vq_all(part_range[Nu] + i_nu, part_range[Mu] + i_mu) =
+                            conj((*Nu_qVq.second.at(q))(i_mu, i_nu));
+                    }
+            }
+        }
+        if (Params::debug)
+        {
+            sprintf(fn, "Vq_all_q_%d.mtx", iq);
+            print_complex_matrix_mm(Vq_all, Params::output_dir + "/" + fn, 1e-15);
+        }
+        auto sqrtVq_all = power_hemat(Vq_all, 0.5, true, false, Params::sqrt_coulomb_threshold);
+        // Vq_all is now eigenvectors of the original Coulomb matrix
+        const auto &Vq_eigen = Vq_all;
+        if (Params::debug)
+        {
+            sprintf(fn, "sqrtVq_all_q_%d.mtx", iq);
+            print_complex_matrix_mm(sqrtVq_all, Params::output_dir + "/" + fn, 1e-15);
+            // sprintf(fn, "rotated_sqrtVq_all_q_%d.mtx", iq);
+            // print_complex_matrix_mm(Vq_all * sqrtVq_all * transpose(Vq_all, true), fn, 1e-15);
+            // print_complex_matrix_mm(transpose(Vq_all, true) * sqrtVq_all * Vq_all, fn, 1e-15);
+            sprintf(fn, "Vqeigenvec_q_%d.mtx", iq);
+            print_complex_matrix_mm(Vq_eigen, Params::output_dir + "/" + fn, 1e-15);
+        }
+
+        // truncated (cutoff) Coulomb
+        ComplexMatrix Vqcut_all(range_all, range_all);
+        for (auto &Mu_NuqVq : coulmat_wc)
+        {
+            auto Mu = Mu_NuqVq.first;
+            auto n_mu = atom_mu[Mu];
+            for (auto &Nu_qVq : Mu_NuqVq.second)
+            {
+                auto Nu = Nu_qVq.first;
+                if (0 == Nu_qVq.second.count(q)) continue;
+                auto n_nu = atom_mu[Nu];
+                for (int i_mu = 0; i_mu != n_mu; i_mu++)
+                    for (int i_nu = 0; i_nu != n_nu; i_nu++)
+                    {
+                        Vqcut_all(part_range[Mu] + i_mu, part_range[Nu] + i_nu) =
+                            (*Nu_qVq.second.at(q))(i_mu, i_nu);
+                        Vqcut_all(part_range[Nu] + i_nu, part_range[Mu] + i_mu) =
+                            conj((*Nu_qVq.second.at(q))(i_mu, i_nu));
+                    }
+            }
+        }
+        auto sqrtVqcut_all =
+            power_hemat(Vqcut_all, 0.5, false, true, Params::sqrt_coulomb_threshold);
+        // sprintf(fn, "sqrtVqcut_all_q_%d.mtx", iq);
+        // print_complex_matrix_mm(sqrtVqcut_all, fn, 1e-15);
+        sprintf(fn, "Vqcut_all_filtered_q_%d.mtx", iq);
+        // print_complex_matrix_mm(Vqcut_all, fn, 1e-15);
+        // save the filtered truncated Coulomb back to the atom mapping object
+        // TODO: revise the necessity
+        for (auto &Mu_NuqVq : coulmat_wc)
+        {
+            auto Mu = Mu_NuqVq.first;
+            auto n_mu = atom_mu[Mu];
+            for (auto &Nu_qVq : Mu_NuqVq.second)
+            {
+                auto Nu = Nu_qVq.first;
+                if (0 == Nu_qVq.second.count(q)) continue;
+                auto n_nu = atom_mu[Nu];
+                for (int i_mu = 0; i_mu != n_mu; i_mu++)
+                    for (int i_nu = 0; i_nu != n_nu; i_nu++)
+                        (*Nu_qVq.second.at(q))(i_mu, i_nu) =
+                            Vqcut_all(part_range[Mu] + i_mu, part_range[Nu] + i_nu);
+            }
+        }
+
+        ComplexMatrix chi0fq_all(range_all, range_all);
+        for (const auto &freq_qMuNuchi : chi0.get_chi0_q())
+        {
+            auto freq = freq_qMuNuchi.first;
+            auto ifreq = chi0.tfg.get_freq_index(freq);
+            auto MuNuchi = freq_qMuNuchi.second.at(q);
+            for (const auto &Mu_Nuchi : MuNuchi)
+            {
+                auto Mu = Mu_Nuchi.first;
+                auto n_mu = atom_mu[Mu];
+                for (auto &Nu_chi : Mu_Nuchi.second)
+                {
+                    auto Nu = Nu_chi.first;
+                    auto n_nu = atom_mu[Nu];
+                    for (int i_mu = 0; i_mu != n_mu; i_mu++)
+                        for (int i_nu = 0; i_nu != n_nu; i_nu++)
+                        {
+                            chi0fq_all(part_range[Mu] + i_mu, part_range[Nu] + i_nu) =
+                                Nu_chi.second(i_mu, i_nu);
+                            chi0fq_all(part_range[Nu] + i_nu, part_range[Mu] + i_mu) =
+                                conj(Nu_chi.second(i_mu, i_nu));
+                        }
+                }
+            }
+            sprintf(fn, "chi0fq_all_q_%d_freq_%d.mtx", iq, ifreq);
+            print_complex_matrix_mm(chi0fq_all, Params::output_dir + "/" + fn, 1e-15);
+
+            ComplexMatrix identity(range_all, range_all);
+            identity.set_as_identity_matrix();
+            auto eps_fq = sqrtVq_all * chi0fq_all * sqrtVq_all;
+            eps_fq = transpose(Vq_eigen, true) * eps_fq * Vq_eigen;
+            if (!epsmac_LF_imagfreq.empty() && is_gamma_point(q))
+            {
+                // rotate to Coulomb-diagonal basis
+                // lib_printf("Largest off-diagonal = %f\n", eps_fq.get_max_abs_offdiag());
+                // print_matrix("rotated eps_fq: ", eps_fq.real());
+                // replacing the element corresponding to largest Coulomb eigenvalue with dielectric
+                // function
+                lib_printf("%22.12f %22.12f %22.12f %22.12f\n", freq, eps_fq(0, 0).real(),
+                           eps_fq(eps_fq.nr - 1, eps_fq.nc - 1).real(),
+                           epsmac_LF_imagfreq[ifreq].real());
+                // eps_fq(eps_fq.nr - 1, eps_fq.nc - 1) = epsmac_LF_imagfreq[ifreq];
+                eps_fq(0, 0) = 1.0 - epsmac_LF_imagfreq[ifreq];
+            }
+            if (Params::debug)
+            {
+                sprintf(fn, "rotated_vsxvs_q_%d_freq_%d.mtx", iq, ifreq);
+                print_complex_matrix_mm(eps_fq, Params::output_dir + "/" + fn, 1e-10);
+            }
+            // rotate back to ABF
+            eps_fq = Vq_eigen * eps_fq * transpose(Vq_eigen, true);
+            eps_fq = identity - eps_fq;
+            if (Params::debug)
+            {
+                sprintf(fn, "eps_q_%d_freq_%d.mtx", iq, ifreq);
+                print_complex_matrix_mm(eps_fq, Params::output_dir + "/" + fn, 1e-10);
+            }
+
+            // invert the epsilon matrix
+            power_hemat_onsite(eps_fq, -1);
+            auto wc_all = sqrtVqcut_all * (eps_fq - identity) * sqrtVqcut_all;
+            // sprintf(fn, "inveps_q_%d_freq_%d.mtx", iq, ifreq);
+            // print_complex_matrix_mm(eps_fq, fn, 1e-15);
+            // sprintf(fn, "wc_q_%d_freq_%d.mtx", iq, ifreq);
+            // print_complex_matrix_mm(wc_all, fn, 1e-15);
+
+            // save result to the atom mapping object
+            for (auto &Mu_Nuchi : MuNuchi)
+            {
+                auto Mu = Mu_Nuchi.first;
+                auto n_mu = atom_mu[Mu];
+                for (auto &Nu_chi : Mu_Nuchi.second)
+                {
+                    auto Nu = Nu_chi.first;
+                    auto n_nu = atom_mu[Nu];
+                    shared_ptr<ComplexMatrix> wc_ptr = make_shared<ComplexMatrix>();
+                    wc_ptr->create(n_mu, n_nu);
+                    for (int i_mu = 0; i_mu != n_mu; i_mu++)
+                        for (int i_nu = 0; i_nu != n_nu; i_nu++)
+                        {
+                            (*wc_ptr)(i_mu, i_nu) =
+                                wc_all(part_range[Mu] + i_mu, part_range[Nu] + i_nu);
+                        }
+                    Wc_freq_q[freq][Mu][Nu][q] =
+                        matrix_m<complex<double>>(n_mu, n_nu, wc_ptr->c, MAJOR::ROW, MAJOR::ROW);
+                }
+            }
+        }
+    }
+
+    return Wc_freq_q;
+}
+
+// Done: converge compute_Wc_freq_q_blacs and compute_Wc_freq_q_blacs_wing
+map<double, atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
+compute_Wc_freq_q_blacs(Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat_eps,
+                        atpair_k_cplx_mat_t &coulmat_wc,
+                        const vector<std::complex<double>> &epsmac_LF_imagfreq)
+{
+    map<double,
+        atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
+        Wc_freq_q;
+    const complex<double> CONE{1.0, 0.0};
+    const int n_abf = LIBRPA::atomic_basis_abf.nb_total;
+    const auto part_range = LIBRPA::atomic_basis_abf.get_part_range();
+
+    if (mpi_comm_global_h.myid == 0)
+    {
+        cout << "Calculating Wc using ScaLAPACK" << endl;
+    }
+    mpi_comm_global_h.barrier();
+
+    Profiler::start("compute_Wc_freq_q_blacs_init");
+    Array_Desc desc_nabf_nabf(blacs_ctxt_global_h);
+    // Use a square blocksize instead max block, otherwise heev and inversion will complain about
+    // illegal parameter Maximal blocksize ensure that atom indices related to the rows/columns of a
+    // local matrix is minimized.
+    desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0);
+    // This, however, is not optimal for matrix operations, and may lead to segment fault during
+    // MPI operations with parallel linear algebra subroutine. Thus we define an optimal blocksize
+    Array_Desc desc_nabf_nabf_opt(blacs_ctxt_global_h);
+    const int nb_opt = min(128, desc_nabf_nabf.nb());
+    desc_nabf_nabf_opt.init(n_abf, n_abf, nb_opt, nb_opt, 0, 0);
+    // obtain the indices of atom-pair block necessary to build 2D block of a Hermitian/symmetric
+    // matrix
+    const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy(
+        'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf);
+    const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf);
+    // temp_block is used to collect data from IJ-pair data structure with comm_map2_first
+    auto temp_block = init_local_mat<complex<double>>(desc_nabf_nabf, MAJOR::COL);
+    // Below are the working arrays for matrix operations
+    auto chi0_block = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
+    auto coul_block = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
+    auto coul_eigen_block = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
+    auto coul_chi0_block = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
+    auto coulwc_block = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
+
+    const double mem_blocks = (chi0_block.size() + coul_block.size() + coul_eigen_block.size() +
+                               coul_chi0_block.size() + coulwc_block.size()) *
+                              16.0e-6;
+    ofs_myid << get_timestamp()
+             << " Memory consumption of task-local blocks for screened Coulomb [MB]: " << mem_blocks
+             << endl;
+
+    const auto atpair_local = dispatch_upper_trangular_tasks(
+        natom, blacs_ctxt_global_h.myid, blacs_ctxt_global_h.nprows, blacs_ctxt_global_h.npcols,
+        blacs_ctxt_global_h.myprow, blacs_ctxt_global_h.mypcol);
+#ifdef LIBRPA_DEBUG
+    ofs_myid << get_timestamp() << " atpair_local " << atpair_local << endl;
+    ofs_myid << get_timestamp() << " s0_s1 " << s0_s1 << endl;
+#endif
+
+    // IJ pair of Wc to be returned
+    pair<set<int>, set<int>> Iset_Jset_Wc;
+    for (const auto &ap : atpair_local)
+    {
+        Iset_Jset_Wc.first.insert(ap.first);
+        Iset_Jset_Wc.second.insert(ap.second);
+    }
+
+    // Prepare local basis indices for 2D->IJ map
+    int I, iI;
+    map<int, vector<int>> map_lor_v;
+    map<int, vector<int>> map_loc_v;
+    for (int i_lo = 0; i_lo != desc_nabf_nabf.m_loc(); i_lo++)
+    {
+        int i_glo = desc_nabf_nabf.indx_l2g_r(i_lo);
+        LIBRPA::atomic_basis_abf.get_local_index(i_glo, I, iI);
+        map_lor_v[I].push_back(iI);
+    }
+    for (int i_lo = 0; i_lo != desc_nabf_nabf.n_loc(); i_lo++)
+    {
+        int i_glo = desc_nabf_nabf.indx_l2g_c(i_lo);
+        LIBRPA::atomic_basis_abf.get_local_index(i_glo, I, iI);
+        map_loc_v[I].push_back(iI);
+    }
+
+    vector<Vector3_Order<double>> qpts;
+    for (const auto &q_weight : irk_weight) qpts.push_back(q_weight.first);
+
+    vec<double> eigenvalues(n_abf);
+    Profiler::cease("compute_Wc_freq_q_blacs_init");
+    LIBRPA::utils::lib_printf_root("Time for Wc initialization (seconds, Wall/CPU): %f %f\n",
+                                   Profiler::get_wall_time_last("compute_Wc_freq_q_blacs_init"),
+                                   Profiler::get_cpu_time_last("compute_Wc_freq_q_blacs_init"));
+
+    Profiler::start("compute_Wc_freq_q_work");
+#ifdef LIBRPA_USE_LIBRI
+    for (const auto &q : qpts)
+    {
+        const int iq = std::distance(qpts.cbegin(), std::find(qpts.cbegin(), qpts.cend(), q));
+        const int iq_in_k =
+            std::distance(klist.cbegin(), std::find(klist.cbegin(), klist.cend(), q));
+        // q-point in fractional coordinates
+        const auto &qf = kfrac_list[iq_in_k];
+        LIBRPA::utils::lib_printf_root("Computing Wc(q), %d / %d, q=(%f, %f, %f)\n", iq + 1,
+                                       qpts.size(), qf.x, qf.y, qf.z);
+        coul_block.zero_out();
+        coulwc_block.zero_out();
+        // lib_printf("coul_block\n%s", str(coul_block).c_str());
+
+        // q-array for LibRI object
+        std::array<double, 3> qa = {q.x, q.y, q.z};
+
+        // collect the block elements of truncated coulomb matrices first
+        // as we reuse coul_eigen_block to reduce memory usage
+        Profiler::start("epsilon_prepare_coulwc_sqrt", "Prepare sqrt of truncated Coulomb");
+        {
+            size_t n_singular_coulwc;
+            // LibRI tensor for communication, release once done
+            std::map<int,
+                     std::map<std::pair<int, std::array<double, 3>>, RI::Tensor<complex<double>>>>
+                couleps_libri;
+            Profiler::start("epsilon_prepare_coulwc_sqrt_1", "Setup libRI object");
+            for (const auto &Mu_Nu : atpair_local)
+            {
+                const auto Mu = Mu_Nu.first;
+                const auto Nu = Mu_Nu.second;
+                // ofs_myid << "Mu " << Mu << " Nu " << Nu << endl;
+                if (coulmat_wc.count(Mu) == 0 || coulmat_wc.at(Mu).count(Nu) == 0 ||
+                    coulmat_wc.at(Mu).at(Nu).count(q) == 0)
+                    continue;
+                const auto &Vq = coulmat_wc.at(Mu).at(Nu).at(q);
+                const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu);
+                const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu);
+                std::valarray<complex<double>> Vq_va(Vq->c, Vq->size);
+                auto pvq = std::make_shared<std::valarray<complex<double>>>();
+                *pvq = Vq_va;
+                couleps_libri[Mu][{Nu, qa}] = RI::Tensor<complex<double>>({n_mu, n_nu}, pvq);
+            }
+            Profiler::stop("epsilon_prepare_coulwc_sqrt_1");
+
+            Profiler::start("epsilon_prepare_coulwc_sqrt_2", "libRI Communicate");
+            const auto IJq_coul = RI::Communicate_Tensors_Map_Judge::comm_map2_first(
+                mpi_comm_global_h.comm, couleps_libri, s0_s1.first, s0_s1.second);
+            Profiler::stop("epsilon_prepare_coulwc_sqrt_2");
+
+            Profiler::start("epsilon_prepare_coulwc_sqrt_3", "Collect 2D-block from IJ");
+            // for (const auto &IJ: set_IJ_nabf_nabf)
+            // {
+            //     const auto &I = IJ.first;
+            //     const auto &J = IJ.second;
+            //     collect_block_from_IJ_storage_syhe(
+            //         coulwc_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, IJ.first,
+            //         IJ.second, true, CONE, IJq_coul.at(I).at({J, qa}).ptr(), MAJOR::ROW);
+            // }
+            collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf,
+                                             qa, true, CONE, IJq_coul, MAJOR::ROW);
+            ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1, desc_nabf_nabf.desc,
+                                          coulwc_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
+                                          blacs_ctxt_global_h.ictxt);
+            Profiler::stop("epsilon_prepare_coulwc_sqrt_3");
+            Profiler::start("epsilon_prepare_coulwc_sqrt_4", "Perform square root");
+            power_hemat_blacs(coulwc_block, desc_nabf_nabf_opt, coul_eigen_block,
+                              desc_nabf_nabf_opt, n_singular_coulwc, eigenvalues.c, 0.5,
+                              Params::sqrt_coulomb_threshold);
+            Profiler::stop("epsilon_prepare_coulwc_sqrt_4");
+        }
+        Profiler::stop("epsilon_prepare_coulwc_sqrt");
+        LIBRPA::utils::lib_printf_root(
+            "Time to prepare sqrt root of Coulomb for Wc(q) (seconds, Wall/CPU): %f %f\n",
+            Profiler::get_wall_time_last("epsilon_prepare_coulwc_sqrt"),
+            Profiler::get_cpu_time_last("epsilon_prepare_coulwc_sqrt"));
+        ofs_myid << get_timestamp() << " Done coulwc sqrt" << endl;
+
+        Profiler::start("epsilon_prepare_couleps_sqrt", "Prepare sqrt of bare Coulomb");
+        // collect the block elements of coulomb matrices
+        {
+            // LibRI tensor for communication, release once done
+            std::map<int,
+                     std::map<std::pair<int, std::array<double, 3>>, RI::Tensor<complex<double>>>>
+                couleps_libri;
+            ofs_myid << get_timestamp() << " Start build couleps_libri" << endl;
+            for (const auto &Mu_Nu : atpair_local)
+            {
+                const auto Mu = Mu_Nu.first;
+                const auto Nu = Mu_Nu.second;
+                // ofs_myid << "Mu " << Mu << " Nu " << Nu << endl;
+                if (coulmat_eps.count(Mu) == 0 || coulmat_eps.at(Mu).count(Nu) == 0 ||
+                    coulmat_eps.at(Mu).at(Nu).count(q) == 0)
+                    continue;
+                const auto &Vq = coulmat_eps.at(Mu).at(Nu).at(q);
+                const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu);
+                const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu);
+                std::valarray<complex<double>> Vq_va(Vq->c, Vq->size);
+                auto pvq = std::make_shared<std::valarray<complex<double>>>();
+                *pvq = Vq_va;
+                couleps_libri[Mu][{Nu, qa}] = RI::Tensor<complex<double>>({n_mu, n_nu}, pvq);
+            }
+            ofs_myid << get_timestamp() << " Done build couleps_libri" << endl;
+            // ofs_myid << "Couleps_libri" << endl << couleps_libri;
+            // if (couleps_libri.size() == 0)
+            //     throw std::logic_error("data at q-point not found in coulmat_eps");
+
+            // perform communication
+            ofs_myid << get_timestamp() << " Start collect couleps_libri, targets" << endl;
+#ifdef LIBRPA_DEBUG
+            ofs_myid << set_IJ_nabf_nabf << endl;
+            ofs_myid << "Extended blocks" << endl;
+            ofs_myid << "atom 1: " << s0_s1.first << endl;
+            ofs_myid << "atom 2: " << s0_s1.second << endl;
+#endif
+            // ofs_myid << "Owned blocks\n";
+            // print_keys(ofs_myid, couleps_libri);
+            // mpi_comm_global_h.barrier();
+            const auto IJq_coul = RI::Communicate_Tensors_Map_Judge::comm_map2_first(
+                mpi_comm_global_h.comm, couleps_libri, s0_s1.first, s0_s1.second);
+            ofs_myid << get_timestamp() << " Done collect couleps_libri, collected blocks" << endl;
+
+            ofs_myid << get_timestamp() << " Start construct couleps 2D block" << endl;
+            collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf,
+                                             qa, true, CONE, IJq_coul, MAJOR::ROW);
+            ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1, desc_nabf_nabf.desc,
+                                          coul_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
+                                          blacs_ctxt_global_h.ictxt);
+            ofs_myid << get_timestamp() << " Done construct couleps 2D block" << endl;
+        }
+        // char fn[100];
+        // sprintf(fn, "couleps_iq_%d.mtx", iq);
+        // print_matrix_mm_file_parallel(fn, coul_block, desc_nabf_nabf);
+        // ofs_myid << str(coul_block);
+        // lib_printf("coul_block\n%s", str(coul_block).c_str());
+
+        size_t n_singular;
+        ofs_myid << get_timestamp() << " Start power hemat couleps\n";
+        matrix_m<std::complex<double>> sqrtveig_blacs;
+        if (is_gamma_point(q))
+        {
+            // choice of power_hemat_blacs_real/power_hemat_blacs_desc
+            // leads to sub-meV difference
+            sqrtveig_blacs = power_hemat_blacs_real(
+                coul_block, desc_nabf_nabf_opt, coul_eigen_block, desc_nabf_nabf_opt, n_singular,
+                eigenvalues.c, 0.5, Params::sqrt_coulomb_threshold);
+            if (Params::replace_w_head && Params::option_dielect_func == 3)
+            {
+                df_headwing.wing_mu_to_lambda(sqrtveig_blacs, desc_nabf_nabf_opt);
+            }
+        }
+        else
+        {
+            sqrtveig_blacs = power_hemat_blacs(coul_block, desc_nabf_nabf_opt, coul_eigen_block,
+                                               desc_nabf_nabf_opt, n_singular, eigenvalues.c, 0.5,
+                                               Params::sqrt_coulomb_threshold);
+        }
+        ofs_myid << get_timestamp() << " Done power hemat couleps\n";
+        // lib_printf("nabf %d nsingu %lu\n", n_abf, n_singular);
+        // release sqrtv when the q-point is not Gamma, or macroscopic dielectric constant at
+        // imaginary frequency is not prepared
+        if (epsmac_LF_imagfreq.empty() || !is_gamma_point(q)) sqrtveig_blacs.clear();
+        const size_t n_nonsingular = n_abf - n_singular;
+        Profiler::stop("epsilon_prepare_couleps_sqrt");
+        LIBRPA::utils::lib_printf_root(
+            "Time to prepare sqrt root of Coulomb for Epsilon(q) (seconds, Wall/CPU): %f %f\n",
+            Profiler::get_wall_time_last("epsilon_prepare_couleps_sqrt"),
+            Profiler::get_cpu_time_last("epsilon_prepare_couleps_sqrt"));
+        ofs_myid << get_timestamp() << " Done couleps sqrt\n";
+        std::flush(ofs_myid);
+
+        for (const auto &freq : chi0.tfg.get_freq_nodes())
+        {
+            const auto ifreq = chi0.tfg.get_freq_index(freq);
+            Profiler::start("epsilon_wc_work_q_omega");
+            Profiler::start("epsilon_prepare_chi0_2d", "Prepare Chi0 2D block");
+            chi0_block.zero_out();
+            {
+                std::map<int, std::map<std::pair<int, std::array<double, 3>>,
+                                       RI::Tensor<complex<double>>>>
+                    chi0_libri;
+                if (chi0.get_chi0_q().count(freq) > 0 && chi0.get_chi0_q().at(freq).count(q) > 0)
+                {
+                    const auto &chi0_wq = chi0.get_chi0_q().at(freq).at(q);
+                    for (const auto &M_Nchi : chi0_wq)
+                    {
+                        const auto &M = M_Nchi.first;
+                        const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M);
+                        for (const auto &N_chi : M_Nchi.second)
+                        {
+                            const auto &N = N_chi.first;
+                            const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N);
+                            const auto &chi = N_chi.second;
+                            std::valarray<complex<double>> chi_va(chi.c, chi.size);
+                            auto pchi = std::make_shared<std::valarray<complex<double>>>();
+                            *pchi = chi_va;
+                            chi0_libri[M][{N, qa}] =
+                                RI::Tensor<complex<double>>({n_mu, n_nu}, pchi);
+                        }
+                    }
+                    // Release the chi0 block for this frequency and q to reduce memory load,
+                    // as they will not be used again
+                    chi0.free_chi0_q(freq, q);
+                }
+                // ofs_myid << "chi0_libri" << endl << chi0_libri;
+                Profiler::start("epsilon_prepare_chi0_2d_comm_map2");
+                const auto IJq_chi0 = RI::Communicate_Tensors_Map_Judge::comm_map2_first(
+                    mpi_comm_global_h.comm, chi0_libri, s0_s1.first, s0_s1.second);
+                Profiler::stop("epsilon_prepare_chi0_2d_comm_map2");
+                // ofs_myid << "IJq_chi0" << endl << IJq_chi0;
+                // for (const auto &IJ: set_IJ_nabf_nabf)
+                // {
+                //     const auto &I = IJ.first;
+                //     const auto &J = IJ.second;
+                //     collect_block_from_IJ_storage_syhe(
+                //         chi0_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, IJ.first,
+                //         IJ.second, true, CONE, IJq_chi0.at(I).at({J, qa}).ptr(), MAJOR::ROW);
+                // }
+                Profiler::start("epsilon_prepare_chi0_2d_collect_block");
+                collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf,
+                                                 LIBRPA::atomic_basis_abf, qa, true, CONE, IJq_chi0,
+                                                 MAJOR::ROW);
+                ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1,
+                                              desc_nabf_nabf.desc, chi0_block.ptr(), 1, 1,
+                                              desc_nabf_nabf_opt.desc, blacs_ctxt_global_h.ictxt);
+                Profiler::stop("epsilon_prepare_chi0_2d_collect_block");
+                // sprintf(fn, "chi_ifreq_%d_iq_%d.mtx", ifreq, iq);
+                // print_matrix_mm_file_parallel(fn, chi0_block, desc_nabf_nabf);
+            }
+            Profiler::stop("epsilon_prepare_chi0_2d");
+
+            Profiler::start("epsilon_compute_eps", "Compute dielectric matrix");
+
+            // for Gamma point, overwrite the head term
+            if (epsmac_LF_imagfreq.size() > 0 && is_gamma_point(q))
+            {
+                ofs_myid << get_timestamp() << " Entering dielectric matrix head overwrite" << endl;
+                // rotate to Coulomb-eigenvector basis
+                // descending order
+                ScalapackConnector::pgemm_f(
+                    'N', 'N', n_abf, n_nonsingular, n_abf, 1.0, chi0_block.ptr(), 1, 1,
+                    desc_nabf_nabf_opt.desc, sqrtveig_blacs.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
+                    0.0, coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc);
+                ScalapackConnector::pgemm_f('C', 'N', n_nonsingular, n_nonsingular, n_abf, 1.0,
+                                            sqrtveig_blacs.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
+                                            coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
+                                            0.0, chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc);
+
+                if (Params::option_dielect_func == 3)
+                {
+                    chi0_block *= -1.0;
+                    for (int i = 0; i != n_nonsingular; i++)
+                    {
+                        const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i);
+                        if (ilo < 0) continue;
+                        const int jlo = desc_nabf_nabf_opt.indx_g2l_c(i);
+                        if (jlo < 0) continue;
+                        chi0_block(ilo, jlo) += 1.0;
+                    }
+                    ofs_myid << get_timestamp() << "Perform the head & wing element overwrite"
+                             << endl;
+                    df_headwing.rewrite_eps(chi0_block, ifreq, desc_nabf_nabf_opt);
+                }
+                else
+                {
+                    const int ilo = desc_nabf_nabf_opt.indx_g2l_r(0);
+                    const int jlo = desc_nabf_nabf_opt.indx_g2l_c(0);
+                    if (ilo >= 0 && jlo >= 0)
+                    {
+                        ofs_myid << get_timestamp() << "Perform the head element overwrite" << endl;
+                        chi0_block(ilo, jlo) = 1.0 - epsmac_LF_imagfreq[ifreq];
+                    }
+                }
+                // rotate back to ABF
+                // descending order
+                ScalapackConnector::pgemm_f('N', 'N', n_abf, n_nonsingular, n_nonsingular, 1.0,
+                                            coul_eigen_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
+                                            chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, 0.0,
+                                            coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc);
+                ScalapackConnector::pgemm_f('N', 'C', n_abf, n_abf, n_nonsingular, 1.0,
+                                            coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
+                                            coul_eigen_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
+                                            0.0, chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc);
+                if (Params::option_dielect_func != 3)
+                {
+                    // now chi0_block is actually v1/2 chi v1/2
+                    chi0_block *= -1.0;
+                    for (int i = 0; i != n_abf; i++)
+                    {
+                        const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i);
+                        if (ilo < 0) continue;
+                        const int jlo = desc_nabf_nabf_opt.indx_g2l_c(i);
+                        if (jlo < 0) continue;
+                        chi0_block(ilo, jlo) += 1.0;
+                    }
+                    // now chi0_block is actually the dielectric matrix
+                    // perform inversion
+                    Profiler::start("epsilon_invert_eps", "Invert dielectric matrix");
+                    invert_scalapack(chi0_block, desc_nabf_nabf_opt);
+                }
+                // subtract 1 from diagonal
+                for (int i = 0; i != n_abf; i++)
+                {
+                    const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i);
+                    if (ilo < 0) continue;
+                    const int jlo = desc_nabf_nabf_opt.indx_g2l_c(i);
+                    if (jlo < 0) continue;
+                    chi0_block(ilo, jlo) -= 1.0;
+                }
+            }
+            else
+            {
+                Profiler::start("epsilon_compute_eps_pgemm_1");
+                ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, coul_block.ptr(), 1,
+                                            1, desc_nabf_nabf_opt.desc, chi0_block.ptr(), 1, 1,
+                                            desc_nabf_nabf_opt.desc, 0.0, coul_chi0_block.ptr(), 1,
+                                            1, desc_nabf_nabf_opt.desc);
+                Profiler::cease("epsilon_compute_eps_pgemm_1");
+                Profiler::start("epsilon_compute_eps_pgemm_2");
+                ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0,
+                                            coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
+                                            coul_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, 0.0,
+                                            chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc);
+                Profiler::cease("epsilon_compute_eps_pgemm_2");
+                // now chi0_block is actually v1/2 chi v1/2
+                chi0_block *= -1.0;
+                for (int i = 0; i != n_abf; i++)
+                {
+                    const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i);
+                    if (ilo < 0) continue;
+                    const int jlo = desc_nabf_nabf_opt.indx_g2l_c(i);
+                    if (jlo < 0) continue;
+                    chi0_block(ilo, jlo) += 1.0;
+                }
+                Profiler::stop("epsilon_compute_eps");
+                // now chi0_block is actually the dielectric matrix
+                // perform inversion
+                Profiler::start("epsilon_invert_eps", "Invert dielectric matrix");
+                invert_scalapack(chi0_block, desc_nabf_nabf_opt);
+                // subtract 1 from diagonal
+                for (int i = 0; i != n_abf; i++)
+                {
+                    const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i);
+                    if (ilo < 0) continue;
+                    const int jlo = desc_nabf_nabf_opt.indx_g2l_c(i);
+                    if (jlo < 0) continue;
+                    chi0_block(ilo, jlo) -= 1.0;
+                }
+                Profiler::stop("epsilon_invert_eps");
+            }
+            // debug for GaAs
+            // for (int i = 0; i != n_abf; i++)
+            // {
+            //     for (int j = 0; j != n_abf; j++)
+            //     {
+            //     const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i);
+            //     if (ilo < 0) continue;
+            //     const int jlo = desc_nabf_nabf_opt.indx_g2l_c(j);
+            //     if (jlo < 0) continue;
+            //     if(i==j)
+            //         chi0_block(ilo, jlo) = 1.0;
+            //     else
+            //         chi0_block(ilo, jlo) = 0.0;
+            //     }
+            // }
+            // debug for unfold shrink Wc
+            // for (int i = 0; i != n_abf; i++)
+            //{
+            //     const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i);
+            //     if (ilo < 0) continue;
+            //     for (int j = 0; j != n_abf; j++)
+            //     {
+            //         const int jlo = desc_nabf_nabf_opt.indx_g2l_c(j);
+            //         if (jlo < 0) continue;
+            //         if (i == j)
+            //             chi0_block(ilo, jlo) = 1.0;
+            //         else
+            //             chi0_block(ilo, jlo) = 0.0;
+            //     }
+            // }
+            // debug end
+
+            Profiler::start("epsilon_multiply_coulwc", "Multiply truncated Coulomb");
+            ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, coulwc_block.ptr(), 1,
+                                        1, desc_nabf_nabf_opt.desc, chi0_block.ptr(), 1, 1,
+                                        desc_nabf_nabf_opt.desc, 0.0, coul_chi0_block.ptr(), 1, 1,
+                                        desc_nabf_nabf_opt.desc);
+            ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, coul_chi0_block.ptr(),
+                                        1, 1, desc_nabf_nabf_opt.desc, coulwc_block.ptr(), 1, 1,
+                                        desc_nabf_nabf_opt.desc, 0.0, chi0_block.ptr(), 1, 1,
+                                        desc_nabf_nabf_opt.desc);
+            ScalapackConnector::pgemr2d_f(n_abf, n_abf, chi0_block.ptr(), 1, 1,
+                                          desc_nabf_nabf_opt.desc, temp_block.ptr(), 1, 1,
+                                          desc_nabf_nabf.desc, blacs_ctxt_global_h.ictxt);
+            Profiler::stop("epsilon_multiply_coulwc");
+            // lib_printf("chi0_block\n%s", str(chi0_block).c_str());
+            // now chi0_block is the screened Coulomb interaction Wc (i.e. W-V)
+
+            Profiler::start("epsilon_convert_wc_2d_to_ij", "Convert Wc, 2D -> IJ");
+            Profiler::start("epsilon_convert_wc_map_block", "Initialize Wc atom-pair map");
+            map<int, map<int, matrix_m<complex<double>>>> Wc_MNmap;
+            // map_block_to_IJ_storage(Wc_MNmap, LIBRPA::atomic_basis_abf,
+            //                         LIBRPA::atomic_basis_abf, chi0_block,
+            //                         desc_nabf_nabf, MAJOR::ROW);
+            map_block_to_IJ_storage_new(Wc_MNmap, LIBRPA::atomic_basis_abf, map_lor_v, map_loc_v,
+                                        temp_block, desc_nabf_nabf, MAJOR::ROW);
+            Profiler::stop("epsilon_convert_wc_map_block");
+
+            Profiler::start("epsilon_convert_wc_communicate", "Communicate");
+            {
+                std::map<int, std::map<std::pair<int, std::array<double, 3>>,
+                                       RI::Tensor<complex<double>>>>
+                    Wc_libri;
+                Profiler::start("epsilon_convert_wc_communicate_1");
+                for (const auto &M_NWc : Wc_MNmap)
+                {
+                    const auto &M = M_NWc.first;
+                    const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M);
+                    for (const auto &N_Wc : M_NWc.second)
+                    {
+                        const auto &N = N_Wc.first;
+                        const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N);
+                        const auto &Wc = N_Wc.second;
+                        // std::valarray<complex<double>> Wc_va(Wc.ptr(), Wc.size());
+                        // auto pWc = std::make_shared<std::valarray<complex<double>>>();
+                        // *pWc = Wc_va;
+                        /*if (iq == 10 && ifreq == 10)
+                        {
+                            char fn[100];
+                            sprintf(fn, "Wc_M_%zu_N_%zu.dat", M, N);
+                            print_matrix_mm_file(Wc, Params::output_dir + "/" + fn);
+                        }*/
+                        Wc_libri[M][{N, qa}] = RI::Tensor<complex<double>>({n_mu, n_nu}, Wc.sptr());
+                    }
+                }
+                Profiler::stop("epsilon_convert_wc_communicate_1");
+                Profiler::start("epsilon_convert_wc_communicate_2");
+                // main timing
+                // cout << Wc_libri;
+                const auto IJq_Wc = RI::Communicate_Tensors_Map_Judge::comm_map2_first(
+                    mpi_comm_global_h.comm, Wc_libri, Iset_Jset_Wc.first, Iset_Jset_Wc.second);
+                Profiler::stop("epsilon_convert_wc_communicate_2");
+                Profiler::start("epsilon_convert_wc_communicate_3");
+                // parse collected to
+                for (const auto &MN : atpair_local)
+                {
+                    const auto &M = MN.first;
+                    const auto &N = MN.second;
+                    const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M);
+                    const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N);
+                    // Use row major for later usage in LibRI
+                    Wc_freq_q[freq][M][N][q] = matrix_m<complex<double>>(
+                        n_mu, n_nu, IJq_Wc.at(M).at({N, qa}).data, MAJOR::ROW);
+                }
+                Profiler::stop("epsilon_convert_wc_communicate_3");
+                // for ( int i_mu = 0; i_mu != n_mu; i_mu++ )
+                //     for ( int i_nu = 0; i_nu != n_nu; i_nu++ )
+                //     {
+                //     }
+            }
+            Profiler::stop("epsilon_convert_wc_communicate");
+            Profiler::stop("epsilon_convert_wc_2d_to_ij");
+            Profiler::cease("epsilon_wc_work_q_omega");
+            LIBRPA::utils::lib_printf_root(
+                "Time for Wc(i_q=%d, i_omega=%d) (seconds, Wall/CPU): %f %f\n", iq + 1, ifreq + 1,
+                Profiler::get_wall_time_last("epsilon_wc_work_q_omega"),
+                Profiler::get_cpu_time_last("epsilon_wc_work_q_omega"));
+        }
+    }
+#else
+    throw std::logic_error("need compilation with LibRI");
+#endif
+    Profiler::cease("compute_Wc_freq_q_work");
+    LIBRPA::utils::lib_printf_root("Time for Wc computation (seconds, Wall/CPU): %f %f\n",
+                                   Profiler::get_wall_time_last("compute_Wc_freq_q_work"),
+                                   Profiler::get_cpu_time_last("compute_Wc_freq_q_work"));
+
+    return Wc_freq_q;
+}
+
+map<double, atom_mapping<std::map<Vector3_Order<int>, matrix_m<complex<double>>>>::pair_t_old>
+FT_Wc_freq_q(
+    const map<double,
+              atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
+        &Wc_freq_q,
+    const TFGrids &tfg, const int &n_k_points, const vector<Vector3_Order<int>> &Rlist)
+{
+    // major of Wc_freq_q input and Wc_tau_R output
+    const MAJOR major_Wc = MAJOR::ROW;
+
+    map<double, atom_mapping<std::map<Vector3_Order<int>, matrix_m<complex<double>>>>::pair_t_old>
+        Wc_freq_R;
+    const int ngrids = tfg.get_n_grids();
+    if (Params::debug)
+    {
+        if (mpi_comm_global_h.is_root()) lib_printf("Converting Wc q,w -> R,t\n");
+        mpi_comm_global_h.barrier();
+    }
+    set<pair<atom_t, atom_t>> atpairs_unique;
+    for (const auto &freq_MuNuqWc : Wc_freq_q)
+    {
+        for (const auto &Mu_NuqWc : freq_MuNuqWc.second)
+        {
+            const auto Mu = Mu_NuqWc.first;
+            for (const auto &Nu_qWc : Mu_NuqWc.second)
+            {
+                const auto Nu = Nu_qWc.first;
+                atpairs_unique.insert({Mu, Nu});
+                for (const auto &q_Wc : Nu_qWc.second)
+                {
+                    assert(q_Wc.second.major() == major_Wc);
+                }
+            }
+        }
+    }
+
+    vector<pair<pair<int, Vector3_Order<int>>, pair<atom_t, atom_t>>> ifreqR_atpair_all;
+    // allocate space before hand
+    for (auto R : Rlist)
+    {
+        for (int ifreq = 0; ifreq != ngrids; ifreq++)
+        {
+            auto freq = tfg.get_freq_nodes()[ifreq];
+            for (auto atpair_unique : atpairs_unique)
+            {
+                const auto Mu = atpair_unique.first;
+                const int n_mu = atom_mu[Mu];
+                const auto Nu = atpair_unique.second;
+                const int n_nu = atom_mu[Nu];
+                Wc_freq_R[freq][Mu][Nu][R] = matrix_m<complex<double>>(n_mu, n_nu, major_Wc);
+                ifreqR_atpair_all.push_back({{ifreq, R}, atpair_unique});
+            }
+        }
+    }
+
+#pragma omp parallel for schedule(dynamic)
+    for (auto ifreqR_atpair : ifreqR_atpair_all)
+    {
+        const auto ifreq = ifreqR_atpair.first.first;
+        const auto freq = tfg.get_freq_nodes()[ifreq];
+        const auto R = ifreqR_atpair.first.second;
+        const auto Mu = ifreqR_atpair.second.first;
+        const auto Nu = ifreqR_atpair.second.second;
+        const int n_mu = atom_mu[Mu];
+        const int n_nu = atom_mu[Nu];
+
+        // thread local temporary matrix
+        matrix_m<complex<double>> WfR_temp(n_mu, n_nu, major_Wc);
+
+        if (Wc_freq_q.count(freq) == 0) continue;
+        if (Wc_freq_q.at(freq).count(Mu) == 0) continue;
+        if (Wc_freq_q.at(freq).at(Mu).count(Nu) == 0) continue;
+
+        for (auto &Wc_q : Wc_freq_q.at(freq).at(Mu).at(Nu))
+        {
+            const auto q = Wc_q.first;
+            const auto &Wc = Wc_q.second;
+            for (auto q_bz : map_irk_ks[q])
+            {
+                const double ang = -q_bz * (R * latvec) * TWO_PI;
+                const complex<double> weight =
+                    complex<double>(cos(ang), sin(ang)) / double(n_k_points);
+                if (q == q_bz)
+                    WfR_temp += Wc * weight;
+                else
+                    WfR_temp += conj(Wc) * weight;
+            }
+        }
+        // omp_set_lock(&lock_Wc);
+        Wc_freq_R[freq][Mu][Nu][R] += WfR_temp;
+        // omp_unset_lock(&lock_Wc);
+    }
+
+    if (mpi_comm_global_h.is_root())
+    {
+        lib_printf("Done converting Wc(q,w) -> Wc(R,w)\n");
+    }
+    mpi_comm_global_h.barrier();
+
+    return Wc_freq_R;
+}
+
+map<double, atom_mapping<std::map<Vector3_Order<int>, matrix_m<complex<double>>>>::pair_t_old>
+CT_FT_Wc_freq_q(
+    const map<double,
+              atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
+        &Wc_freq_q,
+    const TFGrids &tfg, const int &n_k_points, const vector<Vector3_Order<int>> &Rlist)
+{
+    // major of Wc_freq_q input and Wc_tau_R output
+    const MAJOR major_Wc = MAJOR::ROW;
+
+    map<double, atom_mapping<std::map<Vector3_Order<int>, matrix_m<complex<double>>>>::pair_t_old>
+        Wc_tau_R;
+    if (!tfg.has_time_grids()) throw logic_error("TFGrids object does not have time grids");
+    const int ngrids = tfg.get_n_grids();
+
+    LIBRPA::utils::lib_printf_root("Converting Wc(q,w) -> W(R,t)\n");
+    mpi_comm_global_h.barrier();
+
+    set<pair<atom_t, atom_t>> atpairs_unique;
+    for (const auto &freq_MuNuqWc : Wc_freq_q)
+    {
+        for (const auto &Mu_NuqWc : freq_MuNuqWc.second)
+        {
+            const auto Mu = Mu_NuqWc.first;
+            for (const auto &Nu_qWc : Mu_NuqWc.second)
+            {
+                const auto Nu = Nu_qWc.first;
+                atpairs_unique.insert({Mu, Nu});
+                for (const auto &q_Wc : Nu_qWc.second)
+                {
+                    assert(q_Wc.second.major() == major_Wc);
+                }
+            }
+        }
+    }
+
+    vector<pair<pair<int, Vector3_Order<int>>, pair<atom_t, atom_t>>> itauR_atpair_all;
+    // allocate space before hand
+    for (auto R : Rlist)
+    {
+        for (int itau = 0; itau != ngrids; itau++)
+        {
+            auto tau = tfg.get_time_nodes()[itau];
+            for (auto atpair_unique : atpairs_unique)
+            {
+                const auto Mu = atpair_unique.first;
+                const int n_mu = atom_mu[Mu];
+                const auto Nu = atpair_unique.second;
+                const int n_nu = atom_mu[Nu];
+                Wc_tau_R[tau][Mu][Nu][R] = matrix_m<complex<double>>(n_mu, n_nu, major_Wc);
+                itauR_atpair_all.push_back({{itau, R}, atpair_unique});
+            }
+        }
+    }
+
+    LIBRPA::utils::lib_printf_coll("Task %4d: distributing %d {I, J, R, tau} on %d threads\n",
+                                   LIBRPA::envs::myid_global, itauR_atpair_all.size(),
+                                   omp_get_max_threads());
+
+#pragma omp parallel for schedule(dynamic)
+    for (auto itauR_atpair : itauR_atpair_all)
+    {
+        const auto itau = itauR_atpair.first.first;
+        const auto tau = tfg.get_time_nodes()[itau];
+        const auto R = itauR_atpair.first.second;
+        const auto Mu = itauR_atpair.second.first;
+        const auto Nu = itauR_atpair.second.second;
+        const int n_mu = atom_mu[Mu];
+        const int n_nu = atom_mu[Nu];
+
+        // thread local temporary matrix
+        matrix_m<complex<double>> WtR_temp(n_mu, n_nu, major_Wc);
+
+        for (int ifreq = 0; ifreq < ngrids; ifreq++)
+        {
+            const auto freq = tfg.get_freq_nodes()[ifreq];
+            const auto f2t = tfg.get_costrans_f2t()(itau, ifreq);
+            // ofs_myid << "f2t cos eff for freq " << freq << " -> tau " << tau  << ": " << f2t <<
+            // "\n";
+            if (Wc_freq_q.count(freq) == 0) continue;
+            if (Wc_freq_q.at(freq).count(Mu) == 0) continue;
+            if (Wc_freq_q.at(freq).at(Mu).count(Nu) == 0) continue;
+            // cout << "freq: " << freq << "\n";
+
+            const auto &Wc_q_all = Wc_freq_q.at(freq).at(Mu).at(Nu);
+            for (auto &Wc_q : Wc_q_all)
+            {
+                const auto q = Wc_q.first;
+                const auto &Wc = Wc_q.second;
+                for (auto q_bz : map_irk_ks[q])
+                {
+                    const double ang = -q_bz * (R * latvec) * TWO_PI;
+                    const complex<double> weight =
+                        complex<double>(cos(ang), sin(ang)) * f2t / double(n_k_points);
+                    // ofs_myid << q << " " << q_bz << " weight = " << weight << "\n";
+                    // ofs_myid << q_Wc.second;
+                    if (q == q_bz)
+                        WtR_temp += Wc * weight;
+                    else
+                        WtR_temp += conj(Wc) * weight;
+                }
+            }
+        }
+        // omp_set_lock(&lock_Wc);
+        Wc_tau_R[tau][Mu][Nu][R] += WtR_temp;
+        // omp_unset_lock(&lock_Wc);
+    }
+
+    LIBRPA::utils::lib_printf_root("Done converting Wc q,w -> R,t\n");
+    mpi_comm_global_h.barrier();
+
+    // myz debug: check the imaginary part of the matrix
+    // NOTE: if G(R) is real, is W(R) real as well?
+    // if (Params::debug)
+    // {
+    //     for (const auto & tau_MuNuRWc: Wc_tau_R)
+    //     {
+    //         char fn[80];
+    //         auto tau = tau_MuNuRWc.first;
+    //         auto itau = tfg.get_time_index(tau);
+    //         for (const auto & Mu_NuRWc: tau_MuNuRWc.second)
+    //         {
+    //             auto Mu = Mu_NuRWc.first;
+    //             // const int n_mu = atom_mu[Mu];
+    //             for (const auto & Nu_RWc: Mu_NuRWc.second)
+    //             {
+    //                 auto Nu = Nu_RWc.first;
+    //                 // const int n_nu = atom_mu[Nu];
+    //                 for (const auto & R_Wc: Nu_RWc.second)
+    //                 {
+    //                     auto R = R_Wc.first;
+    //                     auto Wc = R_Wc.second;
+    //                     auto iteR = std::find(Rlist.cbegin(), Rlist.cend(), R);
+    //                     auto iR = std::distance(Rlist.cbegin(), iteR);
+    //                     sprintf(fn, "Wc_Mu_%zu_Nu_%zu_iR_%zu_itau_%d_id_%d.mtx", Mu, Nu, iR,
+    //                     itau, mpi_comm_global_h.myid); print_matrix_mm_file(Wc,
+    //                     Params::output_dir + "/" + fn, 1e-10);
+    //                 }
+    //             }
+    //         }
+    //     }
+    // }
+    // end myz debug
+    return Wc_tau_R;
+}
+
+map<double, atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
+CT_FT_Wc_freq2time_q(
+    const map<double,
+              atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
+        &Wc_freq_q,
+    const TFGrids &tfg, const int &n_k_points, const vector<Vector3_Order<int>> &Rlist,
+    const vector<Vector3_Order<double>> &qlist)
+{
+    // major of Wc_freq_q input and Wc_tau_R output
+    const MAJOR major_Wc = MAJOR::ROW;
+
+    map<double,
+        atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
+        Wc_tau_q;
+    if (!tfg.has_time_grids()) throw logic_error("TFGrids object does not have time grids");
+    const int ngrids = tfg.get_n_grids();
+
+    LIBRPA::utils::lib_printf_root("Converting Wc(q,w) -> W(q,t)\n");
+    mpi_comm_global_h.barrier();
+
+    set<pair<atom_t, atom_t>> atpairs_unique;
+    for (const auto &freq_MuNuqWc : Wc_freq_q)
+    {
+        for (const auto &Mu_NuqWc : freq_MuNuqWc.second)
+        {
+            const auto Mu = Mu_NuqWc.first;
+            for (const auto &Nu_qWc : Mu_NuqWc.second)
+            {
+                const auto Nu = Nu_qWc.first;
+                atpairs_unique.insert({Mu, Nu});
+                for (const auto &q_Wc : Nu_qWc.second)
+                {
+                    assert(q_Wc.second.major() == major_Wc);
+                }
+            }
+        }
+    }
+    vector<pair<int, pair<atom_t, atom_t>>> itau_atpair_all;
+    // allocate space before hand
+
+    for (int itau = 0; itau != ngrids; itau++)
+    {
+        auto tau = tfg.get_time_nodes()[itau];
+        for (auto atpair_unique : atpairs_unique)
+        {
+            const auto Mu = atpair_unique.first;
+            const int n_mu = atom_mu_s[Mu];
+            const auto Nu = atpair_unique.second;
+            const int n_nu = atom_mu_s[Nu];
+            for (auto q : qlist)
+                Wc_tau_q[tau][Mu][Nu][q] = matrix_m<complex<double>>(n_mu, n_nu, major_Wc);
+            itau_atpair_all.push_back({itau, atpair_unique});
+        }
+    }
+
+    LIBRPA::utils::lib_printf_coll("Task %4d: distributing %d {I, J, R, tau} on %d threads\n",
+                                   LIBRPA::envs::myid_global, itau_atpair_all.size(),
+                                   omp_get_max_threads());
+
+#pragma omp parallel for schedule(dynamic)
+    for (auto itau_atpair : itau_atpair_all)
+    {
+        const auto itau = itau_atpair.first;
+        const auto tau = tfg.get_time_nodes()[itau];
+        const auto Mu = itau_atpair.second.first;
+        const auto Nu = itau_atpair.second.second;
+        const int n_mu = atom_mu_s[Mu];
+        const int n_nu = atom_mu_s[Nu];
+
+        // thread local temporary matrix
+        matrix_m<complex<double>> Wtq_temp(n_mu, n_nu, major_Wc);
+
+        for (int ifreq = 0; ifreq < ngrids; ifreq++)
+        {
+            const auto freq = tfg.get_freq_nodes()[ifreq];
+            const auto f2t = tfg.get_costrans_f2t()(itau, ifreq);
+            // ofs_myid << "f2t cos eff for freq " << freq << " -> tau " << tau  << ": " << f2t <<
+            // "\n";
+            if (Wc_freq_q.count(freq) == 0) continue;
+            if (Wc_freq_q.at(freq).count(Mu) == 0) continue;
+            if (Wc_freq_q.at(freq).at(Mu).count(Nu) == 0) continue;
+            // cout << "freq: " << freq << "\n";
+
+            const auto &Wc_q_all = Wc_freq_q.at(freq).at(Mu).at(Nu);
+            for (auto &Wc_q : Wc_q_all)
+            {
+                const auto q = Wc_q.first;
+                const auto &Wc = Wc_q.second;
+                const double weight = f2t;
+                Wtq_temp = Wc * weight;
+                // omp_set_lock(&lock_Wc);
+                Wc_tau_q[tau][Mu][Nu][q] += Wtq_temp;
+                // omp_unset_lock(&lock_Wc);
+            }
+        }
+    }
+
+    LIBRPA::utils::lib_printf_root("Done converting Wc q,w -> q,t\n");
+    mpi_comm_global_h.barrier();
+
+    return Wc_tau_q;
+}
+
+atom_mapping<std::map<Vector3_Order<int>, matrix_m<complex<double>>>>::pair_t_old CT_FT_Wc_tau_R2q(
+    const atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old
+        &Wc_tau_q,
+    const TFGrids &tfg, const int &n_kpoints, const vector<Vector3_Order<int>> &Rlist,
+    const int &itau)
+{
+    const int tau = tfg.get_time_nodes()[itau];
+    // major of Wc_freq_q input and Wc_tau_R output
+    const MAJOR major_Wc = MAJOR::ROW;
+
+    atom_mapping<std::map<Vector3_Order<int>, matrix_m<complex<double>>>>::pair_t_old Wc_tau_R;
+    if (!tfg.has_time_grids()) throw logic_error("TFGrids object does not have time grids");
+    const int ngrids = tfg.get_n_grids();
+
+    LIBRPA::utils::lib_printf_root("Converting Wc(q,t) -> W(R,t)\n");
+    mpi_comm_global_h.barrier();
+
+    set<pair<atom_t, atom_t>> atpairs_unique;
+    for (const auto &MuNuqWc : Wc_tau_q)
+    {
+        const auto Mu = MuNuqWc.first;
+        for (const auto &Nu_qWc : MuNuqWc.second)
+        {
+            const auto Nu = Nu_qWc.first;
+            atpairs_unique.insert({Mu, Nu});
+            for (const auto &q_Wc : Nu_qWc.second)
+            {
+                assert(q_Wc.second.major() == major_Wc);
+            }
+        }
+    }
+
+    vector<pair<Vector3_Order<int>, pair<atom_t, atom_t>>> iR_atpair_all;
+    // allocate space before hand
+    for (auto R : Rlist)
+    {
+        for (auto atpair_unique : atpairs_unique)
+        {
+            const auto Mu = atpair_unique.first;
+            const int n_mu = atom_mu_l[Mu];
+            const auto Nu = atpair_unique.second;
+            const int n_nu = atom_mu_l[Nu];
+            Wc_tau_R[Mu][Nu][R] = matrix_m<complex<double>>(n_mu, n_nu, major_Wc);
+            iR_atpair_all.push_back({R, atpair_unique});
+        }
+    }
+
+    LIBRPA::utils::lib_printf_coll("Task %4d: distributing %d {I, J, R, tau} on %d threads\n",
+                                   LIBRPA::envs::myid_global, iR_atpair_all.size(),
+                                   omp_get_max_threads());
+
+#pragma omp parallel for schedule(dynamic)
+    for (auto iR_atpair : iR_atpair_all)
+    {
+        const auto R = iR_atpair.first;
+        const auto Mu = iR_atpair.second.first;
+        const auto Nu = iR_atpair.second.second;
+        const int n_mu = atom_mu_l[Mu];
+        const int n_nu = atom_mu_l[Nu];
+
+        // thread local temporary matrix
+        matrix_m<complex<double>> WtR_temp(n_mu, n_nu, major_Wc);
+
+        if (Wc_tau_q.count(Mu) == 0) continue;
+        if (Wc_tau_q.at(Mu).count(Nu) == 0) continue;
+        // cout << "freq: " << freq << "\n";
+
+        const auto &Wc_q_all = Wc_tau_q.at(Mu).at(Nu);
+        for (auto &Wc_q : Wc_q_all)
+        {
+            const auto q = Wc_q.first;
+            const auto &Wc = Wc_q.second;
+            for (auto q_bz : map_irk_ks[q])
+            {
+                const double ang = -q_bz * (R * latvec) * TWO_PI;
+                const complex<double> weight =
+                    complex<double>(cos(ang), sin(ang)) / double(n_kpoints);
+                if (q == q_bz)
+                    WtR_temp += Wc * weight;
+                else
+                    WtR_temp += conj(Wc) * weight;
+            }
+        }
+        // omp_set_lock(&lock_Wc);
+        Wc_tau_R[Mu][Nu][R] += WtR_temp;
+        // omp_unset_lock(&lock_Wc);
+    }
+
+    LIBRPA::utils::lib_printf_root("Done converting Wc q,t -> R,t\n");
+    mpi_comm_global_h.barrier();
+    return Wc_tau_R;
+}
+
+void test_libcomm_for_system(const atpair_k_cplx_mat_t &coulmat)
+{
+    if (mpi_comm_global_h.myid == 0) lib_printf("test_libcomm_for_system Coulumb\n");
+    // lib_printf("Calculating EcRPA with BLACS, pid:  %d\n", mpi_comm_global_h.myid);
+    const complex<double> CONE{1.0, 0.0};
+    const int n_abf = LIBRPA::atomic_basis_abf.nb_total;
+    const auto part_range = LIBRPA::atomic_basis_abf.get_part_range();
+
+    mpi_comm_global_h.barrier();
+
+    Array_Desc desc_nabf_nabf(blacs_ctxt_global_h);
+    // use a square blocksize instead max block, otherwise heev and inversion will complain about
+    // illegal parameter
+    desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0);
+    const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy(
+        'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf);
+    const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf);
+
+    auto coul_block = init_local_mat<complex<double>>(desc_nabf_nabf, MAJOR::COL);
+
+    vector<Vector3_Order<double>> qpts;
+    for (const auto &qMuNuchi : irk_weight) qpts.push_back(qMuNuchi.first);
+
+#ifdef LIBRPA_USE_LIBRI
+    for (const auto &q : qpts)
+    {
+        coul_block.zero_out();
+
+        int iq = std::distance(klist.begin(), std::find(klist.begin(), klist.end(), q));
+        std::array<double, 3> qa = {q.x, q.y, q.z};
+        // collect the block elements of coulomb matrices
+        {
+            double vq_begin = omp_get_wtime();
+            // LibRI tensor for communication, release once done
+            std::map<int, std::map<std::pair<int, std::array<double, 3>>, Tensor<complex<double>>>>
+                coul_libri;
+            coul_libri.clear();
+            int count_coul = 0;
+            for (const auto &Mu_Nu : local_atpair)
+            {
+                const auto Mu = Mu_Nu.first;
+                const auto Nu = Mu_Nu.second;
+                // ofs_myid << "myid " << blacs_ctxt_global_h.myid << "Mu " << Mu << " Nu " << Nu <<
+                // endl;
+                if (coulmat.count(Mu) == 0 || coulmat.at(Mu).count(Nu) == 0 ||
+                    coulmat.at(Mu).at(Nu).count(q) == 0)
+                    continue;
+                const auto &Vq = coulmat.at(Mu).at(Nu).at(q);
+                const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu);
+                const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu);
+                std::valarray<complex<double>> Vq_va(Vq->c, Vq->size);
+                auto pvq = std::make_shared<std::valarray<complex<double>>>();
+                *pvq = Vq_va;
+                coul_libri[Mu][{Nu, qa}] = Tensor<complex<double>>({n_mu, n_nu}, pvq);
+                count_coul += 1;
+            }
+            int count_pair = 0;
+            for (auto &Mu : coul_libri)
+            {
+                for (auto &nu_q : Mu.second)
+                {
+                    count_pair += 1;
+                }
+            }
+            // printf("Finish RPA blacs 2d  vq arr\n");
+            double arr_end = omp_get_wtime();
+            mpi_comm_global_h.barrier();
+            double comm_begin = omp_get_wtime();
+            lib_printf(
+                "Begin comm_map2_first  myid: %d  q:(%f, %f, %f)  count_coul: %d  count_pair: %d\n",
+                mpi_comm_global_h.myid, q.x, q.y, q.z, count_coul, count_pair);
+            const auto IJq_coul =
+                comm_map2_first(mpi_comm_global_h.comm, coul_libri, s0_s1.first, s0_s1.second);
+            double comm_end = omp_get_wtime();
+            mpi_comm_global_h.barrier();
+            // printf("End vq comm_map2_first  myid: %d   TIME_USED:
+            // %f\n",mpi_comm_global_h.myid,comm_end-comm_begin);
+            //  ofs_myid << "IJq_coul" << endl << IJq_coul;
+            // printf("Finish RPA blacs 2d  vq 2d\n");
+            double block_begin = omp_get_wtime();
+            // for (const auto &IJ: set_IJ_nabf_nabf)
+            // {
+            //     const auto &I = IJ.first;
+            //     const auto &J = IJ.second;
+            //     // cout << IJq_coul.at(I).at({J, qa});
+            //     collect_block_from_IJ_storage_syhe(
+            //         coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, IJ.first,
+            //         IJ.second, true, CONE, IJq_coul.at(I).at({J, qa}).ptr(), MAJOR::ROW);
+            //     // lib_printf("myid %d I %d J %d nr %d nc %d\n%s",
+            //     //        blacs_ctxt_global_h.myid, I, J,
+            //     //        coul_block.nr(), coul_block.nc(),
+            //     //        str(coul_block).c_str());
+            // }
+            collect_block_from_ALL_IJ_Tensor(coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf,
+                                             qa, true, CONE, IJq_coul, MAJOR::ROW);
+            double block_end = omp_get_wtime();
+            // lib_printf("Vq Time  myid: %d  arr_time: %f  comm_time: %f   block_time: %f
+            // pair_size: %d\n",mpi_comm_global_h.myid,arr_end-vq_begin, comm_end-comm_begin,
+            // block_end-block_begin,set_IJ_nabf_nabf.size());
+            mpi_comm_global_h.barrier();
+            double vq_end = omp_get_wtime();
+
+            if (mpi_comm_global_h.myid == 0)
+                lib_printf(" | Total vq time: %f  lri_coul: %f   comm_vq: %f   block_vq: %f\n",
+                           vq_end - vq_begin, comm_begin - vq_begin, block_begin - comm_begin,
+                           vq_end - block_begin);
+        }
+    }
+    lib_printf("Success test_libcomm_for_system\n");
+#endif
+}

From 9ce284227658babf9c8b1aaeb3fd754dc289be58 Mon Sep 17 00:00:00 2001
From: userchba <88939318+userchba@users.noreply.github.com>
Date: Fri, 25 Jul 2025 01:11:27 +0800
Subject: [PATCH 02/18] Add files via upload

fix a bug which open 4 mpi will contribute error for rap calculation with scalapack and without scalapack
---
 src/epsilon.cpp | 131 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 115 insertions(+), 16 deletions(-)

diff --git a/src/epsilon.cpp b/src/epsilon.cpp
index cec86fd0..9ed7d46b 100644
--- a/src/epsilon.cpp
+++ b/src/epsilon.cpp
@@ -1,5 +1,5 @@
 #include "epsilon.h"
-
+#define OPEN_TEST_FOR_LU_DECOMPOSITION
 #include <math.h>
 #include <omp.h>
 
@@ -317,15 +317,41 @@ CorrEnergy compute_RPA_correlation_blacs_2d(Chi0 &chi0, atpair_k_cplx_mat_t &cou
     // ofs_myid << "Iset Jset " << s0_s1 << endl;
     // ofs_myid << "atpair_unordered_local of myid " << blacs_ctxt_global_h.myid << " " <<
     // atpair_unordered_local << endl;
-
+    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+    // printf("success before vector qpts processid:%d,chi0.tfg.get_freq_nodes()[0]:%f,chi0.get_chi0_q().size():%d\n", mpi_comm_global_h.myid,
+        //    chi0.tfg.get_freq_nodes()[0], chi0.get_chi0_q().size());
+    // printf("chi0.get_chi0_q().empty():%d\n", chi0.get_chi0_q().empty());
+    printf("processId:%d,chi0.klist.size():%zu\n", mpi_comm_global_h.myid, chi0.klist.size());
+    // for(const auto &k : chi0.klist)
+    // {
+    //     printf("processId:%d, k: (%f, %f, %f)\n", mpi_comm_global_h.myid, k.x, k.y, k.z);
+    // }
+    #endif
     vector<Vector3_Order<double>> qpts;
-    for (const auto &qMuNuchi : chi0.get_chi0_q().at(chi0.tfg.get_freq_nodes()[0]))
-        qpts.push_back(qMuNuchi.first);
-
+    
+    // for (const auto &qMuNuchi : chi0.get_chi0_q().at(chi0.tfg.get_freq_nodes()[0]))
+    // {
+    //     qpts.push_back(qMuNuchi.first);
+    //     #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+    //     const auto &q = qMuNuchi.first;
+    //     printf("processId:%d, q: (%f, %f, %f)\n", mpi_comm_global_h.myid, q.x, q.y, q.z);
+    //     #endif
+    // }
+    for(const auto &q : chi0.klist)
+    {
+        qpts.push_back(q);
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+        printf("processId:%d, q: (%f, %f, %f)\n", mpi_comm_global_h.myid, q.x, q.y, q.z);
+        #endif
+    }
     complex<double> tot_RPA_energy(0.0, 0.0);
     map<Vector3_Order<double>, complex<double>> cRPA_q;
     if (mpi_comm_global_h.is_root()) lib_printf("Finish init RPA blacs 2d\n");
+    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+    printf("success before for loop processid:%d\n", mpi_comm_global_h.myid);
+    #endif
 #ifdef LIBRPA_USE_LIBRI
+    
     for (const auto &q : qpts)
     {
         coul_block.zero_out();
@@ -345,9 +371,15 @@ CorrEnergy compute_RPA_correlation_blacs_2d(Chi0 &chi0, atpair_k_cplx_mat_t &cou
                 const auto Nu = Mu_Nu.second;
                 // ofs_myid << "myid " << blacs_ctxt_global_h.myid << "Mu " << Mu << " Nu " << Nu <<
                 // endl;
+                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+                // printf("success before if coulmat.count:%d\n", mpi_comm_global_h.myid);
+                #endif
                 if (coulmat.count(Mu) == 0 || coulmat.at(Mu).count(Nu) == 0 ||
                     coulmat.at(Mu).at(Nu).count(q) == 0)
                     continue;
+                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+                // printf("success after if coulmat.count:%d\n", mpi_comm_global_h.myid);
+                #endif
                 const auto &Vq = coulmat.at(Mu).at(Nu).at(q);
                 const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu);
                 const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu);
@@ -420,8 +452,19 @@ CorrEnergy compute_RPA_correlation_blacs_2d(Chi0 &chi0, atpair_k_cplx_mat_t &cou
                 std::map<int,
                          std::map<std::pair<int, std::array<double, 3>>, Tensor<complex<double>>>>
                     chi0_libri;
-                const auto &chi0_wq = chi0.get_chi0_q().at(freq).at(q);
+                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+                // printf("success before chi0.get_chi0_q().at(freq).at(q) processId:%d\n", mpi_comm_global_h.myid);
+                // printf("processId:%d,chi0.get_chi0_q().empty():%d\n", mpi_comm_global_h.myid, chi0.get_chi0_q().empty());
+                #endif
+                atom_mapping<ComplexMatrix>::pair_t_old chi0_wq;
+                if(!chi0.get_chi0_q().empty())
+                    chi0_wq = chi0.get_chi0_q().at(freq).at(q);
+                // const auto &chi0_wq = chi0.get_chi0_q().at(freq).at(q);
+                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+                // printf("success after chi0.get_chi0_q().at(freq).at(q) processId:%d\n", mpi_comm_global_h.myid);
+                #endif
                 chi0_libri.clear();
+                if(!chi0.get_chi0_q().empty())
                 for (const auto &M_Nchi : chi0_wq)
                 {
                     const auto &M = M_Nchi.first;
@@ -444,7 +487,15 @@ CorrEnergy compute_RPA_correlation_blacs_2d(Chi0 &chi0, atpair_k_cplx_mat_t &cou
                     lib_printf("chi0_freq_q size: %d,  freq: %f, q:( %f, %f, %f )\n",
                                chi0_wq.size(), freq, q.x, q.y, q.z);
                 }
-                chi0.free_chi0_q(freq, q);
+                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+                // printf("success before chi0.free_chi0_q(freq, q) processId:%d\n", mpi_comm_global_h.myid);
+                #endif
+                if(!chi0.get_chi0_q().empty())
+                    chi0.free_chi0_q(freq, q);
+                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+                // printf("success after chi0.free_chi0_q(freq, q) processId:%d\n", mpi_comm_global_h.myid);
+                #endif
+
 
                 LIBRPA::utils::release_free_mem();
                 // if(mpi_comm_global_h.is_root())
@@ -605,6 +656,10 @@ complex<double> compute_pi_det_blacs_2d(matrix_m<complex<double>> &loc_piT,
     //     print_complex_real_matrix("first_pi",pi_freq_q.at(0).at(0));
     //     print_complex_real_matrix("first_loc_piT_mat",loc_piT);
     // }
+    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+    printf("success before pzgetrf_ processid:%d,range_all: %d, loc_piT.nr(): %d, loc_piT.nc(): %d\n",
+           mpi_comm_global_h.myid, range_all, loc_piT.nr(), loc_piT.nc());
+    #endif
     double det_begin = omp_get_wtime();
     // ScalapackConnector::transpose_desc(DESCPI_T, arrdesc_pi.desc);
     pzgetrf_(&range_all, &range_all, loc_piT.ptr(), &one, &one, arrdesc_pi.desc, ipiv, &info);
@@ -1001,11 +1056,19 @@ CorrEnergy compute_RPA_correlation(const Chi0 &chi0, const atpair_k_cplx_mat_t &
     part_range.resize(atom_mu.size());
     part_range[0] = 0;
     int count_range = 0;
+    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+    // printf("success before part_range processid:%d, atom_mu.size(): %zu\n",
+        //    mpi_comm_global_h.myid, atom_mu.size());
+    #endif
     for (int I = 0; I != atom_mu.size() - 1; I++)
     {
         count_range += atom_mu[I];
         part_range[I + 1] = count_range;
     }
+    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+    // printf("success after part_range processid:%d, atom_mu.size(): %zu\n",
+    //        mpi_comm_global_h.myid, atom_mu.size());
+    #endif
 
     // cout << "part_range:" << endl;
     // for (int I = 0; I != atom_mu.size(); I++)
@@ -1016,20 +1079,29 @@ CorrEnergy compute_RPA_correlation(const Chi0 &chi0, const atpair_k_cplx_mat_t &
 
     // pi_freq_q contains all atoms
     map<double, map<Vector3_Order<double>, ComplexMatrix>> pi_freq_q;
-
-    for (const auto &freq_q_MuNupi : pi_freq_q_Mu_Nu)
+    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+    // printf("| process %d, qpts.size(): %zu,freq.size():%zu\n", mpi_comm_global_h.myid, chi0.klist.size(),chi0.tfg.get_freq_nodes().size());
+    #endif
+    for(const auto &freq : chi0.tfg.get_freq_nodes())
     {
-        const auto freq = freq_q_MuNupi.first;
-
-        for (const auto &q_MuNupi : freq_q_MuNupi.second)
-        {
-            const auto q = q_MuNupi.first;
-            const auto MuNupi = q_MuNupi.second;
+        // printf("| process %d, freq: %f\n", mpi_comm_global_h.myid, freq);
+        map<Vector3_Order<double>, atom_mapping<ComplexMatrix>::pair_t_old> freq_q_MuNupi;
+        if(!chi0.get_chi0_q().empty())
+            freq_q_MuNupi=pi_freq_q_Mu_Nu.at(freq);
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+        // printf("success before freq_q_MuNupi processid:%d, freq_q_MuNupi.size(): %zu\n",
+        //        mpi_comm_global_h.myid, freq_q_MuNupi.size());
+        #endif
+        for(const auto &q:chi0.klist){
+            atom_mapping<ComplexMatrix>::pair_t_old q_MuNupi;
+            if(!chi0.get_chi0_q().empty())
+                q_MuNupi = freq_q_MuNupi.at(q);
+            const auto MuNupi = q_MuNupi;
             pi_freq_q[freq][q].create(range_all, range_all);
 
             ComplexMatrix pi_munu_tmp(range_all, range_all);
             pi_munu_tmp.zero_out();
-
+            if(!chi0.get_chi0_q().empty())
             for (const auto &Mu_Nupi : MuNupi)
             {
                 const auto Mu = Mu_Nupi.first;
@@ -1068,6 +1140,9 @@ CorrEnergy compute_RPA_correlation(const Chi0 &chi0, const atpair_k_cplx_mat_t &
     {
         complex<double> tot_RPA_energy(0.0, 0.0);
         map<Vector3_Order<double>, complex<double>> cRPA_q;
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+        int num_iteration = 0;
+        #endif
         for (const auto &freq_qpi : pi_freq_q)
         {
             const auto freq = freq_qpi.first;
@@ -1081,6 +1156,21 @@ CorrEnergy compute_RPA_correlation(const Chi0 &chi0, const atpair_k_cplx_mat_t &
                 ComplexMatrix identity_minus_pi(range_all, range_all);
                 identity.set_as_identity_matrix();
                 identity_minus_pi = identity - pi_freq_q[freq][q];
+                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+                // if(num_iteration==0)
+                // if(mpi_comm_global_h.myid == 1)
+                // {
+                //     complex<double>* test_c= identity_minus_pi.c;
+                //     for(int i=0;i<range_all;i++){
+                //         for(int j=0;j<range_all;j++){
+                //             printf("%f+%fi ",
+                //                    test_c[i*range_all+j].real(), test_c[i*range_all+j].imag());
+                //         }
+                //         printf("\n");
+                //     }
+                // }
+                num_iteration++;
+                #endif
                 complex<double> det_for_rpa(1.0, 0.0);
                 int info_LU = 0;
                 int *ipiv = new int[range_all];
@@ -1311,12 +1401,21 @@ map<double, map<Vector3_Order<double>, atom_mapping<ComplexMatrix>::pair_t_old>>
     // std::stringstream ss;
     // ss<<"out_pi_rank_"<<mpi_comm_global_h.myid<<".txt";
     // fp.open(ss.str());
+    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+    // printf("success before irk_weight, pid: %d\n", mpi_comm_global_h.myid);
+    #endif
     for (auto &k_pair : irk_weight)
     {
         Vector3_Order<double> ik_vec = k_pair.first;
         for (int I = 0; I != natom; I++)
         {
+            #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+            // printf("success before gather_vp_row_q irk_weight, pid: %d\n", mpi_comm_global_h.myid);
+            #endif
             atom_mapping<ComplexMatrix>::pair_t_old Vq_row = gather_vq_row_q(I, coulmat, ik_vec);
+            #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+            // printf("success after gather_vp_row_q irk_weight, pid: %d\n", mpi_comm_global_h.myid);
+            #endif
             for (auto &freq_p : chi0.get_chi0_q())
             {
                 const double freq = freq_p.first;

From fb9774969a71a4b0425a62e756a503c2949f310e Mon Sep 17 00:00:00 2001
From: userchba <88939318+userchba@users.noreply.github.com>
Date: Fri, 25 Jul 2025 01:17:39 +0800
Subject: [PATCH 03/18] Delete epsilon.cpp

---
 epsilon.cpp | 2970 ---------------------------------------------------
 1 file changed, 2970 deletions(-)

diff --git a/epsilon.cpp b/epsilon.cpp
index 9ed7d46b..8b137891 100644
--- a/epsilon.cpp
+++ b/epsilon.cpp
@@ -1,2971 +1 @@
-#include "epsilon.h"
-#define OPEN_TEST_FOR_LU_DECOMPOSITION
-#include <math.h>
-#include <omp.h>
 
-#include <algorithm>
-#include <array>
-#include <set>
-#include <stdexcept>
-#include <valarray>
-
-#include "atoms.h"
-#include "constants.h"
-#include "envs_blacs.h"
-#include "envs_io.h"
-#include "envs_mpi.h"
-#include "lapack_connector.h"
-#include "libri_utils.h"
-#include "matrix_m_parallel_utils.h"
-#include "parallel_mpi.h"
-#include "params.h"
-#include "pbc.h"
-#include "profiler.h"
-#include "scalapack_connector.h"
-#include "stl_io_helper.h"
-#include "utils_blacs.h"
-#include "utils_io.h"
-#include "utils_mem.h"
-#include "utils_mpi_io.h"
-
-#ifdef LIBRPA_USE_LIBRI
-#include <RI/comm/mix/Communicate_Tensors_Map_Judge.h>
-#include <RI/global/Tensor.h>
-using RI::Tensor;
-using RI::Communicate_Tensors_Map_Judge::comm_map2_first;
-#endif
-
-using LIBRPA::Array_Desc;
-using LIBRPA::envs::blacs_ctxt_global_h;
-using LIBRPA::envs::mpi_comm_global_h;
-using LIBRPA::envs::ofs_myid;
-using LIBRPA::utils::lib_printf;
-
-CorrEnergy compute_RPA_correlation_blacs_2d_gamma_only(Chi0 &chi0, atpair_k_cplx_mat_t &coulmat)
-{
-    CorrEnergy corr;
-    if (mpi_comm_global_h.myid == 0)
-        lib_printf("Calculating EcRPA with BLACS/ScaLAPACK 2D gamma_only\n");
-    // lib_printf("Calculating EcRPA with BLACS, pid:  %d\n", mpi_comm_global_h.myid);
-    const auto &mf = chi0.mf;
-    const double CONE = 1.0;
-    const int n_abf = LIBRPA::atomic_basis_abf.nb_total;
-    // std::cout << "n_abf " << n_abf << std::endl;
-    // std::cout << "n_atoms " << LIBRPA::atomic_basis_abf.n_atoms << std::endl;
-    const auto part_range = LIBRPA::atomic_basis_abf.get_part_range();
-    // std::cout << "part_range " << part_range[0] << " " << part_range[1] << std::endl;
-    auto nbs_ = LIBRPA::atomic_basis_abf.get_atom_nbs();
-    // std::cout << "nbs_ " << nbs_[0] << " " << nbs_[1] << std::endl;
-
-    mpi_comm_global_h.barrier();
-
-    Array_Desc desc_nabf_nabf(blacs_ctxt_global_h);
-    // use a square blocksize instead max block, otherwise heev and inversion will complain about
-    // illegal parameter
-    desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0);
-    const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy(
-        'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf);
-    const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf);
-    auto chi0_block = init_local_mat<double>(desc_nabf_nabf, MAJOR::COL);
-    auto coul_block = init_local_mat<double>(desc_nabf_nabf, MAJOR::COL);
-    auto coul_chi0_block = init_local_mat<double>(desc_nabf_nabf, MAJOR::COL);
-
-    vector<Vector3_Order<double>> qpts;
-    for (const auto &qMuNuchi : chi0.get_chi0_q().at(chi0.tfg.get_freq_nodes()[0]))
-        qpts.push_back(qMuNuchi.first);
-
-    complex<double> tot_RPA_energy(0.0, 0.0);
-    map<Vector3_Order<double>, complex<double>> cRPA_q;
-    if (mpi_comm_global_h.is_root()) lib_printf("Finish init RPA blacs 2d\n");
-#ifdef LIBRPA_USE_LIBRI
-    for (const auto &q : qpts)
-    {
-        coul_block.zero_out();
-
-        int iq = std::distance(klist.begin(), std::find(klist.begin(), klist.end(), q));
-        std::array<double, 3> qa = {q.x, q.y, q.z};
-        // collect the block elements of coulomb matrices
-        {
-            double vq_begin = omp_get_wtime();
-            // LibRI tensor for communication, release once done
-            std::map<int, std::map<std::pair<int, std::array<double, 3>>, Tensor<double>>>
-                coul_libri;
-
-            for (const auto &Mu_Nu : local_atpair)
-            {
-                const auto Mu = Mu_Nu.first;
-                const auto Nu = Mu_Nu.second;
-                // ofs_myid << "myid " << blacs_ctxt_global_h.myid << "Mu " << Mu << " Nu " << Nu <<
-                // endl;
-                if (coulmat.count(Mu) == 0 || coulmat.at(Mu).count(Nu) == 0 ||
-                    coulmat.at(Mu).at(Nu).count(q) == 0)
-                    continue;
-                const auto &Vq = coulmat.at(Mu).at(Nu).at(q);
-                const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu);
-                const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu);
-                matrix tmp_vq_real = (*Vq).real();
-                std::valarray<double> Vq_va(tmp_vq_real.c, Vq->size);
-                auto pvq = std::make_shared<std::valarray<double>>();
-                *pvq = Vq_va;
-                coul_libri[Mu][{Nu, std::array<double, 3>{0, 0, 0}}] =
-                    Tensor<double>({n_mu, n_nu}, pvq);
-                coulmat.at(Mu).at(Nu).at(q).reset();
-            }
-
-            LIBRPA::utils::release_free_mem();
-
-            // printf("Finish RPA blacs 2d  vq arr\n");
-            double arr_end = omp_get_wtime();
-            mpi_comm_global_h.barrier();
-            double comm_begin = omp_get_wtime();
-            // printf("Begin comm_map2_first  myid: %d\n",mpi_comm_global_h.myid);
-            const auto IJq_coul =
-                comm_map2_first(mpi_comm_global_h.comm, coul_libri, s0_s1.first, s0_s1.second);
-            double comm_end = omp_get_wtime();
-            mpi_comm_global_h.barrier();
-
-            double block_begin = omp_get_wtime();
-
-            collect_block_from_ALL_IJ_Tensor(coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf,
-                                             qa, true, CONE, IJq_coul, MAJOR::ROW);
-
-            double block_end = omp_get_wtime();
-            lib_printf(
-                "Vq Time  myid: %d  arr_time: %f  comm_time: %f   block_time: %f   pair_size: %d\n",
-                mpi_comm_global_h.myid, arr_end - vq_begin, comm_end - comm_begin,
-                block_end - block_begin, set_IJ_nabf_nabf.size());
-            mpi_comm_global_h.barrier();
-            double vq_end = omp_get_wtime();
-
-            if (mpi_comm_global_h.myid == 0)
-                lib_printf(" | Total vq time: %f  lri_coul: %f   comm_vq: %f   block_vq: %f\n",
-                           vq_end - vq_begin, comm_begin - vq_begin, block_begin - comm_begin,
-                           vq_end - block_begin);
-        }
-
-        double chi_arr_time = 0.0;
-        double chi_comm_time = 0.0;
-        double chi_2d_time = 0.0;
-        for (const auto &freq : chi0.tfg.get_freq_nodes())
-        {
-            const auto ifreq = chi0.tfg.get_freq_index(freq);
-            const double freq_weight = chi0.tfg.find_freq_weight(freq);
-            double pi_freq_begin = omp_get_wtime();
-            chi0_block.zero_out();
-            {
-                double chi_begin_arr = omp_get_wtime();
-                std::map<int, std::map<std::pair<int, std::array<double, 3>>, Tensor<double>>>
-                    chi0_libri;
-                const auto &chi0_wq = chi0.get_chi0_q().at(freq).at(q);
-
-                for (const auto &M_Nchi : chi0_wq)
-                {
-                    const auto &M = M_Nchi.first;
-                    const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M);
-                    for (const auto &N_chi : M_Nchi.second)
-                    {
-                        const auto &N = N_chi.first;
-                        const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N);
-                        const auto &chi = N_chi.second.real();
-                        std::valarray<double> chi_va(chi.c, chi.size);
-                        auto pchi = std::make_shared<std::valarray<double>>();
-                        *pchi = chi_va;
-                        chi0_libri[M][{N, std::array<double, 3>{0, 0, 0}}] =
-                            Tensor<double>({n_mu, n_nu}, pchi);
-                    }
-                }
-
-                // if(mpi_comm_global_h.is_root())
-                // {
-                //     lib_printf("Begin to clean chi0 !!! \n");
-                //     system("free -m");
-                //     lib_printf("chi0_freq_q size: %d\n",chi0_wq.size());
-                // }
-
-                chi0.free_chi0_q(freq, q);
-
-                LIBRPA::utils::release_free_mem();
-
-                // if(mpi_comm_global_h.is_root())
-                // {
-                //     lib_printf("After clean chi0 !!! \n");
-                //     system("free -m");
-                //     lib_printf("chi0_freq_q size: %d\n",chi0_wq.size());
-                // }
-
-                mpi_comm_global_h.barrier();
-                double chi_end_arr = omp_get_wtime();
-                // ofs_myid << "chi0_libri" << endl << chi0_libri;
-
-                const auto IJq_chi0 =
-                    comm_map2_first(mpi_comm_global_h.comm, chi0_libri, s0_s1.first, s0_s1.second);
-                // ofs_myid << "IJq_chi0" << endl << IJq_chi0;
-                double chi_end_comm = omp_get_wtime();
-
-                collect_block_from_ALL_IJ_Tensor(chi0_block, desc_nabf_nabf,
-                                                 LIBRPA::atomic_basis_abf, qa, true, CONE, IJq_chi0,
-                                                 MAJOR::ROW);
-                // printf("End collect block myid: %d ifreq: %d   TIME_USED:
-                // %f\n",mpi_comm_global_h.myid,ifreq,chi_end_comm-chi_end_arr);
-                mpi_comm_global_h.barrier();
-                double chi_end_2d = omp_get_wtime();
-
-                chi_arr_time = (chi_end_arr - chi_begin_arr);
-                chi_comm_time = (chi_end_comm - chi_end_arr);
-                chi_2d_time = (chi_end_2d - chi_end_comm);
-            }
-
-            double pi_begin = omp_get_wtime();
-            ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, coul_block.ptr(), 1, 1,
-                                        desc_nabf_nabf.desc, chi0_block.ptr(), 1, 1,
-                                        desc_nabf_nabf.desc, 0.0, coul_chi0_block.ptr(), 1, 1,
-                                        desc_nabf_nabf.desc);
-            // char fnp[100];
-            // sprintf(fnp, "pi_ifreq_%d_iq_%d.mtx", ifreq, iq);
-            double pi_end = omp_get_wtime();
-            // printf("End pgemm  myid: %d ifreq: %d \n",mpi_comm_global_h.myid,ifreq);
-            double trace_pi = 0.0;
-            double trace_pi_loc = 0.0;
-            for (int i = 0; i != n_abf; i++)
-            {
-                const int ilo = desc_nabf_nabf.indx_g2l_r(i);
-                const int jlo = desc_nabf_nabf.indx_g2l_c(i);
-                if (ilo >= 0 && jlo >= 0) trace_pi_loc += coul_chi0_block(ilo, jlo);
-            }
-
-            coul_chi0_block *= -1.0;
-            for (int i = 0; i != n_abf; i++)
-            {
-                const int ilo = desc_nabf_nabf.indx_g2l_r(i);
-                const int jlo = desc_nabf_nabf.indx_g2l_c(i);
-                if (ilo >= 0 && jlo >= 0) coul_chi0_block(ilo, jlo) += CONE;
-            }
-
-            int *ipiv = new int[desc_nabf_nabf.m_loc() * 10];
-            int info;
-            // printf("begin det  myid: %d ifreq: %d \n",mpi_comm_global_h.myid,ifreq);
-            double ln_det =
-                compute_pi_det_blacs_2d_gamma_only(coul_chi0_block, desc_nabf_nabf, ipiv, info);
-            // printf("End det  myid: %d ifreq: %d \n",mpi_comm_global_h.myid,ifreq);
-            double det_end = omp_get_wtime();
-            mpi_comm_global_h.barrier();
-            MPI_Allreduce(&trace_pi_loc, &trace_pi, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_global_h.comm);
-            double pi_freq_end = omp_get_wtime();
-
-            if (mpi_comm_global_h.myid == 0)
-            {
-                lib_printf(
-                    "| TIME of DET-freq-q:  %f,  q: ( %f, %f, %f)  TOT: %f  CHI_arr: %f  CHI_comm: "
-                    "%f, CHI_2d: %f, Pi: %f, Det: %f\n",
-                    freq, q.x, q.y, q.z, pi_freq_end - pi_freq_begin, chi_arr_time, chi_comm_time,
-                    chi_2d_time, pi_end - pi_begin, det_end - pi_end);
-                complex<double> rpa_for_omega_q = complex<double>(trace_pi + ln_det);
-                /*std::cout << "q: " << iq << ", freq: " << ifreq << ", ln_det:" << ln_det
-                          << ", trace_pi: " << trace_pi << ", rpa_for_omega_q" << rpa_for_omega_q
-                          << ", contribution: "
-                          << rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI << std::endl;*/
-                cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;  //! check
-                // std::cout << "rpa_for_omega_q: " << rpa_for_omega_q
-                //          << ", freq_weight: " << freq_weight << ", irk_weight[q]:" <<
-                //          irk_weight[q]
-                //          << ", cRPA_q[q]: " << cRPA_q[q] << std::endl;
-                tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;
-            }
-        }
-    }
-#else
-    throw std::logic_error("need compilation with LibRI");
-#endif
-    if (mpi_comm_global_h.myid == 0)
-    {
-        for (auto &q_crpa : cRPA_q)
-        {
-            corr.qcontrib[q_crpa.first] = q_crpa.second;
-        }
-    }
-    mpi_comm_global_h.barrier();
-    corr.value = tot_RPA_energy;
-
-    corr.etype = CorrEnergy::type::RPA;
-    return corr;
-}
-
-CorrEnergy compute_RPA_correlation_blacs_2d(Chi0 &chi0, atpair_k_cplx_mat_t &coulmat)
-{
-    lib_printf("Begin to compute_RPA_correlation_blacs_2d  myid: %d\n", mpi_comm_global_h.myid);
-    system("free -m");
-    CorrEnergy corr;
-    if (mpi_comm_global_h.myid == 0) lib_printf("Calculating EcRPA with BLACS/ScaLAPACK 2D\n");
-    // lib_printf("Calculating EcRPA with BLACS, pid:  %d\n", mpi_comm_global_h.myid);
-    const auto &mf = chi0.mf;
-    const complex<double> CONE{1.0, 0.0};
-    const int n_abf = LIBRPA::atomic_basis_abf.nb_total;
-    const auto part_range = LIBRPA::atomic_basis_abf.get_part_range();
-
-    mpi_comm_global_h.barrier();
-
-    Array_Desc desc_nabf_nabf(blacs_ctxt_global_h);
-    // use a square blocksize instead max block, otherwise heev and inversion will complain about
-    // illegal parameter
-    desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0);
-    const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy(
-        'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf);
-    const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf);
-    auto chi0_block = init_local_mat<complex<double>>(desc_nabf_nabf, MAJOR::COL);
-    auto coul_block = init_local_mat<complex<double>>(desc_nabf_nabf, MAJOR::COL);
-    auto coul_chi0_block = init_local_mat<complex<double>>(desc_nabf_nabf, MAJOR::COL);
-    // ofs_myid << "Iset Jset " << s0_s1 << endl;
-    // ofs_myid << "atpair_unordered_local of myid " << blacs_ctxt_global_h.myid << " " <<
-    // atpair_unordered_local << endl;
-    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
-    // printf("success before vector qpts processid:%d,chi0.tfg.get_freq_nodes()[0]:%f,chi0.get_chi0_q().size():%d\n", mpi_comm_global_h.myid,
-        //    chi0.tfg.get_freq_nodes()[0], chi0.get_chi0_q().size());
-    // printf("chi0.get_chi0_q().empty():%d\n", chi0.get_chi0_q().empty());
-    printf("processId:%d,chi0.klist.size():%zu\n", mpi_comm_global_h.myid, chi0.klist.size());
-    // for(const auto &k : chi0.klist)
-    // {
-    //     printf("processId:%d, k: (%f, %f, %f)\n", mpi_comm_global_h.myid, k.x, k.y, k.z);
-    // }
-    #endif
-    vector<Vector3_Order<double>> qpts;
-    
-    // for (const auto &qMuNuchi : chi0.get_chi0_q().at(chi0.tfg.get_freq_nodes()[0]))
-    // {
-    //     qpts.push_back(qMuNuchi.first);
-    //     #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
-    //     const auto &q = qMuNuchi.first;
-    //     printf("processId:%d, q: (%f, %f, %f)\n", mpi_comm_global_h.myid, q.x, q.y, q.z);
-    //     #endif
-    // }
-    for(const auto &q : chi0.klist)
-    {
-        qpts.push_back(q);
-        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
-        printf("processId:%d, q: (%f, %f, %f)\n", mpi_comm_global_h.myid, q.x, q.y, q.z);
-        #endif
-    }
-    complex<double> tot_RPA_energy(0.0, 0.0);
-    map<Vector3_Order<double>, complex<double>> cRPA_q;
-    if (mpi_comm_global_h.is_root()) lib_printf("Finish init RPA blacs 2d\n");
-    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
-    printf("success before for loop processid:%d\n", mpi_comm_global_h.myid);
-    #endif
-#ifdef LIBRPA_USE_LIBRI
-    
-    for (const auto &q : qpts)
-    {
-        coul_block.zero_out();
-
-        int iq = std::distance(klist.begin(), std::find(klist.begin(), klist.end(), q));
-        std::array<double, 3> qa = {q.x, q.y, q.z};
-        // collect the block elements of coulomb matrices
-        {
-            double vq_begin = omp_get_wtime();
-            // LibRI tensor for communication, release once done
-            std::map<int, std::map<std::pair<int, std::array<double, 3>>, Tensor<complex<double>>>>
-                coul_libri;
-            coul_libri.clear();
-            for (const auto &Mu_Nu : local_atpair)
-            {
-                const auto Mu = Mu_Nu.first;
-                const auto Nu = Mu_Nu.second;
-                // ofs_myid << "myid " << blacs_ctxt_global_h.myid << "Mu " << Mu << " Nu " << Nu <<
-                // endl;
-                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
-                // printf("success before if coulmat.count:%d\n", mpi_comm_global_h.myid);
-                #endif
-                if (coulmat.count(Mu) == 0 || coulmat.at(Mu).count(Nu) == 0 ||
-                    coulmat.at(Mu).at(Nu).count(q) == 0)
-                    continue;
-                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
-                // printf("success after if coulmat.count:%d\n", mpi_comm_global_h.myid);
-                #endif
-                const auto &Vq = coulmat.at(Mu).at(Nu).at(q);
-                const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu);
-                const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu);
-                std::valarray<complex<double>> Vq_va(Vq->c, Vq->size);
-                auto pvq = std::make_shared<std::valarray<complex<double>>>();
-                *pvq = Vq_va;
-                coul_libri[Mu][{Nu, qa}] = Tensor<complex<double>>({n_mu, n_nu}, pvq);
-            }
-            // printf("Finish RPA blacs 2d  vq arr\n");
-            double arr_end = omp_get_wtime();
-            mpi_comm_global_h.barrier();
-            double comm_begin = omp_get_wtime();
-            // printf("Begin comm_map2_first  myid: %d\n",mpi_comm_global_h.myid);
-            const auto IJq_coul =
-                comm_map2_first(mpi_comm_global_h.comm, coul_libri, s0_s1.first, s0_s1.second);
-            double comm_end = omp_get_wtime();
-            mpi_comm_global_h.barrier();
-            // printf("End vq comm_map2_first  myid: %d   TIME_USED:
-            // %f\n",mpi_comm_global_h.myid,comm_end-comm_begin);
-            //  ofs_myid << "IJq_coul" << endl << IJq_coul;
-            // printf("Finish RPA blacs 2d  vq 2d\n");
-            double block_begin = omp_get_wtime();
-            // for (const auto &IJ: set_IJ_nabf_nabf)
-            // {
-            //     const auto &I = IJ.first;
-            //     const auto &J = IJ.second;
-            //     // cout << IJq_coul.at(I).at({J, qa});
-            //     collect_block_from_IJ_storage_syhe(
-            //         coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, IJ.first,
-            //         IJ.second, true, CONE, IJq_coul.at(I).at({J, qa}).ptr(), MAJOR::ROW);
-            //     // lib_printf("myid %d I %d J %d nr %d nc %d\n%s",
-            //     //        blacs_ctxt_global_h.myid, I, J,
-            //     //        coul_block.nr(), coul_block.nc(),
-            //     //        str(coul_block).c_str());
-            // }
-            collect_block_from_ALL_IJ_Tensor(coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf,
-                                             qa, true, CONE, IJq_coul, MAJOR::ROW);
-            double block_end = omp_get_wtime();
-            lib_printf(
-                "Vq Time  myid: %d  arr_time: %f  comm_time: %f   block_time: %f   pair_size: %d\n",
-                mpi_comm_global_h.myid, arr_end - vq_begin, comm_end - comm_begin,
-                block_end - block_begin, set_IJ_nabf_nabf.size());
-            mpi_comm_global_h.barrier();
-            double vq_end = omp_get_wtime();
-
-            if (mpi_comm_global_h.myid == 0)
-                lib_printf(" | Total vq time: %f  lri_coul: %f   comm_vq: %f   block_vq: %f\n",
-                           vq_end - vq_begin, comm_begin - vq_begin, block_begin - comm_begin,
-                           vq_end - block_begin);
-        }
-
-        // if(mpi_comm_global_h.is_root())
-        // printf("Finish RPA blacs 2d  vq comm\n");
-        //  char fn[100];
-        //  sprintf(fn, "coul_iq_%d.mtx", iq);
-        //  print_matrix_mm_file_parallel(fn, coul_block, desc_nabf_nabf);
-        //  ofs_myid << str(coul_block);
-        //  lib_printf("coul_block\n%s", str(coul_block).c_str());
-        double chi_arr_time = 0.0;
-        double chi_comm_time = 0.0;
-        double chi_2d_time = 0.0;
-        for (const auto &freq : chi0.tfg.get_freq_nodes())
-        {
-            const auto ifreq = chi0.tfg.get_freq_index(freq);
-            const double freq_weight = chi0.tfg.find_freq_weight(freq);
-            double pi_freq_begin = omp_get_wtime();
-            chi0_block.zero_out();
-            {
-                double chi_begin_arr = omp_get_wtime();
-                std::map<int,
-                         std::map<std::pair<int, std::array<double, 3>>, Tensor<complex<double>>>>
-                    chi0_libri;
-                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
-                // printf("success before chi0.get_chi0_q().at(freq).at(q) processId:%d\n", mpi_comm_global_h.myid);
-                // printf("processId:%d,chi0.get_chi0_q().empty():%d\n", mpi_comm_global_h.myid, chi0.get_chi0_q().empty());
-                #endif
-                atom_mapping<ComplexMatrix>::pair_t_old chi0_wq;
-                if(!chi0.get_chi0_q().empty())
-                    chi0_wq = chi0.get_chi0_q().at(freq).at(q);
-                // const auto &chi0_wq = chi0.get_chi0_q().at(freq).at(q);
-                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
-                // printf("success after chi0.get_chi0_q().at(freq).at(q) processId:%d\n", mpi_comm_global_h.myid);
-                #endif
-                chi0_libri.clear();
-                if(!chi0.get_chi0_q().empty())
-                for (const auto &M_Nchi : chi0_wq)
-                {
-                    const auto &M = M_Nchi.first;
-                    const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M);
-                    for (const auto &N_chi : M_Nchi.second)
-                    {
-                        const auto &N = N_chi.first;
-                        const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N);
-                        const auto &chi = N_chi.second;
-                        std::valarray<complex<double>> chi_va(chi.c, chi.size);
-                        auto pchi = std::make_shared<std::valarray<complex<double>>>();
-                        *pchi = chi_va;
-                        chi0_libri[M][{N, qa}] = Tensor<complex<double>>({n_mu, n_nu}, pchi);
-                    }
-                }
-                if (mpi_comm_global_h.is_root())
-                {
-                    lib_printf("Begin to clean chi0 !!! \n");
-                    LIBRPA::utils::display_free_mem();
-                    lib_printf("chi0_freq_q size: %d,  freq: %f, q:( %f, %f, %f )\n",
-                               chi0_wq.size(), freq, q.x, q.y, q.z);
-                }
-                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
-                // printf("success before chi0.free_chi0_q(freq, q) processId:%d\n", mpi_comm_global_h.myid);
-                #endif
-                if(!chi0.get_chi0_q().empty())
-                    chi0.free_chi0_q(freq, q);
-                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
-                // printf("success after chi0.free_chi0_q(freq, q) processId:%d\n", mpi_comm_global_h.myid);
-                #endif
-
-
-                LIBRPA::utils::release_free_mem();
-                // if(mpi_comm_global_h.is_root())
-                // {
-                //     lib_printf("After clean chi0 !!! \n");
-                //     system("free -m");
-                //     lib_printf("chi0_freq_q size: %d\n",chi0_wq.size());
-                // }
-                mpi_comm_global_h.barrier();
-                double chi_end_arr = omp_get_wtime();
-                // ofs_myid << "chi0_libri" << endl << chi0_libri;
-
-                const auto IJq_chi0 =
-                    comm_map2_first(mpi_comm_global_h.comm, chi0_libri, s0_s1.first, s0_s1.second);
-                // ofs_myid << "IJq_chi0" << endl << IJq_chi0;
-                double chi_end_comm = omp_get_wtime();
-                collect_block_from_ALL_IJ_Tensor(chi0_block, desc_nabf_nabf,
-                                                 LIBRPA::atomic_basis_abf, qa, true, CONE, IJq_chi0,
-                                                 MAJOR::ROW);
-                mpi_comm_global_h.barrier();
-                double chi_end_2d = omp_get_wtime();
-
-                chi_arr_time = (chi_end_arr - chi_begin_arr);
-                chi_comm_time = (chi_end_comm - chi_end_arr);
-                chi_2d_time = (chi_end_2d - chi_end_comm);
-                // char fnc[100];
-                // sprintf(fnc, "chi_ifreq_%d_iq_%d.mtx", ifreq, iq);
-                // if( ifreq== 0)
-                //     print_matrix_mm_file_parallel(fnc, chi0_block, desc_nabf_nabf);
-            }
-
-            double pi_begin = omp_get_wtime();
-            ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, coul_block.ptr(), 1, 1,
-                                        desc_nabf_nabf.desc, chi0_block.ptr(), 1, 1,
-                                        desc_nabf_nabf.desc, 0.0, coul_chi0_block.ptr(), 1, 1,
-                                        desc_nabf_nabf.desc);
-            // char fnp[100];
-            // sprintf(fnp, "pi_ifreq_%d_iq_%d.mtx", ifreq, iq);
-            double pi_end = omp_get_wtime();
-
-            complex<double> trace_pi(0.0, 0.0);
-            complex<double> trace_pi_loc(0.0, 0.0);
-            for (int i = 0; i != n_abf; i++)
-            {
-                const int ilo = desc_nabf_nabf.indx_g2l_r(i);
-                const int jlo = desc_nabf_nabf.indx_g2l_c(i);
-                if (ilo >= 0 && jlo >= 0) trace_pi_loc += coul_chi0_block(ilo, jlo);
-            }
-
-            coul_chi0_block *= -1.0;
-            for (int i = 0; i != n_abf; i++)
-            {
-                const int ilo = desc_nabf_nabf.indx_g2l_r(i);
-                const int jlo = desc_nabf_nabf.indx_g2l_c(i);
-                if (ilo >= 0 && jlo >= 0) coul_chi0_block(ilo, jlo) += CONE;
-                // std::cout << "1-Pi: " << ilo << "," << jlo << "," << coul_chi0_block(ilo, jlo)
-                //<< std::endl;
-            }
-            // if( ifreq== 0 && mpi_comm_global_h.is_root() )
-            //     print_whole_matrix("pi-2D-loc", coul_chi0_block);
-
-            int *ipiv = new int[desc_nabf_nabf.m_loc() * 10];
-            int info;
-            complex<double> ln_det =
-                compute_pi_det_blacs_2d(coul_chi0_block, desc_nabf_nabf, ipiv, info);
-            double det_end = omp_get_wtime();
-            mpi_comm_global_h.barrier();
-            MPI_Allreduce(&trace_pi_loc, &trace_pi, 1, MPI_DOUBLE_COMPLEX, MPI_SUM,
-                          mpi_comm_global_h.comm);
-            double pi_freq_end = omp_get_wtime();
-            // double task_end = omp_get_wtime();
-            //  if(mpi_comm_global_h.is_root())
-            //      lib_printf("| After det for freq:  %f,  q: ( %f, %f, %f)   TIME_LOCMAT: %f
-            //      TIME_DET: %f  TIME_CAL_Pi: %f, TIME_TRAN_LOC: %f\n",ifreq,
-            //      q.x,q.y,q.z,task_mid-task_begin,task_end-task_mid,pi_time,loc_tran_time);
-            // para_mpi.mpi_barrier();
-
-            if (mpi_comm_global_h.myid == 0)
-            {
-                lib_printf(
-                    "| TIME of DET-freq-q:  %f,  q: ( %f, %f, %f)  TOT: %f  CHI_arr: %f  CHI_comm: "
-                    "%f, CHI_2d: %f, Pi: %f, Det: %f\n",
-                    freq, q.x, q.y, q.z, pi_freq_end - pi_freq_begin, chi_arr_time, chi_comm_time,
-                    chi_2d_time, pi_end - pi_begin, det_end - pi_end);
-                complex<double> rpa_for_omega_q = trace_pi + ln_det;
-                cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;  //! check
-                tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;
-            }
-        }
-    }
-#else
-    throw std::logic_error("need compilation with LibRI");
-#endif
-    if (mpi_comm_global_h.myid == 0)
-    {
-        for (auto &q_crpa : cRPA_q)
-        {
-            corr.qcontrib[q_crpa.first] = q_crpa.second;
-            // cout << q_crpa.first << q_crpa.second << endl;
-        }
-        // cout << "gx_num_" << chi0.tfg.size() << "  tot_RPA_energy:  " << setprecision(8)
-        // <<tot_RPA_energy << endl;
-    }
-    mpi_comm_global_h.barrier();
-    corr.value = tot_RPA_energy;
-
-    corr.etype = CorrEnergy::type::RPA;
-    return corr;
-}
-double compute_pi_det_blacs_2d_gamma_only(matrix_m<double> &loc_piT, const Array_Desc &arrdesc_pi,
-                                          int *ipiv, int &info)
-{
-    int one = 1;
-    int range_all = N_all_mu;
-    int DESCPI_T[9];
-
-    double det_begin = omp_get_wtime();
-
-    ScalapackConnector::pgetrf_f(range_all, range_all, loc_piT.ptr(), one, one, arrdesc_pi.desc,
-                                 ipiv, info);
-    double trf_end = omp_get_wtime();
-
-    double ln_det_loc = 0.0;
-    double ln_det_all = 0.0;
-
-    for (int ig = 0; ig != range_all; ig++)
-    {
-        int locr = arrdesc_pi.indx_g2l_r(ig);
-        int locc = arrdesc_pi.indx_g2l_c(ig);
-        if (locr >= 0 && locc >= 0)
-        {
-            double tmp_ln_det;
-            if (loc_piT(locr, locc) > 0)
-            {
-                tmp_ln_det = std::log(loc_piT(locr, locc));
-            }
-            else
-            {
-                tmp_ln_det = std::log(-loc_piT(locr, locc));
-            }
-            ln_det_loc += tmp_ln_det;
-        }
-    }
-    double ln_end = omp_get_wtime();
-
-    MPI_Allreduce(&ln_det_loc, &ln_det_all, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_global_h.comm);
-    double det_end = omp_get_wtime();
-    return ln_det_all;
-}
-complex<double> compute_pi_det_blacs_2d(matrix_m<complex<double>> &loc_piT,
-                                        const Array_Desc &arrdesc_pi, int *ipiv, int &info)
-{
-    int one = 1;
-    int range_all = N_all_mu;
-    int DESCPI_T[9];
-    // if(out_pi)
-    // {
-    //     print_complex_real_matrix("first_pi",pi_freq_q.at(0).at(0));
-    //     print_complex_real_matrix("first_loc_piT_mat",loc_piT);
-    // }
-    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
-    printf("success before pzgetrf_ processid:%d,range_all: %d, loc_piT.nr(): %d, loc_piT.nc(): %d\n",
-           mpi_comm_global_h.myid, range_all, loc_piT.nr(), loc_piT.nc());
-    #endif
-    double det_begin = omp_get_wtime();
-    // ScalapackConnector::transpose_desc(DESCPI_T, arrdesc_pi.desc);
-    pzgetrf_(&range_all, &range_all, loc_piT.ptr(), &one, &one, arrdesc_pi.desc, ipiv, &info);
-    double trf_end = omp_get_wtime();
-    // ScalapackConnector::pgetrf_f(range_all,range_all,loc_piT.c,one,one,DESCPI_T,ipiv, info);
-    // printf("   after LU myid: %d\n",mpi_comm_global_h.myid);
-    // printf("desc myid: %d,  m n: %d,%d,  mb nb: %d, %d,  loc_m_n: %d, %d, myp: %d,%d, npr,npc:
-    // %d, %d\n",mpi_comm_global_h.myid, arrdesc_pi.m(),arrdesc_pi.n(),
-    // arrdesc_pi.mb(),arrdesc_pi.nb(),
-    // arrdesc_pi.m_loc(),arrdesc_pi.n_loc(),arrdesc_pi.myprow(),arrdesc_pi.mypcol(),arrdesc_pi.nprows(),arrdesc_pi.npcols());
-    complex<double> ln_det_loc(0.0, 0.0);
-    complex<double> ln_det_all(0.0, 0.0);
-    // complex<double> det_loc(1.0,0.0);
-    // complex<double> det_glo(0.0,0.0);
-    // vector<complex<double>>  det_dig;
-    // vector<complex<double>>  ln_det_dig;
-    // vector<complex<double>>  det_dig_r;
-    // vector<complex<double>>  det_dig_c;
-    // printf(" myid: %d ig=25, locr,locc: %d,
-    // %d)\n",mpi_comm_global_h.myid,arrdesc_pi.indx_g2l_r(25),arrdesc_pi.indx_g2l_c(25));
-    for (int ig = 0; ig != range_all; ig++)
-    {
-        // int locr=para_mpi.localIndex(ig,row_nblk,para_mpi.nprow,para_mpi.myprow);
-        // int locc=para_mpi.localIndex(ig,col_nblk,para_mpi.npcol,para_mpi.mypcol);
-        int locr = arrdesc_pi.indx_g2l_r(ig);
-        int locc = arrdesc_pi.indx_g2l_c(ig);
-        if (locr >= 0 && locc >= 0)
-        {
-            // if(ipiv[locr]!=(ig+1))
-            // 	det_loc=-1*det_loc * loc_piT(locc,locr);
-            // else
-            // 	det_loc=det_loc * loc_piT(locc,locr);
-            // det_dig.push_back(loc_piT(locr,locc));
-            // det_dig_r.push_back(locr);
-            // det_dig_c.push_back(locc);
-            complex<double> tmp_ln_det;
-            if (loc_piT(locr, locc).real() > 0)
-            {
-                tmp_ln_det = std::log(loc_piT(locr, locc));
-                // ln_det_dig.push_back(tmp_ln_det);
-            }
-            else
-            {
-                tmp_ln_det = std::log(-loc_piT(locr, locc));
-                // ln_det_dig.push_back(tmp_ln_det);
-            }
-            ln_det_loc += tmp_ln_det;
-        }
-    }
-    double ln_end = omp_get_wtime();
-    //     ComplexMatrix det_mm(loc_piT.nr(),loc_piT.nc());
-    //     for(int i=0;i!=loc_piT.nr();i++)
-    //         for(int j=0;j!=loc_piT.nc();j++)
-    //             det_mm(i,j)=loc_piT(i,j);
-    //    // sort(det_dig.rbegin(),det_dig.rend());
-    //     ComplexMatrix det_dig_mm(det_dig.size(),4);
-    //     for(int i=0;i!=det_dig.size();i++)
-    //     {
-    //         det_dig_mm(i,0) =det_dig_r[i];
-    //         det_dig_mm(i,1) =det_dig_c[i];
-    //         det_dig_mm(i,2)=det_dig[i];
-    //         det_dig_mm(i,3)=ln_det_dig[i];
-    //     }
-    //     char fn[100];
-    //     sprintf(fn, "det_dig_myid_%d.mtx", mpi_comm_global_h.myid);
-    //     print_complex_matrix_file("det_dig_loc", det_dig_mm, fn, false);
-
-    //     sprintf(fn, "det_mat_myid_%d.mtx", mpi_comm_global_h.myid);
-    //     print_complex_matrix_file("det_mat_loc", det_mm, fn, false);
-
-    MPI_Allreduce(&ln_det_loc, &ln_det_all, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_global_h.comm);
-    double det_end = omp_get_wtime();
-    // if(mpi_comm_global_h.myid == 0)
-    //     lib_printf("    | Det time   trf: %f   ln: %f   allreduce:
-    //     %f\n",trf_end-det_begin,ln_end-trf_end, det_end-ln_end);
-    // MPI_Allreduce(&det_loc,&det_glo,1,MPI_DOUBLE_COMPLEX,MPI_PROD,mpi_comm_global_h.comm);
-    // ln_det_all=std::log(det_glo);
-    return ln_det_all;
-}
-
-complex<double> compute_pi_det_blacs(ComplexMatrix &loc_piT, const Array_Desc &arrdesc_pi,
-                                     int *ipiv, int &info)
-{
-    // int range_all = atom_mu_part_range[natom-1]+atom_mu[natom-1];
-    // int desc_pi[9];
-    // int loc_row, loc_col, info;
-    // int row_nblk=1;
-    // int col_nblk=1;
-    int one = 1;
-    int range_all = N_all_mu;
-    // para_mpi.set_blacs_mat(desc_pi,loc_row,loc_col,range_all,range_all,row_nblk,col_nblk);
-    // int *ipiv = new int [loc_row*10];
-    // ComplexMatrix loc_piT(loc_col,loc_row);
-
-    // for(int i=0;i!=loc_row;i++)
-    // {
-    //     int global_row = para_mpi.globalIndex(i,row_nblk,para_mpi.nprow,para_mpi.myprow);
-    //     int mu;
-    //     int I=atom_mu_glo2loc(global_row,mu);
-    //     for(int j=0;j!=loc_col;j++)
-    //     {
-    //         int global_col = para_mpi.globalIndex(j,col_nblk,para_mpi.npcol,para_mpi.mypcol);
-    //         int nu;
-    //         int J=atom_mu_glo2loc(global_col,nu);
-
-    //         if( global_col == global_row)
-    //         {
-    //             loc_piT(j,i)=complex<double>(1.0,0.0) - pi_freq_q.at(I).at(J)(mu,nu);
-    //         }
-    //         else
-    //         {
-    //             loc_piT(j,i)=-1*  pi_freq_q.at(I).at(J)(mu,nu);
-    //         }
-
-    //     }
-    // }
-    int DESCPI_T[9];
-    // if(out_pi)
-    // {
-    //     print_complex_real_matrix("first_pi",pi_freq_q.at(0).at(0));
-    //     print_complex_real_matrix("first_loc_piT_mat",loc_piT);
-    // }
-
-    ScalapackConnector::transpose_desc(DESCPI_T, arrdesc_pi.desc);
-
-    // para_mpi.mpi_barrier();
-    // printf("   before LU Myid: %d        Available DOS memory = %ld
-    // bytes\n",mpi_comm_global_h.myid, memavail()); printf("   before LU myid: %d  range_all: %d,
-    // loc_mat.size: %d\n",mpi_comm_global_h.myid,range_all,loc_piT.size);
-    pzgetrf_(&range_all, &range_all, loc_piT.c, &one, &one, DESCPI_T, ipiv, &info);
-    // printf("   after LU myid: %d\n",mpi_comm_global_h.myid);
-    complex<double> ln_det_loc(0.0, 0.0);
-    complex<double> ln_det_all(0.0, 0.0);
-    for (int ig = 0; ig != range_all; ig++)
-    {
-        // int locr=para_mpi.localIndex(ig,row_nblk,para_mpi.nprow,para_mpi.myprow);
-        // int locc=para_mpi.localIndex(ig,col_nblk,para_mpi.npcol,para_mpi.mypcol);
-        int locr = arrdesc_pi.indx_g2l_r(ig);
-        int locc = arrdesc_pi.indx_g2l_c(ig);
-        if (locr >= 0 && locc >= 0)
-        {
-            // if(ipiv[locr]!=(ig+1))
-            // 	det_loc=-1*det_loc * loc_piT(locc,locr);
-            // else
-            // 	det_loc=det_loc * loc_piT(locc,locr);
-            if (loc_piT(locc, locr).real() > 0)
-                ln_det_loc += std::log(loc_piT(locc, locr));
-            else
-                ln_det_loc += std::log(-loc_piT(locc, locr));
-        }
-    }
-    MPI_Allreduce(&ln_det_loc, &ln_det_all, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_global_h.comm);
-    return ln_det_all;
-}
-
-CorrEnergy compute_RPA_correlation_blacs(const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat)
-{
-    CorrEnergy corr;
-    if (mpi_comm_global_h.myid == 0) lib_printf("Calculating EcRPA with BLACS/ScaLAPACK row\n");
-
-    const auto &mf = chi0.mf;
-    const complex<double> CONE{1.0, 0.0};
-    const int n_abf = LIBRPA::atomic_basis_abf.nb_total;
-    const auto part_range = LIBRPA::atomic_basis_abf.get_part_range();
-
-    mpi_comm_global_h.barrier();
-
-    LIBRPA::Array_Desc arrdesc_pi(blacs_ctxt_global_h);
-    arrdesc_pi.init_square_blk(n_abf, n_abf, 0, 0);
-    int loc_row = arrdesc_pi.m_loc(), loc_col = arrdesc_pi.n_loc(), info;
-
-    // para_mpi.set_blacs_mat(desc_pi,loc_row,loc_col,N_all_mu,N_all_mu,row_nblk,col_nblk);
-    int *ipiv = new int[loc_row * 10];
-    // double vq_begin_m2t= omp_get_wtime();
-    // std::map<int, std::map<std::pair<int, std::array<double, 3>>, Tensor<complex<double>>>>
-    // vq_libri; for(auto &Ip:Vq)
-    // {
-    //     auto I=Ip.first;
-    //     const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(I);
-    //     for(auto &Jp:Ip.second)
-    //     {
-    //         auto J=Jp.first;
-    //         const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(J);
-    //         for(auto &qp:Jp.second)
-    //         {
-    //             auto q=qp.first;
-    //             std::array<double, 3> qa = {q.x, q.y, q.z};
-    //             const auto &vq_ptr=qp.second;
-    //             std::valarray<complex<double>> Vq_va(vq_ptr->c, vq_ptr->size);
-    //             auto pvq = std::make_shared<std::valarray<complex<double>>>();
-    //             *pvq = Vq_va;
-    //             vq_libri[I][{J, qa}] = Tensor<complex<double>>({n_mu, n_nu}, pvq);
-    //             if(I!=J)
-    //             {
-    //                 auto vqT=transpose(*vq_ptr, 1);
-    //                 std::valarray<complex<double>> VqT_va(vqT.c, vqT.size);
-    //                 auto pvqT = std::make_shared<std::valarray<complex<double>>>();
-    //                 *pvqT = VqT_va;
-    //                 vq_libri[J][{I, qa}] = Tensor<complex<double>>({n_nu, n_mu}, pvqT);
-    //             }
-    //         }
-    //     }
-    // }
-    // double vq_end_m2t = omp_get_wtime();
-    // set<int> loc_atp_IJ;
-    // for(auto &atp:local_atpair)
-    // {
-    //     loc_atp_IJ.insert(atp.first);
-    //     loc_atp_IJ.insert(atp.second);
-    // }
-    // set<int> all_atom_set;
-    // for(int I=0;I!=natom;I++)
-    //     all_atom_set.insert(I);
-    // const auto IJq_coul = Communicate_Tensors_Map_Judge::comm_map2_first(mpi_comm_global_h.comm,
-    // vq_libri, all_atom_set, loc_atp_IJ); atpair_k_cplx_mat_t Vq_loc; double vq_end_comm =
-    // omp_get_wtime(); for(auto Ip:IJq_coul)
-    // {
-    //     auto I=Ip.first;
-    //     auto n_mu=atom_mu[I];
-    //     for(auto &Jqp:Ip.second)
-    //     {
-    //         auto J=Jqp.first.first;
-    //         auto n_nu=atom_mu[J];
-    //         auto qa=Jqp.first.second;
-    //         Vector3_Order<double> q{qa[0],qa[1],qa[2]};
-    //         shared_ptr<ComplexMatrix> vq_ptr = make_shared<ComplexMatrix>();
-    //         vq_ptr->create(n_mu, n_nu);
-    //         const auto length=sizeof(complex<double>)* n_mu *n_nu;
-    //         memcpy((*vq_ptr).c, Jqp.second.ptr(),length);
-    //         Vq_loc[I][J][q]=vq_ptr;
-    //         //printf("| process %d, I: %d  J: %d\n",mpi_comm_global_h.myid, I,J );
-    //     }
-    // }
-    // double vq_end_t2m = omp_get_wtime();
-    // mpi_comm_global_h.barrier();
-    // if(mpi_comm_global_h.is_root())
-    //     lib_printf("| Vq_time %f, TIME_m2t: %f   TIME_comm: %f  TIME_t2m:
-    //     %f\n",vq_end_t2m-vq_begin_m2t,vq_end_m2t-vq_begin_m2t,vq_end_comm-vq_end_m2t,vq_end_t2m-vq_end_comm);
-    map<double, map<Vector3_Order<double>, ComplexMatrix>> pi_freq_q;
-    complex<double> tot_RPA_energy(0.0, 0.0);
-    map<Vector3_Order<double>, complex<double>> cRPA_q;
-    for (const auto &freq_q_MuNuchi0 : chi0.get_chi0_q())
-    {
-        const auto freq = freq_q_MuNuchi0.first;
-        const double freq_weight = chi0.tfg.find_freq_weight(freq);
-        for (const auto &q_MuNuchi0 : freq_q_MuNuchi0.second)
-        {
-            double task_begin = omp_get_wtime();
-            const auto q = q_MuNuchi0.first;
-            auto &MuNuchi0 = q_MuNuchi0.second;
-
-            // ComplexMatrix loc_piT(loc_col,loc_row);
-            auto loc_piT = init_local_mat<complex<double>>(arrdesc_pi, MAJOR::COL);
-            complex<double> trace_pi(0.0, 0.0);
-            double vq_time = 0.0;
-            double pi_time = 0.0;
-            double loc_tran_time = 0.0;
-            for (int Mu = 0; Mu != natom; Mu++)
-            {
-                double Mu_begin = omp_get_wtime();
-                // lib_printf(" |process %d,  Mu:  %d\n",mpi_comm_global_h.myid,Mu);
-                const size_t n_mu = atom_mu[Mu];
-                atom_mapping<ComplexMatrix>::pair_t_old Vq_row = gather_vq_row_q(Mu, coulmat, q);
-                double Mu_after_vq = omp_get_wtime();
-                // atom_mapping<ComplexMatrix>::pair_t_old Vq_row;
-                // const auto IJq_coul =
-                // Communicate_Tensors_Map_Judge::comm_map2_first(mpi_comm_global_h.comm, vq_libri,
-                // {Mu}, loc_atp_atoms); double Mu_vq_comm = omp_get_wtime(); for(auto Ip:IJq_coul)
-                // {
-                //     auto I=Ip.first;
-                //     auto n_mu=atom_mu[I];
-                //     for(auto &Jqp:Ip.second)
-                //     {
-                //         auto J=Jqp.first.first;
-                //         auto n_nu=atom_mu[J];
-                //         auto q=Jqp.first.second;
-                //         Vq_row[I][J].create(n_mu,n_nu);
-                //         const auto length=sizeof(complex<double>)* n_mu *n_nu;
-                //         memcpy(Vq_row[I][J].c, Jqp.second.ptr(),length);
-                //     }
-                // }
-                // double Mu_after_vq=omp_get_wtime();
-                // printf("   |process %d, Mu: %d  vq_row.size:
-                // %d\n",para_mpi.get_myid(),Mu,Vq_row[Mu].size()); ComplexMatrix
-                // loc_pi_row=compute_Pi_freq_q_row(q,MuNuchi0,Vq_loc,Mu,q);
-                ComplexMatrix loc_pi_row = compute_Pi_freq_q_row(q, MuNuchi0, Vq_row, Mu);
-                // printf("   |process %d,   compute_pi\n",para_mpi.get_myid());
-                ComplexMatrix glo_pi_row(n_mu, N_all_mu);
-                mpi_comm_global_h.barrier();
-                mpi_comm_global_h.allreduce_ComplexMatrix(loc_pi_row, glo_pi_row);
-                double Mu_after_pi_loc = omp_get_wtime();
-                // cout<<"  glo_pi_rowT nr,nc: "<<glo_pi_row.nr<<" "<<glo_pi_row.nc<<endl;
-
-                for (int i_mu = 0; i_mu != n_mu; i_mu++)
-                    trace_pi += glo_pi_row(i_mu, atom_mu_part_range[Mu] + i_mu);
-                // select glo_pi_rowT to pi_blacs
-                for (int i = 0; i != loc_row; i++)
-                {
-                    // int global_row =
-                    // para_mpi.globalIndex(i,row_nblk,para_mpi.nprow,para_mpi.myprow);
-                    int global_row = arrdesc_pi.indx_l2g_r(i);
-                    int mu_blacs;
-                    int I_blacs = atom_mu_glo2loc(global_row, mu_blacs);
-                    if (I_blacs == Mu)
-                        for (int j = 0; j != loc_col; j++)
-                        {
-                            // int global_col =
-                            // para_mpi.globalIndex(j,col_nblk,para_mpi.npcol,para_mpi.mypcol);
-                            int global_col = arrdesc_pi.indx_l2g_c(j);
-                            int nu_blacs;
-                            int J_blacs = atom_mu_glo2loc(global_col, nu_blacs);
-                            // cout<<" Mu: "<<Mu<<"  i,j: "<<i<<"  "<<j<<"    glo_row,col:
-                            // "<<global_row<<"  "<<global_col<<"  J:"<<J_blacs<< "  index i,j:
-                            // "<<atom_mu_part_range[J_blacs] + mu_blacs<<" "<<nu_blacs<<endl;
-                            if (global_col == global_row)
-                            {
-                                loc_piT(i, j) =
-                                    complex<double>(1.0, 0.0) -
-                                    glo_pi_row(mu_blacs, atom_mu_part_range[J_blacs] + nu_blacs);
-                            }
-                            else
-                            {
-                                loc_piT(i, j) =
-                                    -glo_pi_row(mu_blacs, atom_mu_part_range[J_blacs] + nu_blacs);
-                            }
-                        }
-                }
-                double Mu_after_loc_tran = omp_get_wtime();
-                vq_time += (Mu_after_vq - Mu_begin);
-                pi_time += (Mu_after_pi_loc - Mu_after_vq);
-                loc_tran_time += (Mu_after_loc_tran - Mu_after_pi_loc);
-            }
-            // if(freq == chi0.tfg.get_freq_nodes()[0] && mpi_comm_global_h.is_root())
-            //     print_complex_matrix(" loc_piT",loc_piT);
-            double task_mid = omp_get_wtime();
-            // printf("|process  %d, before det\n",mpi_comm_global_h.myid);
-            complex<double> ln_det = compute_pi_det_blacs_2d(loc_piT, arrdesc_pi, ipiv, info);
-            double task_end = omp_get_wtime();
-            if (mpi_comm_global_h.is_root())
-                lib_printf(
-                    "| After det for freq:  %f,  q: ( %f, %f, %f)   TIME_Vq_COMM: %f   TIME_DET: "
-                    "%f  TIME_CAL_Pi: %f, TIME_TRAN_LOC: %f\n",
-                    freq, q.x, q.y, q.z, vq_time, task_end - task_mid, pi_time, loc_tran_time);
-            // para_mpi.mpi_barrier();
-            if (mpi_comm_global_h.myid == 0)
-            {
-                complex<double> rpa_for_omega_q = trace_pi + ln_det;
-                // cout << " ifreq:" << freq << "      rpa_for_omega_k: " << rpa_for_omega_q << "
-                // lnt_det: " << ln_det << "    trace_pi " << trace_pi << endl;
-                cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;  //! check
-                tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;
-            }
-        }
-    }
-
-    if (mpi_comm_global_h.myid == 0)
-    {
-        for (auto &q_crpa : cRPA_q)
-        {
-            corr.qcontrib[q_crpa.first] = q_crpa.second;
-            // cout << q_crpa.first << q_crpa.second << endl;
-        }
-        // cout << "gx_num_" << chi0.tfg.size() << "  tot_RPA_energy:  " << setprecision(8)
-        // <<tot_RPA_energy << endl;
-    }
-    mpi_comm_global_h.barrier();
-    corr.value = tot_RPA_energy;
-    corr.etype = CorrEnergy::type::RPA;
-    return corr;
-}
-
-CorrEnergy compute_RPA_correlation(const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat)
-{
-    CorrEnergy corr;
-    if (mpi_comm_global_h.myid == 0) lib_printf("Calculating EcRPA without BLACS/ScaLAPACK\n");
-    // lib_printf("Begin cal cRPA , pid:  %d\n", mpi_comm_global_h.myid);
-    const auto &mf = chi0.mf;
-
-    // freq, q
-    map<double, map<Vector3_Order<double>, atom_mapping<ComplexMatrix>::pair_t_old>>
-        pi_freq_q_Mu_Nu;
-    if (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR ||
-        LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI)
-        pi_freq_q_Mu_Nu = compute_Pi_q_MPI(chi0, coulmat);
-    else
-        pi_freq_q_Mu_Nu = compute_Pi_q(chi0, coulmat);
-    lib_printf("Finish Pi freq on Proc %4d, size %zu\n", mpi_comm_global_h.myid,
-               pi_freq_q_Mu_Nu.size());
-    // mpi_comm_global_h.barrier();
-
-    int range_all = N_all_mu;
-
-    vector<int> part_range;
-    part_range.resize(atom_mu.size());
-    part_range[0] = 0;
-    int count_range = 0;
-    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
-    // printf("success before part_range processid:%d, atom_mu.size(): %zu\n",
-        //    mpi_comm_global_h.myid, atom_mu.size());
-    #endif
-    for (int I = 0; I != atom_mu.size() - 1; I++)
-    {
-        count_range += atom_mu[I];
-        part_range[I + 1] = count_range;
-    }
-    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
-    // printf("success after part_range processid:%d, atom_mu.size(): %zu\n",
-    //        mpi_comm_global_h.myid, atom_mu.size());
-    #endif
-
-    // cout << "part_range:" << endl;
-    // for (int I = 0; I != atom_mu.size(); I++)
-    // {
-    //     cout << part_range[I] << endl;
-    // }
-    // cout << "part_range over" << endl;
-
-    // pi_freq_q contains all atoms
-    map<double, map<Vector3_Order<double>, ComplexMatrix>> pi_freq_q;
-    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
-    // printf("| process %d, qpts.size(): %zu,freq.size():%zu\n", mpi_comm_global_h.myid, chi0.klist.size(),chi0.tfg.get_freq_nodes().size());
-    #endif
-    for(const auto &freq : chi0.tfg.get_freq_nodes())
-    {
-        // printf("| process %d, freq: %f\n", mpi_comm_global_h.myid, freq);
-        map<Vector3_Order<double>, atom_mapping<ComplexMatrix>::pair_t_old> freq_q_MuNupi;
-        if(!chi0.get_chi0_q().empty())
-            freq_q_MuNupi=pi_freq_q_Mu_Nu.at(freq);
-        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
-        // printf("success before freq_q_MuNupi processid:%d, freq_q_MuNupi.size(): %zu\n",
-        //        mpi_comm_global_h.myid, freq_q_MuNupi.size());
-        #endif
-        for(const auto &q:chi0.klist){
-            atom_mapping<ComplexMatrix>::pair_t_old q_MuNupi;
-            if(!chi0.get_chi0_q().empty())
-                q_MuNupi = freq_q_MuNupi.at(q);
-            const auto MuNupi = q_MuNupi;
-            pi_freq_q[freq][q].create(range_all, range_all);
-
-            ComplexMatrix pi_munu_tmp(range_all, range_all);
-            pi_munu_tmp.zero_out();
-            if(!chi0.get_chi0_q().empty())
-            for (const auto &Mu_Nupi : MuNupi)
-            {
-                const auto Mu = Mu_Nupi.first;
-                const auto Nupi = Mu_Nupi.second;
-                const size_t n_mu = atom_mu[Mu];
-                for (const auto &Nu_pi : Nupi)
-                {
-                    const auto Nu = Nu_pi.first;
-                    const auto pimat = Nu_pi.second;
-                    const size_t n_nu = atom_mu[Nu];
-
-                    for (size_t mu = 0; mu != n_mu; ++mu)
-                    {
-                        for (size_t nu = 0; nu != n_nu; ++nu)
-                        {
-                            pi_munu_tmp(part_range[Mu] + mu, part_range[Nu] + nu) += pimat(mu, nu);
-                        }
-                    }
-                }
-            }
-            if (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR ||
-                LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI)
-            {
-                mpi_comm_global_h.reduce_ComplexMatrix(pi_munu_tmp, pi_freq_q.at(freq).at(q), 0);
-            }
-            else
-            {
-                pi_freq_q.at(freq).at(q) = std::move(pi_munu_tmp);
-            }
-        }
-    }
-    // lib_printf("Finish Pi communicate %4d, size %zu\n", mpi_comm_global_h.myid,
-    // pi_freq_q_Mu_Nu.size());
-    mpi_comm_global_h.barrier();
-    // if (mpi_comm_global_h.myid == 0)
-    {
-        complex<double> tot_RPA_energy(0.0, 0.0);
-        map<Vector3_Order<double>, complex<double>> cRPA_q;
-        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
-        int num_iteration = 0;
-        #endif
-        for (const auto &freq_qpi : pi_freq_q)
-        {
-            const auto freq = freq_qpi.first;
-            const double freq_weight = chi0.tfg.find_freq_weight(freq);
-            for (const auto &q_pi : freq_qpi.second)
-            {
-                const auto q = q_pi.first;
-                const auto pimat = q_pi.second;
-                complex<double> rpa_for_omega_q(0.0, 0.0);
-                ComplexMatrix identity(range_all, range_all);
-                ComplexMatrix identity_minus_pi(range_all, range_all);
-                identity.set_as_identity_matrix();
-                identity_minus_pi = identity - pi_freq_q[freq][q];
-                #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
-                // if(num_iteration==0)
-                // if(mpi_comm_global_h.myid == 1)
-                // {
-                //     complex<double>* test_c= identity_minus_pi.c;
-                //     for(int i=0;i<range_all;i++){
-                //         for(int j=0;j<range_all;j++){
-                //             printf("%f+%fi ",
-                //                    test_c[i*range_all+j].real(), test_c[i*range_all+j].imag());
-                //         }
-                //         printf("\n");
-                //     }
-                // }
-                num_iteration++;
-                #endif
-                complex<double> det_for_rpa(1.0, 0.0);
-                int info_LU = 0;
-                int *ipiv = new int[range_all];
-                LapackConnector::zgetrf(range_all, range_all, identity_minus_pi, range_all, ipiv,
-                                        &info_LU);
-                for (int ib = 0; ib != range_all; ib++)
-                {
-                    if (ipiv[ib] != (ib + 1))
-                        det_for_rpa = -det_for_rpa * identity_minus_pi(ib, ib);
-                    else
-                        det_for_rpa = det_for_rpa * identity_minus_pi(ib, ib);
-                }
-                delete[] ipiv;
-
-                complex<double> trace_pi;
-                complex<double> ln_det;
-                ln_det = std::log(det_for_rpa);
-                trace_pi = trace(pi_freq_q.at(freq).at(q));
-                // cout << "PI trace vector:" << endl;
-                // cout << endl;
-                rpa_for_omega_q = ln_det + trace_pi;
-                // cout << " ifreq:" << freq << "      rpa_for_omega_k: " << rpa_for_omega_q << "
-                // lnt_det: " << ln_det << "    trace_pi " << trace_pi << endl;
-                cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;
-                tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;
-            }
-        }
-        // lib_printf("Finish EcRPA %4d, size %zu\n", mpi_comm_global_h.myid,
-        // pi_freq_q_Mu_Nu.size());
-        mpi_comm_global_h.barrier();
-        map<Vector3_Order<double>, complex<double>> global_cRPA_q;
-        for (auto q_weight : irk_weight)
-        {
-            MPI_Reduce(&cRPA_q[q_weight.first], &global_cRPA_q[q_weight.first], 1,
-                       MPI_DOUBLE_COMPLEX, MPI_SUM, 0, mpi_comm_global_h.comm);
-        }
-
-        for (auto &q_crpa : global_cRPA_q)
-        {
-            corr.qcontrib[q_crpa.first] = q_crpa.second;
-        }
-        complex<double> gather_tot_RPA_energy(0.0, 0.0);
-        MPI_Reduce(&tot_RPA_energy, &gather_tot_RPA_energy, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, 0,
-                   mpi_comm_global_h.comm);
-        corr.value = gather_tot_RPA_energy;
-    }
-    corr.etype = CorrEnergy::type::RPA;
-    return corr;
-}
-
-CorrEnergy compute_MP2_correlation(const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat)
-{
-    CorrEnergy corr;
-    corr.etype = CorrEnergy::type::MP2;
-    return corr;
-}
-
-map<double, map<Vector3_Order<double>, atom_mapping<ComplexMatrix>::pair_t_old>> compute_Pi_q(
-    const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat)
-{
-    map<double, map<Vector3_Order<double>, atom_mapping<ComplexMatrix>::pair_t_old>> pi;
-    lib_printf("Begin compute_Pi_q , pid:  %d\n", mpi_comm_global_h.myid);
-    for (auto const &freq_qJQchi0 : chi0.get_chi0_q())
-    {
-        const double freq = freq_qJQchi0.first;
-        for (auto &q_JQchi0 : freq_qJQchi0.second)
-        {
-            Vector3_Order<double> q = q_JQchi0.first;
-            for (auto &JQchi0 : q_JQchi0.second)
-            {
-                const size_t J = JQchi0.first;
-                const size_t J_mu = atom_mu[J];
-                for (auto &Qchi0 : JQchi0.second)
-                {
-                    const size_t Q = Qchi0.first;
-                    const size_t Q_mu = atom_mu[Q];
-                    // auto &chi0_mat = Qchi0.second;
-                    for (int I = 0; I != natom; I++)
-                    {
-                        // const size_t I = I_p.first;
-                        const size_t I_mu = atom_mu[I];
-                        pi[freq][q][I][Q].create(I_mu, Q_mu);
-                        if (J != Q) pi[freq][q][I][J].create(I_mu, J_mu);
-                    }
-                }
-            }
-            // if(freq==chi0.tfg.get_freq_nodes()[0])
-            //     for(auto &Ip:pi[freq][q])
-            //         for(auto &Jp:Ip.second)
-            //             lib_printf("  |process  %d, pi atpair: %d, %d
-            //             \n",mpi_comm_global_h.myid,Ip.first,Jp.first);
-        }
-    }
-
-    // ofstream fp;
-    // std::stringstream ss;
-    // ss<<"out_pi_rank_"<<mpi_comm_global_h.myid<<".txt";
-    // fp.open(ss.str());
-    for (auto &freq_p : chi0.get_chi0_q())
-    {
-        const double freq = freq_p.first;
-        const auto chi0_freq = freq_p.second;
-        for (auto &k_pair : chi0_freq)
-        {
-            Vector3_Order<double> ik_vec = k_pair.first;
-            auto chi0_freq_k = k_pair.second;
-            for (auto &J_p : chi0_freq_k)
-            {
-                const size_t J = J_p.first;
-                for (auto &Q_p : J_p.second)
-                {
-                    const size_t Q = Q_p.first;
-                    auto &chi0_mat = Q_p.second;
-                    for (int I = 0; I != natom; I++)
-                    {
-                        // const size_t I = I_p.first;
-                        // printf("cal_pi  pid: %d , IJQ:  %d  %d  %d\n", mpi_comm_global_h.myid, I,
-                        // J, Q);
-                        //   cout<<"         pi_IQ: "<<pi_k.at(freq).at(ik_vec).at(I).at(Q)(0,0)<<"
-                        //   pi_IJ: "<<pi_k.at(freq).at(ik_vec).at(I).at(J)(0,0);
-                        if (I <= J)
-                        {
-                            // if (freq == chi0.tfg.get_freq_nodes()[0])
-                            //     lib_printf("cal_pi  pid: %d , IJQ:  %d  %d  %d   type: %d \n",
-                            //     mpi_comm_global_h.myid, I, J, Q,1);
-                            //      << "  Vq: " << (*Vq.at(I).at(J).at(ik_vec))(0, 0) << endl;
-                            pi.at(freq).at(ik_vec).at(I).at(Q) +=
-                                (*Vq.at(I).at(J).at(ik_vec)) * chi0_mat;
-                            // if (freq == chi0.tfg.get_freq_nodes()[0])
-                            // {
-                            //     std:stringstream sm;
-                            //     complex<double> trace_pi;
-                            //     trace_pi = trace(pi.at(freq).at(ik_vec).at(I).at(Q));
-                            //     sm << " IJQ: " << I << " " << J << " " << Q << "  ik_vec: " <<
-                            //     ik_vec << "  trace_pi:  " << trace_pi << endl;
-                            //     print_complex_matrix_file(sm.str().c_str(),
-                            //     (*Vq.at(I).at(J).at(ik_vec)),fp,false);
-                            //     print_complex_matrix_file("chi0:", chi0_mat,fp,false);
-                            //     print_complex_matrix_file("pi_mat:",
-                            //     pi.at(freq).at(ik_vec).at(I).at(Q),fp,false);
-                            // }
-                        }
-                        else
-                        {
-                            // if (freq == chi0.tfg.get_freq_nodes()[0])
-                            //     lib_printf("cal_pi  pid: %d , IJQ:  %d  %d  %d   type: %d \n",
-                            //     mpi_comm_global_h.myid, I, J, Q,2);
-                            //      << "  Vq: " << transpose(*Vq.at(J).at(I).at(ik_vec), 1)(0, 0) <<
-                            //      endl;
-                            pi.at(freq).at(ik_vec).at(I).at(Q) +=
-                                transpose(*Vq.at(J).at(I).at(ik_vec), 1) * chi0_mat;
-                        }
-
-                        if (J != Q)
-                        {
-                            ComplexMatrix chi0_QJ = transpose(chi0_mat, 1);
-                            if (I <= Q)
-                            {
-                                // if (freq == chi0.tfg.get_freq_nodes()[0])
-                                //     lib_printf("cal_pi  pid: %d , IJQ:  %d  %d  %d   type: %d
-                                //     \n", mpi_comm_global_h.myid, I, J, Q,3);
-                                //      << "  Vq: " << (*Vq.at(I).at(Q).at(ik_vec))(0, 0) << endl;
-                                pi.at(freq).at(ik_vec).at(I).at(J) +=
-                                    (*Vq.at(I).at(Q).at(ik_vec)) * chi0_QJ;
-                            }
-                            else
-                            {
-                                // if (freq == chi0.tfg.get_freq_nodes()[0])
-                                //     lib_printf("cal_pi  pid: %d , IJQ:  %d  %d  %d   type: %d
-                                //     \n", mpi_comm_global_h.myid, I, J, Q,4);
-                                //      << "  Vq: " << transpose(*Vq.at(J).at(I).at(ik_vec), 1)(0,
-                                //      0) << endl;
-                                pi.at(freq).at(ik_vec).at(I).at(J) +=
-                                    transpose(*Vq.at(Q).at(I).at(ik_vec), 1) * chi0_QJ;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-    // fp.close();
-    // print_complex_matrix("
-    // first_pi_mat:",pi.at(chi0.tfg.get_freq_nodes()[0]).at({0,0,0}).at(0).at(0));
-    /* print_complex_matrix("
-     * last_pi_mat:",pi.at(chi0.tfg.get_freq_nodes()[0]).at({0,0,0}).at(natom-1).at(natom-1)); */
-    return pi;
-}
-
-map<double, map<Vector3_Order<double>, atom_mapping<ComplexMatrix>::pair_t_old>> compute_Pi_q_MPI(
-    const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat)
-{
-    map<double, map<Vector3_Order<double>, atom_mapping<ComplexMatrix>::pair_t_old>> pi;
-    lib_printf("Begin compute_Pi_q_MPI , pid:  %d\n", mpi_comm_global_h.myid);
-    for (auto const &freq_qJQchi0 : chi0.get_chi0_q())
-    {
-        const double freq = freq_qJQchi0.first;
-        for (auto &q_JQchi0 : freq_qJQchi0.second)
-        {
-            Vector3_Order<double> q = q_JQchi0.first;
-            for (auto &JQchi0 : q_JQchi0.second)
-            {
-                const size_t J = JQchi0.first;
-                const size_t J_mu = atom_mu[J];
-                for (auto &Qchi0 : JQchi0.second)
-                {
-                    const size_t Q = Qchi0.first;
-                    const size_t Q_mu = atom_mu[Q];
-                    // auto &chi0_mat = Qchi0.second;
-                    for (int I = 0; I != natom; I++)
-                    {
-                        // const size_t I = I_p.first;
-                        const size_t I_mu = atom_mu[I];
-                        pi[freq][q][I][Q].create(I_mu, Q_mu);
-                        if (J != Q) pi[freq][q][I][J].create(I_mu, J_mu);
-                    }
-                }
-            }
-            // if(freq==chi0.tfg.get_freq_nodes()[0])
-            //     for(auto &Ip:pi[freq][q])
-            //         for(auto &Jp:Ip.second)
-            //             lib_printf("  |process  %d, pi atpair: %d, %d
-            //             \n",mpi_comm_global_h.myid,Ip.first,Jp.first);
-        }
-    }
-
-    // ofstream fp;
-    // std::stringstream ss;
-    // ss<<"out_pi_rank_"<<mpi_comm_global_h.myid<<".txt";
-    // fp.open(ss.str());
-    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
-    // printf("success before irk_weight, pid: %d\n", mpi_comm_global_h.myid);
-    #endif
-    for (auto &k_pair : irk_weight)
-    {
-        Vector3_Order<double> ik_vec = k_pair.first;
-        for (int I = 0; I != natom; I++)
-        {
-            #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
-            // printf("success before gather_vp_row_q irk_weight, pid: %d\n", mpi_comm_global_h.myid);
-            #endif
-            atom_mapping<ComplexMatrix>::pair_t_old Vq_row = gather_vq_row_q(I, coulmat, ik_vec);
-            #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
-            // printf("success after gather_vp_row_q irk_weight, pid: %d\n", mpi_comm_global_h.myid);
-            #endif
-            for (auto &freq_p : chi0.get_chi0_q())
-            {
-                const double freq = freq_p.first;
-                const auto chi0_freq = freq_p.second;
-
-                auto chi0_freq_k = freq_p.second.at(ik_vec);
-
-                for (auto &J_p : chi0_freq_k)
-                {
-                    const size_t J = J_p.first;
-                    for (auto &Q_p : J_p.second)
-                    {
-                        const size_t Q = Q_p.first;
-                        auto &chi0_mat = Q_p.second;
-
-                        // const size_t I = I_p.first;
-                        // printf("cal_pi  pid: %d , IJQ:  %d  %d  %d\n", mpi_comm_global_h.myid, I,
-                        // J, Q);
-                        //   cout<<"         pi_IQ: "<<pi_k.at(freq).at(ik_vec).at(I).at(Q)(0,0)<<"
-                        //   pi_IJ: "<<pi_k.at(freq).at(ik_vec).at(I).at(J)(0,0);
-
-                        // if (freq == chi0.tfg.get_freq_nodes()[0])
-                        //     lib_printf("cal_pi  pid: %d , IJQ:  %d  %d  %d   type: %d \n",
-                        //     mpi_comm_global_h.myid, I, J, Q,1);
-                        //      << "  Vq: " << (*Vq.at(I).at(J).at(ik_vec))(0, 0) << endl;
-                        pi.at(freq).at(ik_vec).at(I).at(Q) += Vq_row.at(I).at(J) * chi0_mat;
-                        // if (freq == chi0.tfg.get_freq_nodes()[0])
-                        // {
-                        //     std:stringstream sm;
-                        //     complex<double> trace_pi;
-                        //     trace_pi = trace(pi.at(freq).at(ik_vec).at(I).at(Q));
-                        //     sm << " IJQ: " << I << " " << J << " " << Q << "  ik_vec: " << ik_vec
-                        //     << "  trace_pi:  " << trace_pi << endl;
-                        //     print_complex_matrix_file(sm.str().c_str(),
-                        //     Vq_row.at(I).at(J),fp,false); print_complex_matrix_file("chi0:",
-                        //     chi0_mat,fp,false); print_complex_matrix_file("pi_mat:",
-                        //     pi.at(freq).at(ik_vec).at(I).at(Q),fp,false);
-                        // }
-
-                        if (J != Q)
-                        {
-                            ComplexMatrix chi0_QJ = transpose(chi0_mat, 1);
-                            // if (freq == chi0.tfg.get_freq_nodes()[0])
-                            //     lib_printf("cal_pi  pid: %d , IJQ:  %d  %d  %d   type: %d \n",
-                            //     mpi_comm_global_h.myid, I, J,Q,3);
-                            //      << "  Vq: " << (*Vq.at(I).at(Q).at(ik_vec))(0, 0) << endl;
-                            pi.at(freq).at(ik_vec).at(I).at(J) += Vq_row.at(I).at(Q) * chi0_QJ;
-                        }
-                    }
-                }
-            }
-        }
-    }
-    // fp.close();
-    //  print_complex_matrix("
-    //  first_pi_mat:",pi.at(chi0.tfg.get_freq_nodes()[0]).at({0,0,0}).at(0).at(0));
-    /* print_complex_matrix("
-     * last_pi_mat:",pi.at(chi0.tfg.get_freq_nodes()[0]).at({0,0,0}).at(natom-1).at(natom-1)); */
-    lib_printf("End compute_Pi_q_MPI , pid:  %d\n", mpi_comm_global_h.myid);
-    return pi;
-}
-
-ComplexMatrix compute_Pi_freq_q_row(const Vector3_Order<double> &ik_vec,
-                                    const atom_mapping<ComplexMatrix>::pair_t_old &chi0_freq_q,
-                                    const atom_mapping<ComplexMatrix>::pair_t_old &Vq_row,
-                                    const int &I)
-{
-    map<size_t, ComplexMatrix> pi;
-    // lib_printf("Begin cal_pi_k , pid:  %d\n", para_mpi.get_myid());
-    auto I_mu = atom_mu[I];
-    for (int J = 0; J != natom; J++) pi[J].create(I_mu, atom_mu[J]);
-
-    omp_lock_t pi_lock;
-    omp_init_lock(&pi_lock);
-#pragma omp parallel for schedule(dynamic)
-    for (int iap = 0; iap != local_atpair.size(); iap++)
-    {
-        const size_t J = local_atpair[iap].first;
-        const size_t Q = local_atpair[iap].second;
-        auto &chi0_mat = chi0_freq_q.at(J).at(Q);
-        auto tmp_pi_mat = Vq_row.at(I).at(J) * chi0_mat;
-        ComplexMatrix chi0_QJ = transpose(chi0_mat, 1);
-        auto tmp_pi_mat2 = Vq_row.at(I).at(Q) * chi0_QJ;
-        omp_set_lock(&pi_lock);
-        pi.at(Q) += tmp_pi_mat;
-        if (J != Q)
-        {
-            pi.at(J) += tmp_pi_mat2;
-        }
-        omp_unset_lock(&pi_lock);
-    }
-    omp_destroy_lock(&pi_lock);
-    // for (auto &J_p : chi0_freq_q)
-    // {
-    //     const size_t J = J_p.first;
-    //     for (auto &Q_p : J_p.second)
-    //     {
-    //         const size_t Q = Q_p.first;
-    //         auto &chi0_mat = Q_p.second;
-    //         pi.at(Q) += Vq_row.at(I).at(J) * chi0_mat;
-    //         if (J != Q)
-    //         {
-    //             ComplexMatrix chi0_QJ = transpose(chi0_mat, 1);
-    //             pi.at(J) += Vq_row.at(I).at(Q) * chi0_QJ;
-    //         }
-    //     }
-    // }
-    // Pi_rowT
-    // ComplexMatrix pi_row(N_all_mu,atom_mu[I]);
-    // complex<double> *pi_row_ptr=pi_row.c;
-    // for(auto &Jp:pi)
-    // {
-    //     auto J=Jp.first;
-    //     auto J_mu=atom_mu[J];
-    //     const auto length=sizeof(complex<double>)* I_mu *J_mu;
-    //     memcpy(pi_row_ptr, pi.at(J).c,length);
-    //     pi_row_ptr+=I_mu *J_mu;
-    // }
-    ComplexMatrix pi_row(atom_mu[I], N_all_mu);
-    for (int i = 0; i != pi_row.nr; i++)
-        for (int J = 0; J != natom; J++)
-            for (int j = 0; j != atom_mu[J]; j++)
-                pi_row(i, atom_mu_part_range[J] + j) = pi.at(J)(i, j);
-    return pi_row;
-}
-
-ComplexMatrix compute_Pi_freq_q_row_ri(const Vector3_Order<double> &ik_vec,
-                                       const atom_mapping<ComplexMatrix>::pair_t_old &chi0_freq_q,
-                                       const atpair_k_cplx_mat_t &Vq_loc, const int &I,
-                                       const Vector3_Order<double> &q)
-{
-    map<size_t, ComplexMatrix> pi;
-    // lib_printf("Begin cal_pi_k , pid:  %d\n", mpi_comm_global_h.myid);
-    auto I_mu = atom_mu[I];
-    for (int J = 0; J != natom; J++) pi[J].create(I_mu, atom_mu[J]);
-
-    omp_lock_t pi_lock;
-    omp_init_lock(&pi_lock);
-#pragma omp parallel for schedule(dynamic)
-    for (int iap = 0; iap != local_atpair.size(); iap++)
-    {
-        const size_t J = local_atpair[iap].first;
-        const size_t Q = local_atpair[iap].second;
-        auto &chi0_mat = chi0_freq_q.at(J).at(Q);
-        // printf("| IN cal Pi process %d, I: %d  J: %d  Q: %d\n",mpi_comm_global_h.myid, I,J,Q );
-        auto tmp_pi_mat = *Vq_loc.at(I).at(J).at(q) * chi0_mat;
-        ComplexMatrix chi0_QJ = transpose(chi0_mat, 1);
-        auto tmp_pi_mat2 = *Vq_loc.at(I).at(Q).at(q) * chi0_QJ;
-        omp_set_lock(&pi_lock);
-        pi.at(Q) += tmp_pi_mat;
-        if (J != Q)
-        {
-            pi.at(J) += tmp_pi_mat2;
-        }
-        omp_unset_lock(&pi_lock);
-    }
-    omp_destroy_lock(&pi_lock);
-    // for (auto &J_p : chi0_freq_q)
-    // {
-    //     const size_t J = J_p.first;
-    //     for (auto &Q_p : J_p.second)
-    //     {
-    //         const size_t Q = Q_p.first;
-    //         auto &chi0_mat = Q_p.second;
-    //         pi.at(Q) += Vq_row.at(I).at(J) * chi0_mat;
-    //         if (J != Q)
-    //         {
-    //             ComplexMatrix chi0_QJ = transpose(chi0_mat, 1);
-    //             pi.at(J) += Vq_row.at(I).at(Q) * chi0_QJ;
-    //         }
-    //     }
-    // }
-    // Pi_rowT
-    // ComplexMatrix pi_row(N_all_mu,atom_mu[I]);
-    // complex<double> *pi_row_ptr=pi_row.c;
-    // for(auto &Jp:pi)
-    // {
-    //     auto J=Jp.first;
-    //     auto J_mu=atom_mu[J];
-    //     const auto length=sizeof(complex<double>)* I_mu *J_mu;
-    //     memcpy(pi_row_ptr, pi.at(J).c,length);
-    //     pi_row_ptr+=I_mu *J_mu;
-    // }
-    ComplexMatrix pi_row(atom_mu[I], N_all_mu);
-    for (int i = 0; i != pi_row.nr; i++)
-        for (int J = 0; J != natom; J++)
-            for (int j = 0; j != atom_mu[J]; j++)
-                pi_row(i, atom_mu_part_range[J] + j) = pi.at(J)(i, j);
-    return pi_row;
-}
-
-atom_mapping<ComplexMatrix>::pair_t_old gather_vq_row_q(const int &I,
-                                                        const atpair_k_cplx_mat_t &coulmat,
-                                                        const Vector3_Order<double> &ik_vec)
-{
-    auto I_mu = atom_mu[I];
-    atom_mapping<ComplexMatrix>::pair_t_old Vq_row;
-    for (int J_tmp = 0; J_tmp != natom; J_tmp++)
-    {
-        auto J_mu = atom_mu[J_tmp];
-        ComplexMatrix loc_vq(atom_mu[I], atom_mu[J_tmp]);
-        Vq_row[I][J_tmp].create(atom_mu[I], atom_mu[J_tmp]);
-        // const auto length=sizeof(complex<double>)* I_mu *J_mu;
-        // complex<double> *loc_vq_ptr=loc_vq.c;
-        if (I <= J_tmp)
-        {
-            if (Vq.count(I))
-                if (Vq.at(I).count(J_tmp)) loc_vq = *Vq.at(I).at(J_tmp).at(ik_vec);
-        }
-        else
-        {
-            if (Vq.count(J_tmp))
-                if (Vq.at(J_tmp).count(I)) loc_vq = transpose(*Vq.at(J_tmp).at(I).at(ik_vec), 1);
-        }
-        mpi_comm_global_h.allreduce_ComplexMatrix(loc_vq, Vq_row[I][J_tmp]);
-    }
-    return Vq_row;
-}
-
-map<double, atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
-compute_Wc_freq_q(Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat_eps,
-                  atpair_k_cplx_mat_t &coulmat_wc,
-                  const vector<std::complex<double>> &epsmac_LF_imagfreq)
-{
-    map<double,
-        atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
-        Wc_freq_q;
-    const int range_all = LIBRPA::atomic_basis_abf.nb_total;
-    const auto part_range = LIBRPA::atomic_basis_abf.get_part_range();
-
-    if (mpi_comm_global_h.myid == 0)
-    {
-        cout << "Calculating Wc using LAPACK" << endl;
-    }
-
-    mpi_comm_global_h.barrier();
-    // use q-points as the outmost loop, so that square root of Coulomb will not be recalculated at
-    // each frequency point
-    vector<Vector3_Order<double>> qpts;
-    for (const auto &qMuNuchi : chi0.get_chi0_q().at(chi0.tfg.get_freq_nodes()[0]))
-        qpts.push_back(qMuNuchi.first);
-
-    for (const auto &q : qpts)
-    {
-        int iq = std::distance(klist.begin(), std::find(klist.begin(), klist.end(), q));
-        char fn[80];
-
-        ComplexMatrix Vq_all(range_all, range_all);
-        for (const auto &Mu_NuqVq : coulmat_eps)
-        {
-            auto Mu = Mu_NuqVq.first;
-            auto n_mu = atom_mu[Mu];
-            for (auto &Nu_qVq : Mu_NuqVq.second)
-            {
-                auto Nu = Nu_qVq.first;
-                if (0 == Nu_qVq.second.count(q)) continue;
-                auto n_nu = atom_mu[Nu];
-                for (int i_mu = 0; i_mu != n_mu; i_mu++)
-                    for (int i_nu = 0; i_nu != n_nu; i_nu++)
-                    {
-                        Vq_all(part_range[Mu] + i_mu, part_range[Nu] + i_nu) =
-                            (*Nu_qVq.second.at(q))(i_mu, i_nu);
-                        Vq_all(part_range[Nu] + i_nu, part_range[Mu] + i_mu) =
-                            conj((*Nu_qVq.second.at(q))(i_mu, i_nu));
-                    }
-            }
-        }
-        if (Params::debug)
-        {
-            sprintf(fn, "Vq_all_q_%d.mtx", iq);
-            print_complex_matrix_mm(Vq_all, Params::output_dir + "/" + fn, 1e-15);
-        }
-        auto sqrtVq_all = power_hemat(Vq_all, 0.5, true, false, Params::sqrt_coulomb_threshold);
-        // Vq_all is now eigenvectors of the original Coulomb matrix
-        const auto &Vq_eigen = Vq_all;
-        if (Params::debug)
-        {
-            sprintf(fn, "sqrtVq_all_q_%d.mtx", iq);
-            print_complex_matrix_mm(sqrtVq_all, Params::output_dir + "/" + fn, 1e-15);
-            // sprintf(fn, "rotated_sqrtVq_all_q_%d.mtx", iq);
-            // print_complex_matrix_mm(Vq_all * sqrtVq_all * transpose(Vq_all, true), fn, 1e-15);
-            // print_complex_matrix_mm(transpose(Vq_all, true) * sqrtVq_all * Vq_all, fn, 1e-15);
-            sprintf(fn, "Vqeigenvec_q_%d.mtx", iq);
-            print_complex_matrix_mm(Vq_eigen, Params::output_dir + "/" + fn, 1e-15);
-        }
-
-        // truncated (cutoff) Coulomb
-        ComplexMatrix Vqcut_all(range_all, range_all);
-        for (auto &Mu_NuqVq : coulmat_wc)
-        {
-            auto Mu = Mu_NuqVq.first;
-            auto n_mu = atom_mu[Mu];
-            for (auto &Nu_qVq : Mu_NuqVq.second)
-            {
-                auto Nu = Nu_qVq.first;
-                if (0 == Nu_qVq.second.count(q)) continue;
-                auto n_nu = atom_mu[Nu];
-                for (int i_mu = 0; i_mu != n_mu; i_mu++)
-                    for (int i_nu = 0; i_nu != n_nu; i_nu++)
-                    {
-                        Vqcut_all(part_range[Mu] + i_mu, part_range[Nu] + i_nu) =
-                            (*Nu_qVq.second.at(q))(i_mu, i_nu);
-                        Vqcut_all(part_range[Nu] + i_nu, part_range[Mu] + i_mu) =
-                            conj((*Nu_qVq.second.at(q))(i_mu, i_nu));
-                    }
-            }
-        }
-        auto sqrtVqcut_all =
-            power_hemat(Vqcut_all, 0.5, false, true, Params::sqrt_coulomb_threshold);
-        // sprintf(fn, "sqrtVqcut_all_q_%d.mtx", iq);
-        // print_complex_matrix_mm(sqrtVqcut_all, fn, 1e-15);
-        sprintf(fn, "Vqcut_all_filtered_q_%d.mtx", iq);
-        // print_complex_matrix_mm(Vqcut_all, fn, 1e-15);
-        // save the filtered truncated Coulomb back to the atom mapping object
-        // TODO: revise the necessity
-        for (auto &Mu_NuqVq : coulmat_wc)
-        {
-            auto Mu = Mu_NuqVq.first;
-            auto n_mu = atom_mu[Mu];
-            for (auto &Nu_qVq : Mu_NuqVq.second)
-            {
-                auto Nu = Nu_qVq.first;
-                if (0 == Nu_qVq.second.count(q)) continue;
-                auto n_nu = atom_mu[Nu];
-                for (int i_mu = 0; i_mu != n_mu; i_mu++)
-                    for (int i_nu = 0; i_nu != n_nu; i_nu++)
-                        (*Nu_qVq.second.at(q))(i_mu, i_nu) =
-                            Vqcut_all(part_range[Mu] + i_mu, part_range[Nu] + i_nu);
-            }
-        }
-
-        ComplexMatrix chi0fq_all(range_all, range_all);
-        for (const auto &freq_qMuNuchi : chi0.get_chi0_q())
-        {
-            auto freq = freq_qMuNuchi.first;
-            auto ifreq = chi0.tfg.get_freq_index(freq);
-            auto MuNuchi = freq_qMuNuchi.second.at(q);
-            for (const auto &Mu_Nuchi : MuNuchi)
-            {
-                auto Mu = Mu_Nuchi.first;
-                auto n_mu = atom_mu[Mu];
-                for (auto &Nu_chi : Mu_Nuchi.second)
-                {
-                    auto Nu = Nu_chi.first;
-                    auto n_nu = atom_mu[Nu];
-                    for (int i_mu = 0; i_mu != n_mu; i_mu++)
-                        for (int i_nu = 0; i_nu != n_nu; i_nu++)
-                        {
-                            chi0fq_all(part_range[Mu] + i_mu, part_range[Nu] + i_nu) =
-                                Nu_chi.second(i_mu, i_nu);
-                            chi0fq_all(part_range[Nu] + i_nu, part_range[Mu] + i_mu) =
-                                conj(Nu_chi.second(i_mu, i_nu));
-                        }
-                }
-            }
-            sprintf(fn, "chi0fq_all_q_%d_freq_%d.mtx", iq, ifreq);
-            print_complex_matrix_mm(chi0fq_all, Params::output_dir + "/" + fn, 1e-15);
-
-            ComplexMatrix identity(range_all, range_all);
-            identity.set_as_identity_matrix();
-            auto eps_fq = sqrtVq_all * chi0fq_all * sqrtVq_all;
-            eps_fq = transpose(Vq_eigen, true) * eps_fq * Vq_eigen;
-            if (!epsmac_LF_imagfreq.empty() && is_gamma_point(q))
-            {
-                // rotate to Coulomb-diagonal basis
-                // lib_printf("Largest off-diagonal = %f\n", eps_fq.get_max_abs_offdiag());
-                // print_matrix("rotated eps_fq: ", eps_fq.real());
-                // replacing the element corresponding to largest Coulomb eigenvalue with dielectric
-                // function
-                lib_printf("%22.12f %22.12f %22.12f %22.12f\n", freq, eps_fq(0, 0).real(),
-                           eps_fq(eps_fq.nr - 1, eps_fq.nc - 1).real(),
-                           epsmac_LF_imagfreq[ifreq].real());
-                // eps_fq(eps_fq.nr - 1, eps_fq.nc - 1) = epsmac_LF_imagfreq[ifreq];
-                eps_fq(0, 0) = 1.0 - epsmac_LF_imagfreq[ifreq];
-            }
-            if (Params::debug)
-            {
-                sprintf(fn, "rotated_vsxvs_q_%d_freq_%d.mtx", iq, ifreq);
-                print_complex_matrix_mm(eps_fq, Params::output_dir + "/" + fn, 1e-10);
-            }
-            // rotate back to ABF
-            eps_fq = Vq_eigen * eps_fq * transpose(Vq_eigen, true);
-            eps_fq = identity - eps_fq;
-            if (Params::debug)
-            {
-                sprintf(fn, "eps_q_%d_freq_%d.mtx", iq, ifreq);
-                print_complex_matrix_mm(eps_fq, Params::output_dir + "/" + fn, 1e-10);
-            }
-
-            // invert the epsilon matrix
-            power_hemat_onsite(eps_fq, -1);
-            auto wc_all = sqrtVqcut_all * (eps_fq - identity) * sqrtVqcut_all;
-            // sprintf(fn, "inveps_q_%d_freq_%d.mtx", iq, ifreq);
-            // print_complex_matrix_mm(eps_fq, fn, 1e-15);
-            // sprintf(fn, "wc_q_%d_freq_%d.mtx", iq, ifreq);
-            // print_complex_matrix_mm(wc_all, fn, 1e-15);
-
-            // save result to the atom mapping object
-            for (auto &Mu_Nuchi : MuNuchi)
-            {
-                auto Mu = Mu_Nuchi.first;
-                auto n_mu = atom_mu[Mu];
-                for (auto &Nu_chi : Mu_Nuchi.second)
-                {
-                    auto Nu = Nu_chi.first;
-                    auto n_nu = atom_mu[Nu];
-                    shared_ptr<ComplexMatrix> wc_ptr = make_shared<ComplexMatrix>();
-                    wc_ptr->create(n_mu, n_nu);
-                    for (int i_mu = 0; i_mu != n_mu; i_mu++)
-                        for (int i_nu = 0; i_nu != n_nu; i_nu++)
-                        {
-                            (*wc_ptr)(i_mu, i_nu) =
-                                wc_all(part_range[Mu] + i_mu, part_range[Nu] + i_nu);
-                        }
-                    Wc_freq_q[freq][Mu][Nu][q] =
-                        matrix_m<complex<double>>(n_mu, n_nu, wc_ptr->c, MAJOR::ROW, MAJOR::ROW);
-                }
-            }
-        }
-    }
-
-    return Wc_freq_q;
-}
-
-// Done: converge compute_Wc_freq_q_blacs and compute_Wc_freq_q_blacs_wing
-map<double, atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
-compute_Wc_freq_q_blacs(Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat_eps,
-                        atpair_k_cplx_mat_t &coulmat_wc,
-                        const vector<std::complex<double>> &epsmac_LF_imagfreq)
-{
-    map<double,
-        atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
-        Wc_freq_q;
-    const complex<double> CONE{1.0, 0.0};
-    const int n_abf = LIBRPA::atomic_basis_abf.nb_total;
-    const auto part_range = LIBRPA::atomic_basis_abf.get_part_range();
-
-    if (mpi_comm_global_h.myid == 0)
-    {
-        cout << "Calculating Wc using ScaLAPACK" << endl;
-    }
-    mpi_comm_global_h.barrier();
-
-    Profiler::start("compute_Wc_freq_q_blacs_init");
-    Array_Desc desc_nabf_nabf(blacs_ctxt_global_h);
-    // Use a square blocksize instead max block, otherwise heev and inversion will complain about
-    // illegal parameter Maximal blocksize ensure that atom indices related to the rows/columns of a
-    // local matrix is minimized.
-    desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0);
-    // This, however, is not optimal for matrix operations, and may lead to segment fault during
-    // MPI operations with parallel linear algebra subroutine. Thus we define an optimal blocksize
-    Array_Desc desc_nabf_nabf_opt(blacs_ctxt_global_h);
-    const int nb_opt = min(128, desc_nabf_nabf.nb());
-    desc_nabf_nabf_opt.init(n_abf, n_abf, nb_opt, nb_opt, 0, 0);
-    // obtain the indices of atom-pair block necessary to build 2D block of a Hermitian/symmetric
-    // matrix
-    const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy(
-        'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf);
-    const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf);
-    // temp_block is used to collect data from IJ-pair data structure with comm_map2_first
-    auto temp_block = init_local_mat<complex<double>>(desc_nabf_nabf, MAJOR::COL);
-    // Below are the working arrays for matrix operations
-    auto chi0_block = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
-    auto coul_block = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
-    auto coul_eigen_block = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
-    auto coul_chi0_block = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
-    auto coulwc_block = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
-
-    const double mem_blocks = (chi0_block.size() + coul_block.size() + coul_eigen_block.size() +
-                               coul_chi0_block.size() + coulwc_block.size()) *
-                              16.0e-6;
-    ofs_myid << get_timestamp()
-             << " Memory consumption of task-local blocks for screened Coulomb [MB]: " << mem_blocks
-             << endl;
-
-    const auto atpair_local = dispatch_upper_trangular_tasks(
-        natom, blacs_ctxt_global_h.myid, blacs_ctxt_global_h.nprows, blacs_ctxt_global_h.npcols,
-        blacs_ctxt_global_h.myprow, blacs_ctxt_global_h.mypcol);
-#ifdef LIBRPA_DEBUG
-    ofs_myid << get_timestamp() << " atpair_local " << atpair_local << endl;
-    ofs_myid << get_timestamp() << " s0_s1 " << s0_s1 << endl;
-#endif
-
-    // IJ pair of Wc to be returned
-    pair<set<int>, set<int>> Iset_Jset_Wc;
-    for (const auto &ap : atpair_local)
-    {
-        Iset_Jset_Wc.first.insert(ap.first);
-        Iset_Jset_Wc.second.insert(ap.second);
-    }
-
-    // Prepare local basis indices for 2D->IJ map
-    int I, iI;
-    map<int, vector<int>> map_lor_v;
-    map<int, vector<int>> map_loc_v;
-    for (int i_lo = 0; i_lo != desc_nabf_nabf.m_loc(); i_lo++)
-    {
-        int i_glo = desc_nabf_nabf.indx_l2g_r(i_lo);
-        LIBRPA::atomic_basis_abf.get_local_index(i_glo, I, iI);
-        map_lor_v[I].push_back(iI);
-    }
-    for (int i_lo = 0; i_lo != desc_nabf_nabf.n_loc(); i_lo++)
-    {
-        int i_glo = desc_nabf_nabf.indx_l2g_c(i_lo);
-        LIBRPA::atomic_basis_abf.get_local_index(i_glo, I, iI);
-        map_loc_v[I].push_back(iI);
-    }
-
-    vector<Vector3_Order<double>> qpts;
-    for (const auto &q_weight : irk_weight) qpts.push_back(q_weight.first);
-
-    vec<double> eigenvalues(n_abf);
-    Profiler::cease("compute_Wc_freq_q_blacs_init");
-    LIBRPA::utils::lib_printf_root("Time for Wc initialization (seconds, Wall/CPU): %f %f\n",
-                                   Profiler::get_wall_time_last("compute_Wc_freq_q_blacs_init"),
-                                   Profiler::get_cpu_time_last("compute_Wc_freq_q_blacs_init"));
-
-    Profiler::start("compute_Wc_freq_q_work");
-#ifdef LIBRPA_USE_LIBRI
-    for (const auto &q : qpts)
-    {
-        const int iq = std::distance(qpts.cbegin(), std::find(qpts.cbegin(), qpts.cend(), q));
-        const int iq_in_k =
-            std::distance(klist.cbegin(), std::find(klist.cbegin(), klist.cend(), q));
-        // q-point in fractional coordinates
-        const auto &qf = kfrac_list[iq_in_k];
-        LIBRPA::utils::lib_printf_root("Computing Wc(q), %d / %d, q=(%f, %f, %f)\n", iq + 1,
-                                       qpts.size(), qf.x, qf.y, qf.z);
-        coul_block.zero_out();
-        coulwc_block.zero_out();
-        // lib_printf("coul_block\n%s", str(coul_block).c_str());
-
-        // q-array for LibRI object
-        std::array<double, 3> qa = {q.x, q.y, q.z};
-
-        // collect the block elements of truncated coulomb matrices first
-        // as we reuse coul_eigen_block to reduce memory usage
-        Profiler::start("epsilon_prepare_coulwc_sqrt", "Prepare sqrt of truncated Coulomb");
-        {
-            size_t n_singular_coulwc;
-            // LibRI tensor for communication, release once done
-            std::map<int,
-                     std::map<std::pair<int, std::array<double, 3>>, RI::Tensor<complex<double>>>>
-                couleps_libri;
-            Profiler::start("epsilon_prepare_coulwc_sqrt_1", "Setup libRI object");
-            for (const auto &Mu_Nu : atpair_local)
-            {
-                const auto Mu = Mu_Nu.first;
-                const auto Nu = Mu_Nu.second;
-                // ofs_myid << "Mu " << Mu << " Nu " << Nu << endl;
-                if (coulmat_wc.count(Mu) == 0 || coulmat_wc.at(Mu).count(Nu) == 0 ||
-                    coulmat_wc.at(Mu).at(Nu).count(q) == 0)
-                    continue;
-                const auto &Vq = coulmat_wc.at(Mu).at(Nu).at(q);
-                const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu);
-                const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu);
-                std::valarray<complex<double>> Vq_va(Vq->c, Vq->size);
-                auto pvq = std::make_shared<std::valarray<complex<double>>>();
-                *pvq = Vq_va;
-                couleps_libri[Mu][{Nu, qa}] = RI::Tensor<complex<double>>({n_mu, n_nu}, pvq);
-            }
-            Profiler::stop("epsilon_prepare_coulwc_sqrt_1");
-
-            Profiler::start("epsilon_prepare_coulwc_sqrt_2", "libRI Communicate");
-            const auto IJq_coul = RI::Communicate_Tensors_Map_Judge::comm_map2_first(
-                mpi_comm_global_h.comm, couleps_libri, s0_s1.first, s0_s1.second);
-            Profiler::stop("epsilon_prepare_coulwc_sqrt_2");
-
-            Profiler::start("epsilon_prepare_coulwc_sqrt_3", "Collect 2D-block from IJ");
-            // for (const auto &IJ: set_IJ_nabf_nabf)
-            // {
-            //     const auto &I = IJ.first;
-            //     const auto &J = IJ.second;
-            //     collect_block_from_IJ_storage_syhe(
-            //         coulwc_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, IJ.first,
-            //         IJ.second, true, CONE, IJq_coul.at(I).at({J, qa}).ptr(), MAJOR::ROW);
-            // }
-            collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf,
-                                             qa, true, CONE, IJq_coul, MAJOR::ROW);
-            ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1, desc_nabf_nabf.desc,
-                                          coulwc_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
-                                          blacs_ctxt_global_h.ictxt);
-            Profiler::stop("epsilon_prepare_coulwc_sqrt_3");
-            Profiler::start("epsilon_prepare_coulwc_sqrt_4", "Perform square root");
-            power_hemat_blacs(coulwc_block, desc_nabf_nabf_opt, coul_eigen_block,
-                              desc_nabf_nabf_opt, n_singular_coulwc, eigenvalues.c, 0.5,
-                              Params::sqrt_coulomb_threshold);
-            Profiler::stop("epsilon_prepare_coulwc_sqrt_4");
-        }
-        Profiler::stop("epsilon_prepare_coulwc_sqrt");
-        LIBRPA::utils::lib_printf_root(
-            "Time to prepare sqrt root of Coulomb for Wc(q) (seconds, Wall/CPU): %f %f\n",
-            Profiler::get_wall_time_last("epsilon_prepare_coulwc_sqrt"),
-            Profiler::get_cpu_time_last("epsilon_prepare_coulwc_sqrt"));
-        ofs_myid << get_timestamp() << " Done coulwc sqrt" << endl;
-
-        Profiler::start("epsilon_prepare_couleps_sqrt", "Prepare sqrt of bare Coulomb");
-        // collect the block elements of coulomb matrices
-        {
-            // LibRI tensor for communication, release once done
-            std::map<int,
-                     std::map<std::pair<int, std::array<double, 3>>, RI::Tensor<complex<double>>>>
-                couleps_libri;
-            ofs_myid << get_timestamp() << " Start build couleps_libri" << endl;
-            for (const auto &Mu_Nu : atpair_local)
-            {
-                const auto Mu = Mu_Nu.first;
-                const auto Nu = Mu_Nu.second;
-                // ofs_myid << "Mu " << Mu << " Nu " << Nu << endl;
-                if (coulmat_eps.count(Mu) == 0 || coulmat_eps.at(Mu).count(Nu) == 0 ||
-                    coulmat_eps.at(Mu).at(Nu).count(q) == 0)
-                    continue;
-                const auto &Vq = coulmat_eps.at(Mu).at(Nu).at(q);
-                const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu);
-                const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu);
-                std::valarray<complex<double>> Vq_va(Vq->c, Vq->size);
-                auto pvq = std::make_shared<std::valarray<complex<double>>>();
-                *pvq = Vq_va;
-                couleps_libri[Mu][{Nu, qa}] = RI::Tensor<complex<double>>({n_mu, n_nu}, pvq);
-            }
-            ofs_myid << get_timestamp() << " Done build couleps_libri" << endl;
-            // ofs_myid << "Couleps_libri" << endl << couleps_libri;
-            // if (couleps_libri.size() == 0)
-            //     throw std::logic_error("data at q-point not found in coulmat_eps");
-
-            // perform communication
-            ofs_myid << get_timestamp() << " Start collect couleps_libri, targets" << endl;
-#ifdef LIBRPA_DEBUG
-            ofs_myid << set_IJ_nabf_nabf << endl;
-            ofs_myid << "Extended blocks" << endl;
-            ofs_myid << "atom 1: " << s0_s1.first << endl;
-            ofs_myid << "atom 2: " << s0_s1.second << endl;
-#endif
-            // ofs_myid << "Owned blocks\n";
-            // print_keys(ofs_myid, couleps_libri);
-            // mpi_comm_global_h.barrier();
-            const auto IJq_coul = RI::Communicate_Tensors_Map_Judge::comm_map2_first(
-                mpi_comm_global_h.comm, couleps_libri, s0_s1.first, s0_s1.second);
-            ofs_myid << get_timestamp() << " Done collect couleps_libri, collected blocks" << endl;
-
-            ofs_myid << get_timestamp() << " Start construct couleps 2D block" << endl;
-            collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf,
-                                             qa, true, CONE, IJq_coul, MAJOR::ROW);
-            ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1, desc_nabf_nabf.desc,
-                                          coul_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
-                                          blacs_ctxt_global_h.ictxt);
-            ofs_myid << get_timestamp() << " Done construct couleps 2D block" << endl;
-        }
-        // char fn[100];
-        // sprintf(fn, "couleps_iq_%d.mtx", iq);
-        // print_matrix_mm_file_parallel(fn, coul_block, desc_nabf_nabf);
-        // ofs_myid << str(coul_block);
-        // lib_printf("coul_block\n%s", str(coul_block).c_str());
-
-        size_t n_singular;
-        ofs_myid << get_timestamp() << " Start power hemat couleps\n";
-        matrix_m<std::complex<double>> sqrtveig_blacs;
-        if (is_gamma_point(q))
-        {
-            // choice of power_hemat_blacs_real/power_hemat_blacs_desc
-            // leads to sub-meV difference
-            sqrtveig_blacs = power_hemat_blacs_real(
-                coul_block, desc_nabf_nabf_opt, coul_eigen_block, desc_nabf_nabf_opt, n_singular,
-                eigenvalues.c, 0.5, Params::sqrt_coulomb_threshold);
-            if (Params::replace_w_head && Params::option_dielect_func == 3)
-            {
-                df_headwing.wing_mu_to_lambda(sqrtveig_blacs, desc_nabf_nabf_opt);
-            }
-        }
-        else
-        {
-            sqrtveig_blacs = power_hemat_blacs(coul_block, desc_nabf_nabf_opt, coul_eigen_block,
-                                               desc_nabf_nabf_opt, n_singular, eigenvalues.c, 0.5,
-                                               Params::sqrt_coulomb_threshold);
-        }
-        ofs_myid << get_timestamp() << " Done power hemat couleps\n";
-        // lib_printf("nabf %d nsingu %lu\n", n_abf, n_singular);
-        // release sqrtv when the q-point is not Gamma, or macroscopic dielectric constant at
-        // imaginary frequency is not prepared
-        if (epsmac_LF_imagfreq.empty() || !is_gamma_point(q)) sqrtveig_blacs.clear();
-        const size_t n_nonsingular = n_abf - n_singular;
-        Profiler::stop("epsilon_prepare_couleps_sqrt");
-        LIBRPA::utils::lib_printf_root(
-            "Time to prepare sqrt root of Coulomb for Epsilon(q) (seconds, Wall/CPU): %f %f\n",
-            Profiler::get_wall_time_last("epsilon_prepare_couleps_sqrt"),
-            Profiler::get_cpu_time_last("epsilon_prepare_couleps_sqrt"));
-        ofs_myid << get_timestamp() << " Done couleps sqrt\n";
-        std::flush(ofs_myid);
-
-        for (const auto &freq : chi0.tfg.get_freq_nodes())
-        {
-            const auto ifreq = chi0.tfg.get_freq_index(freq);
-            Profiler::start("epsilon_wc_work_q_omega");
-            Profiler::start("epsilon_prepare_chi0_2d", "Prepare Chi0 2D block");
-            chi0_block.zero_out();
-            {
-                std::map<int, std::map<std::pair<int, std::array<double, 3>>,
-                                       RI::Tensor<complex<double>>>>
-                    chi0_libri;
-                if (chi0.get_chi0_q().count(freq) > 0 && chi0.get_chi0_q().at(freq).count(q) > 0)
-                {
-                    const auto &chi0_wq = chi0.get_chi0_q().at(freq).at(q);
-                    for (const auto &M_Nchi : chi0_wq)
-                    {
-                        const auto &M = M_Nchi.first;
-                        const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M);
-                        for (const auto &N_chi : M_Nchi.second)
-                        {
-                            const auto &N = N_chi.first;
-                            const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N);
-                            const auto &chi = N_chi.second;
-                            std::valarray<complex<double>> chi_va(chi.c, chi.size);
-                            auto pchi = std::make_shared<std::valarray<complex<double>>>();
-                            *pchi = chi_va;
-                            chi0_libri[M][{N, qa}] =
-                                RI::Tensor<complex<double>>({n_mu, n_nu}, pchi);
-                        }
-                    }
-                    // Release the chi0 block for this frequency and q to reduce memory load,
-                    // as they will not be used again
-                    chi0.free_chi0_q(freq, q);
-                }
-                // ofs_myid << "chi0_libri" << endl << chi0_libri;
-                Profiler::start("epsilon_prepare_chi0_2d_comm_map2");
-                const auto IJq_chi0 = RI::Communicate_Tensors_Map_Judge::comm_map2_first(
-                    mpi_comm_global_h.comm, chi0_libri, s0_s1.first, s0_s1.second);
-                Profiler::stop("epsilon_prepare_chi0_2d_comm_map2");
-                // ofs_myid << "IJq_chi0" << endl << IJq_chi0;
-                // for (const auto &IJ: set_IJ_nabf_nabf)
-                // {
-                //     const auto &I = IJ.first;
-                //     const auto &J = IJ.second;
-                //     collect_block_from_IJ_storage_syhe(
-                //         chi0_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, IJ.first,
-                //         IJ.second, true, CONE, IJq_chi0.at(I).at({J, qa}).ptr(), MAJOR::ROW);
-                // }
-                Profiler::start("epsilon_prepare_chi0_2d_collect_block");
-                collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf,
-                                                 LIBRPA::atomic_basis_abf, qa, true, CONE, IJq_chi0,
-                                                 MAJOR::ROW);
-                ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1,
-                                              desc_nabf_nabf.desc, chi0_block.ptr(), 1, 1,
-                                              desc_nabf_nabf_opt.desc, blacs_ctxt_global_h.ictxt);
-                Profiler::stop("epsilon_prepare_chi0_2d_collect_block");
-                // sprintf(fn, "chi_ifreq_%d_iq_%d.mtx", ifreq, iq);
-                // print_matrix_mm_file_parallel(fn, chi0_block, desc_nabf_nabf);
-            }
-            Profiler::stop("epsilon_prepare_chi0_2d");
-
-            Profiler::start("epsilon_compute_eps", "Compute dielectric matrix");
-
-            // for Gamma point, overwrite the head term
-            if (epsmac_LF_imagfreq.size() > 0 && is_gamma_point(q))
-            {
-                ofs_myid << get_timestamp() << " Entering dielectric matrix head overwrite" << endl;
-                // rotate to Coulomb-eigenvector basis
-                // descending order
-                ScalapackConnector::pgemm_f(
-                    'N', 'N', n_abf, n_nonsingular, n_abf, 1.0, chi0_block.ptr(), 1, 1,
-                    desc_nabf_nabf_opt.desc, sqrtveig_blacs.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
-                    0.0, coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc);
-                ScalapackConnector::pgemm_f('C', 'N', n_nonsingular, n_nonsingular, n_abf, 1.0,
-                                            sqrtveig_blacs.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
-                                            coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
-                                            0.0, chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc);
-
-                if (Params::option_dielect_func == 3)
-                {
-                    chi0_block *= -1.0;
-                    for (int i = 0; i != n_nonsingular; i++)
-                    {
-                        const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i);
-                        if (ilo < 0) continue;
-                        const int jlo = desc_nabf_nabf_opt.indx_g2l_c(i);
-                        if (jlo < 0) continue;
-                        chi0_block(ilo, jlo) += 1.0;
-                    }
-                    ofs_myid << get_timestamp() << "Perform the head & wing element overwrite"
-                             << endl;
-                    df_headwing.rewrite_eps(chi0_block, ifreq, desc_nabf_nabf_opt);
-                }
-                else
-                {
-                    const int ilo = desc_nabf_nabf_opt.indx_g2l_r(0);
-                    const int jlo = desc_nabf_nabf_opt.indx_g2l_c(0);
-                    if (ilo >= 0 && jlo >= 0)
-                    {
-                        ofs_myid << get_timestamp() << "Perform the head element overwrite" << endl;
-                        chi0_block(ilo, jlo) = 1.0 - epsmac_LF_imagfreq[ifreq];
-                    }
-                }
-                // rotate back to ABF
-                // descending order
-                ScalapackConnector::pgemm_f('N', 'N', n_abf, n_nonsingular, n_nonsingular, 1.0,
-                                            coul_eigen_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
-                                            chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, 0.0,
-                                            coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc);
-                ScalapackConnector::pgemm_f('N', 'C', n_abf, n_abf, n_nonsingular, 1.0,
-                                            coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
-                                            coul_eigen_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
-                                            0.0, chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc);
-                if (Params::option_dielect_func != 3)
-                {
-                    // now chi0_block is actually v1/2 chi v1/2
-                    chi0_block *= -1.0;
-                    for (int i = 0; i != n_abf; i++)
-                    {
-                        const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i);
-                        if (ilo < 0) continue;
-                        const int jlo = desc_nabf_nabf_opt.indx_g2l_c(i);
-                        if (jlo < 0) continue;
-                        chi0_block(ilo, jlo) += 1.0;
-                    }
-                    // now chi0_block is actually the dielectric matrix
-                    // perform inversion
-                    Profiler::start("epsilon_invert_eps", "Invert dielectric matrix");
-                    invert_scalapack(chi0_block, desc_nabf_nabf_opt);
-                }
-                // subtract 1 from diagonal
-                for (int i = 0; i != n_abf; i++)
-                {
-                    const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i);
-                    if (ilo < 0) continue;
-                    const int jlo = desc_nabf_nabf_opt.indx_g2l_c(i);
-                    if (jlo < 0) continue;
-                    chi0_block(ilo, jlo) -= 1.0;
-                }
-            }
-            else
-            {
-                Profiler::start("epsilon_compute_eps_pgemm_1");
-                ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, coul_block.ptr(), 1,
-                                            1, desc_nabf_nabf_opt.desc, chi0_block.ptr(), 1, 1,
-                                            desc_nabf_nabf_opt.desc, 0.0, coul_chi0_block.ptr(), 1,
-                                            1, desc_nabf_nabf_opt.desc);
-                Profiler::cease("epsilon_compute_eps_pgemm_1");
-                Profiler::start("epsilon_compute_eps_pgemm_2");
-                ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0,
-                                            coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
-                                            coul_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, 0.0,
-                                            chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc);
-                Profiler::cease("epsilon_compute_eps_pgemm_2");
-                // now chi0_block is actually v1/2 chi v1/2
-                chi0_block *= -1.0;
-                for (int i = 0; i != n_abf; i++)
-                {
-                    const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i);
-                    if (ilo < 0) continue;
-                    const int jlo = desc_nabf_nabf_opt.indx_g2l_c(i);
-                    if (jlo < 0) continue;
-                    chi0_block(ilo, jlo) += 1.0;
-                }
-                Profiler::stop("epsilon_compute_eps");
-                // now chi0_block is actually the dielectric matrix
-                // perform inversion
-                Profiler::start("epsilon_invert_eps", "Invert dielectric matrix");
-                invert_scalapack(chi0_block, desc_nabf_nabf_opt);
-                // subtract 1 from diagonal
-                for (int i = 0; i != n_abf; i++)
-                {
-                    const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i);
-                    if (ilo < 0) continue;
-                    const int jlo = desc_nabf_nabf_opt.indx_g2l_c(i);
-                    if (jlo < 0) continue;
-                    chi0_block(ilo, jlo) -= 1.0;
-                }
-                Profiler::stop("epsilon_invert_eps");
-            }
-            // debug for GaAs
-            // for (int i = 0; i != n_abf; i++)
-            // {
-            //     for (int j = 0; j != n_abf; j++)
-            //     {
-            //     const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i);
-            //     if (ilo < 0) continue;
-            //     const int jlo = desc_nabf_nabf_opt.indx_g2l_c(j);
-            //     if (jlo < 0) continue;
-            //     if(i==j)
-            //         chi0_block(ilo, jlo) = 1.0;
-            //     else
-            //         chi0_block(ilo, jlo) = 0.0;
-            //     }
-            // }
-            // debug for unfold shrink Wc
-            // for (int i = 0; i != n_abf; i++)
-            //{
-            //     const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i);
-            //     if (ilo < 0) continue;
-            //     for (int j = 0; j != n_abf; j++)
-            //     {
-            //         const int jlo = desc_nabf_nabf_opt.indx_g2l_c(j);
-            //         if (jlo < 0) continue;
-            //         if (i == j)
-            //             chi0_block(ilo, jlo) = 1.0;
-            //         else
-            //             chi0_block(ilo, jlo) = 0.0;
-            //     }
-            // }
-            // debug end
-
-            Profiler::start("epsilon_multiply_coulwc", "Multiply truncated Coulomb");
-            ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, coulwc_block.ptr(), 1,
-                                        1, desc_nabf_nabf_opt.desc, chi0_block.ptr(), 1, 1,
-                                        desc_nabf_nabf_opt.desc, 0.0, coul_chi0_block.ptr(), 1, 1,
-                                        desc_nabf_nabf_opt.desc);
-            ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, coul_chi0_block.ptr(),
-                                        1, 1, desc_nabf_nabf_opt.desc, coulwc_block.ptr(), 1, 1,
-                                        desc_nabf_nabf_opt.desc, 0.0, chi0_block.ptr(), 1, 1,
-                                        desc_nabf_nabf_opt.desc);
-            ScalapackConnector::pgemr2d_f(n_abf, n_abf, chi0_block.ptr(), 1, 1,
-                                          desc_nabf_nabf_opt.desc, temp_block.ptr(), 1, 1,
-                                          desc_nabf_nabf.desc, blacs_ctxt_global_h.ictxt);
-            Profiler::stop("epsilon_multiply_coulwc");
-            // lib_printf("chi0_block\n%s", str(chi0_block).c_str());
-            // now chi0_block is the screened Coulomb interaction Wc (i.e. W-V)
-
-            Profiler::start("epsilon_convert_wc_2d_to_ij", "Convert Wc, 2D -> IJ");
-            Profiler::start("epsilon_convert_wc_map_block", "Initialize Wc atom-pair map");
-            map<int, map<int, matrix_m<complex<double>>>> Wc_MNmap;
-            // map_block_to_IJ_storage(Wc_MNmap, LIBRPA::atomic_basis_abf,
-            //                         LIBRPA::atomic_basis_abf, chi0_block,
-            //                         desc_nabf_nabf, MAJOR::ROW);
-            map_block_to_IJ_storage_new(Wc_MNmap, LIBRPA::atomic_basis_abf, map_lor_v, map_loc_v,
-                                        temp_block, desc_nabf_nabf, MAJOR::ROW);
-            Profiler::stop("epsilon_convert_wc_map_block");
-
-            Profiler::start("epsilon_convert_wc_communicate", "Communicate");
-            {
-                std::map<int, std::map<std::pair<int, std::array<double, 3>>,
-                                       RI::Tensor<complex<double>>>>
-                    Wc_libri;
-                Profiler::start("epsilon_convert_wc_communicate_1");
-                for (const auto &M_NWc : Wc_MNmap)
-                {
-                    const auto &M = M_NWc.first;
-                    const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M);
-                    for (const auto &N_Wc : M_NWc.second)
-                    {
-                        const auto &N = N_Wc.first;
-                        const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N);
-                        const auto &Wc = N_Wc.second;
-                        // std::valarray<complex<double>> Wc_va(Wc.ptr(), Wc.size());
-                        // auto pWc = std::make_shared<std::valarray<complex<double>>>();
-                        // *pWc = Wc_va;
-                        /*if (iq == 10 && ifreq == 10)
-                        {
-                            char fn[100];
-                            sprintf(fn, "Wc_M_%zu_N_%zu.dat", M, N);
-                            print_matrix_mm_file(Wc, Params::output_dir + "/" + fn);
-                        }*/
-                        Wc_libri[M][{N, qa}] = RI::Tensor<complex<double>>({n_mu, n_nu}, Wc.sptr());
-                    }
-                }
-                Profiler::stop("epsilon_convert_wc_communicate_1");
-                Profiler::start("epsilon_convert_wc_communicate_2");
-                // main timing
-                // cout << Wc_libri;
-                const auto IJq_Wc = RI::Communicate_Tensors_Map_Judge::comm_map2_first(
-                    mpi_comm_global_h.comm, Wc_libri, Iset_Jset_Wc.first, Iset_Jset_Wc.second);
-                Profiler::stop("epsilon_convert_wc_communicate_2");
-                Profiler::start("epsilon_convert_wc_communicate_3");
-                // parse collected to
-                for (const auto &MN : atpair_local)
-                {
-                    const auto &M = MN.first;
-                    const auto &N = MN.second;
-                    const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M);
-                    const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N);
-                    // Use row major for later usage in LibRI
-                    Wc_freq_q[freq][M][N][q] = matrix_m<complex<double>>(
-                        n_mu, n_nu, IJq_Wc.at(M).at({N, qa}).data, MAJOR::ROW);
-                }
-                Profiler::stop("epsilon_convert_wc_communicate_3");
-                // for ( int i_mu = 0; i_mu != n_mu; i_mu++ )
-                //     for ( int i_nu = 0; i_nu != n_nu; i_nu++ )
-                //     {
-                //     }
-            }
-            Profiler::stop("epsilon_convert_wc_communicate");
-            Profiler::stop("epsilon_convert_wc_2d_to_ij");
-            Profiler::cease("epsilon_wc_work_q_omega");
-            LIBRPA::utils::lib_printf_root(
-                "Time for Wc(i_q=%d, i_omega=%d) (seconds, Wall/CPU): %f %f\n", iq + 1, ifreq + 1,
-                Profiler::get_wall_time_last("epsilon_wc_work_q_omega"),
-                Profiler::get_cpu_time_last("epsilon_wc_work_q_omega"));
-        }
-    }
-#else
-    throw std::logic_error("need compilation with LibRI");
-#endif
-    Profiler::cease("compute_Wc_freq_q_work");
-    LIBRPA::utils::lib_printf_root("Time for Wc computation (seconds, Wall/CPU): %f %f\n",
-                                   Profiler::get_wall_time_last("compute_Wc_freq_q_work"),
-                                   Profiler::get_cpu_time_last("compute_Wc_freq_q_work"));
-
-    return Wc_freq_q;
-}
-
-map<double, atom_mapping<std::map<Vector3_Order<int>, matrix_m<complex<double>>>>::pair_t_old>
-FT_Wc_freq_q(
-    const map<double,
-              atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
-        &Wc_freq_q,
-    const TFGrids &tfg, const int &n_k_points, const vector<Vector3_Order<int>> &Rlist)
-{
-    // major of Wc_freq_q input and Wc_tau_R output
-    const MAJOR major_Wc = MAJOR::ROW;
-
-    map<double, atom_mapping<std::map<Vector3_Order<int>, matrix_m<complex<double>>>>::pair_t_old>
-        Wc_freq_R;
-    const int ngrids = tfg.get_n_grids();
-    if (Params::debug)
-    {
-        if (mpi_comm_global_h.is_root()) lib_printf("Converting Wc q,w -> R,t\n");
-        mpi_comm_global_h.barrier();
-    }
-    set<pair<atom_t, atom_t>> atpairs_unique;
-    for (const auto &freq_MuNuqWc : Wc_freq_q)
-    {
-        for (const auto &Mu_NuqWc : freq_MuNuqWc.second)
-        {
-            const auto Mu = Mu_NuqWc.first;
-            for (const auto &Nu_qWc : Mu_NuqWc.second)
-            {
-                const auto Nu = Nu_qWc.first;
-                atpairs_unique.insert({Mu, Nu});
-                for (const auto &q_Wc : Nu_qWc.second)
-                {
-                    assert(q_Wc.second.major() == major_Wc);
-                }
-            }
-        }
-    }
-
-    vector<pair<pair<int, Vector3_Order<int>>, pair<atom_t, atom_t>>> ifreqR_atpair_all;
-    // allocate space before hand
-    for (auto R : Rlist)
-    {
-        for (int ifreq = 0; ifreq != ngrids; ifreq++)
-        {
-            auto freq = tfg.get_freq_nodes()[ifreq];
-            for (auto atpair_unique : atpairs_unique)
-            {
-                const auto Mu = atpair_unique.first;
-                const int n_mu = atom_mu[Mu];
-                const auto Nu = atpair_unique.second;
-                const int n_nu = atom_mu[Nu];
-                Wc_freq_R[freq][Mu][Nu][R] = matrix_m<complex<double>>(n_mu, n_nu, major_Wc);
-                ifreqR_atpair_all.push_back({{ifreq, R}, atpair_unique});
-            }
-        }
-    }
-
-#pragma omp parallel for schedule(dynamic)
-    for (auto ifreqR_atpair : ifreqR_atpair_all)
-    {
-        const auto ifreq = ifreqR_atpair.first.first;
-        const auto freq = tfg.get_freq_nodes()[ifreq];
-        const auto R = ifreqR_atpair.first.second;
-        const auto Mu = ifreqR_atpair.second.first;
-        const auto Nu = ifreqR_atpair.second.second;
-        const int n_mu = atom_mu[Mu];
-        const int n_nu = atom_mu[Nu];
-
-        // thread local temporary matrix
-        matrix_m<complex<double>> WfR_temp(n_mu, n_nu, major_Wc);
-
-        if (Wc_freq_q.count(freq) == 0) continue;
-        if (Wc_freq_q.at(freq).count(Mu) == 0) continue;
-        if (Wc_freq_q.at(freq).at(Mu).count(Nu) == 0) continue;
-
-        for (auto &Wc_q : Wc_freq_q.at(freq).at(Mu).at(Nu))
-        {
-            const auto q = Wc_q.first;
-            const auto &Wc = Wc_q.second;
-            for (auto q_bz : map_irk_ks[q])
-            {
-                const double ang = -q_bz * (R * latvec) * TWO_PI;
-                const complex<double> weight =
-                    complex<double>(cos(ang), sin(ang)) / double(n_k_points);
-                if (q == q_bz)
-                    WfR_temp += Wc * weight;
-                else
-                    WfR_temp += conj(Wc) * weight;
-            }
-        }
-        // omp_set_lock(&lock_Wc);
-        Wc_freq_R[freq][Mu][Nu][R] += WfR_temp;
-        // omp_unset_lock(&lock_Wc);
-    }
-
-    if (mpi_comm_global_h.is_root())
-    {
-        lib_printf("Done converting Wc(q,w) -> Wc(R,w)\n");
-    }
-    mpi_comm_global_h.barrier();
-
-    return Wc_freq_R;
-}
-
-map<double, atom_mapping<std::map<Vector3_Order<int>, matrix_m<complex<double>>>>::pair_t_old>
-CT_FT_Wc_freq_q(
-    const map<double,
-              atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
-        &Wc_freq_q,
-    const TFGrids &tfg, const int &n_k_points, const vector<Vector3_Order<int>> &Rlist)
-{
-    // major of Wc_freq_q input and Wc_tau_R output
-    const MAJOR major_Wc = MAJOR::ROW;
-
-    map<double, atom_mapping<std::map<Vector3_Order<int>, matrix_m<complex<double>>>>::pair_t_old>
-        Wc_tau_R;
-    if (!tfg.has_time_grids()) throw logic_error("TFGrids object does not have time grids");
-    const int ngrids = tfg.get_n_grids();
-
-    LIBRPA::utils::lib_printf_root("Converting Wc(q,w) -> W(R,t)\n");
-    mpi_comm_global_h.barrier();
-
-    set<pair<atom_t, atom_t>> atpairs_unique;
-    for (const auto &freq_MuNuqWc : Wc_freq_q)
-    {
-        for (const auto &Mu_NuqWc : freq_MuNuqWc.second)
-        {
-            const auto Mu = Mu_NuqWc.first;
-            for (const auto &Nu_qWc : Mu_NuqWc.second)
-            {
-                const auto Nu = Nu_qWc.first;
-                atpairs_unique.insert({Mu, Nu});
-                for (const auto &q_Wc : Nu_qWc.second)
-                {
-                    assert(q_Wc.second.major() == major_Wc);
-                }
-            }
-        }
-    }
-
-    vector<pair<pair<int, Vector3_Order<int>>, pair<atom_t, atom_t>>> itauR_atpair_all;
-    // allocate space before hand
-    for (auto R : Rlist)
-    {
-        for (int itau = 0; itau != ngrids; itau++)
-        {
-            auto tau = tfg.get_time_nodes()[itau];
-            for (auto atpair_unique : atpairs_unique)
-            {
-                const auto Mu = atpair_unique.first;
-                const int n_mu = atom_mu[Mu];
-                const auto Nu = atpair_unique.second;
-                const int n_nu = atom_mu[Nu];
-                Wc_tau_R[tau][Mu][Nu][R] = matrix_m<complex<double>>(n_mu, n_nu, major_Wc);
-                itauR_atpair_all.push_back({{itau, R}, atpair_unique});
-            }
-        }
-    }
-
-    LIBRPA::utils::lib_printf_coll("Task %4d: distributing %d {I, J, R, tau} on %d threads\n",
-                                   LIBRPA::envs::myid_global, itauR_atpair_all.size(),
-                                   omp_get_max_threads());
-
-#pragma omp parallel for schedule(dynamic)
-    for (auto itauR_atpair : itauR_atpair_all)
-    {
-        const auto itau = itauR_atpair.first.first;
-        const auto tau = tfg.get_time_nodes()[itau];
-        const auto R = itauR_atpair.first.second;
-        const auto Mu = itauR_atpair.second.first;
-        const auto Nu = itauR_atpair.second.second;
-        const int n_mu = atom_mu[Mu];
-        const int n_nu = atom_mu[Nu];
-
-        // thread local temporary matrix
-        matrix_m<complex<double>> WtR_temp(n_mu, n_nu, major_Wc);
-
-        for (int ifreq = 0; ifreq < ngrids; ifreq++)
-        {
-            const auto freq = tfg.get_freq_nodes()[ifreq];
-            const auto f2t = tfg.get_costrans_f2t()(itau, ifreq);
-            // ofs_myid << "f2t cos eff for freq " << freq << " -> tau " << tau  << ": " << f2t <<
-            // "\n";
-            if (Wc_freq_q.count(freq) == 0) continue;
-            if (Wc_freq_q.at(freq).count(Mu) == 0) continue;
-            if (Wc_freq_q.at(freq).at(Mu).count(Nu) == 0) continue;
-            // cout << "freq: " << freq << "\n";
-
-            const auto &Wc_q_all = Wc_freq_q.at(freq).at(Mu).at(Nu);
-            for (auto &Wc_q : Wc_q_all)
-            {
-                const auto q = Wc_q.first;
-                const auto &Wc = Wc_q.second;
-                for (auto q_bz : map_irk_ks[q])
-                {
-                    const double ang = -q_bz * (R * latvec) * TWO_PI;
-                    const complex<double> weight =
-                        complex<double>(cos(ang), sin(ang)) * f2t / double(n_k_points);
-                    // ofs_myid << q << " " << q_bz << " weight = " << weight << "\n";
-                    // ofs_myid << q_Wc.second;
-                    if (q == q_bz)
-                        WtR_temp += Wc * weight;
-                    else
-                        WtR_temp += conj(Wc) * weight;
-                }
-            }
-        }
-        // omp_set_lock(&lock_Wc);
-        Wc_tau_R[tau][Mu][Nu][R] += WtR_temp;
-        // omp_unset_lock(&lock_Wc);
-    }
-
-    LIBRPA::utils::lib_printf_root("Done converting Wc q,w -> R,t\n");
-    mpi_comm_global_h.barrier();
-
-    // myz debug: check the imaginary part of the matrix
-    // NOTE: if G(R) is real, is W(R) real as well?
-    // if (Params::debug)
-    // {
-    //     for (const auto & tau_MuNuRWc: Wc_tau_R)
-    //     {
-    //         char fn[80];
-    //         auto tau = tau_MuNuRWc.first;
-    //         auto itau = tfg.get_time_index(tau);
-    //         for (const auto & Mu_NuRWc: tau_MuNuRWc.second)
-    //         {
-    //             auto Mu = Mu_NuRWc.first;
-    //             // const int n_mu = atom_mu[Mu];
-    //             for (const auto & Nu_RWc: Mu_NuRWc.second)
-    //             {
-    //                 auto Nu = Nu_RWc.first;
-    //                 // const int n_nu = atom_mu[Nu];
-    //                 for (const auto & R_Wc: Nu_RWc.second)
-    //                 {
-    //                     auto R = R_Wc.first;
-    //                     auto Wc = R_Wc.second;
-    //                     auto iteR = std::find(Rlist.cbegin(), Rlist.cend(), R);
-    //                     auto iR = std::distance(Rlist.cbegin(), iteR);
-    //                     sprintf(fn, "Wc_Mu_%zu_Nu_%zu_iR_%zu_itau_%d_id_%d.mtx", Mu, Nu, iR,
-    //                     itau, mpi_comm_global_h.myid); print_matrix_mm_file(Wc,
-    //                     Params::output_dir + "/" + fn, 1e-10);
-    //                 }
-    //             }
-    //         }
-    //     }
-    // }
-    // end myz debug
-    return Wc_tau_R;
-}
-
-map<double, atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
-CT_FT_Wc_freq2time_q(
-    const map<double,
-              atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
-        &Wc_freq_q,
-    const TFGrids &tfg, const int &n_k_points, const vector<Vector3_Order<int>> &Rlist,
-    const vector<Vector3_Order<double>> &qlist)
-{
-    // major of Wc_freq_q input and Wc_tau_R output
-    const MAJOR major_Wc = MAJOR::ROW;
-
-    map<double,
-        atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
-        Wc_tau_q;
-    if (!tfg.has_time_grids()) throw logic_error("TFGrids object does not have time grids");
-    const int ngrids = tfg.get_n_grids();
-
-    LIBRPA::utils::lib_printf_root("Converting Wc(q,w) -> W(q,t)\n");
-    mpi_comm_global_h.barrier();
-
-    set<pair<atom_t, atom_t>> atpairs_unique;
-    for (const auto &freq_MuNuqWc : Wc_freq_q)
-    {
-        for (const auto &Mu_NuqWc : freq_MuNuqWc.second)
-        {
-            const auto Mu = Mu_NuqWc.first;
-            for (const auto &Nu_qWc : Mu_NuqWc.second)
-            {
-                const auto Nu = Nu_qWc.first;
-                atpairs_unique.insert({Mu, Nu});
-                for (const auto &q_Wc : Nu_qWc.second)
-                {
-                    assert(q_Wc.second.major() == major_Wc);
-                }
-            }
-        }
-    }
-    vector<pair<int, pair<atom_t, atom_t>>> itau_atpair_all;
-    // allocate space before hand
-
-    for (int itau = 0; itau != ngrids; itau++)
-    {
-        auto tau = tfg.get_time_nodes()[itau];
-        for (auto atpair_unique : atpairs_unique)
-        {
-            const auto Mu = atpair_unique.first;
-            const int n_mu = atom_mu_s[Mu];
-            const auto Nu = atpair_unique.second;
-            const int n_nu = atom_mu_s[Nu];
-            for (auto q : qlist)
-                Wc_tau_q[tau][Mu][Nu][q] = matrix_m<complex<double>>(n_mu, n_nu, major_Wc);
-            itau_atpair_all.push_back({itau, atpair_unique});
-        }
-    }
-
-    LIBRPA::utils::lib_printf_coll("Task %4d: distributing %d {I, J, R, tau} on %d threads\n",
-                                   LIBRPA::envs::myid_global, itau_atpair_all.size(),
-                                   omp_get_max_threads());
-
-#pragma omp parallel for schedule(dynamic)
-    for (auto itau_atpair : itau_atpair_all)
-    {
-        const auto itau = itau_atpair.first;
-        const auto tau = tfg.get_time_nodes()[itau];
-        const auto Mu = itau_atpair.second.first;
-        const auto Nu = itau_atpair.second.second;
-        const int n_mu = atom_mu_s[Mu];
-        const int n_nu = atom_mu_s[Nu];
-
-        // thread local temporary matrix
-        matrix_m<complex<double>> Wtq_temp(n_mu, n_nu, major_Wc);
-
-        for (int ifreq = 0; ifreq < ngrids; ifreq++)
-        {
-            const auto freq = tfg.get_freq_nodes()[ifreq];
-            const auto f2t = tfg.get_costrans_f2t()(itau, ifreq);
-            // ofs_myid << "f2t cos eff for freq " << freq << " -> tau " << tau  << ": " << f2t <<
-            // "\n";
-            if (Wc_freq_q.count(freq) == 0) continue;
-            if (Wc_freq_q.at(freq).count(Mu) == 0) continue;
-            if (Wc_freq_q.at(freq).at(Mu).count(Nu) == 0) continue;
-            // cout << "freq: " << freq << "\n";
-
-            const auto &Wc_q_all = Wc_freq_q.at(freq).at(Mu).at(Nu);
-            for (auto &Wc_q : Wc_q_all)
-            {
-                const auto q = Wc_q.first;
-                const auto &Wc = Wc_q.second;
-                const double weight = f2t;
-                Wtq_temp = Wc * weight;
-                // omp_set_lock(&lock_Wc);
-                Wc_tau_q[tau][Mu][Nu][q] += Wtq_temp;
-                // omp_unset_lock(&lock_Wc);
-            }
-        }
-    }
-
-    LIBRPA::utils::lib_printf_root("Done converting Wc q,w -> q,t\n");
-    mpi_comm_global_h.barrier();
-
-    return Wc_tau_q;
-}
-
-atom_mapping<std::map<Vector3_Order<int>, matrix_m<complex<double>>>>::pair_t_old CT_FT_Wc_tau_R2q(
-    const atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old
-        &Wc_tau_q,
-    const TFGrids &tfg, const int &n_kpoints, const vector<Vector3_Order<int>> &Rlist,
-    const int &itau)
-{
-    const int tau = tfg.get_time_nodes()[itau];
-    // major of Wc_freq_q input and Wc_tau_R output
-    const MAJOR major_Wc = MAJOR::ROW;
-
-    atom_mapping<std::map<Vector3_Order<int>, matrix_m<complex<double>>>>::pair_t_old Wc_tau_R;
-    if (!tfg.has_time_grids()) throw logic_error("TFGrids object does not have time grids");
-    const int ngrids = tfg.get_n_grids();
-
-    LIBRPA::utils::lib_printf_root("Converting Wc(q,t) -> W(R,t)\n");
-    mpi_comm_global_h.barrier();
-
-    set<pair<atom_t, atom_t>> atpairs_unique;
-    for (const auto &MuNuqWc : Wc_tau_q)
-    {
-        const auto Mu = MuNuqWc.first;
-        for (const auto &Nu_qWc : MuNuqWc.second)
-        {
-            const auto Nu = Nu_qWc.first;
-            atpairs_unique.insert({Mu, Nu});
-            for (const auto &q_Wc : Nu_qWc.second)
-            {
-                assert(q_Wc.second.major() == major_Wc);
-            }
-        }
-    }
-
-    vector<pair<Vector3_Order<int>, pair<atom_t, atom_t>>> iR_atpair_all;
-    // allocate space before hand
-    for (auto R : Rlist)
-    {
-        for (auto atpair_unique : atpairs_unique)
-        {
-            const auto Mu = atpair_unique.first;
-            const int n_mu = atom_mu_l[Mu];
-            const auto Nu = atpair_unique.second;
-            const int n_nu = atom_mu_l[Nu];
-            Wc_tau_R[Mu][Nu][R] = matrix_m<complex<double>>(n_mu, n_nu, major_Wc);
-            iR_atpair_all.push_back({R, atpair_unique});
-        }
-    }
-
-    LIBRPA::utils::lib_printf_coll("Task %4d: distributing %d {I, J, R, tau} on %d threads\n",
-                                   LIBRPA::envs::myid_global, iR_atpair_all.size(),
-                                   omp_get_max_threads());
-
-#pragma omp parallel for schedule(dynamic)
-    for (auto iR_atpair : iR_atpair_all)
-    {
-        const auto R = iR_atpair.first;
-        const auto Mu = iR_atpair.second.first;
-        const auto Nu = iR_atpair.second.second;
-        const int n_mu = atom_mu_l[Mu];
-        const int n_nu = atom_mu_l[Nu];
-
-        // thread local temporary matrix
-        matrix_m<complex<double>> WtR_temp(n_mu, n_nu, major_Wc);
-
-        if (Wc_tau_q.count(Mu) == 0) continue;
-        if (Wc_tau_q.at(Mu).count(Nu) == 0) continue;
-        // cout << "freq: " << freq << "\n";
-
-        const auto &Wc_q_all = Wc_tau_q.at(Mu).at(Nu);
-        for (auto &Wc_q : Wc_q_all)
-        {
-            const auto q = Wc_q.first;
-            const auto &Wc = Wc_q.second;
-            for (auto q_bz : map_irk_ks[q])
-            {
-                const double ang = -q_bz * (R * latvec) * TWO_PI;
-                const complex<double> weight =
-                    complex<double>(cos(ang), sin(ang)) / double(n_kpoints);
-                if (q == q_bz)
-                    WtR_temp += Wc * weight;
-                else
-                    WtR_temp += conj(Wc) * weight;
-            }
-        }
-        // omp_set_lock(&lock_Wc);
-        Wc_tau_R[Mu][Nu][R] += WtR_temp;
-        // omp_unset_lock(&lock_Wc);
-    }
-
-    LIBRPA::utils::lib_printf_root("Done converting Wc q,t -> R,t\n");
-    mpi_comm_global_h.barrier();
-    return Wc_tau_R;
-}
-
-void test_libcomm_for_system(const atpair_k_cplx_mat_t &coulmat)
-{
-    if (mpi_comm_global_h.myid == 0) lib_printf("test_libcomm_for_system Coulumb\n");
-    // lib_printf("Calculating EcRPA with BLACS, pid:  %d\n", mpi_comm_global_h.myid);
-    const complex<double> CONE{1.0, 0.0};
-    const int n_abf = LIBRPA::atomic_basis_abf.nb_total;
-    const auto part_range = LIBRPA::atomic_basis_abf.get_part_range();
-
-    mpi_comm_global_h.barrier();
-
-    Array_Desc desc_nabf_nabf(blacs_ctxt_global_h);
-    // use a square blocksize instead max block, otherwise heev and inversion will complain about
-    // illegal parameter
-    desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0);
-    const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy(
-        'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf);
-    const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf);
-
-    auto coul_block = init_local_mat<complex<double>>(desc_nabf_nabf, MAJOR::COL);
-
-    vector<Vector3_Order<double>> qpts;
-    for (const auto &qMuNuchi : irk_weight) qpts.push_back(qMuNuchi.first);
-
-#ifdef LIBRPA_USE_LIBRI
-    for (const auto &q : qpts)
-    {
-        coul_block.zero_out();
-
-        int iq = std::distance(klist.begin(), std::find(klist.begin(), klist.end(), q));
-        std::array<double, 3> qa = {q.x, q.y, q.z};
-        // collect the block elements of coulomb matrices
-        {
-            double vq_begin = omp_get_wtime();
-            // LibRI tensor for communication, release once done
-            std::map<int, std::map<std::pair<int, std::array<double, 3>>, Tensor<complex<double>>>>
-                coul_libri;
-            coul_libri.clear();
-            int count_coul = 0;
-            for (const auto &Mu_Nu : local_atpair)
-            {
-                const auto Mu = Mu_Nu.first;
-                const auto Nu = Mu_Nu.second;
-                // ofs_myid << "myid " << blacs_ctxt_global_h.myid << "Mu " << Mu << " Nu " << Nu <<
-                // endl;
-                if (coulmat.count(Mu) == 0 || coulmat.at(Mu).count(Nu) == 0 ||
-                    coulmat.at(Mu).at(Nu).count(q) == 0)
-                    continue;
-                const auto &Vq = coulmat.at(Mu).at(Nu).at(q);
-                const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu);
-                const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu);
-                std::valarray<complex<double>> Vq_va(Vq->c, Vq->size);
-                auto pvq = std::make_shared<std::valarray<complex<double>>>();
-                *pvq = Vq_va;
-                coul_libri[Mu][{Nu, qa}] = Tensor<complex<double>>({n_mu, n_nu}, pvq);
-                count_coul += 1;
-            }
-            int count_pair = 0;
-            for (auto &Mu : coul_libri)
-            {
-                for (auto &nu_q : Mu.second)
-                {
-                    count_pair += 1;
-                }
-            }
-            // printf("Finish RPA blacs 2d  vq arr\n");
-            double arr_end = omp_get_wtime();
-            mpi_comm_global_h.barrier();
-            double comm_begin = omp_get_wtime();
-            lib_printf(
-                "Begin comm_map2_first  myid: %d  q:(%f, %f, %f)  count_coul: %d  count_pair: %d\n",
-                mpi_comm_global_h.myid, q.x, q.y, q.z, count_coul, count_pair);
-            const auto IJq_coul =
-                comm_map2_first(mpi_comm_global_h.comm, coul_libri, s0_s1.first, s0_s1.second);
-            double comm_end = omp_get_wtime();
-            mpi_comm_global_h.barrier();
-            // printf("End vq comm_map2_first  myid: %d   TIME_USED:
-            // %f\n",mpi_comm_global_h.myid,comm_end-comm_begin);
-            //  ofs_myid << "IJq_coul" << endl << IJq_coul;
-            // printf("Finish RPA blacs 2d  vq 2d\n");
-            double block_begin = omp_get_wtime();
-            // for (const auto &IJ: set_IJ_nabf_nabf)
-            // {
-            //     const auto &I = IJ.first;
-            //     const auto &J = IJ.second;
-            //     // cout << IJq_coul.at(I).at({J, qa});
-            //     collect_block_from_IJ_storage_syhe(
-            //         coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, IJ.first,
-            //         IJ.second, true, CONE, IJq_coul.at(I).at({J, qa}).ptr(), MAJOR::ROW);
-            //     // lib_printf("myid %d I %d J %d nr %d nc %d\n%s",
-            //     //        blacs_ctxt_global_h.myid, I, J,
-            //     //        coul_block.nr(), coul_block.nc(),
-            //     //        str(coul_block).c_str());
-            // }
-            collect_block_from_ALL_IJ_Tensor(coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf,
-                                             qa, true, CONE, IJq_coul, MAJOR::ROW);
-            double block_end = omp_get_wtime();
-            // lib_printf("Vq Time  myid: %d  arr_time: %f  comm_time: %f   block_time: %f
-            // pair_size: %d\n",mpi_comm_global_h.myid,arr_end-vq_begin, comm_end-comm_begin,
-            // block_end-block_begin,set_IJ_nabf_nabf.size());
-            mpi_comm_global_h.barrier();
-            double vq_end = omp_get_wtime();
-
-            if (mpi_comm_global_h.myid == 0)
-                lib_printf(" | Total vq time: %f  lri_coul: %f   comm_vq: %f   block_vq: %f\n",
-                           vq_end - vq_begin, comm_begin - vq_begin, block_begin - comm_begin,
-                           vq_end - block_begin);
-        }
-    }
-    lib_printf("Success test_libcomm_for_system\n");
-#endif
-}

From ee6853bdcff7abfd113ab07268553904d242886c Mon Sep 17 00:00:00 2001
From: userchba <88939318+userchba@users.noreply.github.com>
Date: Fri, 25 Jul 2025 01:19:41 +0800
Subject: [PATCH 04/18] Delete specific file

---
 epsilon.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/epsilon.cpp b/epsilon.cpp
index 8b137891..139597f9 100644
--- a/epsilon.cpp
+++ b/epsilon.cpp
@@ -1 +1,2 @@
 
+

From ece4c3f8702f4c6f3df8d8614f1de2038c908089 Mon Sep 17 00:00:00 2001
From: userchba <88939318+userchba@users.noreply.github.com>
Date: Fri, 25 Jul 2025 01:23:08 +0800
Subject: [PATCH 05/18] Delete epsilon.cpp

---
 epsilon.cpp | 2 --
 1 file changed, 2 deletions(-)
 delete mode 100644 epsilon.cpp

diff --git a/epsilon.cpp b/epsilon.cpp
deleted file mode 100644
index 139597f9..00000000
--- a/epsilon.cpp
+++ /dev/null
@@ -1,2 +0,0 @@
-
-

From ab5910f48810c9731b05167116af68bb659519e3 Mon Sep 17 00:00:00 2001
From: userchba <88939318+userchba@users.noreply.github.com>
Date: Mon, 3 Nov 2025 16:33:52 +0800
Subject: [PATCH 06/18] change pgemm by pgeadd in cal_eps

replace commented-out identity matrix initialization and matrix multiplication code by plead
---
 src/dielecmodel.cpp | 42 ++++++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/src/dielecmodel.cpp b/src/dielecmodel.cpp
index b58a2059..87507dcd 100644
--- a/src/dielecmodel.cpp
+++ b/src/dielecmodel.cpp
@@ -1296,24 +1296,30 @@ void diele_func::cal_eps(const int ifreq, Array_Desc &desc_nabf_nabf_opt, Array_
             chi0(ilo, jlo) = result;
         }
     }
-    auto identity = init_local_mat<complex<double>>(desc_body, MAJOR::COL);
-    for (int i = 0; i < n_nonsingular - 1; i++)
-    {
-        const int ilo = desc_body.indx_g2l_r(i);
-        if (ilo < 0) continue;
-        for (int j = 0; j < n_nonsingular - 1; j++)
-        {
-            const int jlo = desc_body.indx_g2l_c(j);
-            if (jlo < 0) continue;
-            if (i == j)
-                identity(ilo, jlo) = 1.0;
-            else
-                identity(ilo, jlo) = 0.0;
-        }
-    }
-    ScalapackConnector::pgemm_f('N', 'N', n_nonsingular - 1, n_nonsingular - 1, n_nonsingular - 1,
-                                1.0, body_inv.ptr(), 1, 1, desc_body.desc, identity.ptr(), 1, 1,
-                                desc_body.desc, 1.0, chi0.ptr(), 2, 2, desc_nabf_nabf_opt.desc);
+    // auto identity = init_local_mat<complex<double>>(desc_body, MAJOR::COL);
+    // for (int i = 0; i < n_nonsingular - 1; i++)
+    // {
+    //     const int ilo = desc_body.indx_g2l_r(i);
+    //     if (ilo < 0) continue;
+    //     for (int j = 0; j < n_nonsingular - 1; j++)
+    //     {
+    //         const int jlo = desc_body.indx_g2l_c(j);
+    //         if (jlo < 0) continue;
+    //         if (i == j)
+    //             identity(ilo, jlo) = 1.0;
+    //         else
+    //             identity(ilo, jlo) = 0.0;
+    //     }
+    // }
+    // ScalapackConnector::pgemm_f('N', 'N', n_nonsingular - 1, n_nonsingular - 1, n_nonsingular - 1,
+    //                             1.0, body_inv.ptr(), 1, 1, desc_body.desc, identity.ptr(), 1, 1,
+    //                             desc_body.desc, 1.0, chi0.ptr(), 2, 2, desc_nabf_nabf_opt.desc);
+    ScalapackConnector::pgeadd_f(
+        'N', n_nonsingular - 1, n_nonsingular - 1, 
+        1.0, 
+        body_inv.ptr(), 1, 1, desc_body.desc, 
+        1.0, 
+        chi0.ptr(), 2, 2, desc_nabf_nabf_opt.desc);
     Profiler::stop("cal_inverse_dielectric_matrix_ij");
     if (mpi_comm_global_h.is_root())
         std::cout << "* Success: calculate average inverse dielectric matrix no." << ifreq + 1

From c7ead5c9b95fcad3bde9ee8465f27f5833c03d48 Mon Sep 17 00:00:00 2001
From: userchba <88939318+userchba@users.noreply.github.com>
Date: Fri, 14 Nov 2025 16:50:29 +0800
Subject: [PATCH 07/18] add gpu install setting

---
 CMakeLists.txt | 52 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e9d4ee05..4264025f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,14 +20,18 @@ set(LIB_NAME            "rpa")
 set(FORTRAN_LIB_NAME    "rpa_f")
 
 # options setup
-option(USE_LIBRI            "Use LibRI for tensor contraction" OFF)
+option(USE_LIBRI            "Use LibRI for tensor contraction" ON)
 option(USE_CMAKE_INC        "Use cmake.inc for configure" OFF)
-option(USE_GREENX_API       "Use GreenX API for minimax grids generation" OFF)
+option(USE_GREENX_API       "Use GreenX API for minimax grids generation" ON)
 option(USE_EXTERNAL_GREENX  "Use external GreenX library rather than the packaged one" OFF)
 option(ENABLE_TEST          "Flag to build unit tests" ON)
 option(ENABLE_DRIVER        "Flag to build driver executables" ON)
 option(ENABLE_FORTRAN_BIND  "Flag to build Fotran binding" OFF)
 option(VERBOSE_OUTPUT       "Flag to print verbose information in stdout and process output" OFF)
+option(USE_CUDA "Use cuda for EcRPA calculation" OFF)
+option(ENABLE_CUSOLVERMP "Use cusolverMp for EcRPA calculation" OFF)
+option(ENABLE_CUBLASMP "Use cublasmp for EcRPA calculation" OFF )
+option(ENABLE_NVHPC "Use nvhpc for calculation" OFF )
 # NOTE: static library not tested
 option(BUILD_LIBRPA_SHARED  "Flag to build shared libraries" ON)
 
@@ -61,6 +65,10 @@ if(USE_GREENX_API OR ENABLE_FORTRAN_BIND)
 else()
   enable_language(CXX)
 endif()
+# added by hbchen in 2025-5-19
+if(USE_CUDA)
+  enable_language(CUDA)
+endif()
 
 # bypass the deprecation warning of classic C++ of oneAPI
 if(CMAKE_CXX_COMPILER_ID MATCHES Intel)
@@ -177,6 +185,46 @@ if(USE_LIBRI)
   add_compile_definitions("LIBRPA_USE_LIBRI")
 endif()
 
+
+# add gpu accelaration by chenhaobo in 2025-04-26
+# cmake_policy(SET CMP0146 NEW)
+if(USE_CUDA)
+  find_package(CUDA REQUIRED)
+  find_package(CUDAToolkit REQUIRED)
+  add_definitions(-DADD_)
+  include_directories(${MAGMA_ROOT}/include)
+  set(MAGMA_INCLUDE_DIR "${MAGMA_ROOT}/include")
+  set(MAGMA_INCLUDE_DIR $ENV{MAGMA_INCLUDE_DIR})
+  link_directories(${MAGMA_ROOT}/lib)
+  set(CMAKE_CUDA_STANDARD 14)
+  set(CUDA_LIBRARIES "-lcublas -lcudart -lcusolver")
+  list(APPEND math_libs
+              ${CUDA_LIBRARIES}
+              ${MAGMA_ROOT}/lib/libmagma.so
+              )
+  add_compile_definitions("LIBRPA_USE_CUDA")
+endif()
+if(ENABLE_NVHPC)
+add_compile_definitions("ENABLE_NVHPC")
+set(ENABLE_CUSOLVERMP ON)
+set(ENABLE_CUBLASMP ON)
+endif()
+if(ENABLE_CUSOLVERMP)
+  add_compile_definitions("ENABLE_CUSOLVERMP")
+  set(CUSOLVERMP_LIBRAYIES "-lcal -lcusolverMp")
+  list(APPEND math_libs
+              ${CUSOLVERMP_LIBRAYIES}
+              )
+endif()
+if(ENABLE_CUBLASMP)
+  add_compile_definitions("ENABLE_CUBLASMP")
+  set(CUBLASMP_LIBRAYIES "-lcublasmp -lcurand -lcublas")
+  list(APPEND math_libs
+              ${CUBLASMP_LIBRAYIES}
+              )
+endif()
+# finish adding gpu accelaration by chenhaobo in 2025-04-26
+
 if(CMAKE_BUILD_TYPE MATCHES "Debug")
   message(STATUS "Build type set to Debug, adding LIBRPA_DEBUG preprocessor directive")
   add_compile_definitions("LIBRPA_DEBUG")

From b47a007254448e15c4c4a6a6aa8680f53a398034 Mon Sep 17 00:00:00 2001
From: userchba <88939318+userchba@users.noreply.github.com>
Date: Fri, 14 Nov 2025 16:55:23 +0800
Subject: [PATCH 08/18] add gpu calculation for rpa and g0w0_band

---
 src/CMakeLists.txt       |   23 +
 src/app_rpa.cpp          |   21 +
 src/array_desc_device.cu |   36 +
 src/array_desc_device.h  |   61 ++
 src/cuda_connector.cpp   | 2108 ++++++++++++++++++++++++++++++++++++++
 src/cuda_connector.cu    |  601 +++++++++++
 src/cuda_connector.h     |  621 +++++++++++
 src/device_connector.cpp |  248 +++++
 src/device_connector.h   |   54 +
 src/device_stream.cpp    |   34 +
 src/device_stream.h      |  134 +++
 src/epsilon_cuda.cpp     | 1448 ++++++++++++++++++++++++++
 src/epsilon_cuda.h       |   18 +
 src/matrix_device.cpp    |   16 +
 src/matrix_device.h      |  111 ++
 15 files changed, 5534 insertions(+)
 create mode 100644 src/array_desc_device.cu
 create mode 100644 src/array_desc_device.h
 create mode 100644 src/cuda_connector.cpp
 create mode 100644 src/cuda_connector.cu
 create mode 100644 src/cuda_connector.h
 create mode 100644 src/device_connector.cpp
 create mode 100644 src/device_connector.h
 create mode 100644 src/device_stream.cpp
 create mode 100644 src/device_stream.h
 create mode 100644 src/epsilon_cuda.cpp
 create mode 100644 src/epsilon_cuda.h
 create mode 100644 src/matrix_device.cpp
 create mode 100644 src/matrix_device.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e36be38e..213ef544 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -72,6 +72,29 @@ else()
   list(APPEND lib_sources get_minimax_local.cpp)
 endif()
 
+if(USE_CUDA)
+  list(APPEND lib_sources 
+        epsilon_cuda.cpp
+        cuda_connector.cu
+        )
+endif()
+if(ENABLE_NVHPC)
+  list(APPEND lib_sources 
+        cuda_connector.cpp  
+        array_desc_device.cu
+        device_stream.cpp
+        matrix_device.cpp
+        device_connector.cpp
+        )
+  # set_source_files_properties(
+  #   base_blacs.cpp
+  #   PROPERTIES
+  #   LANGUAGE CUDA
+  #   # COMPILE_OPTIONS "--x cu"
+  #   )
+endif()
+
+
 target_include_directories(rpa_lib
   PRIVATE
     # ${CMAKE_CURRENT_LIST_DIR}
diff --git a/src/app_rpa.cpp b/src/app_rpa.cpp
index a6cb3128..a837684c 100644
--- a/src/app_rpa.cpp
+++ b/src/app_rpa.cpp
@@ -11,6 +11,10 @@
 #include "stl_io_helper.h"
 #include "utils_mem.h"
 #include "utils_timefreq.h"
+#ifdef LIBRPA_USE_CUDA
+#include <cuda_runtime.h> // added by hbchen in 2025-5-17
+#include <epsilon_cuda.h> // added by hbchen in 2025-7-26
+#endif
 
 namespace LIBRPA
 {
@@ -87,6 +91,23 @@ void get_rpa_correlation_energy_(std::complex<double> &rpa_corr,
     mpi_comm_global_h.barrier();
     Profiler::start("EcRPA", "Compute RPA correlation Energy");
     CorrEnergy corr;
+    #ifdef LIBRPA_USE_CUDA
+    int deviceCount;
+    cudaError_t err= cudaGetDeviceCount(&deviceCount);
+    // lib_printf("cudaSuccess:%d\n",err==cudaSuccess&&deviceCount>0);
+    printf("Number of CUDA devices: %d\n", deviceCount);
+    // deviceCount!=4 is a bug, because if I don't set the gres=gpu:*, cuda will detect all the gpu device
+    if(err==cudaSuccess&&deviceCount>0){//说明存在gpu设备
+        #ifdef ENABLE_NVHPC
+        if (Params::use_scalapack_ecrpa &&
+        (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR ||
+         LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI))
+            corr = compute_RPA_correlation_blacs_2d_cuda(chi0, Vq);
+        else
+        #endif
+            corr = compute_RPA_correlation_cuda(chi0, Vq);
+    }else 
+    #endif
     if (Params::use_scalapack_ecrpa &&
         (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR ||
          LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI))
diff --git a/src/array_desc_device.cu b/src/array_desc_device.cu
new file mode 100644
index 00000000..5dab0fa5
--- /dev/null
+++ b/src/array_desc_device.cu
@@ -0,0 +1,36 @@
+#include "array_desc_device.h"
+
+Array_Desc_Device::Array_Desc_Device(const LIBRPA::Array_Desc& array_desc) {
+    this->ictxt_ = array_desc.ictxt();
+    this->m_ = array_desc.m();
+    this->n_ = array_desc.n();
+    this->mb_ = array_desc.mb();
+    this->nb_ = array_desc.nb();
+    this->lld_ = array_desc.lld();
+    this->irsrc_ = array_desc.irsrc();
+    this->icsrc_ = array_desc.icsrc();
+    this->m_local_ = array_desc.m_loc();
+    this->n_local_ = array_desc.n_loc();
+    this->myprow_ = array_desc.myprow();
+    this->mypcol_ = array_desc.mypcol();
+    this->nprows_ = array_desc.nprows();
+    this->npcols_ = array_desc.npcols();
+}
+__host__ __device__ int Array_Desc_Device::indx_g2p(const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs) {
+    return (isrcproc + indxglob / nb) % nprocs;
+}
+__host__ __device__ int Array_Desc_Device::indx_g2l(const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs) {
+    return nb * (indxglob / (nb * nprocs)) + indxglob % nb;
+}
+__host__ __device__ int Array_Desc_Device::indx_g2l_r(int gindx)const{
+    return this->myprow_ != indx_g2p(gindx, this->mb_, this->irsrc_, this->nprows_) ||
+                   gindx >= this->m_
+               ? -1
+               : indx_g2l(gindx, this->mb_, this->irsrc_, this->nprows_);
+}
+__host__ __device__ int Array_Desc_Device::indx_g2l_c(int gindx)const{
+    return this->mypcol_ != indx_g2p(gindx, this->nb_, this->icsrc_, this->npcols_) ||
+                   gindx >= this->n_
+               ? -1
+               : indx_g2l(gindx, this->nb_, this->icsrc_, this->npcols_);
+}
\ No newline at end of file
diff --git a/src/array_desc_device.h b/src/array_desc_device.h
new file mode 100644
index 00000000..15e25e80
--- /dev/null
+++ b/src/array_desc_device.h
@@ -0,0 +1,61 @@
+#include "base_blacs.h"
+class Array_Desc_Device{
+private:
+    int ictxt_;
+    // int nprocs_;
+    // int myid_;
+    int nprows_;
+    int myprow_;
+    int npcols_;
+    int mypcol_;
+
+    // Array dimensions
+    int m_;
+    int n_;
+    int mb_;
+    int nb_;
+    int irsrc_;
+    int icsrc_;
+    int lld_;
+    int m_local_;
+    int n_local_;
+    __host__ __device__ static int indx_g2p(
+        const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs);
+    __host__ __device__ static int indx_g2l(
+        const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs);
+public:
+    Array_Desc_Device(const LIBRPA::Array_Desc& array_desc);
+    __host__ __device__ 
+    int indx_g2l_r(int gindx) const;
+    __host__ __device__ 
+    int indx_g2l_c(int gindx) const;
+    __host__ __device__ 
+    const int& m() const{ return m_; }
+    __host__ __device__ 
+    const int& n() const{ return n_; }
+    __host__ __device__ 
+    const int& mb() const{ return mb_; }
+    __host__ __device__ 
+    const int& nb() const{ return nb_; }
+    __host__ __device__ 
+    const int& irsrc() const{ return irsrc_; }
+    __host__ __device__ 
+    const int& icsrc() const{ return icsrc_; }
+    __host__ __device__ 
+    const int& lld() const{ return lld_; }
+    __host__ __device__ 
+    const int& m_loc() const{ return m_local_; }
+    __host__ __device__ 
+    const int& n_loc() const{ return n_local_; }
+    __host__ __device__ 
+    const int& nprows() const{ return nprows_; }
+    __host__ __device__ 
+    const int& npcols() const{ return npcols_; }
+    __host__ __device__ 
+    const int& myprow() const{ return myprow_; }
+    __host__ __device__ 
+    const int& mypcol() const{ return mypcol_; }
+    __host__ __device__ 
+    const int& ictxt() const{ return ictxt_; }
+
+};
\ No newline at end of file
diff --git a/src/cuda_connector.cpp b/src/cuda_connector.cpp
new file mode 100644
index 00000000..f273c680
--- /dev/null
+++ b/src/cuda_connector.cpp
@@ -0,0 +1,2108 @@
+#include "cuda_connector.h"
+#include <assert.h>
+// cusolverMp include
+#include "helpers.h"
+#include <cusolverMp.h>
+#include "lapack_connector.h"
+#include "envs_mpi.h"
+using LIBRPA::envs::mpi_comm_global_h;
+#include "device_stream.h"
+// #define CUSOLVERMP_MPI_GRID_COL_MAJOR
+// #define OPEN_TEST_FOR_LU_DECOMPOSITION
+// #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+// #include <fstream> 
+// #include <string>
+// #endif
+// #ifdef ENABLE_CUBLASMP
+// // #include<curand.h>
+// // #include<chrono>
+// #endif
+// #include <vector>
+// #ifdef ENABLE_CUSOLVERMP
+// void CudaConnector::pzgetrf_cusolverMp(const int &m, const int &n, std::complex<double> *h_C_A, const int &ia,
+//                                 const int &ja, const LIBRPA::Array_Desc &arrdesc_pi, int *ipiv, int &h_info_getrf,const char order)
+//     {
+//         const int64_t M = arrdesc_pi.m();
+//         const int64_t N = arrdesc_pi.n();
+
+//         const int64_t IA = ia;
+//         const int64_t JA = ja;
+
+//         /* Tile sizes */
+//         const int64_t MA = arrdesc_pi.mb();
+//         const int64_t NA = arrdesc_pi.nb();
+//         int numRowDevices, numColDevices;
+//         if(order=='C'){
+//             numRowDevices = arrdesc_pi.npcols();
+//             numColDevices = arrdesc_pi.nprows();
+//         }
+//         else if(order=='R'){
+//             numRowDevices = arrdesc_pi.nprows();
+//             numColDevices = arrdesc_pi.npcols();
+//         }else{
+//             fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n");
+//         }
+        
+        
+//         const uint32_t RSRCA = 0;
+//         const uint32_t CSRCA = 0;
+
+//         int mpiCommSize, mpiRank;
+//         MPI_Comm_size(MPI_COMM_WORLD, &mpiCommSize);
+//         MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank);
+
+//         int local_device = getLocalDevice();
+//         int numDevices = 0;
+        
+//         // printf("Number of devices = %d\n", numDevices);
+//         // printf("local_device = %d, mpiRank = %d, mpiCommSize = %d\n", local_device, mpiRank, mpiCommSize);
+//         #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+//         double time_start_set_device=omp_get_wtime();
+//         #endif
+//         cudaError_t cudaStat  = cudaSetDevice(local_device);
+//         #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+//         printf("time for set device:%f seconds, rank:%d\n", omp_get_wtime() - time_start_set_device,mpiRank);
+//         #endif
+//         assert(cudaStat == cudaSuccess);
+//         cudaStat = cudaFree(0);
+//         assert(cudaStat == cudaSuccess);
+//         // const int mpiRank_T=arrdesc_pi.myprow()+ arrdesc_pi.mypcol()*arrdesc_pi.npcols();
+//         const int rank     = mpiRank;
+//         // printf("mpiRank=%d,mpiRank_T=%d\n", mpiRank, mpiRank_T);
+//         const int commSize = mpiCommSize;
+//         /* Library handles */
+//         cusolverMpHandle_t cusolverMpHandle = NULL;
+//         cal_comm_t         cal_comm         = NULL;
+
+//         /* Error codes */
+//         cusolverStatus_t cusolverStat = CUSOLVER_STATUS_SUCCESS;
+//         calError_t       calStat      = CAL_OK;
+//         cudaStat     = cudaSuccess;
+
+//         /* User defined stream */
+//         cudaStream_t localStream = NULL;
+//         cal_comm_create_params_t params;
+//         params.allgather    = allgather;
+//         params.req_test     = request_test;
+//         params.req_free     = request_free;
+//         params.data         = (void*)(MPI_COMM_WORLD);
+//         params.rank         = rank;
+//         params.nranks       = commSize;
+//         params.local_device = local_device;
+
+//         calStat = cal_comm_create(params, &cal_comm);
+//         assert(calStat == CAL_OK);
+
+//         /* Create local stream */
+//         #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+//         double time_start_create_stream=omp_get_wtime();
+//         #endif
+//         cudaStat = cudaStreamCreate(&localStream);
+//         #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+//         printf("time for create stream:%f seconds, rank:%d\n", omp_get_wtime() - time_start_create_stream,mpiRank);
+//         #endif
+//         assert(cudaStat == cudaSuccess);
+
+//         /* Initialize cusolverMp library handle */
+//         #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+//         double time_start_cusolverMpCreate=omp_get_wtime();
+//         #endif
+//         cusolverStat = cusolverMpCreate(&cusolverMpHandle, local_device, localStream);
+//         #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+//         printf("time for cusolverMpCreate:%f seconds, rank:%d\n", omp_get_wtime() - time_start_cusolverMpCreate,mpiRank);
+//         #endif
+//         assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+//         /* cusolverMp grids */
+//         cusolverMpGrid_t gridA_C = NULL;
+
+//         /* cusolverMp matrix descriptors */
+//         cusolverMpMatrixDescriptor_t descrA_C = NULL;
+
+//         /* Distributed matrices */
+//         void*    d_C_A  = NULL; // for cuDoubleComplex
+//         int64_t* d_ipiv = NULL;
+
+//         /* Distributed device workspace */
+//         void* d_work_getrf_C = NULL;
+
+//         /* Distributed host workspace */
+//         void* h_work_getrf_C = NULL;
+
+//         /* size of workspace on device */
+//         size_t workspaceInBytesOnDevice_getrf_C = 0;
+
+//         /* size of workspace on host */
+//         size_t workspaceInBytesOnHost_getrf_C = 0;
+
+//         /* error codes from cusolverMp (device) */
+//         int* d_info_getrf = NULL;
+
+//         /* error codes from cusolverMp (host) */
+//         // int h_info_getrf = 0;
+
+//         /* Single process per device */
+//         assert((numRowDevices * numColDevices) == commSize);
+
+//         /* =========================================== */
+//         /*          Create inputs on master rank       */
+//         /* =========================================== */
+
+//         const int64_t lda   = (IA - 1) + N;
+//         const int64_t colsA = (JA - 1) + N;
+//         // cuDoubleComplex* h_C_A = NULL;
+//         int64_t LLDA, localColsA;
+//         if(order=='C'){
+//             localColsA =arrdesc_pi.m_loc();
+//             LLDA=arrdesc_pi.n_loc();
+//         }else if(order=='R'){
+//             localColsA =arrdesc_pi.n_loc();
+//             LLDA=arrdesc_pi.m_loc();
+//         }else{
+//             fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n");
+//         }
+        
+//         /* Allocate global d_A */
+//         cudaStat = cudaMalloc((void**)&d_C_A, localColsA * LLDA * sizeof(cuDoubleComplex));
+//         assert(cudaStat == cudaSuccess);
+
+//         /* =========================================== */
+//         /*          CREATE GRID DESCRIPTORS            */
+//         /* =========================================== */
+//         if(order=='C'){
+//         cusolverStat = cusolverMpCreateDeviceGrid(
+//                 cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_COL_MAJOR);
+//         }else if(order=='R'){
+//         cusolverStat = cusolverMpCreateDeviceGrid(
+//                 cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_ROW_MAJOR);
+//         }else{
+//             fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n");
+//         }
+//         assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+//         /* =========================================== */
+//         /*        CREATE MATRIX DESCRIPTORS            */
+//         /* =========================================== */
+//         cusolverStat = cusolverMpCreateMatrixDesc(
+//                 &descrA_C, gridA_C, CUDA_C_64F, (IA - 1) + M, (JA - 1) + N, MA, NA, RSRCA, CSRCA, LLDA);
+//         assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+//         /* Allocate global d_ipiv */
+//         /* REMARK : ipiv overlaps A[IA, JA:JA+N] as in Netlib's ScaLAPACK */
+//         cudaStat = cudaMalloc((void**)&d_ipiv, arrdesc_pi.m_loc() * sizeof(int64_t));
+//         assert(cudaStat == cudaSuccess);
+
+//         /* =========================================== */
+//         /*             ALLOCATE D_INFO                 */
+//         /* =========================================== */
+
+//         cudaStat = cudaMalloc((void**)&d_info_getrf, sizeof(int));
+//         assert(cudaStat == cudaSuccess);
+
+//         /* =========================================== */
+//         /*                RESET D_INFO                 */
+//         /* =========================================== */
+
+//         cudaStat = cudaMemset(d_info_getrf, 1, sizeof(int));
+//         assert(cudaStat == cudaSuccess);
+
+//         /* =========================================== */
+//         /*     QUERY WORKSPACE SIZE FOR MP ROUTINES    */
+//         /* =========================================== */
+//         cusolverStat = cusolverMpGetrf_bufferSize(cusolverMpHandle,
+//                                                   N,
+//                                                   N,
+//                                                   d_C_A,
+//                                                   IA,
+//                                                   JA,
+//                                                   descrA_C,
+//                                                   d_ipiv,
+//                                                   CUDA_C_64F,
+//                                                   &workspaceInBytesOnDevice_getrf_C,
+//                                                   &workspaceInBytesOnHost_getrf_C);
+//         assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+//         /* =========================================== */
+//         /*         ALLOCATE PGETRF WORKSPACE            */
+//         /* =========================================== */
+//         cudaStat = cudaMalloc((void**)&d_work_getrf_C, workspaceInBytesOnDevice_getrf_C);
+//         assert(cudaStat == cudaSuccess);
+//         h_work_getrf_C = (void*)malloc(workspaceInBytesOnHost_getrf_C);
+//         assert(h_work_getrf_C != NULL);
+
+//         // copy matrix from h_C_A to d_C_A
+//         std::complex<double>* h_C_A_temp = NULL;
+//         size_t temp_size = (int64_t)localColsA * (int64_t)LLDA * sizeof(cuDoubleComplex);
+//         if(order=='C'){
+//             h_C_A_temp = LapackConnector::transpose(h_C_A, arrdesc_pi.m_loc(), arrdesc_pi.n_loc());
+//             cudaStat = cudaMemcpy(d_C_A, h_C_A_temp, temp_size, cudaMemcpyHostToDevice);
+//         }else if(order=='R'){
+//             cudaStat = cudaMemcpy(d_C_A, h_C_A, temp_size, cudaMemcpyHostToDevice);
+//         }else{
+//             fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n");
+//         }
+//         assert(cudaStat == cudaSuccess);
+        
+
+//         /* sync wait for data to arrive to device */
+//         calStat = cal_stream_sync(cal_comm, localStream);
+//         assert(calStat == CAL_OK);
+
+
+//         /* =========================================== */
+//         /*                   CALL PGETRF               */
+//         /* =========================================== */
+//         h_info_getrf=1;
+//         // printf("h_info_getrf before LU composition(cuDoubleComplex) : %d\n", h_info_getrf);
+//         // printf("LU decomposition begin(cuDoubleComplex)\n");
+//         double start_time_C = omp_get_wtime();
+//         cusolverStat = cusolverMpGetrf(cusolverMpHandle,
+//                                        N,
+//                                        N,
+//                                        d_C_A,
+//                                        IA,
+//                                        JA,
+//                                        descrA_C,
+//                                        d_ipiv,
+//                                        CUDA_C_64F,
+//                                        d_work_getrf_C,
+//                                        workspaceInBytesOnDevice_getrf_C,
+//                                        h_work_getrf_C,
+//                                        workspaceInBytesOnHost_getrf_C,
+//                                        d_info_getrf);
+
+//         /* sync after cusolverMpGetrf */
+//         calStat = cal_stream_sync(cal_comm, localStream);
+//         assert(calStat == CAL_OK);
+//         // printf("LU decomposition end(cuDoubleComplex), time = %f seconds,rand=%d\n", omp_get_wtime() - start_time_C,rank);
+
+//         /* copy d_info_getrf to host */
+//         cudaStat = cudaMemcpyAsync(&h_info_getrf, d_info_getrf, sizeof(int), cudaMemcpyDeviceToHost, localStream);
+//         assert(cudaStat == cudaSuccess);
+//         /* wait for d_info_getrf copy */
+//         cudaStat = cudaStreamSynchronize(localStream);
+//         assert(cudaStat == cudaSuccess);
+//         // printf("h_info_getrf after composition(cuDoubleComplex) : %d\n", h_info_getrf);
+//         /* check return value of cusolverMpGetrf */
+//         assert(h_info_getrf == 0);
+//         // copy d_ipiv to ipiv
+//         cudaStat = cudaMemcpy(ipiv, d_ipiv, arrdesc_pi.m_loc() * sizeof(int64_t), cudaMemcpyDeviceToHost);
+//         assert(cudaStat == cudaSuccess);
+//         // copy matrix from d_C_A to h_C_A
+//         if(order=='C'){
+//             cudaStat = cudaMemcpy(h_C_A_temp, d_C_A, temp_size, cudaMemcpyDeviceToHost);
+//             LapackConnector::transpose(h_C_A_temp, h_C_A, arrdesc_pi.m_loc(), arrdesc_pi.n_loc());
+//         }else if(order=='R'){
+//             cudaStat = cudaMemcpy(h_C_A, d_C_A, temp_size, cudaMemcpyDeviceToHost);
+//         }else{
+//             fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n");
+//         }
+//         assert(cudaStat == cudaSuccess);
+
+//         calStat = cal_stream_sync(cal_comm, localStream);
+//         assert(calStat == CAL_OK);
+
+//         cudaStat = cudaStreamSynchronize(localStream);
+//         assert(cudaStat == cudaSuccess);
+
+
+//         /* sync wait for data to arrive to host */
+//         calStat = cal_stream_sync(cal_comm, localStream);
+//         assert(calStat == CAL_OK);
+
+
+//         /* =========================================== */
+//         /*            CHECK RESIDUAL ON MASTER         */
+//         /* =========================================== */
+
+//         /* =========================================== */
+//         /*        CLEAN UP HOST WORKSPACE ON MASTER    */
+//         /* =========================================== */
+//         cusolverStat = cusolverMpDestroyMatrixDesc(descrA_C);
+//         assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+//         cusolverStat = cusolverMpDestroyGrid(gridA_C);
+//         assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+//         if (d_C_A != NULL)
+//         {
+//             cudaStat = cudaFree(d_C_A);
+//             assert(cudaStat == cudaSuccess);
+//             d_C_A = NULL;
+//         }
+//         if (d_ipiv != NULL)
+//         {
+//             cudaStat = cudaFree(d_ipiv);
+//             assert(cudaStat == cudaSuccess);
+//             d_ipiv = NULL;
+//         }
+
+//         if(h_C_A_temp != NULL)
+//         {
+//             delete [] h_C_A_temp;
+//             h_C_A_temp = NULL;
+//         }
+
+        
+//         if (d_work_getrf_C != NULL)
+//         {
+//             cudaStat = cudaFree(d_work_getrf_C);
+//             assert(cudaStat == cudaSuccess);
+//             d_work_getrf_C = NULL;
+//         }
+
+//         if (d_info_getrf != NULL)
+//         {
+//             cudaStat = cudaFree(d_info_getrf);
+//             assert(cudaStat == cudaSuccess);
+//             d_info_getrf = NULL;
+//         }
+//         if (h_work_getrf_C)
+//         {
+//             free(h_work_getrf_C);
+//             h_work_getrf_C = NULL;
+//         }
+//         /* Destroy cusolverMp handle */
+//         cusolverStat = cusolverMpDestroy(cusolverMpHandle);
+//         assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+//         /* sync before cal_comm_destroy */
+//         calStat = cal_comm_barrier(cal_comm, localStream);
+//         assert(calStat == CAL_OK);
+
+//         /* destroy CAL communicator */
+//         calStat = cal_comm_destroy(cal_comm);
+//         assert(calStat == CAL_OK);
+
+//         /* destroy user stream */
+//         cudaStat = cudaStreamDestroy(localStream);
+//         assert(cudaStat == cudaSuccess);
+
+//         /* MPI barrier before MPI_Finalize */
+//         MPI_Barrier(MPI_COMM_WORLD);
+//         // printf("success in test\n");
+//         return;
+//     }
+// #endif
+
+#ifdef ENABLE_NVHPC
+void CudaConnector::pzgetrf_nvhpc(const GpuDeviceStream&gpu_dev_stream, ComplexMatrixDevice &d_A, const int &ia,
+                                const int &ja, const LIBRPA::Array_Desc &arrdesc_pi, int64_t *d_ipiv, int *d_info_getrf, const char& order)
+{
+        const int64_t M = arrdesc_pi.m();
+        const int64_t N = arrdesc_pi.n();
+
+        const int64_t IA = ia;
+        const int64_t JA = ja;
+
+        /* Tile sizes */
+        const int64_t MA = arrdesc_pi.mb();
+        const int64_t NA = arrdesc_pi.nb();
+        int numRowDevices, numColDevices;
+        if(order=='C'){
+            numRowDevices = arrdesc_pi.npcols();
+            numColDevices = arrdesc_pi.nprows();
+        }
+        else if(order=='R'){
+            numRowDevices = arrdesc_pi.nprows();
+            numColDevices = arrdesc_pi.npcols();
+        }else{
+            fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n");
+        }
+        
+        
+        const uint32_t RSRCA = 0;
+        const uint32_t CSRCA = 0;
+
+        int mpiRank = gpu_dev_stream.rank;
+        int mpiCommSize = gpu_dev_stream.nranks;
+
+        int local_device = gpu_dev_stream.local_device;
+
+        const int rank     = mpiRank;
+        const int commSize = mpiCommSize;
+        /* Library handles */
+        cusolverMpHandle_t cusolverMpHandle = gpu_dev_stream.cusolver_handle;
+        cal_comm_t         cal_comm         = gpu_dev_stream.cal_comm;
+
+        /* Error codes */
+        cusolverStatus_t cusolverStat = CUSOLVER_STATUS_SUCCESS;
+        cudaError_t cudaStat     = cudaSuccess;
+        cudaStream_t localStream = gpu_dev_stream.stream;
+        /* Initialize cusolverMp library handle */
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+        double time_start_cusolverMpCreate=omp_get_wtime();
+        #endif
+        // cusolverStat = cusolverMpCreate(&cusolverMpHandle, local_device, localStream);
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+        // printf("time for cusolverMpCreate:%f seconds, rank:%d\n", omp_get_wtime() - time_start_cusolverMpCreate,mpiRank);
+        #endif
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        /* cusolverMp grids */
+        cusolverMpGrid_t gridA_C = NULL;
+
+        /* cusolverMp matrix descriptors */
+        cusolverMpMatrixDescriptor_t descrA_C = NULL;
+
+        /* Distributed matrices */
+        void*    d_C_A  = d_A.ptr(); // for cuDoubleComplex
+        
+
+        /* Distributed device workspace */
+        void* d_work_getrf_C = NULL;
+
+        /* Distributed host workspace */
+        void* h_work_getrf_C = NULL;
+
+        /* size of workspace on device */
+        size_t workspaceInBytesOnDevice_getrf_C = 0;
+
+        /* size of workspace on host */
+        size_t workspaceInBytesOnHost_getrf_C = 0;
+
+        /* Single process per device */
+        assert((numRowDevices * numColDevices) == commSize);
+
+        /* =========================================== */
+        /*          Create inputs on master rank       */
+        /* =========================================== */
+
+        const int64_t lda   = (IA - 1) + N;
+        const int64_t colsA = (JA - 1) + N;
+        // cuDoubleComplex* h_C_A = NULL;
+        int64_t LLDA, localColsA;
+        if(order=='C'){
+            localColsA =arrdesc_pi.m_loc();
+            LLDA=arrdesc_pi.n_loc();
+        }else if(order=='R'){
+            localColsA =arrdesc_pi.n_loc();
+            LLDA=arrdesc_pi.m_loc();
+        }else{
+            fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n");
+        }
+
+        /* =========================================== */
+        /*          CREATE GRID DESCRIPTORS            */
+        /* =========================================== */
+        if(order=='C'){
+        cusolverStat = cusolverMpCreateDeviceGrid(
+                cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_COL_MAJOR);
+        }else if(order=='R'){
+        cusolverStat = cusolverMpCreateDeviceGrid(
+                cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_ROW_MAJOR);
+        }else{
+            fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n");
+        }
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        /* =========================================== */
+        /*        CREATE MATRIX DESCRIPTORS            */
+        /* =========================================== */
+        cusolverStat = cusolverMpCreateMatrixDesc(
+                &descrA_C, gridA_C, CUDA_C_64F, (IA - 1) + M, (JA - 1) + N, MA, NA, RSRCA, CSRCA, LLDA);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+
+        /* =========================================== */
+        /*                RESET D_INFO                 */
+        /* =========================================== */
+
+        CUDA_CHECK(cudaMemset(d_info_getrf, 1, sizeof(int)));
+        
+
+        /* =========================================== */
+        /*     QUERY WORKSPACE SIZE FOR MP ROUTINES    */
+        /* =========================================== */
+        cusolverStat = cusolverMpGetrf_bufferSize(cusolverMpHandle,
+                                                  N,
+                                                  N,
+                                                  d_C_A,
+                                                  IA,
+                                                  JA,
+                                                  descrA_C,
+                                                  d_ipiv,
+                                                  CUDA_C_64F,
+                                                  &workspaceInBytesOnDevice_getrf_C,
+                                                  &workspaceInBytesOnHost_getrf_C);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        /* =========================================== */
+        /*         ALLOCATE PGETRF WORKSPACE            */
+        /* =========================================== */
+        cudaStat = cudaMalloc((void**)&d_work_getrf_C, workspaceInBytesOnDevice_getrf_C);
+        assert(cudaStat == cudaSuccess);
+        h_work_getrf_C = (void*)malloc(workspaceInBytesOnHost_getrf_C);
+        assert(h_work_getrf_C != NULL);        
+        
+
+        /* sync wait for data to arrive to device */
+        CAL_CHECK(cal_stream_sync(cal_comm, localStream));
+
+
+        /* =========================================== */
+        /*                   CALL PGETRF               */
+        /* =========================================== */
+        double start_time_C = omp_get_wtime();
+        cusolverStat = cusolverMpGetrf(cusolverMpHandle,
+                                       N,
+                                       N,
+                                       d_C_A,
+                                       IA,
+                                       JA,
+                                       descrA_C,
+                                       d_ipiv,
+                                       CUDA_C_64F,
+                                       d_work_getrf_C,
+                                       workspaceInBytesOnDevice_getrf_C,
+                                       h_work_getrf_C,
+                                       workspaceInBytesOnHost_getrf_C,
+                                       d_info_getrf);
+
+        /* sync after cusolverMpGetrf */
+        CAL_CHECK(cal_stream_sync(cal_comm, localStream));
+        // printf("LU decomposition end(cuDoubleComplex), time = %f seconds,rand=%d\n", omp_get_wtime() - start_time_C,rank);
+
+        /* copy d_info_getrf to host */
+        int h_info_getrf=1;
+        cudaStat = cudaMemcpyAsync(&h_info_getrf, d_info_getrf, sizeof(int), cudaMemcpyDeviceToHost, localStream);
+        assert(cudaStat == cudaSuccess);
+        /* wait for d_info_getrf copy */
+        gpu_dev_stream.cudaSync();
+        assert(cudaStat == cudaSuccess);
+        assert(h_info_getrf == 0);
+
+
+        /* sync wait for data to arrive to host */
+        CAL_CHECK(cal_stream_sync(cal_comm, localStream));
+
+        /* =========================================== */
+        /*        CLEAN UP HOST WORKSPACE ON MASTER    */
+        /* =========================================== */
+        cusolverStat = cusolverMpDestroyMatrixDesc(descrA_C);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        cusolverStat = cusolverMpDestroyGrid(gridA_C);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+        
+        if (d_work_getrf_C != NULL)
+        {
+            cudaStat = cudaFree(d_work_getrf_C);
+            assert(cudaStat == cudaSuccess);
+            d_work_getrf_C = NULL;
+        }
+
+        if (h_work_getrf_C)
+        {
+            free(h_work_getrf_C);
+            h_work_getrf_C = NULL;
+        }
+        /* Destroy cusolverMp handle */
+        // cusolverStat = cusolverMpDestroy(cusolverMpHandle);
+        // assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        /* MPI barrier before MPI_Finalize */
+        MPI_Barrier(MPI_COMM_WORLD);
+        // printf("success in test\n");
+        return;
+}
+void CudaConnector::pgetrf_nvhpc_mixed_precision(
+        const GpuDeviceStream&gpu_dev_stream, void *d_A, 
+        const int &ia, const int &ja,
+        const LIBRPA::Array_Desc &arrdesc_pi, int64_t *d_ipiv, int *d_info_getrf,
+        const cudaDataType_t &computeType,const char &order
+    )
+{
+        const int64_t M = arrdesc_pi.m();
+        const int64_t N = arrdesc_pi.n();
+
+        const int64_t IA = ia;
+        const int64_t JA = ja;
+
+        /* Tile sizes */
+        const int64_t MA = arrdesc_pi.mb();
+        const int64_t NA = arrdesc_pi.nb();
+        int numRowDevices, numColDevices;
+        if(order=='C'||order=='c'){
+            numRowDevices = arrdesc_pi.npcols();
+            numColDevices = arrdesc_pi.nprows();
+        }
+        else if(order=='R'||order=='r'){
+            numRowDevices = arrdesc_pi.nprows();
+            numColDevices = arrdesc_pi.npcols();
+        }else{
+            fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n");
+        }
+        
+        
+        const uint32_t RSRCA = 0;
+        const uint32_t CSRCA = 0;
+
+        int local_device = gpu_dev_stream.local_device;
+
+        const int rank     = gpu_dev_stream.rank;
+        const int commSize =  gpu_dev_stream.nranks;
+        /* Library handles */
+        cusolverMpHandle_t cusolverMpHandle = gpu_dev_stream.cusolver_handle;
+        cal_comm_t         cal_comm         = gpu_dev_stream.cal_comm;
+
+        /* Error codes */
+        cusolverStatus_t cusolverStat = CUSOLVER_STATUS_SUCCESS;
+        cudaError_t cudaStat     = cudaSuccess;
+        cudaStream_t localStream = gpu_dev_stream.stream;
+
+        /* cusolverMp grids */
+        cusolverMpGrid_t gridA_C = NULL;
+
+        /* cusolverMp matrix descriptors */
+        cusolverMpMatrixDescriptor_t descrA_C = NULL;
+
+        /* Distributed matrices */
+        void*    d_C_A  = d_A; //
+        
+
+        /* Distributed device workspace */
+        void* d_work_getrf_C = NULL;
+
+        /* Distributed host workspace */
+        void* h_work_getrf_C = NULL;
+
+        /* size of workspace on device */
+        size_t workspaceInBytesOnDevice_getrf_C = 0;
+
+        /* size of workspace on host */
+        size_t workspaceInBytesOnHost_getrf_C = 0;
+
+        /* Single process per device */
+        assert((numRowDevices * numColDevices) == commSize);
+
+        const int64_t lda   = (IA - 1) + N;
+        const int64_t colsA = (JA - 1) + N;
+        // cuDoubleComplex* h_C_A = NULL;
+        int64_t LLDA, localColsA;
+        if(order=='C'||order=='c'){
+            localColsA =arrdesc_pi.m_loc();
+            LLDA=arrdesc_pi.n_loc();
+        }else if(order=='R'||order=='r'){
+            localColsA =arrdesc_pi.n_loc();
+            LLDA=arrdesc_pi.m_loc();
+        }else{
+            fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n");
+        }
+
+        /* =========================================== */
+        /*          CREATE GRID DESCRIPTORS            */
+        /* =========================================== */
+        if(order=='C'||order=='c'){
+        cusolverStat = cusolverMpCreateDeviceGrid(
+                cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_COL_MAJOR);
+        }else if(order=='R'||order=='r'){
+        cusolverStat = cusolverMpCreateDeviceGrid(
+                cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_ROW_MAJOR);
+        }else{
+            fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n");
+        }
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        /* =========================================== */
+        /*        CREATE MATRIX DESCRIPTORS            */
+        /* =========================================== */
+        cusolverStat = cusolverMpCreateMatrixDesc(
+                &descrA_C, gridA_C, computeType, (IA - 1) + M, (JA - 1) + N, MA, NA, RSRCA, CSRCA, LLDA);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+
+        /* =========================================== */
+        /*                RESET D_INFO                 */
+        /* =========================================== */
+
+        CUDA_CHECK(cudaMemset(d_info_getrf, 1, sizeof(int)));
+        
+
+        /* =========================================== */
+        /*     QUERY WORKSPACE SIZE FOR MP ROUTINES    */
+        /* =========================================== */
+        cusolverStat = cusolverMpGetrf_bufferSize(cusolverMpHandle,
+                                                  N,
+                                                  N,
+                                                  d_C_A,
+                                                  IA,
+                                                  JA,
+                                                  descrA_C,
+                                                  d_ipiv,
+                                                  computeType,
+                                                  &workspaceInBytesOnDevice_getrf_C,
+                                                  &workspaceInBytesOnHost_getrf_C);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        /* =========================================== */
+        /*         ALLOCATE PGETRF WORKSPACE            */
+        /* =========================================== */
+        CUDA_CHECK(cudaMallocAsync((void**)&d_work_getrf_C, workspaceInBytesOnDevice_getrf_C, localStream));
+        h_work_getrf_C = (void*)malloc(workspaceInBytesOnHost_getrf_C);
+        assert(h_work_getrf_C != NULL);        
+
+        /* =========================================== */
+        /*                   CALL PGETRF               */
+        /* =========================================== */
+        double start_time_C = omp_get_wtime();
+        CUSOLVERMP_CHECK(cusolverMpGetrf(
+            cusolverMpHandle, N, N,
+            d_C_A, IA, JA, descrA_C,
+            d_ipiv, computeType,
+            d_work_getrf_C, workspaceInBytesOnDevice_getrf_C,
+            h_work_getrf_C, workspaceInBytesOnHost_getrf_C,
+            d_info_getrf));
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        /* sync after cusolverMpGetrf */
+        CAL_CHECK(cal_stream_sync(cal_comm, localStream));
+
+        /* copy d_info_getrf to host */
+        int h_info_getrf=1;
+        cudaStat = cudaMemcpyAsync(&h_info_getrf, d_info_getrf, sizeof(int), cudaMemcpyDeviceToHost, localStream);
+        assert(cudaStat == cudaSuccess);
+        /* wait for d_info_getrf copy */
+        gpu_dev_stream.cudaSync();
+        assert(cudaStat == cudaSuccess);
+        assert(h_info_getrf == 0);
+
+        /* =========================================== */
+        /*        CLEAN UP HOST WORKSPACE ON MASTER    */
+        /* =========================================== */
+        cusolverStat = cusolverMpDestroyMatrixDesc(descrA_C);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        cusolverStat = cusolverMpDestroyGrid(gridA_C);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+        
+        if (d_work_getrf_C != NULL)
+        {
+            CUDA_CHECK(cudaFreeAsync(d_work_getrf_C, localStream));
+            d_work_getrf_C = NULL;
+        }
+
+        if (h_work_getrf_C)
+        {
+            free(h_work_getrf_C);
+            h_work_getrf_C = NULL;
+        }
+
+        /* MPI barrier before MPI_Finalize */
+        MPI_Barrier(MPI_COMM_WORLD);
+        return;
+}
+void CudaConnector::pgetrf_nvhpc_mixed_precision(
+        void *d_A, const int &ia, const int &ja,
+        const LIBRPA::Array_Desc &arrdesc_pi, int64_t *d_ipiv, int *d_info_getrf,
+        const cudaDataType_t &computeType,const char &order
+    )
+{
+        const int64_t M = arrdesc_pi.m();
+        const int64_t N = arrdesc_pi.n();
+
+        const int64_t IA = ia;
+        const int64_t JA = ja;
+
+        /* Tile sizes */
+        const int64_t MA = arrdesc_pi.mb();
+        const int64_t NA = arrdesc_pi.nb();
+        int numRowDevices, numColDevices;
+        if(order=='C'||order=='c'){
+            numRowDevices = arrdesc_pi.npcols();
+            numColDevices = arrdesc_pi.nprows();
+        }
+        else if(order=='R'||order=='r'){
+            numRowDevices = arrdesc_pi.nprows();
+            numColDevices = arrdesc_pi.npcols();
+        }else{
+            fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n");
+        }
+        
+        
+        const uint32_t RSRCA = 0;
+        const uint32_t CSRCA = 0;
+
+        int local_device = device_stream.local_device;
+
+        const int rank     = mpi_comm_global_h.myid;
+        const int commSize =  mpi_comm_global_h.nprocs;
+        /* Library handles */
+        cusolverMpHandle_t cusolverMpHandle = device_stream.cusolverMp_handle;
+        cal_comm_t         cal_comm         = device_stream.cal_comm;
+
+        /* Error codes */
+        cusolverStatus_t cusolverStat = CUSOLVER_STATUS_SUCCESS;
+        cudaError_t cudaStat     = cudaSuccess;
+        cudaStream_t localStream = (cudaStream_t)device_stream.stream;
+
+        /* cusolverMp grids */
+        cusolverMpGrid_t gridA_C = NULL;
+
+        /* cusolverMp matrix descriptors */
+        cusolverMpMatrixDescriptor_t descrA_C = NULL;
+
+        /* Distributed matrices */
+        void*    d_C_A  = d_A; //
+        
+
+        /* Distributed device workspace */
+        void* d_work_getrf_C = NULL;
+
+        /* Distributed host workspace */
+        void* h_work_getrf_C = NULL;
+
+        /* size of workspace on device */
+        size_t workspaceInBytesOnDevice_getrf_C = 0;
+
+        /* size of workspace on host */
+        size_t workspaceInBytesOnHost_getrf_C = 0;
+
+        /* Single process per device */
+        assert((numRowDevices * numColDevices) == commSize);
+
+        const int64_t lda   = (IA - 1) + N;
+        const int64_t colsA = (JA - 1) + N;
+        // cuDoubleComplex* h_C_A = NULL;
+        int64_t LLDA, localColsA;
+        if(order=='C'||order=='c'){
+            localColsA =arrdesc_pi.m_loc();
+            LLDA=arrdesc_pi.n_loc();
+        }else if(order=='R'||order=='r'){
+            localColsA =arrdesc_pi.n_loc();
+            LLDA=arrdesc_pi.m_loc();
+        }else{
+            fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n");
+        }
+
+        /* =========================================== */
+        /*          CREATE GRID DESCRIPTORS            */
+        /* =========================================== */
+        if(order=='C'||order=='c'){
+        cusolverStat = cusolverMpCreateDeviceGrid(
+                cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_COL_MAJOR);
+        }else if(order=='R'||order=='r'){
+        cusolverStat = cusolverMpCreateDeviceGrid(
+                cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_ROW_MAJOR);
+        }else{
+            fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n");
+        }
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        /* =========================================== */
+        /*        CREATE MATRIX DESCRIPTORS            */
+        /* =========================================== */
+        cusolverStat = cusolverMpCreateMatrixDesc(
+                &descrA_C, gridA_C, computeType, (IA - 1) + M, (JA - 1) + N, MA, NA, RSRCA, CSRCA, LLDA);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+
+        /* =========================================== */
+        /*                RESET D_INFO                 */
+        /* =========================================== */
+
+        CUDA_CHECK(cudaMemset(d_info_getrf, 1, sizeof(int)));
+        
+
+        /* =========================================== */
+        /*     QUERY WORKSPACE SIZE FOR MP ROUTINES    */
+        /* =========================================== */
+        cusolverStat = cusolverMpGetrf_bufferSize(cusolverMpHandle,
+                                                  N,
+                                                  N,
+                                                  d_C_A,
+                                                  IA,
+                                                  JA,
+                                                  descrA_C,
+                                                  d_ipiv,
+                                                  computeType,
+                                                  &workspaceInBytesOnDevice_getrf_C,
+                                                  &workspaceInBytesOnHost_getrf_C);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        /* =========================================== */
+        /*         ALLOCATE PGETRF WORKSPACE            */
+        /* =========================================== */
+        CUDA_CHECK(cudaMallocAsync((void**)&d_work_getrf_C, workspaceInBytesOnDevice_getrf_C, localStream));
+        h_work_getrf_C = (void*)malloc(workspaceInBytesOnHost_getrf_C);
+        assert(h_work_getrf_C != NULL);        
+
+        /* =========================================== */
+        /*                   CALL PGETRF               */
+        /* =========================================== */
+        double start_time_C = omp_get_wtime();
+        CUSOLVERMP_CHECK(cusolverMpGetrf(
+            cusolverMpHandle, N, N,
+            d_C_A, IA, JA, descrA_C,
+            d_ipiv, computeType,
+            d_work_getrf_C, workspaceInBytesOnDevice_getrf_C,
+            h_work_getrf_C, workspaceInBytesOnHost_getrf_C,
+            d_info_getrf));
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        /* sync after cusolverMpGetrf */
+        CAL_CHECK(cal_stream_sync(cal_comm, localStream));
+
+        /* copy d_info_getrf to host */
+        int h_info_getrf=1;
+        cudaStat = cudaMemcpyAsync(&h_info_getrf, d_info_getrf, sizeof(int), cudaMemcpyDeviceToHost, localStream);
+        assert(cudaStat == cudaSuccess);
+        /* wait for d_info_getrf copy */
+        device_stream.cudaSync();
+        assert(cudaStat == cudaSuccess);
+        assert(h_info_getrf == 0);
+
+        /* =========================================== */
+        /*        CLEAN UP HOST WORKSPACE ON MASTER    */
+        /* =========================================== */
+        cusolverStat = cusolverMpDestroyMatrixDesc(descrA_C);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+
+        cusolverStat = cusolverMpDestroyGrid(gridA_C);
+        assert(cusolverStat == CUSOLVER_STATUS_SUCCESS);
+        
+        if (d_work_getrf_C != NULL)
+        {
+            CUDA_CHECK(cudaFreeAsync(d_work_getrf_C, localStream));
+            d_work_getrf_C = NULL;
+        }
+
+        if (h_work_getrf_C)
+        {
+            free(h_work_getrf_C);
+            h_work_getrf_C = NULL;
+        }
+
+        /* MPI barrier before MPI_Finalize */
+        MPI_Barrier(MPI_COMM_WORLD);
+        return;
+}
+void CudaConnector::pgetrs_nvhpc_mixed_precision(
+    const GpuDeviceStream& gpu_dev_stream, const cublasOperation_t& trans,
+    const void* d_A, const int64_t& IA, const int64_t& JA, const LIBRPA::Array_Desc &arrdesc_A,
+    const int64_t* d_ipiv,
+    void* d_B, const int64_t& IB, const int64_t& JB, const LIBRPA::Array_Desc &arrdesc_B,
+    int* d_info, const cudaDataType_t& compute_type,
+    const char& order
+)
+{
+    const int64_t N = arrdesc_A.m();
+    const int64_t NRHS = arrdesc_B.n();
+    assert(arrdesc_A.m()==arrdesc_A.n());
+
+    int h_info = 1;
+    size_t workspaceInBytesOnDevice, workspaceInBytesOnHost;
+    void* h_work = NULL;
+    void* d_work = NULL;
+    const int64_t mbA = arrdesc_A.mb();
+    const int64_t nbA = arrdesc_A.nb();
+    const int64_t mbB = arrdesc_B.mb();
+    const int64_t nbB = arrdesc_B.nb();
+    assert(arrdesc_A.npcols() == arrdesc_B.npcols());
+    assert(arrdesc_A.nprows() == arrdesc_B.nprows());
+    int numRowDevices, numColDevices;
+    ORDER_CHECK(order);
+    if(order=='C'||order=='c'){
+        assert(N==NRHS);
+        numRowDevices = arrdesc_A.npcols();
+        numColDevices = arrdesc_A.nprows();
+    }
+    else{
+        numRowDevices = arrdesc_A.nprows();
+        numColDevices = arrdesc_A.npcols();
+    }
+    int64_t LLDA, localColsA, LLDB, localColsB;
+    if(order=='C'||order=='c'){
+        LLDA=arrdesc_A.n_loc();
+        localColsA =arrdesc_A.m_loc();
+        LLDB=arrdesc_B.n_loc();
+        localColsB =arrdesc_B.m_loc();
+    }else{
+        LLDA=arrdesc_A.m_loc();
+        localColsA =arrdesc_A.n_loc();
+        LLDB=arrdesc_B.m_loc();
+        localColsB =arrdesc_B.n_loc();
+    }
+    int mpiCommSize = gpu_dev_stream.nranks;
+    int rank = gpu_dev_stream.rank;
+
+    cusolverMpHandle_t cusolverMpHandle = gpu_dev_stream.cusolver_handle;
+
+    cusolverMpGrid_t grid = NULL;
+    cusolverMpMatrixDescriptor_t descrA = NULL;
+    cusolverMpMatrixDescriptor_t descrB = NULL;
+
+    CUSOLVERMP_CHECK(cusolverMpCreateDeviceGrid(
+        cusolverMpHandle, &grid, gpu_dev_stream.cal_comm, 
+        numRowDevices, numColDevices, 
+        (order=='c'||order=='C')?CUSOLVERMP_GRID_MAPPING_COL_MAJOR:CUSOLVERMP_GRID_MAPPING_ROW_MAJOR)
+    );
+    
+    CUSOLVERMP_CHECK(cusolverMpCreateMatrixDesc(
+            &descrA, grid, compute_type, 
+            (IA - 1) + N, (JA - 1) + N, 
+            mbA, nbA, 0, 0, LLDA)
+    );
+    CUSOLVERMP_CHECK(cusolverMpCreateMatrixDesc(
+            &descrB, grid, compute_type, 
+            (IB - 1) + N, (JB - 1) + NRHS, 
+            mbB, nbB, 0, 0, LLDB)
+    );
+
+    CUSOLVERMP_CHECK(cusolverMpGetrs_bufferSize(
+        cusolverMpHandle, trans, N, NRHS,
+        d_A, IA, JA, descrA, d_ipiv,
+        d_B, IB, JB, descrB,
+        compute_type,
+        &workspaceInBytesOnDevice,&workspaceInBytesOnHost)
+    );
+    gpu_dev_stream.calSync();
+    if (workspaceInBytesOnHost > 0)
+    {
+        h_work = (void*)malloc(workspaceInBytesOnHost);
+        assert(h_work != NULL);
+    }
+    if (workspaceInBytesOnDevice > 0)
+    {
+        CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, gpu_dev_stream.stream));
+    }
+    gpu_dev_stream.calSync();
+    CUSOLVERMP_CHECK(cusolverMpGetrs(
+        cusolverMpHandle, trans, 
+        N, NRHS,
+        d_A, IA, JA, descrA, d_ipiv,
+        d_B, IB, JB, descrB,
+        compute_type,
+        d_work, workspaceInBytesOnDevice,
+        h_work, workspaceInBytesOnHost,
+        d_info)
+    );
+    gpu_dev_stream.calSync();
+
+    CUDA_CHECK(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost));
+    if(h_info!=0)
+    {
+        fprintf(stderr, "Error: cusolverMpgetrs failed with info=%d\n", h_info);
+        exit(1);
+    }
+    if(h_work!=NULL)
+    {
+        free(h_work);
+    }
+    if(d_work!=NULL){
+        CUDA_CHECK(cudaFreeAsync(d_work, gpu_dev_stream.stream));
+    }
+}
+void CudaConnector::pgetrf_trs_nvhpc_mixed_precision(
+    const GpuDeviceStream& gpu_dev_stream, const cublasOperation_t& trans,
+    void* d_A, const int64_t& IA, const int64_t& JA, const LIBRPA::Array_Desc &arrdesc_A,
+    void* d_B, const int64_t& IB, const int64_t& JB, const LIBRPA::Array_Desc &arrdesc_B,
+    const cudaDataType_t& compute_type, const char& order
+)
+{
+    int64_t* d_ipiv;
+    int* d_info;
+    CUDA_CHECK(cudaMallocAsync(&d_info,sizeof(int),gpu_dev_stream.stream));
+    if(order == 'c'||order == 'C'){
+        CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*arrdesc_A.n_loc(),gpu_dev_stream.stream));
+    }else{
+        CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*arrdesc_A.m_loc(),gpu_dev_stream.stream));
+    }
+    pgetrf_nvhpc_mixed_precision(
+        gpu_dev_stream, d_A, 1, 1, arrdesc_A,
+        d_ipiv, d_info,
+        CUDA_C_64F, order
+    );
+    pgetrs_nvhpc_mixed_precision(
+        gpu_dev_stream, trans,
+        d_A, 1, 1, arrdesc_A,
+        d_ipiv,
+        d_B, 1, 1, arrdesc_B,
+        d_info,
+        CUDA_C_64F, order
+    );
+    CUDA_CHECK(cudaFreeAsync(d_info, gpu_dev_stream.stream));
+    CUDA_CHECK(cudaFreeAsync(d_ipiv, gpu_dev_stream.stream));
+}
+// void CudaConnector::pgemm_cublasMp(const char &transa, const char &transb, const int &m, const int &n, const int &k,
+//                         const double &alphaD, const std::complex<double> *A, const int &ia, const int &ja, const LIBRPA::Array_Desc &arrdesc_A,
+//                         const std::complex<double> *B, const int &ib, const int &jb, const LIBRPA::Array_Desc &arrdesc_B,
+//                         const double &betaD, std::complex<double> *C, const int &ic, const int &jc, const LIBRPA::Array_Desc &arrdesc_C)
+// {
+//     using input_t = cuDoubleComplex;
+//     using output_t = cuDoubleComplex;
+//     using compute_t = cuDoubleComplex;
+//     const cudaDataType_t cuda_input_type = CUDA_C_64F;
+//     const cudaDataType_t cuda_output_type = CUDA_C_64F;
+//     cublasComputeType_t cublas_compute_type = CUBLAS_COMPUTE_64F_PEDANTIC;
+
+//     int64_t mbA = arrdesc_A.mb();
+//     int64_t nbA = arrdesc_A.nb();
+//     int64_t mbB = arrdesc_B.mb();
+//     int64_t nbB = arrdesc_B.nb();
+//     int64_t mbC = arrdesc_C.mb();
+//     int64_t nbC = arrdesc_C.nb();
+//     int nprow = arrdesc_A.nprows();
+//     int npcol = arrdesc_A.npcols();
+//     char grid_layout = 'r';
+//     bool verbose = false;
+//     int rank, nranks;
+//     MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+//     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+//     // if(rank==0)
+//     //     printf("sizeof(input_t)=%zu sizeof(output_t)=%zu sizeof(compute_t)=%zu\n", sizeof(input_t), sizeof(output_t), sizeof(compute_t));
+//     const int myprow = arrdesc_A.myprow();
+//     const int mypcol = arrdesc_A.mypcol();
+//     if(rank==0)
+//     {
+//         printf("m:%" PRId64 ", n:%" PRId64 ", k:%" PRId64 "\n", m, n, k);
+//         printf("nprow:%d, npcol:%d\n", nprow, npcol);
+//         printf("mbA:%" PRId64 ", nbA:%" PRId64 "\n", mbA, nbA);
+//     }
+//     const int local_device = getLocalDevice();
+//     printf("myrank:%d, myprow:%d, mypcol:%d, local_device:%d\n", rank, myprow, mypcol, local_device);
+//     CUDA_CHECK(cudaSetDevice(local_device));
+//     CUDA_CHECK(cudaFree(nullptr));
+
+//     cal_comm_t cal_comm;
+//     cal_comm_create_params_t params;
+//     {
+//         params.allgather    = allgather;
+//         params.req_test     = request_test;
+//         params.req_free     = request_free;
+//         params.data         = (void*)(MPI_COMM_WORLD);
+//         params.rank         = rank;
+//         params.nranks       = nranks;
+//         params.local_device = local_device;
+
+//         CAL_CHECK(cal_comm_create(params, &cal_comm));
+//     }
+//     cudaStream_t stream = nullptr;
+//     CUDA_CHECK(cudaStreamCreate(&stream));
+
+//     cublasMpHandle_t handle = nullptr;
+//     CUBLASMP_CHECK(cublasMpCreate(&handle, stream));
+
+//     cublasMpGrid_t grid = nullptr;
+
+//     cublasMpMatrixDescriptor_t descA = nullptr;
+//     cublasMpMatrixDescriptor_t descB = nullptr;
+//     cublasMpMatrixDescriptor_t descC = nullptr;
+
+//     input_t* d_A = nullptr;
+//     input_t* d_B = nullptr;
+//     output_t* d_C = nullptr;
+
+//     void* d_work = nullptr;
+
+//     compute_t alpha = {alphaD,0.0};
+//     compute_t beta = {betaD,0.0};
+
+
+//     size_t workspaceInBytesOnDevice = 0;
+//     size_t workspaceInBytesOnHost = 0;
+
+//     const int64_t global_m_a = (ia - 1) + m;
+//     const int64_t global_n_a = (ja - 1) + k;
+//     const int64_t global_m_b = (ib - 1) + k;
+//     const int64_t global_n_b = (jb - 1) + n;
+//     const int64_t global_m_c = (ic - 1) + m;
+//     const int64_t global_n_c = (jc - 1) + n;
+
+//     const int64_t llda = cublasMpNumroc(global_m_a, mbA, myprow, 0, nprow);
+//     const int64_t loc_n_a = cublasMpNumroc(global_n_a, nbA, mypcol, 0, npcol);
+//     printf("rank:%d, llda=%" PRId64 ", loc_n_a=%" PRId64 "\n", rank, llda, loc_n_a);
+//     const int64_t lldb = cublasMpNumroc(global_m_b, mbB, myprow, 0, nprow);
+//     const int64_t loc_n_b = cublasMpNumroc(global_n_b, nbB, mypcol, 0, npcol);
+
+//     const int64_t lldc = cublasMpNumroc(global_m_c, mbC, myprow, 0, nprow);
+//     const int64_t loc_n_c = cublasMpNumroc(global_n_c, nbC, mypcol, 0, npcol);
+
+//     CUDA_CHECK(cudaMallocAsync(&d_A, llda * loc_n_a * sizeof(input_t), stream));
+//     CUDA_CHECK(cudaMallocAsync(&d_B, lldb * loc_n_b * sizeof(input_t), stream));
+//     CUDA_CHECK(cudaMallocAsync(&d_C, lldc * loc_n_c * sizeof(output_t), stream));
+
+//     CUDA_CHECK(cudaMemcpyAsync(d_A, A, llda * loc_n_a * sizeof(input_t), cudaMemcpyHostToDevice, stream));
+//     CUDA_CHECK(cudaMemcpyAsync(d_B, B, lldb * loc_n_b * sizeof(input_t), cudaMemcpyHostToDevice, stream));
+//     CUDA_CHECK(cudaMemcpyAsync(d_C, C, lldc * loc_n_c * sizeof(output_t), cudaMemcpyHostToDevice, stream));
+
+//     CUBLASMP_CHECK(cublasMpGridCreate(
+//         handle,
+//         nprow,
+//         npcol,
+//         grid_layout == 'c' ? CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR,
+//         cal_comm,
+//         &grid));
+
+//     CUBLASMP_CHECK(
+//         cublasMpMatrixDescriptorCreate(handle,global_m_a, global_n_a, mbA, nbA, 0, 0, llda, cuda_input_type, grid, &descA));
+//     CUBLASMP_CHECK(
+//         cublasMpMatrixDescriptorCreate(handle,global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, cuda_input_type, grid, &descB));
+//     CUBLASMP_CHECK(
+//         cublasMpMatrixDescriptorCreate(handle,global_m_c, global_n_c, mbC, nbC, 0, 0, lldc, cuda_output_type, grid, &descC));
+//     cublasOperation_t transA,transB;
+//     if(transa=='N')
+//         transA=CUBLAS_OP_N;
+//     else if(transa=='T')
+//         transA=CUBLAS_OP_T;
+//     else if(transa=='C')
+//         transA=CUBLAS_OP_C;
+//     else{
+//         if(rank==0)
+//             printf("transa=%c is not supported\n", transa);
+//         exit(1);
+//     }
+//     if(transb=='N')
+//         transB=CUBLAS_OP_N;
+//     else if(transb=='T')
+//         transB=CUBLAS_OP_T;
+//     else if(transb=='C')
+//         transB=CUBLAS_OP_C;
+//     else{
+//         if(rank==0)
+//             printf("transb=%c is not supported\n", transb);
+//         exit(1);
+//     }
+
+
+
+//     CUBLASMP_CHECK(cublasMpGemm_bufferSize(
+//         handle,
+//         transA,
+//         transB,
+//         m,
+//         n,
+//         k,
+//         &alpha,
+//         d_A,
+//         ia,
+//         ja,
+//         descA,
+//         d_B,
+//         ib,
+//         jb,
+//         descB,
+//         &beta,
+//         d_C,
+//         ic,
+//         jc,
+//         descC,
+//         cublas_compute_type,
+//         &workspaceInBytesOnDevice,
+//         &workspaceInBytesOnHost));
+
+//     CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream));
+//     printf("workspaceInBytesOnDevice=%zu, workspaceInBytesOnHost=%zu, rank:%d\n", workspaceInBytesOnDevice, workspaceInBytesOnHost, rank);
+//     std::vector<int8_t> h_work(workspaceInBytesOnHost);
+
+//     CUDA_CHECK(cudaStreamSynchronize(stream));
+
+//     const double begin = MPI_Wtime();
+
+//     CUBLASMP_CHECK(cublasMpGemm(
+//         handle,
+//         transA,
+//         transB,
+//         m,
+//         n,
+//         k,
+//         &alpha,
+//         d_A,
+//         ia,
+//         ja,
+//         descA,
+//         d_B,
+//         ib,
+//         jb,
+//         descB,
+//         &beta,
+//         d_C,
+//         ic,
+//         jc,
+//         descC,
+//         cublas_compute_type,
+//         d_work,
+//         workspaceInBytesOnDevice,
+//         h_work.data(),
+//         workspaceInBytesOnHost));
+//     printf("Duration(before synchronize): %lf GFlops: %lf rank:%d\n", MPI_Wtime() - begin, (2 * m * n * k * 1e-9) / (MPI_Wtime() - begin), rank);
+//     CUDA_CHECK(cudaStreamSynchronize(stream));
+//     printf("Duration(after synchronize): %lf GFlops: %lf rank:%d\n", MPI_Wtime() - begin, (2 * m * n * k * 1e-9) / (MPI_Wtime() - begin), rank);
+//     if(verbose){
+//     }
+//     CUDA_CHECK(cudaMemcpyAsync(C, d_C, lldc * loc_n_c * sizeof(output_t), cudaMemcpyDeviceToHost, stream));
+//     CUDA_CHECK(cudaStreamSynchronize(stream));
+
+//     CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descA));
+//     CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descB));
+//     CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descC));
+
+//     CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid));
+
+//     CUBLASMP_CHECK(cublasMpDestroy(handle));
+
+//     CUDA_CHECK(cudaFreeAsync(d_A, stream));
+//     CUDA_CHECK(cudaFreeAsync(d_B, stream));
+//     CUDA_CHECK(cudaFreeAsync(d_C, stream));
+//     CUDA_CHECK(cudaFreeAsync(d_work, stream));
+
+//     CAL_CHECK(cal_comm_destroy(cal_comm));
+//     CUDA_CHECK(cudaStreamDestroy(stream));
+
+//     MPI_Barrier(MPI_COMM_WORLD);
+
+// }
+void CudaConnector::pgemm_device(cublasMpHandle_t handle,cublasOperation_t transA,cublasOperation_t transB,const int &m,const int &n,const int &k,
+                                    const void *alpha,
+                                    const ComplexMatrixDevice &d_A,int64_t ia,int64_t ja,
+                                    const ComplexMatrixDevice &d_B,int64_t ib,int64_t jb,
+                                    const void *beta,
+                                    ComplexMatrixDevice &d_C,int64_t ic,int64_t jc,
+                                    cublasComputeType_t cublas_compute_type)
+{
+    void* d_work;
+    
+    size_t workspaceInBytesOnDevice,workspaceInBytesOnHost;
+    CUBLASMP_CHECK(cublasMpGemm_bufferSize(handle,transA,transB,m,n,k,
+                                            alpha,
+                                            (const void *)(d_A.ptr()),ia,ja,d_A.desc_cublas,
+                                            (const void *)d_B.ptr(),ib,jb,d_B.desc_cublas,
+                                            beta,
+                                            d_C.ptr(),ic,jc,d_C.desc_cublas,
+                                            cublas_compute_type,
+                                            &workspaceInBytesOnDevice,
+                                            &workspaceInBytesOnHost));
+    CUDA_CHECK(cudaMalloc((void**)&d_work,workspaceInBytesOnDevice));
+    std::vector<int8_t> h_work(workspaceInBytesOnHost);
+    CUBLASMP_CHECK(cublasMpGemm(handle,transA,transB,m,n,k,
+                                alpha,
+                                (const void *)d_A.ptr(),ia,ja,d_A.desc_cublas,
+                                (const void *)d_B.ptr(),ib,jb,d_B.desc_cublas,
+                                beta,
+                                d_C.ptr(),ic,jc,d_C.desc_cublas,
+                                cublas_compute_type,
+                                d_work,workspaceInBytesOnDevice,
+                                h_work.data(),workspaceInBytesOnHost));
+    
+    CUDA_CHECK(cudaFree(d_work));
+}    
+
+void CudaConnector::pgemm_nvhpc(const GpuDeviceStream& gpu_dev_stream,cublasOperation_t transA,cublasOperation_t transB,const int & m,const int & n,const int & k,
+                        const void *alpha,
+                        const ComplexMatrixDevice &d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA,
+                        const ComplexMatrixDevice &d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB,
+                        const void *beta,
+                        ComplexMatrixDevice & d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC,
+                        cublasComputeType_t cublas_compute_type)
+{
+    using input_t = cuDoubleComplex;
+    using output_t = cuDoubleComplex;
+    using compute_t = cuDoubleComplex;
+    const cudaDataType_t cuda_input_type = CUDA_C_64F;
+    const cudaDataType_t cuda_output_type = CUDA_C_64F;
+
+    int64_t mbA = array_descA.mb();
+    int64_t nbA = array_descA.nb();
+    int64_t mbB = array_descB.mb();
+    int64_t nbB = array_descB.nb();
+    int64_t mbC = array_descC.mb();
+    int64_t nbC = array_descC.nb();
+    int nprow = array_descA.nprows();
+    int npcol = array_descA.npcols();
+    int llda= array_descA.lld();
+    int lldb= array_descB.lld();
+    int lldc= array_descC.lld();
+    cublasMpGrid_t grid = nullptr;
+
+    cublasMpMatrixDescriptor_t descA = nullptr;
+    cublasMpMatrixDescriptor_t descB = nullptr;
+    cublasMpMatrixDescriptor_t descC = nullptr;
+    char grid_layout = 'r';
+    int rank=gpu_dev_stream.rank;
+    int nranks=gpu_dev_stream.nranks;
+    
+    const int myprow = (grid_layout == 'c' ? rank % nprow : rank / npcol);
+    const int mypcol = (grid_layout == 'c' ? rank / nprow : rank % npcol);
+    
+    cublasMpHandle_t handle = nullptr;
+    CUBLASMP_CHECK(cublasMpCreate(&handle, gpu_dev_stream.stream));
+    void* d_work = nullptr;
+
+
+    size_t workspaceInBytesOnDevice = 0;
+    size_t workspaceInBytesOnHost = 0;
+
+    int64_t global_m_a, global_n_a, global_m_b, global_n_b;
+    if(transA == CUBLAS_OP_N){
+        global_m_a = (ia - 1) + m;
+        global_n_a = (ja - 1) + k;
+    }else{
+        global_m_a = (ia - 1) + k;
+        global_n_a = (ja - 1) + m;
+    }
+    if(transB == CUBLAS_OP_N){
+        global_m_b = (ib - 1) + k;
+        global_n_b = (jb - 1) + n;
+    }else{
+        global_m_b = (ib - 1) + n;
+        global_n_b = (jb - 1) + k;
+    }
+    const int64_t global_m_c = (ic - 1) + m;
+    const int64_t global_n_c = (jc - 1) + n;
+    
+    CUBLASMP_CHECK(cublasMpGridCreate(
+        handle, nprow, npcol,
+        grid_layout == 'c' ? CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR,
+        gpu_dev_stream.cal_comm, &grid)
+    );
+    
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle,global_m_a, global_n_a, mbA, nbA, 0, 0, llda, cuda_input_type, grid, &descA));
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle,global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, cuda_input_type, grid, &descB));
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle,global_m_c, global_n_c, mbC, nbC, 0, 0, lldc, cuda_output_type, grid, &descC));
+    
+    CUBLASMP_CHECK(cublasMpGemm_bufferSize(
+        handle, transA, transB,
+        m, n, k,
+        alpha,
+        d_A.ptr(), ia, ja, descA,
+        d_B.ptr(), ib, jb, descB,
+        beta,
+        d_C.ptr(), ic, jc, descC,
+        cublas_compute_type,
+        &workspaceInBytesOnDevice, &workspaceInBytesOnHost)
+    );
+    gpu_dev_stream.cudaSync();
+    CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, gpu_dev_stream.stream));
+    std::vector<int8_t> h_work(workspaceInBytesOnHost);
+
+    CUBLASMP_CHECK(cublasMpGemm(
+        handle, transA, transB, 
+        m, n, k,
+        alpha,
+        d_A.ptr(), ia, ja, descA,
+        d_B.ptr(), ib, jb, descB,
+        beta,
+        d_C.ptr(), ic, jc, descC,
+        cublas_compute_type,
+        d_work, workspaceInBytesOnDevice,
+        h_work.data(), workspaceInBytesOnHost
+    ));
+    
+    gpu_dev_stream.cudaSync();
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descA));
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descB));
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descC));
+
+    CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid));
+
+    CUBLASMP_CHECK(cublasMpDestroy(handle));
+    CUDA_CHECK(cudaFree(d_work));
+
+    MPI_Barrier(MPI_COMM_WORLD);
+}
+// void CudaConnector::pgemm_nvhpc_cuFloatComplex(const GpuDeviceStream& gpu_dev_stream,cublasOperation_t transA,cublasOperation_t transB,const int & m,const int & n,const int & k,
+//                         const void *alpha,
+//                         const cuFloatComplex* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA,
+//                         const cuFloatComplex* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB,
+//                         const void *beta,
+//                         cuFloatComplex * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC,
+//                         cublasComputeType_t cublas_compute_type)
+// {
+//     using input_t = cuFloatComplex;
+//     using output_t = cuFloatComplex;
+//     using compute_t = cuFloatComplex;
+//     const cudaDataType_t cuda_input_type = CUDA_C_32F;
+//     const cudaDataType_t cuda_output_type = CUDA_C_32F;
+
+//     int64_t mbA = array_descA.mb();
+//     int64_t nbA = array_descA.nb();
+//     int64_t mbB = array_descB.mb();
+//     int64_t nbB = array_descB.nb();
+//     int64_t mbC = array_descC.mb();
+//     int64_t nbC = array_descC.nb();
+//     int nprow = array_descA.nprows();
+//     int npcol = array_descA.npcols();
+//     int llda= array_descA.lld();
+//     int lldb= array_descB.lld();
+//     int lldc= array_descC.lld();
+//     cublasMpGrid_t grid = nullptr;
+
+//     cublasMpMatrixDescriptor_t descA = nullptr;
+//     cublasMpMatrixDescriptor_t descB = nullptr;
+//     cublasMpMatrixDescriptor_t descC = nullptr;
+//     char grid_layout = 'r';
+//     int rank=gpu_dev_stream.rank;
+//     int nranks=gpu_dev_stream.nranks;
+//     // if(rank==0)
+//     //     printf("sizeof(input_t)=%zu sizeof(output_t)=%zu sizeof(compute_t)=%zu\n", sizeof(input_t), sizeof(output_t), sizeof(compute_t));
+//     const int myprow = (grid_layout == 'c' ? rank % nprow : rank / npcol);
+//     const int mypcol = (grid_layout == 'c' ? rank / nprow : rank % npcol);
+//     const int local_device = gpu_dev_stream.local_device;
+//     cudaStream_t stream = gpu_dev_stream.stream;
+
+//     cublasMpHandle_t handle = nullptr;
+//     CUBLASMP_CHECK(cublasMpCreate(&handle, stream));
+//     void* d_work = nullptr;
+
+
+//     size_t workspaceInBytesOnDevice = 0;
+//     size_t workspaceInBytesOnHost = 0;
+
+//     const int64_t global_m_a = (ia - 1) + m;
+//     const int64_t global_n_a = (ja - 1) + k;
+//     const int64_t global_m_b = (ib - 1) + k;
+//     const int64_t global_n_b = (jb - 1) + n;
+//     const int64_t global_m_c = (ic - 1) + m;
+//     const int64_t global_n_c = (jc - 1) + n;
+
+//     gpu_dev_stream.cudaSync();
+//     // printf("before create grid, rank:%d\n", rank);
+//     CUBLASMP_CHECK(cublasMpGridCreate(
+//         handle,
+//         nprow,
+//         npcol,
+//         grid_layout == 'c' ? CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR,
+//         gpu_dev_stream.cal_comm,
+//         &grid));
+//     // printf("after create grid, rank:%d\n", rank);
+//     CUBLASMP_CHECK(
+//         cublasMpMatrixDescriptorCreate(handle,global_m_a, global_n_a, mbA, nbA, 0, 0, llda, cuda_input_type, grid, &descA));
+//     CUBLASMP_CHECK(
+//         cublasMpMatrixDescriptorCreate(handle,global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, cuda_input_type, grid, &descB));
+//     CUBLASMP_CHECK(
+//         cublasMpMatrixDescriptorCreate(handle,global_m_c, global_n_c, mbC, nbC, 0, 0, lldc, cuda_output_type, grid, &descC));
+//     // printf("after create desc, rank:%d\n", rank);
+//     CUBLASMP_CHECK(cublasMpGemm_bufferSize(
+//         handle,
+//         transA,
+//         transB,
+//         m,
+//         n,
+//         k,
+//         alpha,
+//         d_A,
+//         ia,
+//         ja,
+//         descA,
+//         d_B,
+//         ib,
+//         jb,
+//         descB,
+//         beta,
+//         d_C,
+//         ic,
+//         jc,
+//         descC,
+//         cublas_compute_type,
+//         &workspaceInBytesOnDevice,
+//         &workspaceInBytesOnHost));
+//     // printf("workspaceInBytesOnDevice=%zu, workspaceInBytesOnHost=%zu, rank:%d\n", workspaceInBytesOnDevice, workspaceInBytesOnHost, rank);
+//     CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream));
+//     std::vector<int8_t> h_work(workspaceInBytesOnHost);
+
+//     gpu_dev_stream.cudaSync();
+
+//     // const double begin = MPI_Wtime();
+//     // printf("before gemm, rank:%d\n", rank);
+//     CUBLASMP_CHECK(cublasMpGemm(
+//         handle,
+//         transA,
+//         transB,
+//         m,
+//         n,
+//         k,
+//         alpha,
+//         d_A,
+//         ia,
+//         ja,
+//         descA,
+//         d_B,
+//         ib,
+//         jb,
+//         descB,
+//         beta,
+//         d_C,
+//         ic,
+//         jc,
+//         descC,
+//         cublas_compute_type,
+//         d_work,
+//         workspaceInBytesOnDevice,
+//         h_work.data(),
+//         workspaceInBytesOnHost));
+//     // printf("after gemm, rank:%d\n", rank);
+//     CUDA_CHECK(cudaStreamSynchronize(stream));
+//     CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descA));
+//     CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descB));
+//     CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descC));
+
+//     CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid));
+
+//     CUBLASMP_CHECK(cublasMpDestroy(handle));
+//     CUDA_CHECK(cudaFree(d_work));
+
+//     MPI_Barrier(MPI_COMM_WORLD);
+// }
+void CudaConnector::pgemm_nvhpc_mixed_precision(
+    const GpuDeviceStream& gpu_dev_stream,cublasOperation_t transA,cublasOperation_t transB,const int & m,const int & n,const int & k,
+        const void *alpha,
+        const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA,
+        const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB,
+        const void *beta,
+        void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC,
+        cublasComputeType_t cublas_compute_type
+)
+{
+    cudaDataType_t cuda_compute_type;
+    if(cublas_compute_type == CUBLAS_COMPUTE_64F_PEDANTIC){
+        cuda_compute_type = CUDA_C_64F;
+    }else if(cublas_compute_type == CUBLAS_COMPUTE_32F_PEDANTIC){
+        cuda_compute_type = CUDA_C_32F;
+    }else{
+        fprintf(stderr, "Unsupported cublas_compute_type\n");
+    }
+    int64_t mbA = array_descA.mb();
+    int64_t nbA = array_descA.nb();
+    int64_t mbB = array_descB.mb();
+    int64_t nbB = array_descB.nb();
+    int64_t mbC = array_descC.mb();
+    int64_t nbC = array_descC.nb();
+    int nprow = array_descA.nprows();
+    int npcol = array_descA.npcols();
+    int llda= array_descA.lld();
+    int lldb= array_descB.lld();
+    int lldc= array_descC.lld();
+    cublasMpGrid_t grid = nullptr;
+
+    cublasMpMatrixDescriptor_t descA = nullptr;
+    cublasMpMatrixDescriptor_t descB = nullptr;
+    cublasMpMatrixDescriptor_t descC = nullptr;
+    char grid_layout = 'r';
+    int rank=gpu_dev_stream.rank;
+    int nranks=gpu_dev_stream.nranks;
+    // if(rank==0)
+    //     printf("sizeof(input_t)=%zu sizeof(output_t)=%zu sizeof(compute_t)=%zu\n", sizeof(input_t), sizeof(output_t), sizeof(compute_t));
+    const int myprow = (grid_layout == 'c' ? rank % nprow : rank / npcol);
+    const int mypcol = (grid_layout == 'c' ? rank / nprow : rank % npcol);
+    const int local_device = gpu_dev_stream.local_device;
+    cudaStream_t stream = gpu_dev_stream.stream;
+
+    cublasMpHandle_t handle = nullptr;
+    // printf("before create handle, rank:%d\n");
+    // CudaConnector::check_memory(gpu_dev_stream);
+    CUBLASMP_CHECK(cublasMpCreate(&handle, stream));
+    // printf("after create handle, rank:%d\n");
+    // CudaConnector::check_memory(gpu_dev_stream);
+    void* d_work = nullptr;
+
+
+    size_t workspaceInBytesOnDevice = 0;
+    size_t workspaceInBytesOnHost = 0;
+
+    int64_t global_m_a, global_n_a, global_m_b, global_n_b;
+    if(transA == CUBLAS_OP_N){
+        global_m_a = (ia - 1) + m;
+        global_n_a = (ja - 1) + k;
+    }else{
+        global_m_a = (ia - 1) + k;
+        global_n_a = (ja - 1) + m;
+    }
+    if(transB == CUBLAS_OP_N){
+        global_m_b = (ib - 1) + k;
+        global_n_b = (jb - 1) + n;
+    }else{
+        global_m_b = (ib - 1) + n;
+        global_n_b = (jb - 1) + k;
+    }
+    const int64_t global_m_c = (ic - 1) + m;
+    const int64_t global_n_c = (jc - 1) + n;
+
+    // gpu_dev_stream.cudaSync();
+    // printf("before create grid, rank:%d\n", rank);
+    // CudaConnector::check_memory(gpu_dev_stream);
+    CUBLASMP_CHECK(cublasMpGridCreate(
+        handle,
+        nprow,
+        npcol,
+        grid_layout == 'c' ? CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR,
+        gpu_dev_stream.cal_comm,
+        &grid));
+    // printf("after create grid, rank:%d\n", rank);
+    // CudaConnector::check_memory(gpu_dev_stream);
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle,global_m_a, global_n_a, mbA, nbA, 0, 0, llda, cuda_compute_type, grid, &descA));
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle,global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, cuda_compute_type, grid, &descB));
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle,global_m_c, global_n_c, mbC, nbC, 0, 0, lldc, cuda_compute_type, grid, &descC));
+    
+    CUBLASMP_CHECK(cublasMpGemm_bufferSize(
+        handle, transA, transB,
+        m, n, k,
+        alpha,
+        d_A, ia, ja, descA,
+        d_B, ib, jb, descB,
+        beta,
+        d_C, ic, jc, descC,
+        cublas_compute_type,
+        &workspaceInBytesOnDevice, &workspaceInBytesOnHost)
+    );
+    // printf("workspaceInBytesOnDevice=%zu GiB, workspaceInBytesOnHost=%zu GiB, rank:%d\n", workspaceInBytesOnDevice, workspaceInBytesOnHost, rank);
+    gpu_dev_stream.cudaSync();
+    // printf("before malloc d_work, rank:%d\n");
+    // CudaConnector::check_memory(gpu_dev_stream);
+    CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream));
+    // printf("after malloc d_work, rank:%d\n");
+    // CudaConnector::check_memory(gpu_dev_stream);
+    std::vector<int8_t> h_work(workspaceInBytesOnHost);
+
+    // gpu_dev_stream.cudaSync();
+
+    // const double begin = MPI_Wtime();
+    CUBLASMP_CHECK(cublasMpGemm(
+        handle, transA, transB,
+        m, n, k,
+        alpha,
+        d_A, ia, ja, descA,
+        d_B, ib, jb, descB,
+        beta,
+        d_C, ic, jc, descC,
+        cublas_compute_type,
+        d_work, workspaceInBytesOnDevice,
+        h_work.data(), workspaceInBytesOnHost)
+    );
+    // gpu_dev_stream.cudaSync();
+    // printf("before free d_work, rank:%d\n");
+    // CudaConnector::check_memory(gpu_dev_stream);
+    CUDA_CHECK(cudaFreeAsync(d_work, stream));
+    // printf("after free d_work, rank:%d\n");
+    // CudaConnector::check_memory(gpu_dev_stream);
+    gpu_dev_stream.cudaSync();
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descA));
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descB));
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descC));
+
+    CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid));
+
+    CUBLASMP_CHECK(cublasMpDestroy(handle));
+
+    MPI_Barrier(MPI_COMM_WORLD);
+}
+
+
+void CudaConnector::pgemm_nvhpc_mixed_precision(
+    cublasOperation_t transA,cublasOperation_t transB,const int & m,const int & n,const int & k,
+    const void *alpha,
+    const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA,
+    const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB,
+    const void *beta,
+    void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC,
+    cublasComputeType_t cublas_compute_type
+)
+{
+    cudaDataType_t cuda_compute_type;
+    if(cublas_compute_type == CUBLAS_COMPUTE_64F_PEDANTIC){
+        cuda_compute_type = CUDA_C_64F;
+    }else if(cublas_compute_type == CUBLAS_COMPUTE_32F_PEDANTIC){
+        cuda_compute_type = CUDA_C_32F;
+    }else{
+        fprintf(stderr, "Unsupported cublas_compute_type\n");
+    }
+    int64_t mbA = array_descA.mb();
+    int64_t nbA = array_descA.nb();
+    int64_t mbB = array_descB.mb();
+    int64_t nbB = array_descB.nb();
+    int64_t mbC = array_descC.mb();
+    int64_t nbC = array_descC.nb();
+    int nprow = array_descA.nprows();
+    int npcol = array_descA.npcols();
+    int llda= array_descA.lld();
+    int lldb= array_descB.lld();
+    int lldc= array_descC.lld();
+    cublasMpGrid_t grid = nullptr;
+
+    cublasMpMatrixDescriptor_t descA = nullptr;
+    cublasMpMatrixDescriptor_t descB = nullptr;
+    cublasMpMatrixDescriptor_t descC = nullptr;
+    char grid_layout = 'r';
+    int rank=mpi_comm_global_h.myid;
+    int nranks=mpi_comm_global_h.nprocs;
+    // if(rank==0)
+    //     printf("sizeof(input_t)=%zu sizeof(output_t)=%zu sizeof(compute_t)=%zu\n", sizeof(input_t), sizeof(output_t), sizeof(compute_t));
+    const int myprow = (grid_layout == 'c' ? rank % nprow : rank / npcol);
+    const int mypcol = (grid_layout == 'c' ? rank / nprow : rank % npcol);
+    const int local_device = device_stream.local_device;
+    cudaStream_t stream = (cudaStream_t)device_stream.stream;
+
+    cublasMpHandle_t handle = nullptr;
+    CUBLASMP_CHECK(cublasMpCreate(&handle, stream));
+    void* d_work = nullptr;
+
+
+    size_t workspaceInBytesOnDevice = 0;
+    size_t workspaceInBytesOnHost = 0;
+
+    int64_t global_m_a, global_n_a, global_m_b, global_n_b;
+    if(transA == CUBLAS_OP_N){
+        global_m_a = (ia - 1) + m;
+        global_n_a = (ja - 1) + k;
+    }else{
+        global_m_a = (ia - 1) + k;
+        global_n_a = (ja - 1) + m;
+    }
+    if(transB == CUBLAS_OP_N){
+        global_m_b = (ib - 1) + k;
+        global_n_b = (jb - 1) + n;
+    }else{
+        global_m_b = (ib - 1) + n;
+        global_n_b = (jb - 1) + k;
+    }
+    const int64_t global_m_c = (ic - 1) + m;
+    const int64_t global_n_c = (jc - 1) + n;
+
+    CUBLASMP_CHECK(cublasMpGridCreate(
+        handle,
+        nprow,
+        npcol,
+        grid_layout == 'c' ? CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR,
+        device_stream.cal_comm,
+        &grid));
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle,global_m_a, global_n_a, mbA, nbA, 0, 0, llda, cuda_compute_type, grid, &descA));
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle,global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, cuda_compute_type, grid, &descB));
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle,global_m_c, global_n_c, mbC, nbC, 0, 0, lldc, cuda_compute_type, grid, &descC));
+    
+    CUBLASMP_CHECK(cublasMpGemm_bufferSize(
+        handle, transA, transB,
+        m, n, k,
+        alpha,
+        d_A, ia, ja, descA,
+        d_B, ib, jb, descB,
+        beta,
+        d_C, ic, jc, descC,
+        cublas_compute_type,
+        &workspaceInBytesOnDevice, &workspaceInBytesOnHost)
+    );
+    device_stream.cudaSync();
+    CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream));
+    std::vector<int8_t> h_work(workspaceInBytesOnHost);
+
+    // const double begin = MPI_Wtime();
+    CUBLASMP_CHECK(cublasMpGemm(
+        handle, transA, transB,
+        m, n, k,
+        alpha,
+        d_A, ia, ja, descA,
+        d_B, ib, jb, descB,
+        beta,
+        d_C, ic, jc, descC,
+        cublas_compute_type,
+        d_work, workspaceInBytesOnDevice,
+        h_work.data(), workspaceInBytesOnHost)
+    );
+    
+    CUDA_CHECK(cudaFreeAsync(d_work, stream));
+    
+    device_stream.cudaSync();
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descA));
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descB));
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descC));
+
+    CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid));
+
+    CUBLASMP_CHECK(cublasMpDestroy(handle));
+}
+void CudaConnector::pgeadd_nvhpc(
+    const GpuDeviceStream& gpu_dev_stream,const cublasOperation_t& trans,
+    const void *alpha,
+    const void* d_A, const int64_t& ia, const int64_t& ja, const Array_Desc& array_descA,
+    const void* beta,
+    void* d_B, const int64_t& ib, const int64_t& jb, const Array_Desc& array_descB,
+    const cudaDataType_t& compute_type,
+    const char& order
+)
+{
+    cudaStream_t stream = gpu_dev_stream.stream;
+    cublasMpHandle_t handle = nullptr;
+    const int64_t mbA = array_descA.mb();
+    const int64_t nbA = array_descA.nb();
+    const int64_t mbB = array_descB.mb();
+    const int64_t nbB = array_descB.nb();
+    
+    int nprow, npcol;
+    assert(array_descA.nprows() == array_descB.nprows());
+    assert(array_descA.npcols() == array_descB.npcols());
+    ORDER_CHECK(order);
+    if(order == 'c'||order == 'C'){
+        nprow = array_descA.npcols();
+        npcol = array_descA.nprows();
+    }else{
+        nprow = array_descA.nprows();
+        npcol = array_descA.npcols();
+    }
+    int llda,loc_n_a,lldb,loc_n_b,mA,nA,mB,nB;
+    if(order == 'c'||order == 'C'){
+        llda = array_descA.n_loc();
+        loc_n_a = array_descA.m_loc();
+        lldb = array_descB.n_loc();
+        loc_n_b = array_descB.m_loc();
+        mA = array_descA.n();
+        nA = array_descA.m();
+        mB = array_descB.n();
+        nB = array_descB.m();
+
+    }else{
+        llda = array_descA.m_loc();
+        loc_n_a = array_descA.n_loc();
+        lldb = array_descB.m_loc();
+        loc_n_b = array_descB.n_loc();
+        mA = array_descA.m();
+        nA = array_descA.n();
+        mB = array_descB.m();
+        nB = array_descB.n();
+    }
+    if(trans == CUBLAS_OP_N){
+        assert(mA == nA);
+        assert(mB == nB);
+    }else{
+        assert(mA == nB);
+        assert(mB == nA);
+    }
+    const int global_m_a = ia-1+mA;
+    const int global_n_a = ja-1+nA;
+    const int global_m_b = ib-1+mB;
+    const int global_n_b = jb-1+nB;
+    int rank = gpu_dev_stream.rank;
+    int nranks = gpu_dev_stream.nranks;
+
+    const int local_device = gpu_dev_stream.local_device;
+    CUBLASMP_CHECK(cublasMpCreate(&handle, stream));
+    cublasMpGrid_t grid = nullptr;
+    cublasMpMatrixDescriptor_t descA = nullptr;
+    cublasMpMatrixDescriptor_t descB = nullptr;
+    void* d_work = nullptr;
+    size_t workspaceInBytesOnDevice = 0;
+    size_t workspaceInBytesOnHost = 0;
+    CUBLASMP_CHECK(cublasMpGridCreate(
+        handle, nprow, npcol, 
+        order=='c'||order=='C'?CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR,
+        gpu_dev_stream.cal_comm,
+        &grid));
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle, global_m_a, global_n_a, mbA, nbA, 0, 0, llda, compute_type, grid, &descA));
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle, global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, compute_type, grid, &descB));
+    CUBLASMP_CHECK(cublasMpGeadd_bufferSize(
+        handle, trans,
+        mB, nB,
+        alpha,
+        d_A, ia, ja, descA,
+        beta,
+        d_B, ib, jb, descB,
+        &workspaceInBytesOnDevice, &workspaceInBytesOnHost));
+    gpu_dev_stream.calSync();
+    
+    CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream));
+    gpu_dev_stream.calSync();
+    std::vector<int8_t> h_work(workspaceInBytesOnHost);
+    CUBLASMP_CHECK(cublasMpGeadd(
+        handle, trans,
+        mB, nB,
+        alpha,
+        d_A, ia, ja, descA,
+        beta,
+        d_B, ib, jb, descB,
+        d_work, workspaceInBytesOnDevice,
+        h_work.data(), workspaceInBytesOnHost));
+    gpu_dev_stream.calSync();
+    CUDA_CHECK(cudaFreeAsync(d_work, stream));
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle, descA));
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle, descB));
+    CUBLASMP_CHECK(cublasMpGridDestroy(handle, grid));
+    CUBLASMP_CHECK(cublasMpDestroy(handle));
+}  
+
+void CudaConnector::pgemr2d_nvhpc(
+    const GpuDeviceStream& gpu_dev_stream,const int& m, const int& n,
+    const void* d_A, const int64_t& ia, const int64_t& ja, const Array_Desc& array_descA,
+    void* d_B, const int64_t& ib, const int64_t& jb, const Array_Desc& array_descB,
+    const cudaDataType_t& compute_type
+)
+{
+    
+    cudaStream_t stream = gpu_dev_stream.stream;
+    cublasMpHandle_t handle = nullptr;
+    const int64_t mbA = array_descA.mb();
+    const int64_t nbA = array_descA.nb();
+    const int64_t mbB = array_descB.mb();
+    const int64_t nbB = array_descB.nb();
+    int nprow, npcol;
+    assert(array_descA.nprows() == array_descB.nprows());
+    assert(array_descA.npcols() == array_descB.npcols());
+    nprow = array_descA.nprows();
+    npcol = array_descA.npcols();
+    
+    int llda = array_descA.m_loc();
+    int loc_n_a = array_descA.n_loc();
+
+    int lldb = array_descB.m_loc();
+    int loc_n_b = array_descB.n_loc();
+
+    const int global_m_a = ia-1+m;
+    const int global_n_a = ja-1+n;
+
+    const int global_m_b = ib-1+m;
+    const int global_n_b = jb-1+n;
+    
+    CUBLASMP_CHECK(cublasMpCreate(&handle, stream));
+    cublasMpGrid_t grid = nullptr;
+    cublasMpMatrixDescriptor_t descA = nullptr;
+    cublasMpMatrixDescriptor_t descB = nullptr;
+    void* d_work = nullptr;
+    size_t workspaceInBytesOnDevice = 0;
+    size_t workspaceInBytesOnHost = 0;
+    CUBLASMP_CHECK(cublasMpGridCreate(
+        handle, nprow, npcol, 
+        CUBLASMP_GRID_LAYOUT_ROW_MAJOR,
+        gpu_dev_stream.cal_comm,
+        &grid)
+    );
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle, global_m_a, global_n_a, mbA, nbA, 0, 0, llda, compute_type, grid, &descA));
+    CUBLASMP_CHECK(
+        cublasMpMatrixDescriptorCreate(handle, global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, compute_type, grid, &descB));
+    CUBLASMP_CHECK(cublasMpGemr2D_bufferSize(
+        handle,
+        m, n,
+        d_A, ia, ja, descA,
+        d_B, ib, jb, descB,
+        &workspaceInBytesOnDevice, &workspaceInBytesOnHost,
+        gpu_dev_stream.cal_comm)
+    );
+
+    gpu_dev_stream.calSync();
+    CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream));
+    std::vector<int8_t> h_work(workspaceInBytesOnHost);
+
+    CUBLASMP_CHECK(cublasMpGemr2D(
+        handle,
+        m, n,
+        d_A, ia, ja, descA,
+        d_B, ib, jb, descB,
+        d_work, workspaceInBytesOnDevice,
+        h_work.data(), workspaceInBytesOnHost,
+        gpu_dev_stream.cal_comm)
+    );
+    gpu_dev_stream.calSync();
+    CUDA_CHECK(cudaFreeAsync(d_work, stream));
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle, descA));
+    CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle, descB));
+    CUBLASMP_CHECK(cublasMpGridDestroy(handle, grid));
+    CUBLASMP_CHECK(cublasMpDestroy(handle));
+} 
+#endif
+
diff --git a/src/cuda_connector.cu b/src/cuda_connector.cu
new file mode 100644
index 00000000..f6ffa7db
--- /dev/null
+++ b/src/cuda_connector.cu
@@ -0,0 +1,601 @@
+#include "cuda_connector.h"
+#include "device_stream.h"
+// add by hbchen in 2025-05-19
+__device__ int get_indxg2p(const int &indxglob, const int &nb, const int &iproc,
+                              const int &isrcproc, const int &nprocs)
+{
+    return (isrcproc + indxglob / nb) % nprocs;
+}
+__device__ int get_indxg2l(const int &indxglob, const int &nb, const int &iproc,
+                              const int &isrcproc, const int &nprocs)
+{
+    return nb * (indxglob / (nb * nprocs)) + indxglob % nb;
+}
+__device__ int get_index_g2l_r(const int& gindx, const int& m, const int& mb,const int &myprow, const int& irsrc, const int& nprows)
+{
+    return myprow != get_indxg2p(gindx, mb, myprow, irsrc, nprows) ||
+                   gindx >= m
+               ? -1
+               : get_indxg2l(gindx, mb, myprow, irsrc, nprows);
+}
+__device__ int get_index_g2l_c(const int& gindx, const int &n, const int &nb, const int &mypcol, const int &icsrc, const int &npcols)
+{
+    return mypcol != get_indxg2p(gindx, nb, mypcol, icsrc, npcols) ||
+                   gindx >= n
+               ? -1
+               : get_indxg2l(gindx, nb, mypcol, icsrc, npcols);
+}
+__global__ void det_multiply_seq(const cuDoubleComplex *inC,const int* d_ipiv,const int& num, cuDoubleComplex* ouC){
+    int f = blockIdx.x*blockDim.x + threadIdx.x;
+    // printf("f = %d\n", f);
+    ouC[f]=make_cuDoubleComplex(1.0, 0.0);
+    for(int i=f;i<num;i+=blockDim.x){
+        ouC[f]=cuCmul(ouC[f], inC[i*num+i]);
+        if(d_ipiv[i]!=i+1){
+            ouC[f].x=-ouC[f].x;
+            ouC[f].y=-ouC[f].y;
+        }
+    }
+}
+__global__ void cuDoubleComplex_minus_identity_kernel(cuDoubleComplex *d_a,const int& d_n){
+    int f= blockIdx.x*blockDim.x + threadIdx.x;
+    if(f<d_n){
+        d_a[f*d_n+f].x=d_a[f*d_n+f].x-1.0;
+    }
+}
+__global__ void multiply_number_for_ComplexMatrixDevice_kernel(cuDoubleComplex* d_a, const cuDoubleComplex& d_num, const int& d_len)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < d_len)
+    {
+        d_a[idx] = cuCmul(d_a[idx], d_num);
+    }
+}
+cuDoubleComplex CudaConnector::det_cuDoubleComplex(const cuDoubleComplex *d_a,const int *d_ipiv, const int& n, const cudaStream_t stream){
+    int blockSize = 256;
+    // printf("blockSize = %d\n", blockSize);
+    int gridSize = 1;
+    int *d_n;
+    cudaMallocAsync((void**)&d_n, sizeof(int), stream);
+    cudaMemcpyAsync(d_n, &n, sizeof(int), cudaMemcpyHostToDevice, stream);
+    cuDoubleComplex *d_ouC;
+    cuDoubleComplex *h_ouC;
+    cudaHostAlloc((void**)&h_ouC, blockSize*sizeof(cuDoubleComplex), cudaHostAllocDefault);
+    cuDoubleComplex ouC=make_cuDoubleComplex(1.0, 0.0);
+    cudaMallocAsync((void**)&d_ouC, blockSize*sizeof(cuDoubleComplex), stream);
+    // printf("start det_multiply\n");
+    det_multiply_seq<<<gridSize, blockSize, 0, stream>>>(d_a,d_ipiv,*d_n,d_ouC);
+    cudaMemcpyAsync(h_ouC, d_ouC, blockSize*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost, stream);
+    cudaStreamSynchronize(stream);
+    cudaFreeAsync(d_ouC, stream);
+    cudaFreeAsync(d_n, stream);
+    for(int i=0;i<blockSize;i++){
+        // printf("h_ouC[%d] = (%f, %f)\n", i, h_ouC[i].x, h_ouC[i].y);
+        ouC=cuCmul(ouC, h_ouC[i]);
+    }
+    cudaFreeHost(h_ouC);
+    return ouC;
+}
+void CudaConnector::cuDoubleComplex_minus_identity_Async(cuDoubleComplex *d_a,const int& h_n, const cudaStream_t stream){
+    int blockSize = 256;
+    int gridSize = (h_n + blockSize - 1) / blockSize;
+    int * d_n;
+    cudaMallocAsync((void**)&d_n, sizeof(int), stream);
+    cudaMemcpyAsync(d_n, &h_n, sizeof(int), cudaMemcpyHostToDevice, stream);
+    cuDoubleComplex_minus_identity_kernel<<<gridSize, blockSize, 0, stream>>>(d_a,*d_n);
+    cudaStreamSynchronize(stream);
+    cudaFreeAsync(d_n, stream);
+}
+void CudaConnector::multiply_number_for_ComplexMatrixDevice(ComplexMatrixDevice& mat, const cuDoubleComplex& num, const cudaStream_t& stream)
+{
+    int blockSize = 256;
+    cuDoubleComplex * d_num;
+    int *d_len;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_len, sizeof(int), stream));
+    int len = mat.nr()*mat.nc();
+    CUDA_CHECK(cudaMemcpyAsync(d_len, &len, sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(cuDoubleComplex), stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_num, &num, sizeof(cuDoubleComplex), cudaMemcpyHostToDevice, stream));
+    int gridSize = (mat.nr()*mat.nc() + blockSize - 1) / blockSize;
+    multiply_number_for_ComplexMatrixDevice_kernel<<<gridSize, blockSize, 0, stream>>>(mat.ptr(), *d_num, *d_len);
+}
+__global__ void diag_add_ComplexMatrixDevice_kernel(cuDoubleComplex* d_a, const double& d_num, const int& d_m, const int& d_n, const int& d_m_loc, const int& d_n_loc, const int& d_mb, const int& d_nb, const int& d_myprow, const int& d_mypcol, const int& d_irsrc, const int& d_icsrc, const int& d_nprows, const int& d_npcols)
+{
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row < d_m)
+    {
+        int col = row; // Diagonal element
+        int local_row = get_index_g2l_r(row, d_m, d_mb, d_myprow, d_irsrc, d_nprows);
+        int local_col = get_index_g2l_c(col, d_n, d_nb, d_mypcol, d_icsrc, d_npcols);
+        if (local_row != -1 && local_col != -1 && local_row < d_m_loc && local_col < d_n_loc)
+        {
+            int local_index = local_row + local_col * d_m_loc; // Column-major order
+            d_a[local_index].x += d_num; // Add the real number to the real part
+        }
+    }
+}
+
+void CudaConnector::diag_add_ComplexMatrixDevice(ComplexMatrixDevice& mat, const double& num, const Array_Desc& arrdesc,const cudaStream_t& stream)
+{
+    double *d_num;
+    int blockSize = 256;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(double), stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_num, &num, sizeof(double), cudaMemcpyHostToDevice, stream));
+    int *d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_m, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_n, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_m_loc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_n_loc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_mb, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_nb, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_myprow, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_mypcol, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_irsrc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_icsrc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_nprows, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_npcols, sizeof(int), stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_m, &arrdesc.m(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_n, &arrdesc.n(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_m_loc, &arrdesc.m_loc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_n_loc, &arrdesc.n_loc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_mb, &arrdesc.mb(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_nb, &arrdesc.nb(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_myprow, &arrdesc.myprow(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_mypcol, &arrdesc.mypcol(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_irsrc, &arrdesc.irsrc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_icsrc, &arrdesc.icsrc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_nprows, &arrdesc.nprows(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_npcols, &arrdesc.npcols(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    int gridSize = (arrdesc.m() + blockSize - 1) / blockSize;
+    diag_add_ComplexMatrixDevice_kernel<<<gridSize, blockSize, 0, stream>>>(mat.ptr(),*d_num,*d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols);
+    CUDA_CHECK(cudaFreeAsync(d_num, stream));
+    CUDA_CHECK(cudaFreeAsync(d_m, stream));
+    CUDA_CHECK(cudaFreeAsync(d_n, stream));
+    CUDA_CHECK(cudaFreeAsync(d_m_loc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_n_loc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_mb, stream));
+    CUDA_CHECK(cudaFreeAsync(d_nb, stream));
+    CUDA_CHECK(cudaFreeAsync(d_myprow, stream));
+    CUDA_CHECK(cudaFreeAsync(d_mypcol, stream));
+    CUDA_CHECK(cudaFreeAsync(d_irsrc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_icsrc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_nprows, stream));
+    CUDA_CHECK(cudaFreeAsync(d_npcols, stream));
+
+}
+__global__ void diag_add_matrix_device_blacs_kernel(const cuDoubleComplex* num, cuDoubleComplex* d_A, const Array_Desc_Device& array_desc_device)
+{
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+    if(i>=array_desc_device.m())
+        return;
+    int ilo = array_desc_device.indx_g2l_r(i);
+    if(ilo==-1)
+        return;
+    int jlo = array_desc_device.indx_g2l_c(i);
+    if(jlo==-1)
+        return;
+    if(ilo<array_desc_device.m_loc() && jlo<array_desc_device.n_loc())
+    {
+        d_A[ilo + jlo*array_desc_device.m_loc()] = cuCadd(d_A[ilo + jlo*array_desc_device.m_loc()], *num);
+    }
+}
+void CudaConnector::diag_add_matrix_device_blacs(
+    const void* num, void* d_A, const Array_Desc& array_desc,const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+){
+    Array_Desc_Device array_desc_device(array_desc);
+    Array_Desc_Device* d_array_desc_device;
+    void* d_num;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_array_desc_device, sizeof(Array_Desc_Device), (cudaStream_t)device_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_array_desc_device, &array_desc_device, sizeof(Array_Desc_Device), cudaMemcpyHostToDevice, (cudaStream_t)device_stream.stream));
+    int blockSize = 256;
+    int gridSize = (array_desc_device.m() + blockSize - 1) / blockSize;
+    if(compute_type == LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){
+        CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(cuDoubleComplex), (cudaStream_t)device_stream.stream));
+        CUDA_CHECK(cudaMemcpyAsync(d_num, num, sizeof(cuDoubleComplex), cudaMemcpyHostToDevice, (cudaStream_t)device_stream.stream));
+        
+        diag_add_matrix_device_blacs_kernel<<<gridSize, blockSize, 0, (cudaStream_t)device_stream.stream>>>((cuDoubleComplex*)d_num,(cuDoubleComplex*)d_A,*d_array_desc_device);
+       
+    }else if(compute_type == LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT){
+        CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(cuFloatComplex), (cudaStream_t)device_stream.stream));
+        CUDA_CHECK(cudaMemcpyAsync(d_num, num, sizeof(cuFloatComplex), cudaMemcpyHostToDevice, (cudaStream_t)device_stream.stream));
+    }else if(compute_type == LIBRPA_COMPUTE_TYPE_DOUBLE){
+        CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(double), (cudaStream_t)device_stream.stream));
+        CUDA_CHECK(cudaMemcpyAsync(d_num, num, sizeof(double), cudaMemcpyHostToDevice, (cudaStream_t)device_stream.stream));
+    }else{
+        CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(float), (cudaStream_t)device_stream.stream));
+        CUDA_CHECK(cudaMemcpyAsync(d_num, num, sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)device_stream.stream));
+    }
+    CUDA_CHECK(cudaFreeAsync(d_num, (cudaStream_t)device_stream.stream));
+    CUDA_CHECK(cudaFreeAsync(d_array_desc_device, (cudaStream_t)device_stream.stream));
+
+}
+__global__ void det_ComplexMatrixDevice_blacs_kernel(const cuDoubleComplex* d_in,cuDoubleComplex* d_out,const int& m,const int& n,const int& m_loc,const int& n_loc,const int& mb,const int& nb,const int& myprow,const int& mypcol,const int& irsrc,const int& icsrc,const int& nprows,const int& npcols)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    // printf("idx=%d\n",idx);
+    d_out[idx] = make_cuDoubleComplex(1.0,0.0);
+    
+    for(int i=idx;i<m;i+=blockDim.x)
+    {
+        int local_row = get_index_g2l_r(i, m, mb, myprow, irsrc, nprows);
+        int local_col = get_index_g2l_c(i, m, mb, mypcol, icsrc, npcols);
+        if (local_row != -1 && local_col != -1 && local_row < m_loc && local_col < n_loc)
+        {
+            d_out[idx] = cuCmul(d_out[idx],d_in[local_row + local_col*m_loc]);
+        }
+    }
+}
+std::complex<double> CudaConnector::det_ComplexMatrixDevice_blacs(const cudaStream_t& stream, const ComplexMatrixDevice&d_A, const LIBRPA::Array_Desc &arrdesc_pi)
+{
+    std::complex<double> det_loc={1.0,0.0};
+    int *d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_m, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_n, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_m_loc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_n_loc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_mb, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_nb, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_myprow, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_mypcol, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_irsrc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_icsrc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_nprows, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_npcols, sizeof(int), stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_m, &arrdesc_pi.m(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_n, &arrdesc_pi.n(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_m_loc, &arrdesc_pi.m_loc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_n_loc, &arrdesc_pi.n_loc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_mb, &arrdesc_pi.mb(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_nb, &arrdesc_pi.nb(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_myprow, &arrdesc_pi.myprow(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_mypcol, &arrdesc_pi.mypcol(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_irsrc, &arrdesc_pi.irsrc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_icsrc, &arrdesc_pi.icsrc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_nprows, &arrdesc_pi.nprows(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_npcols, &arrdesc_pi.npcols(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    int blockSize = 256;
+    int gridSize = 1;
+    cuDoubleComplex* d_detA;
+    std::complex<double> *h_detA = new std::complex<double>[blockSize];
+    CUDA_CHECK(cudaMallocAsync((void**)&d_detA, sizeof(cuDoubleComplex) * blockSize, stream));
+    // printf("before det_ComplexMatrixDevice_blacs_kernel\n");
+    // CUDA_CHECK(cudaStreamSynchronize(stream));
+    det_ComplexMatrixDevice_blacs_kernel<<<gridSize, blockSize, 0, stream>>>(d_A.ptr(),d_detA,*d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols);
+    CUDA_CHECK(cudaFreeAsync(d_m, stream));
+    CUDA_CHECK(cudaFreeAsync(d_n, stream));
+    CUDA_CHECK(cudaFreeAsync(d_m_loc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_n_loc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_mb, stream));
+    CUDA_CHECK(cudaFreeAsync(d_nb, stream));
+    CUDA_CHECK(cudaFreeAsync(d_myprow, stream));
+    CUDA_CHECK(cudaFreeAsync(d_mypcol, stream));
+    CUDA_CHECK(cudaFreeAsync(d_irsrc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_icsrc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_nprows, stream));
+    CUDA_CHECK(cudaFreeAsync(d_npcols, stream));
+    // printf("after det_ComplexMatrixDevice_blacs_kernel\n");
+    CUDA_CHECK(cudaMemcpyAsync(h_detA, d_detA, sizeof(cuDoubleComplex) * blockSize, cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaFreeAsync(d_detA, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    for(int i=0;i<blockSize;i++)
+    {
+        det_loc*=h_detA[i];
+    }
+    delete[] h_detA;
+    return det_loc;
+}
+__global__ void transpose_kernel(const cuDoubleComplex* d_in, cuDoubleComplex* d_out, const int& d_m,const int& d_n, const bool& d_conjugate)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    // printf("Thread %d processing index %d\n", idx, idx);
+    if (idx < d_m * d_n)
+    {
+        // data was stored in column-major order
+        int row = idx % d_m;
+        int col = idx / d_m;
+        int transpose_id= col + row * d_n;
+        d_out[transpose_id] = d_in[idx];
+        if (d_conjugate){
+            d_out[transpose_id].y = -d_out[transpose_id].y;
+        }
+        
+    }
+}
+
+void CudaConnector::transpose_ComplexMatrixDevice(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice& in ,bool is_conjugate)
+{
+    int *d_m_ptr,*d_n_ptr;
+    bool *d_conjugate_ptr;
+    int m = in.nr();
+    int n = in.nc();
+    CUDA_CHECK(cudaMallocAsync((void**)&d_m_ptr, sizeof(int), gpu_dev_stream.stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_n_ptr, sizeof(int), gpu_dev_stream.stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_conjugate_ptr, sizeof(bool), gpu_dev_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_m_ptr, &m, sizeof(int), cudaMemcpyHostToDevice, gpu_dev_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_n_ptr, &n, sizeof(int), cudaMemcpyHostToDevice, gpu_dev_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_conjugate_ptr, &is_conjugate, sizeof(bool), cudaMemcpyHostToDevice, gpu_dev_stream.stream));
+
+    const cuDoubleComplex* d_in = in.ptr();
+    cuDoubleComplex* d_out;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_out, sizeof(cuDoubleComplex) * n * m, gpu_dev_stream.stream));
+    int blockSize = 256;
+    int gridSize = (m * n + blockSize - 1) / blockSize;
+    // printf("before transpose kernel, gridSize=%d, blockSize=%d, m=%d, n=%d, is_conjugate=%d\n", gridSize, blockSize, m, n, is_conjugate);
+    transpose_kernel<<<gridSize, blockSize, 0, gpu_dev_stream.stream>>>(d_in, d_out, *d_m_ptr, *d_n_ptr, *d_conjugate_ptr);
+    // printf("after transpose kernel\n");
+    CUDA_CHECK(cudaFreeAsync(d_m_ptr, gpu_dev_stream.stream));
+    CUDA_CHECK(cudaFreeAsync(d_n_ptr, gpu_dev_stream.stream));
+    CUDA_CHECK(cudaFreeAsync(d_conjugate_ptr, gpu_dev_stream.stream));
+    gpu_dev_stream.cudaSync();
+    in.set_ptr(d_out);
+    in.set_nr(n);
+    in.set_nc(m);
+}
+__global__ void cuFloatComplex_to_cuDoubleComplex_kernel(const cuFloatComplex* d_in, cuDoubleComplex* d_out, const int64_t& d_len)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < d_len)
+    {
+        d_out[idx] = make_cuDoubleComplex(static_cast<double>(d_in[idx].x), static_cast<double>(d_in[idx].y));
+    }
+}
+void CudaConnector::cuFloatComplex_to_cuDoubleComplex_Async(const cuFloatComplex* d_in, cuDoubleComplex* d_out, const int64_t& len, const cudaStream_t& stream)
+{
+    int blockSize = 256;
+    int gridSize = (len + blockSize - 1) / blockSize;
+    // printf("gridSize = %d, blockSize = %d, len = %d\n", gridSize, blockSize, len);
+    // Launch the kernel
+    int64_t *d_len;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_len, sizeof(int64_t), stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_len, &len, sizeof(int64_t), cudaMemcpyHostToDevice, stream));
+    cuFloatComplex_to_cuDoubleComplex_kernel<<<gridSize, blockSize, 0, stream>>>(d_in, d_out, *d_len);
+}
+__global__ void cuDoubleComplex_to_cuFloatComplex_kernel(const cuDoubleComplex* d_a, cuFloatComplex* d_b, const int64_t& d_len)
+{
+    
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (idx < d_len)
+    {
+        d_b[idx] = make_cuFloatComplex(static_cast<float>(d_a[idx].x), static_cast<float>(d_a[idx].y));
+    }
+}
+void CudaConnector::cuDoubleComplex_to_cuFloatComplex_Async(const cuDoubleComplex* d_a, cuFloatComplex* d_b, const int64_t& len, const cudaStream_t& stream)
+{
+    int blockSize = 256;
+    int gridSize = (len + blockSize - 1) / blockSize;
+    // printf("gridSize = %d, blockSize = %d, len = %d\n", gridSize, blockSize, len);
+    // Launch the kernel
+    int64_t *d_len;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_len, sizeof(int64_t), stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_len, &len, sizeof(int64_t), cudaMemcpyHostToDevice, stream));
+    // printf("start kernel function\n");
+    cuDoubleComplex_to_cuFloatComplex_kernel<<<gridSize, blockSize, 0, stream>>>(d_a, d_b, *d_len);
+}
+__global__ void float_to_double_kernel(const float* d_a, double* d_b, const int64_t& d_len)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < d_len)
+    {
+        d_b[idx] = d_a[idx];
+    }
+}
+void CudaConnector::float_to_double_device(const float* d_a, double* d_b, const int64_t& len)
+{
+    int blockSize = 256;
+    int gridSize = (len + blockSize - 1) / blockSize;
+    int64_t *d_len;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_len, sizeof(int64_t), device_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_len, &len, sizeof(int64_t), cudaMemcpyHostToDevice, device_stream.stream));
+    float_to_double_kernel<<<gridSize, blockSize, 0, device_stream.stream>>>(d_a, d_b, *d_len);
+    CUDA_CHECK(cudaFreeAsync(d_len, device_stream.stream));
+}
+__global__ void double_to_float_kernel(const double* d_a, float* d_b, const int64_t& d_len)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < d_len)
+    {
+        d_b[idx] = static_cast<float>(d_a[idx]);
+    }
+}
+void CudaConnector::double_to_float_device(const double* d_a, float* d_b, const int64_t& len)
+{
+    int blockSize = 256;
+    int gridSize = (len + blockSize - 1) / blockSize;
+    int64_t *d_len;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_len, sizeof(int64_t), device_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_len, &len, sizeof(int64_t), cudaMemcpyHostToDevice, device_stream.stream));
+    double_to_float_kernel<<<gridSize, blockSize, 0, device_stream.stream>>>(d_a, d_b, *d_len);
+    CUDA_CHECK(cudaFreeAsync(d_len, device_stream.stream));
+}
+__global__ void trace_ComplexMatrixDevice_blacs_kernel(const cuDoubleComplex* d_in, cuDoubleComplex* d_out, const int& d_m, const int& d_n, const int& d_m_loc, const int& d_n_loc, const int& d_mb, const int& d_nb, const int& d_myprow, const int& d_mypcol, const int& d_irsrc, const int& d_icsrc, const int& d_nprows, const int& d_npcols)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    d_out[idx] = make_cuDoubleComplex(0.0, 0.0);
+    for(int i=idx;i<d_m;i+=blockDim.x){
+        int local_row = get_index_g2l_r(i, d_m, d_mb, d_myprow, d_irsrc, d_nprows);
+        int local_col = get_index_g2l_c(i, d_n, d_nb, d_mypcol, d_icsrc, d_npcols);
+        if (local_row != -1 && local_col != -1 && local_row < d_m_loc && local_col < d_n_loc)
+        {
+            d_out[idx] = cuCadd(d_out[idx], d_in[local_row + local_col * d_m_loc]);
+        }
+    }
+}
+
+__global__ void trace_matrix_device_blacs_kernel(const cuDoubleComplex* d_in, cuDoubleComplex* d_out, const Array_Desc_Device& array_desc_device)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    d_out[idx] = make_cuDoubleComplex(0.0, 0.0);
+    for(int i=idx;i<array_desc_device.m();i+=blockDim.x){
+        int local_row = array_desc_device.indx_g2l_r(i);
+        int local_col = array_desc_device.indx_g2l_c(i);
+        if (local_row != -1 && local_col != -1 && local_row < array_desc_device.m_loc() && local_col < array_desc_device.n_loc())
+        {
+            d_out[idx] = cuCadd(d_out[idx], d_in[local_row + local_col * array_desc_device.m_loc()]);
+        }
+    }
+}
+void CudaConnector::trace_matrix_device_blacs(void* trace, const void* d_A, const Array_Desc &arrdesc_A, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type)
+{
+    Array_Desc_Device array_desc_device(arrdesc_A);
+    Array_Desc_Device* d_array_desc_device;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_array_desc_device, sizeof(Array_Desc_Device), (cudaStream_t)device_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_array_desc_device, &array_desc_device, sizeof(Array_Desc_Device), cudaMemcpyHostToDevice, (cudaStream_t)device_stream.stream));
+    int blockSize = 256;
+    int gridSize = 1;
+    void* d_trace;
+    void* h_trace;
+    if(compute_type == LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){
+        std::complex<double> *trace_z = (std::complex<double>*)trace;
+        CUDA_CHECK(cudaMallocAsync((void**)&d_trace, blockSize*sizeof(std::complex<double>), device_stream.stream));
+        h_trace = (std::complex<double>*)malloc(blockSize*sizeof(std::complex<double>));
+        std::complex<double> *h_trace_z = (std::complex<double>*)h_trace;
+        *(trace_z) = std::complex<double>(0.0, 0.0);
+        trace_matrix_device_blacs_kernel<<<gridSize, blockSize, 0, device_stream.stream>>>((const cuDoubleComplex*)d_A, (cuDoubleComplex*)d_trace, *d_array_desc_device);
+        CUDA_CHECK(cudaMemcpyAsync(h_trace, d_trace, blockSize*sizeof(std::complex<double>), cudaMemcpyDeviceToHost, device_stream.stream));
+        device_stream.sync();
+        for(int i=0;i<blockSize;i++){
+            *(trace_z) += h_trace_z[i];
+        }
+    }
+    CUDA_CHECK(cudaFreeAsync(d_array_desc_device,device_stream.stream));
+    CUDA_CHECK(cudaFreeAsync(d_trace,device_stream.stream));
+    free(h_trace);
+}
+__global__ void det_matrix_device_blacs_kernel(const cuDoubleComplex* d_in, cuDoubleComplex* d_out, const Array_Desc_Device& array_desc_device)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int local_row,local_col;
+    d_out[idx] = make_cuDoubleComplex(1.0, 0.0);
+    for(int i=idx;i<array_desc_device.m();i+=blockDim.x){
+        local_row = array_desc_device.indx_g2l_r(i);
+        local_col = array_desc_device.indx_g2l_c(i);
+        if (local_row != -1 && local_col != -1 && local_row < array_desc_device.m_loc() && local_col < array_desc_device.n_loc())
+        {
+            d_out[idx] = cuCmul(d_out[idx], d_in[local_row + local_col * array_desc_device.m_loc()]);
+        }
+    }
+}
+void CudaConnector::det_matrix_device_blacs(void* det, const void* d_A, const Array_Desc &arrdesc_A, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type)
+{
+    Array_Desc_Device array_desc_device(arrdesc_A);
+    Array_Desc_Device* d_array_desc_device;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_array_desc_device, sizeof(Array_Desc_Device), (cudaStream_t)device_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_array_desc_device, &array_desc_device, sizeof(Array_Desc_Device), cudaMemcpyHostToDevice, (cudaStream_t)device_stream.stream));
+    int blockSize = 256;
+    int gridSize = 1;
+    void* d_det;
+    void* h_det;
+    if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){
+        std::complex<double> *det_z = (std::complex<double>*)det;
+        CUDA_CHECK(cudaMallocAsync((void**)&d_det, blockSize*sizeof(std::complex<double>), device_stream.stream));
+        h_det = (std::complex<double>*)malloc(blockSize*sizeof(std::complex<double>));
+        std::complex<double> *h_det_z = (std::complex<double>*)h_det;
+        *(det_z) = std::complex<double>(1.0, 0.0);
+        det_matrix_device_blacs_kernel<<<gridSize, blockSize, 0, device_stream.stream>>>((const cuDoubleComplex*)d_A, (cuDoubleComplex*)d_det, *d_array_desc_device);
+        CUDA_CHECK(cudaMemcpyAsync(h_det, d_det, blockSize*sizeof(std::complex<double>), cudaMemcpyDeviceToHost, device_stream.stream));
+        device_stream.sync();
+        for(int i=0;i<blockSize;i++){
+            *(det_z) *= h_det_z[i];
+        }
+    }
+    CUDA_CHECK(cudaFreeAsync(d_array_desc_device,device_stream.stream));
+    CUDA_CHECK(cudaFreeAsync(d_det,device_stream.stream));
+    free(h_det);
+}
+std::complex<double> CudaConnector::trace_ComplexMatrixDevice_blacs(const cudaStream_t& stream, const ComplexMatrixDevice& d_A, const LIBRPA::Array_Desc &arrdesc)
+{
+    
+    int blockSize = 256;
+    int *d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_m, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_n, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_m_loc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_n_loc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_mb, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_nb, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_myprow, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_mypcol, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_irsrc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_icsrc, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_nprows, sizeof(int), stream));
+    CUDA_CHECK(cudaMallocAsync((void**)&d_npcols, sizeof(int), stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_m, &arrdesc.m(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_n, &arrdesc.n(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_m_loc, &arrdesc.m_loc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_n_loc, &arrdesc.n_loc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_mb, &arrdesc.mb(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_nb, &arrdesc.nb(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_myprow, &arrdesc.myprow(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_mypcol, &arrdesc.mypcol(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_irsrc, &arrdesc.irsrc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_icsrc, &arrdesc.icsrc(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_nprows, &arrdesc.nprows(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_npcols, &arrdesc.npcols(), sizeof(int), cudaMemcpyHostToDevice, stream));
+    cuDoubleComplex *d_out;
+    CUDA_CHECK(cudaMallocAsync((void**)&d_out, sizeof(cuDoubleComplex)*blockSize, stream));
+    std::complex<double> *h_out;
+    std::complex<double> trace_loc = {0.0, 0.0};
+    h_out = new std::complex<double>[blockSize];
+    int gridSize = 1;
+    trace_ComplexMatrixDevice_blacs_kernel<<<gridSize, blockSize, 0, stream>>>(d_A.ptr(),d_out,*d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols);
+    CUDA_CHECK(cudaFreeAsync(d_m, stream));
+    CUDA_CHECK(cudaFreeAsync(d_n, stream));
+    CUDA_CHECK(cudaFreeAsync(d_m_loc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_n_loc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_mb, stream));
+    CUDA_CHECK(cudaFreeAsync(d_nb, stream));
+    CUDA_CHECK(cudaFreeAsync(d_myprow, stream));
+    CUDA_CHECK(cudaFreeAsync(d_mypcol, stream));
+    CUDA_CHECK(cudaFreeAsync(d_irsrc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_icsrc, stream));
+    CUDA_CHECK(cudaFreeAsync(d_nprows, stream));
+    CUDA_CHECK(cudaFreeAsync(d_npcols, stream));
+    CUDA_CHECK(cudaMemcpyAsync(h_out, d_out, sizeof(cuDoubleComplex)*blockSize, cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaFreeAsync(d_out, stream));
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+    for(int i=0;i<blockSize;i++)
+    {
+        trace_loc += h_out[i];
+    }
+    delete[] h_out;
+    return trace_loc;
+}
+void ComplexMatrixDevice::set_as_zero(const cudaStream_t& stream){
+    if(d_data == nullptr|| m==0 || n==0){
+        fprintf(stderr,"Error: ComplexMatrixDevice::set_as_zero: d_data is null or m or n is zero\n");
+        std::abort();
+    }
+    if(stream == nullptr){
+        CUDA_CHECK(cudaMemset(d_data, 0, sizeof(cuDoubleComplex)*m*n));
+    }else{
+        CUDA_CHECK(cudaMemsetAsync(d_data, 0, sizeof(cuDoubleComplex)*m*n, stream));
+    }
+}
+__global__ void set_as_identity_kernel(cuDoubleComplex*d_A, const Array_Desc_Device& array_desc_device){
+    int i = threadIdx.x + blockIdx.x * blockDim.x;
+    if(i>=array_desc_device.m())
+        return;
+    int ilo = array_desc_device.indx_g2l_r(i);
+    if(ilo==-1)
+        return;
+    int jlo = array_desc_device.indx_g2l_c(i);
+    if(jlo==-1)
+        return;
+    d_A[ilo+jlo*array_desc_device.m_loc()] = make_cuDoubleComplex(1.0, 0.0);
+}
+void ComplexMatrixDevice::set_as_identity(const GpuDeviceStream& gpu_dev_stream, const Array_Desc_Device& array_desc_device){
+    this->set_as_zero(gpu_dev_stream.stream);
+    Array_Desc_Device* d_array_desc_device;
+    // printf("array_desc_device.m():%d\n", array_desc_device.m());
+    CUDA_CHECK(cudaMallocAsync((void**)&d_array_desc_device, sizeof(Array_Desc_Device), gpu_dev_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_array_desc_device, &array_desc_device, sizeof(Array_Desc_Device), cudaMemcpyHostToDevice, gpu_dev_stream.stream));
+    if(array_desc_device.m()!=array_desc_device.n()){
+        fprintf(stderr,"Error: ComplexMatrixDevice::set_as_identity: m and n are not equal\n");
+        std::abort();
+    }
+    int blockSize = 256;
+    int gridSize = (array_desc_device.m()+blockSize-1)/blockSize;
+    set_as_identity_kernel<<<gridSize, blockSize, 0, gpu_dev_stream.stream>>>(d_data, *d_array_desc_device);
+    CUDA_CHECK(cudaFreeAsync(d_array_desc_device, gpu_dev_stream.stream));
+}
\ No newline at end of file
diff --git a/src/cuda_connector.h b/src/cuda_connector.h
new file mode 100644
index 00000000..f496bf08
--- /dev/null
+++ b/src/cuda_connector.h
@@ -0,0 +1,621 @@
+#ifndef CUDA_CONNECTOR_H
+#define CUDA_CONNECTOR_H
+
+#pragma once
+//=================hbchen 2025-05-11=========================
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+#include <complex>
+#include <magma_v2.h>
+//=================hbchen 2025-05-11=========================
+#include <iostream>
+#include <device_launch_parameters.h>
+#include <omp.h>
+#include <mpi.h>
+#include "base_blacs.h"
+#include <fstream>
+#ifdef ENABLE_NVHPC
+#include <cusolverMp.h>
+#include <cublasmp.h>
+#include "helpers.h"
+#include "scalapack_connector.h"
+#include <curand.h>
+#include <chrono>
+#include "gpu_device_stream.h"
+#endif
+using LIBRPA::Array_Desc;
+#include "array_desc_device.h"
+#include "matrix_device.h"
+#ifdef ENABLE_NVHPC
+
+
+class ComplexMatrixDevice{
+private:
+    int m=0;
+    int n=0;
+public:
+    cuDoubleComplex* d_data=nullptr;
+    
+    cusolverMpGrid_t grid_cusolver=nullptr;
+    cusolverMpMatrixDescriptor_t desc_cusolver=nullptr;
+
+    cublasMpGrid_t grid_cublas=nullptr;
+    cublasMpMatrixDescriptor_t desc_cublas=nullptr;
+
+    bool is_cusolver_init=false;
+    bool is_cublas_init=false;
+
+    ComplexMatrixDevice(){
+
+    }
+    ComplexMatrixDevice(const int& m, const int& n){
+        ComplexMatrixDevice();
+        this->m=m;
+        this->n=n;
+        CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(cuDoubleComplex)));
+    }
+    ComplexMatrixDevice(const int& m, const int& n,void* c_data){
+        this->m=m;
+        this->n=n;
+        CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(cuDoubleComplex)));
+        CUDA_CHECK(cudaMemcpy(d_data, c_data, m * n * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
+    }
+    void set_data(const int& m, const int& n, const cudaStream_t& stream=nullptr){
+        if(m!=this->m || n!=this->n){
+            clean(stream);
+            this->m=m;
+            this->n=n;
+            if(stream==nullptr){
+                CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(cuDoubleComplex)));
+            }else{
+                CUDA_CHECK(cudaMallocAsync((void**)&d_data, m * n * sizeof(cuDoubleComplex),stream));
+            }
+        }
+    }
+    void set_data(const int& m, const int& n,const void* c_data, const cudaStream_t& stream=nullptr){
+        set_data(m,n,stream);
+        if(stream==nullptr){
+            CUDA_CHECK(cudaMemcpy(d_data, c_data, m * n * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
+        }else{
+            CUDA_CHECK(cudaMemcpyAsync(d_data, c_data, m * n * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice,stream));
+        }
+    }
+    void set_data_device(const int& m, const int& n,const void* d_A, const GpuDeviceStream& gpu_dev_stream)
+    {
+        if(m!=this->m || n!=this->n){
+            set_data(m,n,gpu_dev_stream.stream);
+        }
+        
+        CUDA_CHECK(cudaMemcpyPeerAsync(this->d_data,gpu_dev_stream.local_device, d_A, gpu_dev_stream.local_device, m * n * sizeof(cuDoubleComplex), gpu_dev_stream.stream));
+
+    }
+    void cusolver_init(cusolverMpHandle_t handle, cal_comm_t cal_comm, const Array_Desc& arrdesc){
+        if(!is_cusolver_init){
+            CUSOLVERMP_CHECK(cusolverMpCreateDeviceGrid(handle,&grid_cusolver,cal_comm,arrdesc.nprows(),arrdesc.npcols(),CUSOLVERMP_GRID_MAPPING_ROW_MAJOR));
+            CUSOLVERMP_CHECK(cusolverMpCreateMatrixDesc(&desc_cusolver,grid_cusolver,CUDA_C_64F,m,n,arrdesc.mb(),arrdesc.nb(),0,0,arrdesc.m_loc()));
+            is_cusolver_init=true;
+        }
+    }
+    void cublas_init(cublasMpHandle_t handle, cal_comm_t cal_comm, const Array_Desc& arrdesc){
+        if(!is_cublas_init){
+            CUBLASMP_CHECK(cublasMpGridCreate(handle,arrdesc.nprows(),arrdesc.npcols(),CUBLASMP_GRID_LAYOUT_ROW_MAJOR,cal_comm,&grid_cublas));
+            CUBLASMP_CHECK(cublasMpMatrixDescriptorCreate(handle,m,n,arrdesc.mb(),arrdesc.nb(),0,0,arrdesc.m_loc(),CUDA_C_64F,grid_cublas,&desc_cublas));
+            is_cublas_init=true;
+        }
+    }
+    cuDoubleComplex* ptr(){
+        return d_data;
+    }
+    const cuDoubleComplex* ptr() const {
+        return d_data;
+    }
+    int nr(){
+        return this->m;
+    }
+    int nc(){
+        return this->n;
+    }
+    void set_nr(int m){
+        this->m=m;
+    }
+    void set_nc(int n){
+        this->n=n;
+    }
+    void set_ptr(cuDoubleComplex* d_data){
+        if(this->d_data!=nullptr){
+            cudaFree(this->d_data);
+            this->d_data=nullptr;
+        }
+        this->d_data=d_data;
+    }
+    void clean(const cudaStream_t& stream=nullptr){
+        if(d_data!=nullptr){
+            if(stream==nullptr)
+                cudaFree(d_data);
+            else
+                cudaFreeAsync(d_data,stream);
+            d_data=nullptr;
+        }
+        this->m=0;
+        this->n=0;
+    }
+    void set_as_zero(const cudaStream_t& stream=nullptr);
+    void set_as_identity(const GpuDeviceStream& gpu_dev_stream, const Array_Desc_Device&);
+    void cublasClean(cublasMpHandle_t handle){
+        if(desc_cublas!=nullptr){
+            CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,desc_cublas));
+            desc_cublas=nullptr;
+        }
+        if(grid_cublas!=nullptr){
+            CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid_cublas));
+            grid_cublas=nullptr;
+        }
+        is_cublas_init=false;
+    }
+    ~ComplexMatrixDevice(){
+        if(is_cublas_init){
+            fprintf(stderr, "Error: ComplexMatrixDevice not cleaned cublasMp resources before destructing!\n");
+            std::abort();
+        }
+        // CUDA_CHECK(cudaDeviceSynchronize());
+        if(d_data!=nullptr){
+            CUDA_CHECK(cudaFree(d_data));
+            d_data=nullptr;
+        }
+        if(desc_cusolver!=nullptr){
+            CUSOLVERMP_CHECK(cusolverMpDestroyMatrixDesc(desc_cusolver));
+            desc_cusolver=nullptr;
+        }
+        if(grid_cusolver!=nullptr){
+            CUSOLVERMP_CHECK(cusolverMpDestroyGrid(grid_cusolver));
+            grid_cusolver=nullptr;
+        }
+    }
+    
+};
+
+
+#endif
+
+class CudaConnector
+{
+public:
+    static
+    void write_file(double* A,int M,int N,const char* name){
+        std::fstream out;
+        out.open(name, std::ios::out);
+        if (!out.is_open()) {
+            std::cerr << "Failed to open file: " << name << std::endl;
+            return;
+        }
+        for(int i=0;i<M;i++){
+            for(int j=0;j<N;j++){
+                out<<A[i+j*M]<<" ";
+            }
+            out<<std::endl;
+        }
+        out.close();
+    }
+    static
+    void write_file(cuDoubleComplex* A,int M,int N,const char* name){
+        std::fstream out;
+        std::ifstream file(name);
+        if(file.good()){
+            return;
+        }
+        out.open(name, std::ios::out);
+        if (!out.is_open()) {
+            std::cerr << "Failed to open file: " << name << std::endl;
+            return;
+        }
+        
+        for(int i=0;i<M;i++){
+            for(int j=0;j<N;j++){
+                if(std::abs(A[i+j*M].y)<1e-10)
+                    A[i+j*M].y=0;
+                if(std::abs(A[i+j*M].x)<1e-10)
+                    A[i+j*M].x=0;
+                out<<A[i+j*M].x<<"+"<<A[i+j*M].y<<"i ";
+            }
+            out<<std::endl;
+        }
+        out.close();
+    }
+    static inline
+    void check_memory(const GpuDeviceStream& gpu_dev_stream) {
+        size_t free, total;
+        CUDA_CHECK(cudaMemGetInfo(&free, &total));  // 直接查询驱动
+        printf("rank:%d, Used: %f GiB / %f GiB\n",gpu_dev_stream.rank, (total - free) / (1024.0*1024.0*1024.0), total / (1024.0*1024.0*1024.0));
+    }
+    static inline 
+    cuDoubleComplex* transpose(const cublasHandle_t &handle,const cuDoubleComplex* d_a, const int n, const int lda,bool conjugate=false)
+    {
+        cuDoubleComplex* a_fort;
+        cudaMalloc((void**)&a_fort, n * lda * sizeof(cuDoubleComplex)); // 分配内存
+        cuDoubleComplex alpha = {1.0, 0.0}, beta = {0.0, 0.0};
+        // 调用 cublasZgeam 实现转置
+        if (conjugate)
+            cublasZgeam(handle, CUBLAS_OP_C, CUBLAS_OP_C,
+                    n, lda, // 转置后的维度
+                    &alpha,
+                    d_a, lda, // 输入矩阵的列数作为 leading dimension
+                    &beta,
+                    d_a, lda,       // 第二个矩阵设为空
+                    a_fort, n // 输出矩阵的 leading dimension
+            );
+        else
+        cublasZgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T,
+                    n, lda, // 转置后的维度
+                    &alpha,
+                    d_a, lda, // 输入矩阵的列数作为 leading dimension
+                    &beta,
+                    d_a, lda,       // 第二个矩阵设为空
+                    a_fort, n // 输出矩阵的 leading dimension
+        );
+        return a_fort;
+    }
+    static inline
+    void doubleComplexTocuDoubleComplex_host(const std::complex<double> *a, cuDoubleComplex *b, const int n, const int lda,bool is_transpose=false)
+    {
+        if (is_transpose)
+        {
+            #pragma omp parallel for
+            for (int i = 0; i < n; i++)
+            {
+                for (int j = 0; j < lda; j++)
+                {
+                    b[i*lda+j].x = a[j*n+i].real();
+                    b[i*lda+j].y = a[j*n+i].imag();
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for
+            for (int i = 0; i < n*lda; i++)
+            {
+                b[i].x = a[i].real();
+                b[i].y = a[i].imag();
+            }
+        }
+    }
+    static inline
+    void cuDoubleComplexToDoubleComplex_host(std::complex<double> *a,const cuDoubleComplex *b,const int n,const int lda,bool is_transpose=false)
+    {
+        if (is_transpose)
+        {
+            for (int i = 0; i < n; i++)
+            {
+                for (int j = 0; j < lda; j++)
+                {
+                    a[j*n+i].real(b[i*lda+j].x);
+                    a[j*n+i].imag(b[i*lda+j].y);
+                }
+            }
+        }
+        else
+        {
+            for (int i = 0; i < n*lda; i++)
+            {
+                a[i].real(b[i].x);
+                a[i].imag(b[i].y);
+            }
+        }
+    }
+    // update by chenhaobo in 2025-05-19
+    static inline
+    void cuZgetrf_f(const int& m, const int& n, cuDoubleComplex *d_a, const int& lda, int *d_ipiv, int *d_info, cusolverDnHandle_t cusolverH)
+    {
+        cudaStream_t stream;
+        cusolverStatus_t status = cusolverDnGetStream(cusolverH, &stream);
+        // printf("cusolverDnGetStream status:%d\n",status);
+        // printf("Stream stream:%d\n",stream);
+        // printf("cusolverDnhandle:%d\n",cusolverH);
+        int lwork;
+        // auto start = std::chrono::high_resolution_clock::now();
+        cusolverDnZgetrf_bufferSize(cusolverH, m, n, d_a, lda, &lwork);
+        cuDoubleComplex *d_work;
+        cudaMallocAsync((void**)&d_work, lwork * sizeof(cuDoubleComplex), stream);
+        // cudaStreamSynchronize(stream);
+        cusolverDnZgetrf(cusolverH, m, n, d_a, lda, d_work, d_ipiv, d_info);
+        // cudaStreamSynchronize(stream);
+        // auto end = std::chrono::high_resolution_clock::now();
+        // auto duration= std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+        // int info;
+        // cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost);
+        // printf("stream:%d, info:%d,Time taken of the cudaStreamSynchronize on GPU: %lld microseconds\n",stream,info, duration.count());
+        cudaFreeAsync(d_work, stream);
+    }
+    static
+    cuDoubleComplex det_cuDoubleComplex(const cuDoubleComplex *d_a,const int *d_ipiv, const int& n, const cudaStream_t stream);
+    static
+    void cuDoubleComplex_minus_identity_Async(cuDoubleComplex *d_a,const int& h_n, const cudaStream_t stream);
+    static inline void cuDoubleComplex_minus_identity_host(cuDoubleComplex* h_a, const int& n)
+    {
+        #pragma omp parallel for
+        for(int i=0;i<n;i++){
+            h_a[i*n+i].x -= 1.0; // 减去单位矩阵的对角线元素
+        }
+    }
+    static inline cuDoubleComplex det_cuZgetrf_f_from_host(const int& m, const int& n, cuDoubleComplex *h_a, const int& lda, int *ipiv, int *info, cusolverDnHandle_t cusolverH,const int &deviceId)
+    {
+        cudaSetDevice(deviceId);
+        cudaStream_t stream;
+        cusolverStatus_t status = cusolverDnGetStream(cusolverH, &stream);
+        int lwork;
+        
+        int *d_ipiv;
+        cudaMallocAsync((void**)&d_ipiv, n * sizeof(int), stream);
+        int *d_info;
+        cudaMallocAsync((void**)&d_info, sizeof(int), stream);
+        cudaMemcpyAsync(d_ipiv, ipiv, n * sizeof(int), cudaMemcpyHostToDevice, stream);
+        cudaMemcpyAsync(d_info, info, sizeof(int), cudaMemcpyHostToDevice, stream);
+        cuDoubleComplex *d_a;
+        cudaMallocAsync((void**)&d_a, m * n * sizeof(cuDoubleComplex), stream);
+        cusolverDnZgetrf_bufferSize(cusolverH, m, n, d_a, lda, &lwork);
+        cuDoubleComplex *d_work;
+        cudaMallocAsync((void**)&d_work, lwork * sizeof(cuDoubleComplex), stream);
+        cudaMemcpyAsync(d_a, h_a, m * n * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice, stream);
+        CudaConnector::cuDoubleComplex_minus_identity_Async(d_a,n, stream);
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+            double start_time_zgetrf = omp_get_wtime();
+        #endif
+        cusolverDnZgetrf(cusolverH, m, n, d_a, lda, d_work, d_ipiv, d_info);
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+            printf("cusolverDnZgetrf time: %f seconds\n", omp_get_wtime() - start_time_zgetrf);
+        #endif
+        // Check for errors in the info variable
+        cudaMemcpyAsync(info, d_info, sizeof(int), cudaMemcpyDeviceToHost, stream);
+        cuDoubleComplex det = det_cuDoubleComplex(d_a, d_ipiv, n, stream);
+        // cudaStreamSynchronize(stream);
+        cudaMemcpyAsync(ipiv, d_ipiv, n * sizeof(int), cudaMemcpyDeviceToHost, stream);
+        cudaFreeAsync(d_work, stream);
+        cudaFreeAsync(d_ipiv, stream);
+        cudaFreeAsync(d_info, stream);
+        cudaFreeAsync(d_a, stream);
+        cudaStreamSynchronize(stream);
+        return det;
+    }
+    static inline cuDoubleComplex det_magmaZgetrf_f_mgpu_from_host(const int& m, const int& n, cuDoubleComplex *h_a, const int& lda, int *ipiv, int *info, const int& ngpu, magma_queue_t* queues)
+    {
+        cuDoubleComplex_minus_identity_host(h_a, m);
+        int nb= magma_get_zgetrf_nb( m, n );
+        // printf("nb=%d\n", nb);
+        int ldda= magma_roundup( m, nb );  // multiple of 32 by default
+        int ldn_local,n_local;
+        // printf("ldda:%d\n", ldda);
+        cuDoubleComplex **d_lA = new cuDoubleComplex*[ngpu];
+        for(int dev=0; dev < ngpu; dev++ ) {
+            n_local = ((n/nb)/ngpu)*nb;
+            if (dev < (n/nb) % ngpu)
+                n_local += nb;
+            else if (dev == (n/nb) % ngpu)
+                n_local += n % nb;
+            ldn_local = magma_roundup( n_local, nb );  // multiple of 32 by default
+            // printf("dev=%d, n_local=%d, ldn_local=%d\n", dev, n_local, ldn_local);
+            magma_setdevice( dev );
+            magma_zmalloc( &d_lA[dev], ldda*ldn_local );
+        }
+        // // show h_a
+        // printf("h_a\n");
+        // for(int i=0;i<m;i++){
+        //     for(int j=0;j<n;j++){
+        //         printf("(%f,%f) ", h_a[i*lda+j].x, h_a[i*lda+j].y);
+        //     }
+        //     printf("\n");
+        // }
+        magma_zsetmatrix_1D_col_bcyclic( ngpu, m, n, nb, h_a, lda, d_lA, ldda, queues );
+        // // show d_lA
+        // printf("d_la before LU composition\n");
+        // for(int dev=0; dev < ngpu; dev++ ) {
+        //     n_local = ((n/nb)/ngpu)*nb;
+        //     if (dev < (n/nb) % ngpu)
+        //         n_local += nb;
+        //     else if (dev == (n/nb) % ngpu)
+        //         n_local += n % nb;
+        //     ldn_local = magma_roundup( n_local, nb );  // multiple of 32 by default
+        //     magma_setdevice( dev );
+        //     printf("Device %d d_lA:\n", dev);
+        //     cuDoubleComplex *h_lA=new cuDoubleComplex[ldda*ldn_local];
+        //     cudaMemcpy(h_lA, d_lA[dev], ldda*ldn_local*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
+        //     for(int i=0;i<ldn_local;i++){
+        //         for(int j=0;j<ldda;j++){
+        //             printf("(%f,%f) ", h_lA[i*ldda+j].x, h_lA[i*ldda+j].y);
+        //         }
+        //         printf("\n");
+        //     }
+        //     delete[] h_lA;
+        // }
+        magma_zgetrf_mgpu( ngpu, m, n, d_lA, ldda, ipiv, info );
+        // if (*info != 0) {
+        //     printf("magma_zgetrf_mgpu returned error %lld: %s.\n",
+        //            (long long) *info, magma_strerror( *info ));
+        // }
+        // *info=1;
+        // printf("ipiv after magma_zgetrf_mgpu:");
+        // for(int i=0;i<m;i++){
+        //     printf("%d ", ipiv[i]);
+        //     ipiv[i]=0;
+        // }
+        // printf("\n");
+        // // lapack test
+        // lapackf77_zgetrf( &m, &n, h_a, &lda, ipiv, info );
+        // if (*info != 0) {
+        //     printf("magma_zgetrf_mgpu returned error %lld: %s.\n",
+        //            (long long) *info, magma_strerror( *info ));
+        // }
+        // printf("ipiv after lapack zgetrf:");
+        // for(int i=0;i<m;i++){
+        //     printf("%d ", ipiv[i]);
+        //     // ipiv[i]=0;
+        // }
+        // printf("\n");
+        // // show h_a
+        // printf("h_a after LU composition\n");
+        // for(int i=0;i<m;i++){
+        //     for(int j=0;j<n;j++){
+        //         printf("(%f,%f) ", h_a[i*lda+j].x, h_a[i*lda+j].y);
+        //         h_a[i*lda+j].x = 0.0; // reset h_a to zero
+        //         h_a[i*lda+j].y = 0.0; // reset h_a to zero
+        //     }
+        //     printf("\n");
+        // }
+        // // lapack test over
+        magma_zgetmatrix_1D_col_bcyclic( ngpu, m, n, nb, d_lA, ldda, h_a, lda, queues );
+        for( int dev=0; dev < ngpu; dev++ ) {
+            magma_setdevice( dev );
+            magma_free( d_lA[dev] );
+        }
+        delete[] d_lA;
+        // Calculate the determinant from the diagonal elements and the pivot indices
+        cuDoubleComplex det = cuDoubleComplex{1.0, 0.0};
+        // printf("det=%f+%fi\n", det.x, det.y);
+        for(int i=0;i<m;i++){
+            // printf("h_a[%d][%d]=(%f,%f)\n", i, i, h_a[i*lda+i].x, h_a[i*lda+i].y);
+            det=cuCmul(det, h_a[i*lda+i]);
+            if( ipiv[i] != i + 1 ) {
+                det.x = -det.x; // Adjust sign for row swaps
+                det.y = -det.y; // Adjust sign for row swaps
+            }
+        }
+        return det;
+    }
+    #ifdef ENABLE_NVHPC
+    // static
+    // void pzgetrf_cusolverMp(const int &, const int &, std::complex<double> *, const int &,
+    //                             const int &, const LIBRPA::Array_Desc &, int *, int &,const char order='C');
+    static 
+    void pzgetrf_nvhpc(const GpuDeviceStream&, ComplexMatrixDevice &, const int &,
+                                const int &, const LIBRPA::Array_Desc &, int64_t *, int *,const char& order='C');
+    static 
+    void pgetrf_nvhpc_mixed_precision(
+        const GpuDeviceStream&, void *, 
+        const int &, const int &,
+        const LIBRPA::Array_Desc &, int64_t *, int *,
+        const cudaDataType_t &,const char &order='C'
+    );
+    static 
+    void pgetrf_nvhpc_mixed_precision(
+        void *, const int &, const int &,
+        const LIBRPA::Array_Desc &, int64_t *, int *,
+        const cudaDataType_t &,const char &order='C'
+    );
+    static
+    void pgetrs_nvhpc_mixed_precision(
+        const GpuDeviceStream&, const cublasOperation_t&,
+        const void* d_A, const int64_t& IA, const int64_t& JA, const LIBRPA::Array_Desc &,
+        const int64_t* d_ipiv,
+        void* d_B, const int64_t& IB, const int64_t& JB, const LIBRPA::Array_Desc &,
+        int* d_info,const cudaDataType_t& compute_type,
+        const char& order='C'
+    );
+    static void pgetrf_trs_nvhpc_mixed_precision(
+        const GpuDeviceStream&, const cublasOperation_t&,
+        void* d_A, const int64_t& IA, const int64_t& JA, const LIBRPA::Array_Desc &,
+        void* d_B, const int64_t& IB, const int64_t& JB, const LIBRPA::Array_Desc &,
+        const cudaDataType_t& compute_type, const char& order='C'
+    );
+    static
+    std::complex<double> det_ComplexMatrixDevice_blacs(const cudaStream_t&, const ComplexMatrixDevice&, const LIBRPA::Array_Desc &);
+    static
+    std::complex<double> trace_ComplexMatrixDevice_blacs(const cudaStream_t&, const ComplexMatrixDevice&, const LIBRPA::Array_Desc &);
+    static
+    void trace_matrix_device_blacs(void* trace, const void* d_A, const Array_Desc &arrdesc_A, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type);
+    static
+    void det_matrix_device_blacs(void* det, const void* d_A, const Array_Desc &arrdesc_A, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type);
+    
+    // static
+    // void pgemm_cublasMp(const char &transa, const char &transb, const int &m, const int &n, const int &k,
+    //                     const double &alphaD, const std::complex<double> *A, const int &ia, const int &ja, const LIBRPA::Array_Desc &arrdesc_A,
+    //                     const std::complex<double> *B, const int &ib, const int &jb, const LIBRPA::Array_Desc &arrdesc_B,
+    //                     const double &betaD, std::complex<double> *C, const int &ic, const int &jc, const LIBRPA::Array_Desc &arrdesc_C);
+    
+    static 
+    void pgemm_device(cublasMpHandle_t,cublasOperation_t,cublasOperation_t,const int &,const int &,const int &,
+                        const void *,
+                        const ComplexMatrixDevice &,int64_t,int64_t,
+                        const ComplexMatrixDevice &,int64_t,int64_t,
+                        const void *,
+                        ComplexMatrixDevice &,int64_t,int64_t,
+                        cublasComputeType_t);
+       
+    static 
+    void pgemm_nvhpc(const GpuDeviceStream&,cublasOperation_t,cublasOperation_t,const int &,const int &,const int &,
+                        const void *,
+                        const ComplexMatrixDevice &,int64_t,int64_t,const Array_Desc&,
+                        const ComplexMatrixDevice &,int64_t,int64_t,const Array_Desc&,
+                        const void *,
+                        ComplexMatrixDevice &,int64_t,int64_t,const Array_Desc&,
+                        cublasComputeType_t);
+    // static
+    // void pgemm_nvhpc_cuFloatComplex(const GpuDeviceStream& gpu_dev_stream,cublasOperation_t transA,cublasOperation_t transB,const int & m,const int & n,const int & k,
+    //                     const void *alpha,
+    //                     const cuFloatComplex* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA,
+    //                     const cuFloatComplex* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB,
+    //                     const void *beta,
+    //                     cuFloatComplex * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC,
+    //                     cublasComputeType_t cublas_compute_type);
+    static
+    void pgemm_nvhpc_mixed_precision(
+        const GpuDeviceStream& gpu_dev_stream,cublasOperation_t transA,cublasOperation_t transB,
+        const int & m,const int & n,const int & k,
+        const void *alpha,
+        const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA,
+        const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB,
+        const void *beta,
+        void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC,
+        cublasComputeType_t cublas_compute_type
+    );
+    static
+    void pgemm_nvhpc_mixed_precision(
+        cublasOperation_t transA,cublasOperation_t transB,
+        const int & m,const int & n,const int & k,
+        const void *alpha,
+        const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA,
+        const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB,
+        const void *beta,
+        void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC,
+        cublasComputeType_t cublas_compute_type
+    );
+    static
+    void pgeadd_nvhpc(
+        const GpuDeviceStream& gpu_dev_stream,const cublasOperation_t& trans,
+        const void *alpha,
+        const void* d_A, const int64_t& ia, const int64_t& ja, const Array_Desc& array_descA,
+        const void* beta,
+        void* d_B, const int64_t& ib, const int64_t& jb, const Array_Desc& array_descB,
+        const cudaDataType_t&,
+        const char& order = 'c'
+    );
+    static
+    void pgemr2d_nvhpc(
+        const GpuDeviceStream& gpu_dev_stream,const int&,const int&,
+        const void* d_A, const int64_t& ia, const int64_t& ja, const Array_Desc& array_descA,
+        void* d_B, const int64_t& ib, const int64_t& jb, const Array_Desc& array_descB,
+        const cudaDataType_t&
+    );
+    static
+    void cuFloatComplex_to_cuDoubleComplex_Async(const cuFloatComplex* d_a, cuDoubleComplex* d_b, const int64_t& len, const cudaStream_t& stream);
+    static 
+    void cuDoubleComplex_to_cuFloatComplex_Async(const cuDoubleComplex* d_a, cuFloatComplex* d_b, const int64_t& len, const cudaStream_t& stream);
+    static 
+    void float_to_double_device(const float* d_a, double* d_b, const int64_t& len);
+    static
+    void double_to_float_device(const double* d_a, float* d_b, const int64_t& len);
+    static
+    void multiply_number_for_ComplexMatrixDevice(ComplexMatrixDevice& mat, const cuDoubleComplex& num, const cudaStream_t& stream);                    
+    static
+    void diag_add_ComplexMatrixDevice(ComplexMatrixDevice& mat, const double& num, const Array_Desc& arrdesc,const cudaStream_t& stream);
+    static
+    void diag_add_matrix_device_blacs(
+        const void* num, void* d_A, const Array_Desc& array_desc,const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+    );
+    static 
+    void transpose_ComplexMatrixDevice(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice& in ,bool is_conjugate=false);
+
+    #endif 
+    
+};
+
+#endif // CUDA_CONNECTOR_H
\ No newline at end of file
diff --git a/src/device_connector.cpp b/src/device_connector.cpp
new file mode 100644
index 00000000..b343450c
--- /dev/null
+++ b/src/device_connector.cpp
@@ -0,0 +1,248 @@
+#include "device_connector.h"
+#ifdef ENABLE_NVHPC
+#include "cuda_connector.h"
+#endif
+#include "device_stream.h"
+
+void DeviceConnector::pgetrf_device_mixed_precision(
+    void* d_A, const int& m, const int& n,
+    const LIBRPA::Array_Desc& arrdesc_pi,
+    int64_t* d_ipiv, int* d_info,
+    const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type,
+    const char& order
+){
+    {
+        #ifdef ENABLE_NVHPC
+        cudaDataType_t cuda_compute_type;
+        if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){
+            cuda_compute_type = CUDA_C_64F;
+        }else if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT){
+            cuda_compute_type = CUDA_C_32F;
+        }else if(compute_type==LIBRPA_COMPUTE_TYPE_DOUBLE){
+            cuda_compute_type = CUDA_R_64F;
+        }else if(compute_type==LIBRPA_COMPUTE_TYPE_FLOAT){
+            cuda_compute_type = CUDA_R_32F;
+        }else{
+            fprintf(stderr, "Error: Unsupported compute type in pgetrf_device_mixed_precision\n");
+            exit(1);
+        }
+        CudaConnector::pgetrf_nvhpc_mixed_precision(
+            d_A, m, n,
+            arrdesc_pi,
+            d_ipiv, d_info,
+            cuda_compute_type,
+            order
+        );
+        #else
+        fprintf(stderr, "Error: NVHPC is not enabled in this build, cannot call pgetrf_device_mixed_precision\n");
+        exit(1);
+        #endif
+    }
+}
+
+void DeviceConnector::pgemm_device_mixed_precision(
+    const char& transa, const char& transb,
+    const int & m,const int & n,const int & k,
+    const void *alpha,
+    const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA,
+    const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB,
+    const void *beta,
+    void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC,
+    const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+){
+    #ifdef ENABLE_NVHPC
+    cublasComputeType_t cublas_compute_type;
+    cublasOperation_t transA, transB;
+    if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){
+        cublas_compute_type = CUBLAS_COMPUTE_64F_PEDANTIC;
+    }else if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT){
+        cublas_compute_type = CUBLAS_COMPUTE_32F_PEDANTIC;
+    }else if(compute_type==LIBRPA_COMPUTE_TYPE_DOUBLE){
+        cublas_compute_type = CUBLAS_COMPUTE_64F;
+    }else if(compute_type==LIBRPA_COMPUTE_TYPE_FLOAT){
+        cublas_compute_type = CUBLAS_COMPUTE_32F;
+    }else{
+        fprintf(stderr, "Error: Unsupported compute type in pgemm_device_mixed_precision\n");
+        exit(1);
+    }
+    if(transa=='N'||transa=='n'){
+        transA = CUBLAS_OP_N;
+    }else if(transa=='T'||transa=='t'){
+        transA = CUBLAS_OP_T;
+    }else if(transa=='C'||transa=='c'){
+        transA = CUBLAS_OP_C;
+    }else{
+        fprintf(stderr, "Error: Unsupported transa in pgemm_device_mixed_precision\n");
+        exit(1);
+    }
+    if(transb=='N'||transb=='n'){
+        transB = CUBLAS_OP_N;
+    }else if(transb=='T'||transb=='t'){
+        transB = CUBLAS_OP_T;
+    }else if(transb=='C'||transb=='c'){
+        transB = CUBLAS_OP_C;
+    }else{
+        fprintf(stderr, "Error: Unsupported transb in pgemm_device_mixed_precision\n");
+        exit(1);
+    }
+    CudaConnector::pgemm_nvhpc_mixed_precision(
+        transA, transB,
+        m, n, k,
+        alpha,
+        d_A, ia, ja, array_descA,
+        d_B, ib, jb, array_descB,
+        beta,
+        d_C, ic, jc, array_descC,
+        cublas_compute_type
+    );
+    #endif
+}
+
+void DeviceConnector::transpose_device_blas(
+    const void* d_A, 
+    const int& m, const int& n,
+    void* d_B,
+    const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type,
+    const bool& is_conjugate
+){
+    #ifdef ENABLE_NVHPC
+    cublasHandle_t cublasH = NULL;
+    CUBLAS_CHECK(cublasCreate(&cublasH));
+    CUBLAS_CHECK(cublasSetStream(cublasH, (cudaStream_t)device_stream.stream));
+    cublasOperation_t trans = CUBLAS_OP_T;
+    if(is_conjugate)
+        trans = CUBLAS_OP_C;
+    if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){
+        cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0);
+        cuDoubleComplex beta = make_cuDoubleComplex(0.0, 0.0);
+        CUBLAS_CHECK(cublasZgeam(
+            cublasH, trans, CUBLAS_OP_N,
+            m, n, 
+            &alpha,
+            (cuDoubleComplex*)d_A, n,
+            &beta,
+            (cuDoubleComplex*)d_B, m,
+            (cuDoubleComplex*)d_B, m));
+    }else if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT){
+        cuFloatComplex alpha = make_cuFloatComplex(1.0, 0.0);
+        cuFloatComplex beta = make_cuFloatComplex(0.0, 0.0);
+        CUBLAS_CHECK(cublasCgeam(
+            cublasH, trans, CUBLAS_OP_N,
+            m, n, 
+            &alpha,
+            (cuFloatComplex*)d_A, n,
+            &beta,
+            (cuFloatComplex*)d_B, m,
+            (cuFloatComplex*)d_B, m));
+    }else if(compute_type==LIBRPA_COMPUTE_TYPE_DOUBLE){
+        double alpha = 1.0;
+        double beta = 0.0;
+        CUBLAS_CHECK(cublasDgeam(
+            cublasH, trans, CUBLAS_OP_N,
+            m, n, 
+            &alpha,
+            (double*)d_A, n,
+            &beta,
+            (double*)d_B, m,
+            (double*)d_B, m));
+        
+    }else if(compute_type==LIBRPA_COMPUTE_TYPE_FLOAT){
+        float alpha = 1.0;
+        float beta = 0.0;
+        CUBLAS_CHECK(cublasSgeam(
+            cublasH, trans, CUBLAS_OP_N,
+            m, n, 
+            &alpha,
+            (float*)d_A, n,
+            &beta,
+            (float*)d_B, m,
+            (float*)d_B, m));
+    }else{
+        fprintf(stderr, "Error: Unsupported compute type in transpose_device_blas\n");
+        exit(1);
+    }
+    CUBLAS_CHECK(cublasDestroy(cublasH));
+    #endif
+}
+
+void DeviceConnector::num_multiply_matrix_device(
+    const int& n, const void* num, void* d_A, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+){
+    #ifdef ENABLE_NVHPC
+    cublasHandle_t cublasH = NULL;
+    CUBLAS_CHECK(cublasCreate(&cublasH));
+    CUBLAS_CHECK(cublasSetStream(cublasH, (cudaStream_t)device_stream.stream));
+    if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){
+        CUBLAS_CHECK(cublasZscal(
+            cublasH, n,
+            (cuDoubleComplex*)num,
+            (cuDoubleComplex*)d_A, 1
+        ));
+    }else if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT){
+        CUBLAS_CHECK(cublasCscal(
+            cublasH, n,
+            (cuFloatComplex*)num,
+            (cuFloatComplex*)d_A, 1
+        )); 
+    }else if(compute_type==LIBRPA_COMPUTE_TYPE_DOUBLE){
+        CUBLAS_CHECK(cublasDscal(
+            cublasH, n,
+            (double*)num,
+            (double*)d_A, 1
+        ));
+    }else if(compute_type==LIBRPA_COMPUTE_TYPE_FLOAT){
+        CUBLAS_CHECK(cublasSscal(
+            cublasH, n,
+            (float*)num,
+            (float*)d_A, 1
+        ));
+    }else{
+        fprintf(stderr, "Error: Unsupported compute type in num_multiply_matrix_device\n");
+        exit(1);
+    }
+    #endif
+}
+
+void DeviceConnector::diag_add_matrix_device_blacs(
+    const void* num, void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+){
+    #ifdef ENABLE_NVHPC
+    CudaConnector::diag_add_matrix_device_blacs(
+        num, d_A, array_desc, compute_type
+    );
+    #endif
+}
+
+void DeviceConnector::trace_matrix_device_blacs(
+    void* trace, const void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+){
+    #ifdef ENABLE_NVHPC
+    CudaConnector::trace_matrix_device_blacs(
+        trace, d_A, array_desc, compute_type
+    );
+    #endif
+}
+
+void DeviceConnector::det_matrix_device_blacs(
+    void* det, const void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+){
+    #ifdef ENABLE_NVHPC
+    CudaConnector::det_matrix_device_blacs(
+        det, d_A, array_desc, compute_type
+    );
+    #endif
+}
+
+void DeviceConnector::float_to_double_device(float* d_A, double* d_B, const int64_t& n)
+{
+    #ifdef ENABLE_NVHPC
+    CudaConnector::float_to_double_device(d_A, d_B, n);
+    #endif
+}
+
+void DeviceConnector::double_to_float_device(double* d_A, float* d_B, const int64_t& n)
+{
+    #ifdef ENABLE_NVHPC
+    CudaConnector::double_to_float_device(d_A, d_B, n);
+    #endif
+}
diff --git a/src/device_connector.h b/src/device_connector.h
new file mode 100644
index 00000000..b4188f68
--- /dev/null
+++ b/src/device_connector.h
@@ -0,0 +1,54 @@
+#ifndef DEVICE_CONNECTOR_H
+#define DEVICE_CONNECTOR_H
+#include "base_blacs.h"
+#include  "helpers.h"
+using LIBRPA::Array_Desc;
+
+class DeviceConnector{
+public:
+    static void pgetrf_device_mixed_precision(
+        void* d_A, const int& m, const int& n,
+        const LIBRPA::Array_Desc& arrdesc_pi,
+        int64_t* d_ipiv, int* d_info,
+        const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type,
+        const char& order='C'
+    );
+    static void pgemm_device_mixed_precision(
+        const char& transa, const char& transb,
+        const int & m,const int & n,const int & k,
+        const void *alpha,
+        const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA,
+        const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB,
+        const void *beta,
+        void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC,
+        const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+    );
+    static void transpose_device_blas(
+        const void* d_A, 
+        const int& m, const int& n,
+        void* d_B,
+        const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type,
+        const bool& is_conjugate=false
+    );
+    static void num_multiply_matrix_device(
+        const int& n, const void* num, void* d_A, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+    );
+    static void diag_add_matrix_device_blacs(
+        const void* num, void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+    );
+    
+    static void trace_matrix_device_blacs(
+        void* trace, const void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+    );
+    static void det_matrix_device_blacs(
+        void* det, const void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type
+    );
+    static void float_to_double_device(float* d_A, double* d_B, const int64_t& n);
+    static void double_to_float_device(double* d_A, float* d_B, const int64_t& n);
+
+    
+};
+
+
+
+#endif // DEVICE_CONNECTOR_H
\ No newline at end of file
diff --git a/src/device_stream.cpp b/src/device_stream.cpp
new file mode 100644
index 00000000..03b9ddfc
--- /dev/null
+++ b/src/device_stream.cpp
@@ -0,0 +1,34 @@
+#include "device_stream.h"
+#include "envs_mpi.h"
+using LIBRPA::envs::mpi_comm_global_h;
+void DeviceStream::init(){
+    local_device = DeviceStream::getLocalDevice();
+    #ifdef ENABLE_NVHPC
+    CUDA_CHECK(cudaSetDevice(local_device));
+    CUDA_CHECK(cudaFree(nullptr));
+    {
+        params.allgather    = DeviceStream::allgather;
+        params.req_test     = DeviceStream::request_test;
+        params.req_free     = DeviceStream::request_free;
+        params.data         = (void*)(MPI_COMM_WORLD);
+        params.rank         = mpi_comm_global_h.myid;
+        params.nranks       = mpi_comm_global_h.nprocs;
+        params.local_device = local_device;
+
+        CAL_CHECK(cal_comm_create(params, &cal_comm));
+    }
+    CUDA_CHECK(cudaStreamCreate((cudaStream_t*)&stream));
+    CUSOLVERMP_CHECK(cusolverMpCreate(&cusolverMp_handle, local_device, (cudaStream_t)stream));
+    CUBLASMP_CHECK(cublasMpCreate(&cublasMp_handle, (cudaStream_t)stream));
+    #endif
+}
+void DeviceStream::check_memory(){
+    #ifdef ENABLE_NVHPC
+    size_t free, total;
+    CUDA_CHECK(cudaMemGetInfo(&free, &total));  // 直接查询驱动
+    printf("rank:%d, Used: %f GiB / %f GiB\n",mpi_comm_global_h.myid, (total - free) / (1024.0*1024.0*1024.0), total / (1024.0*1024.0*1024.0));
+    #endif
+}
+
+
+DeviceStream device_stream = DeviceStream();
\ No newline at end of file
diff --git a/src/device_stream.h b/src/device_stream.h
new file mode 100644
index 00000000..91b9800c
--- /dev/null
+++ b/src/device_stream.h
@@ -0,0 +1,134 @@
+#ifndef DEVICE_STREAM_H
+#define DEVICE_STREAM_H
+
+#include <iostream>
+#ifdef ENABLE_NVHPC
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+#include <cusolverMp.h>
+#include <cublasmp.h>
+#endif
+#include "helpers.h"
+#include <mpi.h>
+class DeviceStream{
+private:
+    #ifdef ENABLE_NVHPC
+    cal_comm_create_params_t params;
+    static inline calError_t allgather(void* src_buf, void* recv_buf, size_t size, void* data, void** request)
+    {
+        MPI_Request req;
+        int         err = MPI_Iallgather(src_buf, size, MPI_BYTE, recv_buf, size, MPI_BYTE, (MPI_Comm)(data), &req);
+        if (err != MPI_SUCCESS)
+        {
+            return CAL_ERROR;
+        }
+        *request = (void*)(req);
+        return CAL_OK;
+    }
+
+    static inline calError_t request_test(void* request)
+    {
+        MPI_Request req = (MPI_Request)(request);
+        int         completed;
+        int         err = MPI_Test(&req, &completed, MPI_STATUS_IGNORE);
+        if (err != MPI_SUCCESS)
+        {
+            return CAL_ERROR;
+        }
+        return completed ? CAL_OK : CAL_ERROR_INPROGRESS;
+    }
+
+    static inline calError_t request_free(void* request)
+    {
+        return CAL_OK;
+    }
+    #endif
+    static inline int getLocalDevice()
+    {
+        int localRank;
+        MPI_Comm localComm;
+
+        MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &localComm);
+        MPI_Comm_rank(localComm, &localRank);
+        MPI_Comm_free(&localComm);
+
+        int deviceCount = 0;
+        #ifdef ENABLE_NVHPC
+        CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
+        #endif
+
+        return localRank % deviceCount;
+    }
+public:
+    int local_device;
+    #ifdef ENABLE_NVHPC
+    cudaStream_t stream = nullptr;
+    cal_comm_t cal_comm = nullptr;
+    cusolverMpHandle_t cusolverMp_handle = nullptr;
+    cublasMpHandle_t cublasMp_handle = nullptr;
+    #endif
+    DeviceStream(){}
+    void check_memory();
+    void init();
+    void finalize(){
+        if(stream!=nullptr){
+            #ifdef ENABLE_NVHPC
+            CUDA_CHECK(cudaStreamDestroy((cudaStream_t)stream));
+            #endif
+            stream=nullptr;
+        }
+        #ifdef ENABLE_NVHPC
+        if(cusolverMp_handle!=nullptr){
+            CUSOLVERMP_CHECK(cusolverMpDestroy(cusolverMp_handle));
+            cusolverMp_handle=nullptr;
+        }
+        if(cublasMp_handle!=nullptr){
+            CUBLASMP_CHECK(cublasMpDestroy(cublasMp_handle));
+            cublasMp_handle=nullptr;
+        }
+        if(cal_comm!=nullptr){
+            CAL_CHECK(cal_comm_destroy(cal_comm));
+            cal_comm=nullptr;
+        }
+        #endif
+    }
+    ~DeviceStream(){
+        if(stream!=nullptr){
+            #ifdef ENABLE_NVHPC
+            CUDA_CHECK(cudaStreamDestroy((cudaStream_t)stream));
+            #endif
+            stream=nullptr;
+        }
+        #ifdef ENABLE_NVHPC
+        if(cusolverMp_handle!=nullptr){
+            CUSOLVERMP_CHECK(cusolverMpDestroy(cusolverMp_handle));
+            cusolverMp_handle=nullptr;
+        }
+        if(cublasMp_handle!=nullptr){
+            CUBLASMP_CHECK(cublasMpDestroy(cublasMp_handle));
+            cublasMp_handle=nullptr;
+        }
+        if(cal_comm!=nullptr){
+            CAL_CHECK(cal_comm_destroy(cal_comm));
+            cal_comm=nullptr;
+        }
+        #endif
+    }
+    void sync() const{
+        #ifdef ENABLE_NVHPC
+        CUDA_CHECK(cudaStreamSynchronize((cudaStream_t)stream));
+        #endif
+    }
+    #ifdef ENABLE_NVHPC
+    void cudaSync() const{
+        CUDA_CHECK(cudaStreamSynchronize((cudaStream_t)stream));
+    }
+    void calSync() const {
+        CAL_CHECK(cal_stream_sync(cal_comm,(cudaStream_t)stream));
+    }
+    #endif
+};
+
+extern DeviceStream device_stream;
+#endif // DEVICE_STREAM_H
\ No newline at end of file
diff --git a/src/epsilon_cuda.cpp b/src/epsilon_cuda.cpp
new file mode 100644
index 00000000..dc194711
--- /dev/null
+++ b/src/epsilon_cuda.cpp
@@ -0,0 +1,1448 @@
+#include "epsilon_cuda.h"
+#include "epsilon.h"
+#include "device_stream.h"
+#include <magma_v2.h>
+#include <random>
+#include <math.h>
+#include <omp.h>
+
+#include <algorithm>
+#include <array>
+#include <set>
+#include <stdexcept>
+#include <valarray>
+
+#include "atoms.h"
+#include "constants.h"
+#include "envs_blacs.h"
+#include "envs_io.h"
+#include "envs_mpi.h"
+#include "lapack_connector.h"
+#include "libri_utils.h"
+#include "matrix_m_parallel_utils.h"
+#include "parallel_mpi.h"
+#include "params.h"
+#include "pbc.h"
+#include "profiler.h"
+#include "scalapack_connector.h"
+#include "stl_io_helper.h"
+#include "utils_blacs.h"
+#include "utils_io.h"
+#include "utils_mem.h"
+#include "utils_mpi_io.h"
+#ifdef LIBRPA_USE_LIBRI
+#include <RI/comm/mix/Communicate_Tensors_Map_Judge.h>
+#include <RI/global/Tensor.h>
+using RI::Tensor;
+using RI::Communicate_Tensors_Map_Judge::comm_map2_first;
+#endif
+#ifdef ENABLE_NVHPC
+#include "helpers.h"
+#include <fstream> 
+#include <string>
+#endif
+
+using LIBRPA::Array_Desc;
+using LIBRPA::envs::blacs_ctxt_global_h;
+using LIBRPA::envs::mpi_comm_global_h;
+using LIBRPA::envs::ofs_myid;
+using LIBRPA::utils::lib_printf;
+CorrEnergy compute_RPA_correlation_blacs_2d_cuda(Chi0 &chi0, atpair_k_cplx_mat_t &coulmat)
+{
+    lib_printf("Begin to compute_RPA_correlation_blacs_2d_nvhpc  myid: %d\n", mpi_comm_global_h.myid);
+    system("free -m");
+    CorrEnergy corr;
+    if (mpi_comm_global_h.myid == 0) lib_printf("Calculating EcRPA with BLACS/ScaLAPACK_nvhpc 2D\n");
+    const auto &mf = chi0.mf;
+    const complex<double> CONE{1.0, 0.0};
+    const int n_abf = LIBRPA::atomic_basis_abf.nb_total;
+    if (mpi_comm_global_h.myid == 0) lib_printf("n_abf = %d\n", n_abf);
+    const auto part_range = LIBRPA::atomic_basis_abf.get_part_range();
+
+    mpi_comm_global_h.barrier();
+
+    Array_Desc desc_nabf_nabf(blacs_ctxt_global_h);
+    desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0);
+    const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy(
+        'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf);
+    const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf);
+    auto chi0_block = init_local_mat<complex<double>>(desc_nabf_nabf, MAJOR::COL);
+    auto coul_block = init_local_mat<complex<double>>(desc_nabf_nabf, MAJOR::COL);
+    auto coul_chi0_block = init_local_mat<complex<double>>(desc_nabf_nabf, MAJOR::COL);
+    #ifdef ENABLE_NVHPC
+    MatrixDevice<std::complex<double>> d_chi0_block, d_coul_block, d_coul_chi0_block;
+    #endif
+    vector<Vector3_Order<double>> qpts;
+    for(const auto &q : chi0.klist)
+    {
+        qpts.push_back(q);
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+        // printf("processId:%d, q: (%f, %f, %f)\n", mpi_comm_global_h.myid, q.x, q.y, q.z);
+        #endif
+    }
+    complex<double> tot_RPA_energy(0.0, 0.0);
+    map<Vector3_Order<double>, complex<double>> cRPA_q;
+    if (mpi_comm_global_h.is_root()) lib_printf("Finish init RPA blacs 2d\n");
+    #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+    // printf("success before for loop processid:%d\n", mpi_comm_global_h.myid);
+    #endif
+#ifdef LIBRPA_USE_LIBRI
+    
+    for (const auto &q : qpts)
+    {
+        coul_block.zero_out();
+
+        int iq = std::distance(klist.begin(), std::find(klist.begin(), klist.end(), q));
+        std::array<double, 3> qa = {q.x, q.y, q.z};
+        // collect the block elements of coulomb matrices
+        {
+            double vq_begin = omp_get_wtime();
+            // LibRI tensor for communication, release once done
+            std::map<int, std::map<std::pair<int, std::array<double, 3>>, Tensor<complex<double>>>>
+                coul_libri;
+            coul_libri.clear();
+            for (const auto &Mu_Nu : local_atpair)
+            {
+                const auto Mu = Mu_Nu.first;
+                const auto Nu = Mu_Nu.second;
+
+                if (coulmat.count(Mu) == 0 || coulmat.at(Mu).count(Nu) == 0 ||
+                    coulmat.at(Mu).at(Nu).count(q) == 0)
+                    continue;
+                const auto &Vq = coulmat.at(Mu).at(Nu).at(q);
+                const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu);
+                const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu);
+                std::valarray<complex<double>> Vq_va(Vq->c, Vq->size);
+                auto pvq = std::make_shared<std::valarray<complex<double>>>();
+                *pvq = Vq_va;
+                coul_libri[Mu][{Nu, qa}] = Tensor<complex<double>>({n_mu, n_nu}, pvq);
+            }
+            double arr_end = omp_get_wtime();
+            mpi_comm_global_h.barrier();
+            double comm_begin = omp_get_wtime();
+            const auto IJq_coul =
+                comm_map2_first(mpi_comm_global_h.comm, coul_libri, s0_s1.first, s0_s1.second);
+            double comm_end = omp_get_wtime();
+            mpi_comm_global_h.barrier();
+            double block_begin = omp_get_wtime();
+            collect_block_from_ALL_IJ_Tensor(coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf,
+                                             qa, true, CONE, IJq_coul, MAJOR::ROW);
+            double block_end = omp_get_wtime();
+            lib_printf(
+                "Vq Time  myid: %d  arr_time: %f  comm_time: %f   block_time: %f   pair_size: %d\n",
+                mpi_comm_global_h.myid, arr_end - vq_begin, comm_end - comm_begin,
+                block_end - block_begin, set_IJ_nabf_nabf.size());
+            mpi_comm_global_h.barrier();
+            double vq_end = omp_get_wtime();
+
+            if (mpi_comm_global_h.myid == 0)
+                lib_printf(" | Total vq time: %f  lri_coul: %f   comm_vq: %f   block_vq: %f\n",
+                           vq_end - vq_begin, comm_begin - vq_begin, block_begin - comm_begin,
+                           vq_end - block_begin);
+        }
+        double chi_arr_time = 0.0;
+        double chi_comm_time = 0.0;
+        double chi_2d_time = 0.0;
+        for (const auto &freq : chi0.tfg.get_freq_nodes())
+        {
+            const auto ifreq = chi0.tfg.get_freq_index(freq);
+            const double freq_weight = chi0.tfg.find_freq_weight(freq);
+            double pi_freq_begin = omp_get_wtime();
+            chi0_block.zero_out();
+            {
+                double chi_begin_arr = omp_get_wtime();
+                std::map<int,
+                         std::map<std::pair<int, std::array<double, 3>>, Tensor<complex<double>>>>
+                    chi0_libri;
+                atom_mapping<ComplexMatrix>::pair_t_old chi0_wq;
+                if(!chi0.get_chi0_q().empty())
+                    chi0_wq = chi0.get_chi0_q().at(freq).at(q);
+                chi0_libri.clear();
+                if(!chi0.get_chi0_q().empty())
+                for (const auto &M_Nchi : chi0_wq)
+                {
+                    const auto &M = M_Nchi.first;
+                    const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M);
+                    for (const auto &N_chi : M_Nchi.second)
+                    {
+                        const auto &N = N_chi.first;
+                        const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N);
+                        const auto &chi = N_chi.second;
+                        std::valarray<complex<double>> chi_va(chi.c, chi.size);
+                        auto pchi = std::make_shared<std::valarray<complex<double>>>();
+                        *pchi = chi_va;
+                        chi0_libri[M][{N, qa}] = Tensor<complex<double>>({n_mu, n_nu}, pchi);
+                    }
+                }
+                if (mpi_comm_global_h.is_root())
+                {
+                    lib_printf("Begin to clean chi0 !!! \n");
+                    LIBRPA::utils::display_free_mem();
+                    lib_printf("chi0_freq_q size: %d,  freq: %f, q:( %f, %f, %f )\n",
+                               chi0_wq.size(), freq, q.x, q.y, q.z);
+                }
+                if(!chi0.get_chi0_q().empty())
+                    chi0.free_chi0_q(freq, q);
+                
+                LIBRPA::utils::release_free_mem();
+                mpi_comm_global_h.barrier();
+                double chi_end_arr = omp_get_wtime();
+                const auto IJq_chi0 =
+                    comm_map2_first(mpi_comm_global_h.comm, chi0_libri, s0_s1.first, s0_s1.second);
+                // ofs_myid << "IJq_chi0" << endl << IJq_chi0;
+                double chi_end_comm = omp_get_wtime();
+                collect_block_from_ALL_IJ_Tensor(chi0_block, desc_nabf_nabf,
+                                                 LIBRPA::atomic_basis_abf, qa, true, CONE, IJq_chi0,
+                                                 MAJOR::ROW);
+                mpi_comm_global_h.barrier();
+                double chi_end_2d = omp_get_wtime();
+
+                chi_arr_time = (chi_end_arr - chi_begin_arr);
+                chi_comm_time = (chi_end_comm - chi_end_arr);
+                chi_2d_time = (chi_end_2d - chi_end_comm);
+            }
+
+            double pi_begin = omp_get_wtime();
+            d_coul_block.set_data(coul_block.nr(), coul_block.nc(), coul_block.ptr(),device_stream.stream);
+            d_chi0_block.set_data(chi0_block.nr(), chi0_block.nc(), chi0_block.ptr(),device_stream.stream);
+            d_coul_chi0_block.set_data(coul_chi0_block.nr(), coul_chi0_block.nc(), coul_chi0_block.ptr(),device_stream.stream);
+            std::complex<double> calpha(1.0,0.0),cbeta(0.0,0.0);
+            bool is_mixed_precision = false;
+            if(is_mixed_precision)
+            {
+                MatrixDevice<std::complex<float>> d_coul_block_f(coul_block.nr(),coul_block.nc(),device_stream.stream);
+                MatrixDevice<std::complex<float>> d_chi0_block_f(chi0_block.nr(),chi0_block.nc(),device_stream.stream);
+                MatrixDevice<std::complex<float>> d_coul_chi0_block_f(coul_chi0_block.nr(),coul_chi0_block.nc(),device_stream.stream);
+                DeviceConnector::double_to_float_device((double*)d_coul_block.ptr(),(float*)d_coul_block_f.ptr(),d_coul_block.nr()*d_coul_block.nc()*2);
+                DeviceConnector::double_to_float_device((double*)d_chi0_block.ptr(),(float*)d_chi0_block_f.ptr(),d_chi0_block.nr()*d_chi0_block.nc()*2);
+                DeviceConnector::double_to_float_device((double*)d_coul_chi0_block.ptr(),(float*)d_coul_chi0_block_f.ptr(),d_coul_chi0_block.nr()*d_coul_chi0_block.nc()*2);
+                std::complex<float> calpha_f(1.0f,0.0f),cbeta_f(0.0f,0.0f);
+                DeviceConnector::pgemm_device_mixed_precision(
+                    'N', 'N', n_abf, n_abf, n_abf,
+                    &calpha_f,
+                    d_coul_block_f.ptr(), 1, 1, desc_nabf_nabf,
+                    d_chi0_block_f.ptr(), 1, 1, desc_nabf_nabf,
+                    &cbeta_f,
+                    d_coul_chi0_block_f.ptr(), 1, 1, desc_nabf_nabf,
+                    LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT
+                );
+                DeviceConnector::float_to_double_device((float*)d_coul_chi0_block_f.ptr(),(double*)d_coul_chi0_block.ptr(),d_coul_chi0_block.nr()*d_coul_chi0_block.nc()*2);
+            }else{
+                DeviceConnector::pgemm_device_mixed_precision(
+                    'N', 'N', n_abf, n_abf, n_abf,
+                    &calpha,
+                    d_coul_block.ptr(), 1, 1, desc_nabf_nabf,
+                    d_chi0_block.ptr(), 1, 1, desc_nabf_nabf,
+                    &cbeta,
+                    d_coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf,
+                    LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE
+                );
+            }    
+            double pi_end = omp_get_wtime();
+            complex<double> trace_pi(0.0, 0.0);
+            complex<double> trace_pi_loc(0.0, 0.0);
+            DeviceConnector::trace_matrix_device_blacs(&trace_pi_loc,d_coul_chi0_block.ptr(),desc_nabf_nabf,LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE);
+            cuDoubleComplex calpha1;
+            calpha1.x = -1.0;
+            calpha1.y = 0.0;
+            DeviceConnector::num_multiply_matrix_device(d_coul_chi0_block.nr()*d_coul_chi0_block.nc(),&calpha1,d_coul_chi0_block.ptr(),LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE);
+            
+            DeviceConnector::diag_add_matrix_device_blacs(&calpha,d_coul_chi0_block.ptr(),desc_nabf_nabf, LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE);
+            
+            int64_t *d_ipiv;
+            int *d_info ;
+            CUDA_CHECK(cudaMallocAsync((void**)&d_ipiv, sizeof(int64_t)*max(desc_nabf_nabf.m_loc(), desc_nabf_nabf.n_loc()),device_stream.stream));
+            CUDA_CHECK(cudaMallocAsync((void**)&d_info, sizeof(int),device_stream.stream));
+            
+            complex<double> ln_det =
+                compute_pi_det_blacs_2d_nvhpc(d_coul_chi0_block, desc_nabf_nabf, d_ipiv, d_info, 'c');
+            CUDA_CHECK(cudaFreeAsync(d_ipiv,device_stream.stream));
+            CUDA_CHECK(cudaFreeAsync(d_info,device_stream.stream));
+
+            double det_end = omp_get_wtime();
+            mpi_comm_global_h.barrier();
+            MPI_Allreduce(&trace_pi_loc, &trace_pi, 1, MPI_DOUBLE_COMPLEX, MPI_SUM,
+                          mpi_comm_global_h.comm);
+            double pi_freq_end = omp_get_wtime();
+            if (mpi_comm_global_h.myid == 0)
+            {
+                lib_printf(
+                    "| TIME of DET-freq-q:  %f,  q: ( %f, %f, %f)  TOT: %f  CHI_arr: %f  CHI_comm: "
+                    "%f, CHI_2d: %f, Pi: %f, Det: %f\n",
+                    freq, q.x, q.y, q.z, pi_freq_end - pi_freq_begin, chi_arr_time, chi_comm_time,
+                    chi_2d_time, pi_end - pi_begin, det_end - pi_end);
+                complex<double> rpa_for_omega_q = trace_pi + ln_det;
+                cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;  //! check
+                tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;
+            }
+        }
+    }
+#else
+    throw std::logic_error("need compilation with LibRI");
+#endif
+    if (mpi_comm_global_h.myid == 0)
+    {
+        for (auto &q_crpa : cRPA_q)
+        {
+            corr.qcontrib[q_crpa.first] = q_crpa.second;
+            // cout << q_crpa.first << q_crpa.second << endl;
+        }
+        // cout << "gx_num_" << chi0.tfg.size() << "  tot_RPA_energy:  " << setprecision(8)
+        // <<tot_RPA_energy << endl;
+    }
+    mpi_comm_global_h.barrier();
+    corr.value = tot_RPA_energy;
+
+    corr.etype = CorrEnergy::type::RPA;
+    return corr;
+}
+
+CorrEnergy compute_RPA_correlation_cuda(const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat)
+{
+    CorrEnergy corr;
+    if (mpi_comm_global_h.myid == 0) lib_printf("Calculating EcRPA without BLACS/ScaLAPACK(cuda)\n");
+    // lib_printf("Begin cal cRPA , pid:  %d\n", mpi_comm_global_h.myid);
+    const auto &mf = chi0.mf;
+
+    // freq, q
+    map<double, map<Vector3_Order<double>, atom_mapping<ComplexMatrix>::pair_t_old>>
+        pi_freq_q_Mu_Nu;
+    if (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR ||
+        LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI)
+        pi_freq_q_Mu_Nu = compute_Pi_q_MPI(chi0, coulmat);
+    else
+        pi_freq_q_Mu_Nu = compute_Pi_q(chi0, coulmat);
+    lib_printf("Finish Pi freq on Proc %4d, size %zu\n", mpi_comm_global_h.myid,
+               pi_freq_q_Mu_Nu.size());
+    // mpi_comm_global_h.barrier();
+
+    int range_all = N_all_mu;
+
+    vector<int> part_range;
+    part_range.resize(atom_mu.size());
+    part_range[0] = 0;
+    int count_range = 0;
+    
+    for (int I = 0; I != atom_mu.size() - 1; I++)
+    {
+        count_range += atom_mu[I];
+        part_range[I + 1] = count_range;
+    }
+    
+
+    // pi_freq_q contains all atoms
+    map<double, map<Vector3_Order<double>, ComplexMatrix>> pi_freq_q;
+    
+    for(const auto &freq : chi0.tfg.get_freq_nodes())
+    {
+        // printf("| process %d, freq: %f\n", mpi_comm_global_h.myid, freq);
+        map<Vector3_Order<double>, atom_mapping<ComplexMatrix>::pair_t_old> freq_q_MuNupi;
+        if(!chi0.get_chi0_q().empty())
+            freq_q_MuNupi=pi_freq_q_Mu_Nu.at(freq);
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+        // printf("success before freq_q_MuNupi processid:%d, freq_q_MuNupi.size(): %zu\n",
+        //        mpi_comm_global_h.myid, freq_q_MuNupi.size());
+        #endif
+        for(const auto &q:chi0.klist){
+            atom_mapping<ComplexMatrix>::pair_t_old q_MuNupi;
+            if(!chi0.get_chi0_q().empty())
+                q_MuNupi = freq_q_MuNupi.at(q);
+            const auto MuNupi = q_MuNupi;
+            pi_freq_q[freq][q].create(range_all, range_all);
+
+            ComplexMatrix pi_munu_tmp(range_all, range_all);
+            pi_munu_tmp.zero_out();
+            if(!chi0.get_chi0_q().empty())
+            for (const auto &Mu_Nupi : MuNupi)
+            {
+                const auto Mu = Mu_Nupi.first;
+                const auto Nupi = Mu_Nupi.second;
+                const size_t n_mu = atom_mu[Mu];
+                for (const auto &Nu_pi : Nupi)
+                {
+                    const auto Nu = Nu_pi.first;
+                    const auto pimat = Nu_pi.second;
+                    const size_t n_nu = atom_mu[Nu];
+
+                    for (size_t mu = 0; mu != n_mu; ++mu)
+                    {
+                        for (size_t nu = 0; nu != n_nu; ++nu)
+                        {
+                            pi_munu_tmp(part_range[Mu] + mu, part_range[Nu] + nu) += pimat(mu, nu);
+                        }
+                    }
+                }
+            }
+            if (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR ||
+                LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI)
+            {
+                mpi_comm_global_h.reduce_ComplexMatrix(pi_munu_tmp, pi_freq_q.at(freq).at(q), 0);
+            }
+            else
+            {
+                pi_freq_q.at(freq).at(q) = std::move(pi_munu_tmp);
+            }
+        }
+    }
+    // lib_printf("Finish Pi communicate %4d, size %zu\n", mpi_comm_global_h.myid,
+    // pi_freq_q_Mu_Nu.size());
+    mpi_comm_global_h.barrier();
+    // if (mpi_comm_global_h.myid == 0)
+    {
+        complex<double> tot_RPA_energy(0.0, 0.0);
+        map<Vector3_Order<double>, complex<double>> cRPA_q;
+        int deviceCount;
+        cudaError_t err= cudaGetDeviceCount(&deviceCount);
+        
+        // if(err==cudaSuccess&&deviceCount>0&&deviceCount!=4)
+        //     printf("cudaSuccess:%d\n",err==cudaSuccess&&deviceCount>0);
+        // ==============================test the velocity of cuda stream========================================
+        
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+            range_all=TEST_LU_NUMBER;
+            complex<double>* temp_complex=new complex<double>[range_all*range_all];
+            std::random_device rd;
+            std::mt19937 gen(rd());
+            std::uniform_real_distribution<> dis(-1.0, 1.0);
+            double temp_begin_time=omp_get_wtime();
+            // #pragma omp parallel for
+            for(int i=0;i<range_all*range_all;i++){
+                temp_complex[i].real(dis(gen));
+                temp_complex[i].imag(dis(gen));
+            }
+            printf("time for generate random complex matrix: %f\n",omp_get_wtime()-temp_begin_time);
+            for (const auto &freq_qpi : pi_freq_q)
+            {
+                const auto freq = freq_qpi.first;
+                for (const auto &q_pi : freq_qpi.second)
+                {
+                    const auto q= q_pi.first;
+                    printf("test freq: %f, q: (%f, %f, %f)\n",freq,q.x,q.y,q.z);
+                    pi_freq_q[freq][q].c=new complex<double>[range_all*range_all];
+                    complex<double>* c_ptr=pi_freq_q[freq][q].c;
+                    
+                    double temp_begin_time=omp_get_wtime();
+                    #pragma omp parallel for
+                    for(int i=0;i<range_all*range_all;i++){
+                        c_ptr[i]=temp_complex[i];
+                        
+                    }
+                    printf("time for copy complex matrix: %f\n",omp_get_wtime()-temp_begin_time);
+                }
+            }
+            delete[] temp_complex;
+        #endif
+        printf("the size of matrix:%d,%d\n",range_all,range_all);
+        // ==============================end test, the data is a fault data, but it can be used to test the speed of cuda stream========================================
+
+        int NUM_STREAMS = 4;
+        printf("deviceCount:%d\n",deviceCount);
+        cudaStream_t* streams=new cudaStream_t[deviceCount*NUM_STREAMS];
+        int ngpu= deviceCount;
+        magma_queue_t *queues=new magma_queue_t[ngpu];
+        // create streams
+        double start_time=omp_get_wtime();
+        int max_range_all=60000;
+        printf("range_all:%d, max_range_all:%d\n",range_all,max_range_all);
+        if(range_all<max_range_all){
+            // #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+            printf("success before create streams");
+            // #endif
+            for(int i=0;i<deviceCount;i++){
+                cudaSetDevice(i);//set the device
+                for(int j=0;j<NUM_STREAMS;j++){
+                    cudaStreamCreate(&streams[i*NUM_STREAMS+j]);
+                    printf("the stream was created,device:%d,stream:%d\n",i,j);
+                }
+            }
+            // #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+            printf("success after create streams");
+            // #endif
+        }else{
+            printf("the range_all is too large, LU decomposition will be done with the function magma_zgetrf_mgpu");
+            magma_init();
+            for( int dev = 0; dev < ngpu; ++dev ) {
+
+                magma_queue_create( dev, &queues[dev] );
+            }
+        }
+        // #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+        printf("time for create streams: %f\n",omp_get_wtime()-start_time);
+        // #endif
+        int processSize;
+        int processId= mpi_comm_global_h.myid;
+        MPI_Comm_size(MPI_COMM_WORLD, &processSize);
+        printf("processSize:%d, processId:%d\n",processSize,processId);
+        int num_iteration=0;
+        double time_calculate=omp_get_wtime();
+        #ifdef OPEN_OMP_FOR_LU_DECOMPOSITION
+        #pragma omp parallel
+        #pragma omp single
+        #endif
+        {
+        
+        for (const auto &freq_qpi : pi_freq_q)
+        {
+            const auto freq = freq_qpi.first;
+            const double freq_weight = chi0.tfg.find_freq_weight(freq);
+            for (const auto &q_pi : freq_qpi.second)
+            {
+                // if(processId!=num_iteration%processSize){
+                //     num_iteration++;
+                //     continue;
+                // }
+                #ifndef OPEN_OMP_FOR_LU_DECOMPOSITION
+                    double one_task_begin = omp_get_wtime();
+                #endif
+                const auto q = q_pi.first;
+                const auto pimat = q_pi.second;
+                // double test_startTime=omp_get_wtime();
+                #ifdef OPEN_OMP_FOR_LU_DECOMPOSITION
+                #pragma omp task firstprivate(num_iteration,freq,q,freq_weight) 
+                #endif
+                {
+                    complex<double> trace_pi = trace(pi_freq_q.at(freq).at(q));
+                    complex<double> rpa_for_omega_q(0.0, 0.0);
+                    int info_LU = 1;
+                    int *ipiv = new int[range_all];
+                    cuDoubleComplex det_test;
+                    if(range_all<max_range_all){
+                        int deviceId=(num_iteration/NUM_STREAMS)%deviceCount;
+                        int streamId=num_iteration%NUM_STREAMS;
+                        cudaSetDevice(deviceId);//set the device
+                        #ifndef OPEN_OMP_FOR_LU_DECOMPOSITION
+                            printf("one task time after set device:%f\n",omp_get_wtime()-one_task_begin);
+                        #endif
+                        cudaStream_t stream=streams[deviceId*NUM_STREAMS+streamId];
+                        cusolverDnHandle_t handle;
+                        cusolverDnCreate(&handle);
+                        cusolverDnSetStream(handle, stream);
+                        cuDoubleComplex* h_pi=(cuDoubleComplex*)pi_freq_q[freq][q].c;
+                        // // show h_pi
+                        // printf("h_pi matrix:\n");
+                        // for(int i=0;i<range_all;i++){
+                        //     for(int j=0;j<range_all;j++){
+                        //         printf("%f+%fi ",h_pi[i*range_all+j].x,h_pi[i*range_all+j].y);
+                        //     }
+                        //     printf("\n");
+                        // }
+                        det_test=CudaConnector::det_cuZgetrf_f_from_host(range_all,range_all,(cuDoubleComplex*)pi_freq_q[freq][q].c,range_all,ipiv,&info_LU,handle,deviceId);
+                        cusolverDnDestroy(handle);
+                        // printf("iteration:%d",num_iteration);
+                        // printf("det_test.x:%f,det_test.y:%f,info_LU:%d\n",det_test.x,det_test.y,info_LU);
+                        
+                    }else{
+                        // magma_init();
+                        // magma_queue_t* queues = new magma_queue_t[ngpu];
+                        // for( int dev = 0; dev < ngpu; ++dev ) {
+                        //     magma_setdevice( dev );
+                        //     magma_queue_create( dev, &queues[dev] );
+                        // }
+                        // cuDoubleComplex* h_pi=new cuDoubleComplex[range_all*range_all];
+                        // for(int i=0;i<range_all*range_all;i++){
+                        //     h_pi[i].x = pi_freq_q[freq][q].c[i].real();
+                        //     h_pi[i].y = pi_freq_q[freq][q].c[i].imag();
+                        // }
+                        det_test=CudaConnector::det_magmaZgetrf_f_mgpu_from_host(range_all,range_all,(cuDoubleComplex*)pi_freq_q[freq][q].c,range_all,ipiv,&info_LU,ngpu,queues);
+                        // for( int dev = 0; dev < ngpu; ++dev ) {
+                        //     magma_setdevice( dev );
+                        //     magma_queue_destroy( queues[dev] );
+                        // }
+                        // printf("iteration:%d",num_iteration);
+                        // printf("det_test.x:%f,det_test.y:%f,info_LU:%d\n",det_test.x,det_test.y,info_LU);
+                        // delete[] queues;
+                        // delete[] h_pi;
+                        // magma_finalize();
+                    }
+                    if (info_LU != 0)
+                    {
+                        printf("Error in LU decomposition, info_LU: %d\n", info_LU);
+                        exit(1);
+                    }
+                    // printf("ipiv:");
+                    // for(int i=0;i<range_all;i++)
+                    //     printf("%d ",ipiv[i]);
+                    // printf("\n");
+                    
+                    delete[] ipiv;
+                    if(range_all%2==1){
+                        det_test.x=-det_test.x;
+                        det_test.y=-det_test.y;
+                    }
+                    complex<double> det_for_rpa(det_test.x, det_test.y);
+                    // printf("det_for_rpa_gpu: %f+%fi\n",det_test.x,det_test.y);
+                    
+                    // auto end_time = std::chrono::high_resolution_clock::now();
+                    // auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time);
+                    // printf("LU time by lapack: %lld us\n", duration.count());
+                    // complex<double> trace_pi;
+                    complex<double> ln_det;
+                    ln_det = std::log(det_for_rpa);
+                    
+                    // printf("in_det: %f+%fi, trace_pi: %f+%fi\n", ln_det.real(), ln_det.imag(),
+                    //        trace_pi.real(), trace_pi.imag());
+                    // cout << "PI trace vector:" << endl;
+                    // cout << endl;
+                    rpa_for_omega_q = ln_det + trace_pi;
+                    // cout << " ifreq:" << freq << "      rpa_for_omega_k: " << rpa_for_omega_q << "
+                    // lnt_det: " << ln_det << "    trace_pi " << trace_pi << endl;
+                    // printf("tot_RPA_energy_gpu: %f+%fi,num_iteration:%d\n",
+                    //        tot_RPA_energy.real(), tot_RPA_energy.imag(), num_iteration);
+                    // printf("rpa_for_omega_q:%f+%fi,freq_weight:%f,irk_weight[q]:%f\n",
+                    //        rpa_for_omega_q.real(), rpa_for_omega_q.imag(), freq_weight,
+                    //        irk_weight[q]);
+                    #ifdef OPEN_OMP_FOR_LU_DECOMPOSITION
+                    #pragma omp critical
+                    #endif
+                    {
+                        cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;
+                    
+                        tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI;
+                    }
+                    // printf("freq: %f, q: (%f, %f, %f), rpa_for_omega_q: %f+%fi,tot_RPA_energy_gpu: %f+%fi,num_iteration:%d,deviceId:%d,streamId:%d\n",
+                    //        freq, q.x, q.y, q.z, rpa_for_omega_q.real(), rpa_for_omega_q.imag(),tot_RPA_energy.real(), tot_RPA_energy.imag(), num_iteration, deviceId, streamId);
+                    
+                    // printf("tot_RPA_energy_gpu: %f+%fi,num_iteration:%d\n",tot_RPA_energy.real(),tot_RPA_energy.imag(),num_iteration);
+                }
+                // double test_endTime=omp_get_wtime();
+                // printf("task_time:%f(aimed to test whether task is congested)\n",test_endTime-test_startTime);
+                num_iteration++;
+                #ifndef OPEN_OMP_FOR_LU_DECOMPOSITION
+                    printf("one task time:%f\n",omp_get_wtime()-one_task_begin);
+                #endif
+            }
+        }
+        
+        }
+        printf("time for calculate: %f\n",omp_get_wtime()-time_calculate);
+        printf("mpi_comm_global_h.myid:%d,num_iteration:%d\n",mpi_comm_global_h.myid,num_iteration);
+        #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION
+        // printf("time for all tasks: %f\n",omp_get_wtime()-start_time);
+        #endif
+        printf("tot_RPA_energy_gpu: %f+%fi\n",tot_RPA_energy.real(),tot_RPA_energy.imag());
+        if(range_all<max_range_all){
+            for(int i=0;i<deviceCount;i++){
+                cudaSetDevice(i);//set the device
+                for(int j=0;j<NUM_STREAMS;j++){
+                    cudaStreamDestroy(streams[i*NUM_STREAMS+j]);
+                }
+            }
+        }else{
+            for( int dev = 0; dev < ngpu; ++dev ) {
+                magma_queue_destroy( queues[dev] );
+            }
+            magma_finalize();
+        }
+        double end_time=omp_get_wtime();
+        printf("time=%f\n", end_time-start_time);
+        delete[] queues;
+        delete[] streams;
+        
+        // lib_printf("Finish EcRPA %4d, size %zu\n", mpi_comm_global_h.myid,
+        // pi_freq_q_Mu_Nu.size());
+        mpi_comm_global_h.barrier();
+        map<Vector3_Order<double>, complex<double>> global_cRPA_q;
+        for (auto q_weight : irk_weight)
+        {
+            MPI_Reduce(&cRPA_q[q_weight.first], &global_cRPA_q[q_weight.first], 1,
+                       MPI_DOUBLE_COMPLEX, MPI_SUM, 0, mpi_comm_global_h.comm);
+        }
+
+        for (auto &q_crpa : global_cRPA_q)
+        {
+            corr.qcontrib[q_crpa.first] = q_crpa.second;
+        }
+        complex<double> gather_tot_RPA_energy(0.0, 0.0);
+        MPI_Reduce(&tot_RPA_energy, &gather_tot_RPA_energy, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, 0,
+                   mpi_comm_global_h.comm);
+        corr.value = gather_tot_RPA_energy;
+    }
+    // printf("gather_tot_RPA_energy_gpu: %f+%fi\n",corr.value.real(),corr.value.imag());
+    corr.etype = CorrEnergy::type::RPA;
+    return corr;
+}
+#ifdef ENABLE_NVHPC
+
+complex<double> compute_pi_det_blacs_2d_nvhpc(
+    MatrixDevice<std::complex<double>> &d_A, const LIBRPA::Array_Desc &arrdesc_pi, int64_t *d_ipiv, int *d_info,char order)
+{
+    MatrixDevice<std::complex<double>> d_A_T;
+    
+    ORDER_CHECK(order);  
+    if(order=='C'||order=='c'){
+        d_A_T.set_data(d_A.nc(), d_A.nr(), device_stream.stream);
+        DeviceConnector::transpose_device_blas(d_A.ptr(),d_A_T.nr(), d_A_T.nc(),d_A_T.ptr(), LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE);
+    }
+    int ia=1,ja=1;
+    bool is_mixed_precision=false;
+    if(is_mixed_precision){
+        MatrixDevice<std::complex<float>> d_A_f(d_A.nr(),d_A.nc(),device_stream.stream);
+        
+        DeviceConnector::double_to_float_device((order=='C'||order=='c')?(double*)d_A_T.ptr():(double*)d_A.ptr(),(float*)d_A_f.ptr(),d_A.nr()*d_A.nc()*2);
+        
+        DeviceConnector::pgetrf_device_mixed_precision(
+            d_A_f.ptr(), ia, ja, arrdesc_pi,
+            d_ipiv, d_info,
+            LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT,
+            order
+        );
+        DeviceConnector::float_to_double_device((float*)d_A_f.ptr(),(order=='C'||order=='c')?(double*)d_A_T.ptr():(double*)d_A.ptr(),d_A.nr()*d_A.nc()*2);
+    }else{
+        DeviceConnector::pgetrf_device_mixed_precision(
+            (order=='C'||order=='c')?(void*)d_A_T.ptr():(void*)d_A.ptr(), ia, ja, arrdesc_pi, 
+            d_ipiv, d_info, 
+            LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE, 
+            order);
+    }
+    if(order=='C'||order=='c'){
+        DeviceConnector::transpose_device_blas(d_A_T.ptr(),d_A.nr(),d_A.nc(), d_A.ptr(), LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE);
+        d_A_T.clean(device_stream.stream);
+    }
+    complex<double> ln_det_loc(0.0, 0.0);
+    complex<double> ln_det_all(0.0, 0.0);
+    std::complex<double> det_loc;
+    
+    DeviceConnector::det_matrix_device_blacs(
+        &det_loc, d_A.ptr(), arrdesc_pi, LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE
+    );
+    if(det_loc.real() > 0)
+    {
+        ln_det_loc = std::log(det_loc);
+    }
+    else
+    {
+        ln_det_loc = std::log(-det_loc);
+    }
+    
+    MPI_Allreduce(&ln_det_loc, &ln_det_all, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_global_h.comm);
+    return ln_det_all;
+}
+// Done: converge compute_Wc_freq_q_blacs and compute_Wc_freq_q_blacs_wing
+map<double, atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
+compute_Wc_freq_q_blacs_cuda(Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat_eps,
+                        atpair_k_cplx_mat_t &coulmat_wc,
+                        const vector<std::complex<double>> &epsmac_LF_imagfreq)
+{
+    map<double,
+        atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
+        Wc_freq_q;
+    const complex<double> CONE{1.0, 0.0};
+    const int n_abf = LIBRPA::atomic_basis_abf.nb_total;
+    const auto part_range = LIBRPA::atomic_basis_abf.get_part_range();
+
+    if (mpi_comm_global_h.myid == 0)
+    {
+        cout << "Calculating Wc using NVHPC" << endl;
+    }
+    mpi_comm_global_h.barrier();
+
+    Profiler::start("compute_Wc_freq_q_blacs_init");
+    Array_Desc desc_nabf_nabf(blacs_ctxt_global_h);
+    // Use a square blocksize instead max block, otherwise heev and inversion will complain about
+    // illegal parameter Maximal blocksize ensure that atom indices related to the rows/columns of a
+    // local matrix is minimized.
+    desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0);
+    // This, however, is not optimal for matrix operations, and may lead to segment fault during
+    // MPI operations with parallel linear algebra subroutine. Thus we define an optimal blocksize
+    Array_Desc desc_nabf_nabf_opt(blacs_ctxt_global_h);
+    const int nb_opt = min(128, desc_nabf_nabf.nb());
+    desc_nabf_nabf_opt.init(n_abf, n_abf, nb_opt, nb_opt, 0, 0);
+    // obtain the indices of atom-pair block necessary to build 2D block of a Hermitian/symmetric
+    // matrix
+    const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy(
+        'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf);
+    const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf);
+    // temp_block is used to collect data from IJ-pair data structure with comm_map2_first
+    auto temp_block = init_local_mat<complex<double>>(desc_nabf_nabf, MAJOR::COL);
+    #ifdef ENABLE_NVHPC
+    GpuDeviceStream gpu_dev_stream;
+    ComplexMatrixDevice d_temp_block;
+    ComplexMatrixDevice d_coul_block;
+    #endif
+    // Below are the working arrays for matrix operations
+    auto chi0_block = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
+    auto coul_block = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
+    auto coul_eigen_block = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
+    auto coul_chi0_block = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
+    #ifdef ENABLE_NVHPC
+    ComplexMatrixDevice d_chi0_block;
+    ComplexMatrixDevice d_coul_chi0_block;
+    ComplexMatrixDevice d_coul_eigen_block;
+    #endif
+    auto coulwc_block = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
+    #ifdef ENABLE_NVHPC
+    ComplexMatrixDevice d_coulwc_block;
+    #endif
+
+    const double mem_blocks = (chi0_block.size() + coul_block.size() + coul_eigen_block.size() +
+                               coul_chi0_block.size() + coulwc_block.size()) *
+                              16.0e-6;
+    ofs_myid << get_timestamp()
+             << " Memory consumption of task-local blocks for screened Coulomb [MB]: " << mem_blocks
+             << endl;
+
+    const auto atpair_local = dispatch_upper_trangular_tasks(
+        natom, blacs_ctxt_global_h.myid, blacs_ctxt_global_h.nprows, blacs_ctxt_global_h.npcols,
+        blacs_ctxt_global_h.myprow, blacs_ctxt_global_h.mypcol);
+#ifdef LIBRPA_DEBUG
+    ofs_myid << get_timestamp() << " atpair_local " << atpair_local << endl;
+    ofs_myid << get_timestamp() << " s0_s1 " << s0_s1 << endl;
+#endif
+
+    // IJ pair of Wc to be returned
+    pair<set<int>, set<int>> Iset_Jset_Wc;
+    for (const auto &ap : atpair_local)
+    {
+        Iset_Jset_Wc.first.insert(ap.first);
+        Iset_Jset_Wc.second.insert(ap.second);
+    }
+
+    // Prepare local basis indices for 2D->IJ map
+    int I, iI;
+    map<int, vector<int>> map_lor_v;
+    map<int, vector<int>> map_loc_v;
+    for (int i_lo = 0; i_lo != desc_nabf_nabf.m_loc(); i_lo++)
+    {
+        int i_glo = desc_nabf_nabf.indx_l2g_r(i_lo);
+        LIBRPA::atomic_basis_abf.get_local_index(i_glo, I, iI);
+        map_lor_v[I].push_back(iI);
+    }
+    for (int i_lo = 0; i_lo != desc_nabf_nabf.n_loc(); i_lo++)
+    {
+        int i_glo = desc_nabf_nabf.indx_l2g_c(i_lo);
+        LIBRPA::atomic_basis_abf.get_local_index(i_glo, I, iI);
+        map_loc_v[I].push_back(iI);
+    }
+
+    vector<Vector3_Order<double>> qpts;
+    for (const auto &q_weight : irk_weight) qpts.push_back(q_weight.first);
+
+    vec<double> eigenvalues(n_abf);
+    Profiler::cease("compute_Wc_freq_q_blacs_init");
+    LIBRPA::utils::lib_printf_root("Time for Wc initialization (seconds, Wall/CPU): %f %f\n",
+                                   Profiler::get_wall_time_last("compute_Wc_freq_q_blacs_init"),
+                                   Profiler::get_cpu_time_last("compute_Wc_freq_q_blacs_init"));
+
+    Profiler::start("compute_Wc_freq_q_work");
+#ifdef LIBRPA_USE_LIBRI
+    for (const auto &q : qpts)
+    {
+        const int iq = std::distance(qpts.cbegin(), std::find(qpts.cbegin(), qpts.cend(), q));
+        const int iq_in_k =
+            std::distance(klist.cbegin(), std::find(klist.cbegin(), klist.cend(), q));
+        // q-point in fractional coordinates
+        const auto &qf = kfrac_list[iq_in_k];
+        LIBRPA::utils::lib_printf_root("Computing Wc(q), %d / %d, q=(%f, %f, %f)\n", iq + 1,
+                                       qpts.size(), qf.x, qf.y, qf.z);
+        coul_block.zero_out();
+        coulwc_block.zero_out();
+        // lib_printf("coul_block\n%s", str(coul_block).c_str());
+
+        // q-array for LibRI object
+        std::array<double, 3> qa = {q.x, q.y, q.z};
+
+        // collect the block elements of truncated coulomb matrices first
+        // as we reuse coul_eigen_block to reduce memory usage
+        Profiler::start("epsilon_prepare_coulwc_sqrt", "Prepare sqrt of truncated Coulomb");
+        {
+            size_t n_singular_coulwc;
+            // LibRI tensor for communication, release once done
+            std::map<int,
+                     std::map<std::pair<int, std::array<double, 3>>, RI::Tensor<complex<double>>>>
+                couleps_libri;
+            Profiler::start("epsilon_prepare_coulwc_sqrt_1", "Setup libRI object");
+            for (const auto &Mu_Nu : atpair_local)
+            {
+                const auto Mu = Mu_Nu.first;
+                const auto Nu = Mu_Nu.second;
+                // ofs_myid << "Mu " << Mu << " Nu " << Nu << endl;
+                if (coulmat_wc.count(Mu) == 0 || coulmat_wc.at(Mu).count(Nu) == 0 ||
+                    coulmat_wc.at(Mu).at(Nu).count(q) == 0)
+                    continue;
+                const auto &Vq = coulmat_wc.at(Mu).at(Nu).at(q);
+                const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu);
+                const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu);
+                std::valarray<complex<double>> Vq_va(Vq->c, Vq->size);
+                auto pvq = std::make_shared<std::valarray<complex<double>>>();
+                *pvq = Vq_va;
+                couleps_libri[Mu][{Nu, qa}] = RI::Tensor<complex<double>>({n_mu, n_nu}, pvq);
+            }
+            Profiler::stop("epsilon_prepare_coulwc_sqrt_1");
+
+            Profiler::start("epsilon_prepare_coulwc_sqrt_2", "libRI Communicate");
+            const auto IJq_coul = RI::Communicate_Tensors_Map_Judge::comm_map2_first(
+                mpi_comm_global_h.comm, couleps_libri, s0_s1.first, s0_s1.second);
+            Profiler::stop("epsilon_prepare_coulwc_sqrt_2");
+
+            Profiler::start("epsilon_prepare_coulwc_sqrt_3", "Collect 2D-block from IJ");
+
+            collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf,
+                                             qa, true, CONE, IJq_coul, MAJOR::ROW);
+            ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1, desc_nabf_nabf.desc,
+                                          coulwc_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
+                                          blacs_ctxt_global_h.ictxt);
+            Profiler::stop("epsilon_prepare_coulwc_sqrt_3");
+            Profiler::start("epsilon_prepare_coulwc_sqrt_4", "Perform square root");
+            power_hemat_blacs(coulwc_block, desc_nabf_nabf_opt, coul_eigen_block,
+                              desc_nabf_nabf_opt, n_singular_coulwc, eigenvalues.c, 0.5,
+                              Params::sqrt_coulomb_threshold);
+            Profiler::stop("epsilon_prepare_coulwc_sqrt_4");
+        }
+        Profiler::stop("epsilon_prepare_coulwc_sqrt");
+        LIBRPA::utils::lib_printf_root(
+            "Time to prepare sqrt root of Coulomb for Wc(q) (seconds, Wall/CPU): %f %f\n",
+            Profiler::get_wall_time_last("epsilon_prepare_coulwc_sqrt"),
+            Profiler::get_cpu_time_last("epsilon_prepare_coulwc_sqrt"));
+        ofs_myid << get_timestamp() << " Done coulwc sqrt" << endl;
+
+        Profiler::start("epsilon_prepare_couleps_sqrt", "Prepare sqrt of bare Coulomb");
+        // collect the block elements of coulomb matrices
+        {
+            // LibRI tensor for communication, release once done
+            std::map<int,
+                     std::map<std::pair<int, std::array<double, 3>>, RI::Tensor<complex<double>>>>
+                couleps_libri;
+            ofs_myid << get_timestamp() << " Start build couleps_libri" << endl;
+            for (const auto &Mu_Nu : atpair_local)
+            {
+                const auto Mu = Mu_Nu.first;
+                const auto Nu = Mu_Nu.second;
+                // ofs_myid << "Mu " << Mu << " Nu " << Nu << endl;
+                if (coulmat_eps.count(Mu) == 0 || coulmat_eps.at(Mu).count(Nu) == 0 ||
+                    coulmat_eps.at(Mu).at(Nu).count(q) == 0)
+                    continue;
+                const auto &Vq = coulmat_eps.at(Mu).at(Nu).at(q);
+                const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu);
+                const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu);
+                std::valarray<complex<double>> Vq_va(Vq->c, Vq->size);
+                auto pvq = std::make_shared<std::valarray<complex<double>>>();
+                *pvq = Vq_va;
+                couleps_libri[Mu][{Nu, qa}] = RI::Tensor<complex<double>>({n_mu, n_nu}, pvq);
+            }
+            ofs_myid << get_timestamp() << " Done build couleps_libri" << endl;
+            // ofs_myid << "Couleps_libri" << endl << couleps_libri;
+            // if (couleps_libri.size() == 0)
+            //     throw std::logic_error("data at q-point not found in coulmat_eps");
+
+            // perform communication
+            ofs_myid << get_timestamp() << " Start collect couleps_libri, targets" << endl;
+#ifdef LIBRPA_DEBUG
+            ofs_myid << set_IJ_nabf_nabf << endl;
+            ofs_myid << "Extended blocks" << endl;
+            ofs_myid << "atom 1: " << s0_s1.first << endl;
+            ofs_myid << "atom 2: " << s0_s1.second << endl;
+#endif
+            // ofs_myid << "Owned blocks\n";
+            // print_keys(ofs_myid, couleps_libri);
+            // mpi_comm_global_h.barrier();
+            const auto IJq_coul = RI::Communicate_Tensors_Map_Judge::comm_map2_first(
+                mpi_comm_global_h.comm, couleps_libri, s0_s1.first, s0_s1.second);
+            ofs_myid << get_timestamp() << " Done collect couleps_libri, collected blocks" << endl;
+
+            ofs_myid << get_timestamp() << " Start construct couleps 2D block" << endl;
+            collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf,
+                                             qa, true, CONE, IJq_coul, MAJOR::ROW);
+            #ifndef ENABLE_NVHPC
+            ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1, desc_nabf_nabf.desc,
+                                          coul_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc,
+                                          blacs_ctxt_global_h.ictxt);
+            #else
+            d_temp_block.set_data(temp_block.nr(),temp_block.nc(),temp_block.ptr());
+            d_coul_block.set_data(coul_block.nr(),coul_block.nc());
+            CudaConnector::pgemr2d_nvhpc(
+                gpu_dev_stream, n_abf, n_abf,
+                d_temp_block.ptr(), 1, 1, desc_nabf_nabf,
+                d_coul_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                CUDA_C_64F
+            );
+            gpu_dev_stream.cudaSync();
+            CUDA_CHECK(cudaMemcpy(coul_block.ptr(), d_coul_block.ptr(), sizeof(cuDoubleComplex)*coul_block.nr()*coul_block.nc(), cudaMemcpyDeviceToHost));
+            #endif
+            ofs_myid << get_timestamp() << " Done construct couleps 2D block" << endl;
+        }
+        
+        size_t n_singular;
+        ofs_myid << get_timestamp() << " Start power hemat couleps\n";
+        matrix_m<std::complex<double>> sqrtveig_blacs;
+        #ifdef ENABLE_NVHPC
+        ComplexMatrixDevice d_sqrtveig_blacs;
+        #endif
+        if (is_gamma_point(q))
+        {
+            // choice of power_hemat_blacs_real/power_hemat_blacs_desc
+            // leads to sub-meV difference
+            sqrtveig_blacs = power_hemat_blacs_real(
+                coul_block, desc_nabf_nabf_opt, coul_eigen_block, desc_nabf_nabf_opt, n_singular,
+                eigenvalues.c, 0.5, Params::sqrt_coulomb_threshold);
+            if (Params::replace_w_head && Params::option_dielect_func == 3)
+            {
+                df_headwing.wing_mu_to_lambda(sqrtveig_blacs, desc_nabf_nabf_opt);
+            }
+        }
+        else
+        {
+            sqrtveig_blacs = power_hemat_blacs(coul_block, desc_nabf_nabf_opt, coul_eigen_block,
+                                               desc_nabf_nabf_opt, n_singular, eigenvalues.c, 0.5,
+                                               Params::sqrt_coulomb_threshold);
+        }
+        ofs_myid << get_timestamp() << " Done power hemat couleps\n";
+        // lib_printf("nabf %d nsingu %lu\n", n_abf, n_singular);
+        // release sqrtv when the q-point is not Gamma, or macroscopic dielectric constant at
+        // imaginary frequency is not prepared
+        if (epsmac_LF_imagfreq.empty() || !is_gamma_point(q)) sqrtveig_blacs.clear();
+        const size_t n_nonsingular = n_abf - n_singular;
+        if(gpu_dev_stream.rank==0){
+            printf("n_abf:%lu,n_nonsingular:%lu,n_singular:%lu\n",n_abf,n_nonsingular,n_singular);
+        }
+        Profiler::stop("epsilon_prepare_couleps_sqrt");
+        LIBRPA::utils::lib_printf_root(
+            "Time to prepare sqrt root of Coulomb for Epsilon(q) (seconds, Wall/CPU): %f %f\n",
+            Profiler::get_wall_time_last("epsilon_prepare_couleps_sqrt"),
+            Profiler::get_cpu_time_last("epsilon_prepare_couleps_sqrt"));
+        ofs_myid << get_timestamp() << " Done couleps sqrt\n";
+        std::flush(ofs_myid);
+
+        for (const auto &freq : chi0.tfg.get_freq_nodes())
+        {
+            const auto ifreq = chi0.tfg.get_freq_index(freq);
+            Profiler::start("epsilon_wc_work_q_omega");
+            Profiler::start("epsilon_prepare_chi0_2d", "Prepare Chi0 2D block");
+            chi0_block.zero_out();
+            {
+                std::map<int, std::map<std::pair<int, std::array<double, 3>>,
+                                       RI::Tensor<complex<double>>>>
+                    chi0_libri;
+                if (chi0.get_chi0_q().count(freq) > 0 && chi0.get_chi0_q().at(freq).count(q) > 0)
+                {
+                    const auto &chi0_wq = chi0.get_chi0_q().at(freq).at(q);
+                    for (const auto &M_Nchi : chi0_wq)
+                    {
+                        const auto &M = M_Nchi.first;
+                        const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M);
+                        for (const auto &N_chi : M_Nchi.second)
+                        {
+                            const auto &N = N_chi.first;
+                            const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N);
+                            const auto &chi = N_chi.second;
+                            std::valarray<complex<double>> chi_va(chi.c, chi.size);
+                            auto pchi = std::make_shared<std::valarray<complex<double>>>();
+                            *pchi = chi_va;
+                            chi0_libri[M][{N, qa}] =
+                                RI::Tensor<complex<double>>({n_mu, n_nu}, pchi);
+                        }
+                    }
+                    // Release the chi0 block for this frequency and q to reduce memory load,
+                    // as they will not be used again
+                    chi0.free_chi0_q(freq, q);
+                }
+                // ofs_myid << "chi0_libri" << endl << chi0_libri;
+                Profiler::start("epsilon_prepare_chi0_2d_comm_map2");
+                const auto IJq_chi0 = RI::Communicate_Tensors_Map_Judge::comm_map2_first(
+                    mpi_comm_global_h.comm, chi0_libri, s0_s1.first, s0_s1.second);
+                Profiler::stop("epsilon_prepare_chi0_2d_comm_map2");
+                Profiler::start("epsilon_prepare_chi0_2d_collect_block");
+                collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf,
+                                                 LIBRPA::atomic_basis_abf, qa, true, CONE, IJq_chi0,
+                                                 MAJOR::ROW);
+                #ifndef ENABLE_NVHPC
+                ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1,
+                                              desc_nabf_nabf.desc, chi0_block.ptr(), 1, 1,
+                                              desc_nabf_nabf_opt.desc, blacs_ctxt_global_h.ictxt);
+                #else
+                d_temp_block.set_data(temp_block.nr(),temp_block.nc(),temp_block.ptr());
+                d_chi0_block.set_data(chi0_block.nr(),chi0_block.nc());
+                CudaConnector::pgemr2d_nvhpc(
+                    gpu_dev_stream, n_abf, n_abf,
+                    d_temp_block.ptr(), 1, 1, desc_nabf_nabf,
+                    d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                    CUDA_C_64F
+                );
+                #endif
+                Profiler::stop("epsilon_prepare_chi0_2d_collect_block");
+            }
+            Profiler::stop("epsilon_prepare_chi0_2d");
+
+            Profiler::start("epsilon_compute_eps", "Compute dielectric matrix");
+            if(gpu_dev_stream.rank==0)
+                printf("is_gamma_point(q):%d\n",is_gamma_point(q));
+            const std::complex<double> calpha(1.0,0.0),cbeta(0.0,0.0);
+            if (epsmac_LF_imagfreq.size() > 0 && is_gamma_point(q))
+            {
+                ofs_myid << get_timestamp() << " Entering dielectric matrix head overwrite" << endl;
+                // rotate to Coulomb-eigenvector basis
+                // descending order
+                
+                d_sqrtveig_blacs.set_data(sqrtveig_blacs.nr(), sqrtveig_blacs.nc(), sqrtveig_blacs.ptr());
+                d_coul_chi0_block.set_data(coul_chi0_block.nr(), coul_chi0_block.nc());
+                gpu_dev_stream.cudaSync();
+                CudaConnector::pgemm_nvhpc(
+                    gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_nonsingular, n_abf,
+                    &calpha,
+                    d_chi0_block,1,1,desc_nabf_nabf_opt,
+                    d_sqrtveig_blacs,1,1,desc_nabf_nabf_opt,
+                    &cbeta,
+                    d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                    CUBLAS_COMPUTE_64F_PEDANTIC);
+                
+                CudaConnector::pgemm_nvhpc(
+                    gpu_dev_stream,CUBLAS_OP_C,CUBLAS_OP_N,n_nonsingular, n_nonsingular, n_abf,
+                    &calpha,
+                    d_sqrtveig_blacs,1,1,desc_nabf_nabf_opt,
+                    d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                    &cbeta,
+                    d_chi0_block,1,1,desc_nabf_nabf_opt,
+                    CUBLAS_COMPUTE_64F_PEDANTIC);
+                
+                if (Params::option_dielect_func == 3)
+                {
+                    
+                    cuDoubleComplex calpha1;
+                    calpha1.x = -1.0;
+                    calpha1.y = 0.0;
+                    CudaConnector::multiply_number_for_ComplexMatrixDevice(d_chi0_block,calpha1,gpu_dev_stream.stream);
+                    CudaConnector::diag_add_ComplexMatrixDevice(d_chi0_block,1.0,desc_nabf_nabf_opt,gpu_dev_stream.stream);
+                    gpu_dev_stream.cudaSync();
+                    // {
+                    //     std::string filename = "gpu_";
+                    //     filename += to_string(gpu_dev_stream.nranks);
+                    //     filename += "_";
+                    //     filename += to_string(gpu_dev_stream.rank);
+                    //     gpu_dev_stream.cudaSync();
+                    //     CUDA_CHECK(cudaMemcpy(coul_chi0_block.ptr(),d_chi0_block.ptr(),sizeof(cuDoubleComplex)*coul_chi0_block.nr()*coul_chi0_block.nc(),cudaMemcpyDeviceToHost));
+                    //     CudaConnector::write_file((cuDoubleComplex*)coul_chi0_block.ptr(),coul_chi0_block.nr(),coul_chi0_block.nc(),filename.data());
+                    // }
+                    // CUDA_CHECK(cudaMemcpy(chi0_block.ptr(),d_chi0_block.ptr(),sizeof(cuDoubleComplex)*chi0_block.nr()*chi0_block.nc(),cudaMemcpyDeviceToHost));
+                    
+                    ofs_myid << get_timestamp() << "Perform the head & wing element overwrite"
+                             << endl;
+                    // df_headwing.rewrite_eps(chi0_block, ifreq, desc_nabf_nabf_opt);
+                    df_headwing.rewrite_eps_nvhpc(gpu_dev_stream, d_chi0_block, ifreq, desc_nabf_nabf_opt);
+                    // d_chi0_block.set_data(chi0_block.nr(), chi0_block.nc(), chi0_block.ptr());
+                    
+                    d_coul_eigen_block.set_data(coul_eigen_block.nr(), coul_eigen_block.nc(), coul_eigen_block.ptr());
+                    // rotate back to ABF
+                    // descending order
+                    CudaConnector::pgemm_nvhpc(
+                        gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_nonsingular, n_nonsingular,
+                        &calpha,
+                        d_coul_eigen_block,1,1,desc_nabf_nabf_opt,
+                        d_chi0_block,1,1,desc_nabf_nabf_opt,
+                        &cbeta,
+                        d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                        CUBLAS_COMPUTE_64F_PEDANTIC);
+                    
+                    CudaConnector::pgemm_nvhpc(
+                        gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_C,n_abf, n_abf, n_nonsingular,
+                        &calpha,
+                        d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                        d_coul_eigen_block,1,1,desc_nabf_nabf_opt,
+                        &cbeta,
+                        d_chi0_block,1,1,desc_nabf_nabf_opt,
+                        CUBLAS_COMPUTE_64F_PEDANTIC);
+                    // subtract 1 from diagonal
+                    CudaConnector::diag_add_ComplexMatrixDevice(d_chi0_block,-1.0,desc_nabf_nabf_opt,gpu_dev_stream.stream);
+                    Profiler::start("epsilon_multiply_coulwc", "Multiply truncated Coulomb");
+                    d_coulwc_block.set_data(coulwc_block.nr(), coulwc_block.nc(), coulwc_block.ptr());
+                    d_coul_chi0_block.set_data(coul_chi0_block.nr(), coul_chi0_block.nc());
+                    CudaConnector::pgemm_nvhpc(
+                        gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf,
+                        &calpha,
+                        d_coulwc_block,1,1,desc_nabf_nabf_opt,
+                        d_chi0_block,1,1,desc_nabf_nabf_opt,
+                        &cbeta,
+                        d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                        CUBLAS_COMPUTE_64F_PEDANTIC);
+                    CudaConnector::pgemm_nvhpc(
+                        gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf,
+                        &calpha,
+                        d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                        d_coulwc_block,1,1,desc_nabf_nabf_opt,
+                        &cbeta,
+                        d_chi0_block,1,1,desc_nabf_nabf_opt,
+                        CUBLAS_COMPUTE_64F_PEDANTIC);
+                    Profiler::stop("epsilon_multiply_coulwc");
+                }
+                else
+                {
+                    const int ilo = desc_nabf_nabf_opt.indx_g2l_r(0);
+                    const int jlo = desc_nabf_nabf_opt.indx_g2l_c(0);
+                    if (ilo >= 0 && jlo >= 0)
+                    {
+                        ofs_myid << get_timestamp() << "Perform the head element overwrite" << endl;
+                        std::complex<double> temp_element = 1.0 - epsmac_LF_imagfreq[ifreq];
+                        CUDA_CHECK(cudaMemcpy(d_chi0_block.ptr() + ilo + jlo * d_chi0_block.nr(),
+                                               &temp_element, sizeof(cuDoubleComplex),
+                                               cudaMemcpyHostToDevice));
+                    }
+                    d_coul_eigen_block.set_data(coul_eigen_block.nr(), coul_eigen_block.nc(), coul_eigen_block.ptr());
+                    CudaConnector::pgemm_nvhpc(
+                        gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_nonsingular, n_nonsingular,
+                        &calpha,
+                        d_coul_eigen_block,1,1,desc_nabf_nabf_opt,
+                        d_chi0_block,1,1,desc_nabf_nabf_opt,
+                        &cbeta,
+                        d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                        CUBLAS_COMPUTE_64F_PEDANTIC);
+                    CudaConnector::pgemm_nvhpc(
+                        gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_C,n_abf, n_abf, n_nonsingular,
+                        &calpha,
+                        d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                        d_coul_eigen_block,1,1,desc_nabf_nabf_opt,
+                        &cbeta,
+                        d_chi0_block,1,1,desc_nabf_nabf_opt,
+                        CUBLAS_COMPUTE_64F_PEDANTIC);
+                    
+                    cuDoubleComplex calpha1;
+                    calpha1.x = -1.0;
+                    calpha1.y = 0.0;
+                    CudaConnector::multiply_number_for_ComplexMatrixDevice(d_chi0_block,calpha1,gpu_dev_stream.stream);
+                    CudaConnector::diag_add_ComplexMatrixDevice(d_chi0_block,1.0,desc_nabf_nabf_opt,gpu_dev_stream.stream);
+                    gpu_dev_stream.cudaSync();
+                    Profiler::start("epsilon_invert_eps and epsilon_multiply_coulwc", "Invert dielectric matrix and Multiply truncated Coulomb");
+                    char order = 'c';
+                    d_coulwc_block.set_data(coulwc_block.nr(), coulwc_block.nc(), coulwc_block.ptr(),gpu_dev_stream.stream);
+                    d_coul_chi0_block.set_data_device(coulwc_block.nr(), coulwc_block.nc(), d_coulwc_block.ptr(),gpu_dev_stream);
+                    int64_t* d_ipiv;
+                    int* d_info;
+                    CUDA_CHECK(cudaMallocAsync(&d_info,sizeof(int),gpu_dev_stream.stream));
+                    if(order == 'c'||order == 'C'){
+                        CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*desc_nabf_nabf_opt.n_loc(),gpu_dev_stream.stream));
+                        CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_coul_chi0_block);
+                        CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_chi0_block);
+                    }else{
+                        CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*desc_nabf_nabf_opt.m_loc(),gpu_dev_stream.stream));
+                    }
+                    CudaConnector::pgetrf_nvhpc_mixed_precision(
+                        gpu_dev_stream, d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                        d_ipiv, d_info,
+                        CUDA_C_64F, order
+                    );
+                    CudaConnector::pgetrs_nvhpc_mixed_precision(
+                        gpu_dev_stream, CUBLAS_OP_N,
+                        d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                        d_ipiv,
+                        d_coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                        d_info,
+                        CUDA_C_64F, order
+                    );
+                    if(order == 'c'||order == 'C'){
+                        CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_coul_chi0_block);
+                    }
+                    CUDA_CHECK(cudaFreeAsync(d_info, gpu_dev_stream.stream));
+                    CUDA_CHECK(cudaFreeAsync(d_ipiv, gpu_dev_stream.stream));
+                    d_chi0_block.set_data_device(coulwc_block.nr(), coulwc_block.nc(), d_coulwc_block.ptr(), gpu_dev_stream);
+                    
+                    CudaConnector::pgeadd_nvhpc(
+                        gpu_dev_stream, CUBLAS_OP_N,
+                        &calpha, 
+                        d_coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                        &calpha1,
+                        d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                        CUDA_C_64F, order
+                    );
+                    d_coul_chi0_block.set_data_device(d_chi0_block.nr(), d_chi0_block.nc(), d_chi0_block.ptr(), gpu_dev_stream);
+                    CudaConnector::pgemm_nvhpc(
+                        gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf,
+                        &calpha,
+                        (order=='r'||order=='R')?d_coulwc_block:d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                        (order=='r'||order=='R')?d_coul_chi0_block:d_coulwc_block,1,1,desc_nabf_nabf_opt,
+                        &cbeta,
+                        d_chi0_block,1,1,desc_nabf_nabf_opt,
+                        CUBLAS_COMPUTE_64F_PEDANTIC);
+                    Profiler::stop("epsilon_invert_eps and epsilon_multiply_coulwc");
+                }
+            }
+            else
+            {
+                Profiler::start("epsilon_compute_eps_pgemm_1");
+                // d_chi0_block.set_data(chi0_block.nr(), chi0_block.nc(), chi0_block.ptr());
+                d_coul_block.set_data(coul_block.nr(), coul_block.nc(), coul_block.ptr());
+                d_coul_chi0_block.set_data(coul_chi0_block.nr(), coul_chi0_block.nc());
+                
+                CudaConnector::pgemm_nvhpc(
+                    gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf,
+                    &calpha,
+                    d_coul_block,1,1,desc_nabf_nabf_opt,
+                    d_chi0_block,1,1,desc_nabf_nabf_opt,
+                    &cbeta,
+                    d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                    CUBLAS_COMPUTE_64F_PEDANTIC);
+                Profiler::cease("epsilon_compute_eps_pgemm_1");
+                Profiler::start("epsilon_compute_eps_pgemm_2");
+                CudaConnector::pgemm_nvhpc(
+                    gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf,
+                    &calpha,
+                    d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                    d_coul_block,1,1,desc_nabf_nabf_opt,
+                    &cbeta,
+                    d_chi0_block,1,1,desc_nabf_nabf_opt,
+                    CUBLAS_COMPUTE_64F_PEDANTIC);
+                // d_coul_block.cublasClean(gpu_dev_stream.cublas_handle);
+                Profiler::cease("epsilon_compute_eps_pgemm_2");
+                // now chi0_block is actually v1/2 chi v1/2
+                cuDoubleComplex calpha1;
+                calpha1.x = -1.0;
+                calpha1.y = 0.0;
+                CudaConnector::multiply_number_for_ComplexMatrixDevice(d_chi0_block,calpha1,gpu_dev_stream.stream);
+                gpu_dev_stream.cudaSync();
+                CudaConnector::diag_add_ComplexMatrixDevice(d_chi0_block,1.0,desc_nabf_nabf_opt,gpu_dev_stream.stream);
+                Profiler::stop("epsilon_compute_eps");
+                // now chi0_block is actually the dielectric matrix
+                // perform inversion
+                Profiler::start("epsilon_invert_eps and epsilon_multiply_coulwc", "Invert dielectric matrix and Multiply truncated Coulomb");
+                char order = 'c';
+                d_coulwc_block.set_data(coulwc_block.nr(), coulwc_block.nc(), coulwc_block.ptr(),gpu_dev_stream.stream);
+                d_coul_chi0_block.set_data_device(coulwc_block.nr(), coulwc_block.nc(), d_coulwc_block.ptr(),gpu_dev_stream);
+                int64_t* d_ipiv;
+                int* d_info;
+                CUDA_CHECK(cudaMallocAsync(&d_info,sizeof(int),gpu_dev_stream.stream));
+                if(order == 'c'||order == 'C'){
+                    CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*desc_nabf_nabf_opt.n_loc(),gpu_dev_stream.stream));
+                    CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_coul_chi0_block);
+                    CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_chi0_block);
+                }else{
+                    CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*desc_nabf_nabf_opt.m_loc(),gpu_dev_stream.stream));
+                }
+                CudaConnector::pgetrf_nvhpc_mixed_precision(
+                    gpu_dev_stream, d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                    d_ipiv, d_info,
+                    CUDA_C_64F, order
+                );
+                
+                CudaConnector::pgetrs_nvhpc_mixed_precision(
+                    gpu_dev_stream, CUBLAS_OP_N,
+                    d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                    d_ipiv,
+                    d_coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                    d_info,
+                    CUDA_C_64F, order
+                );
+                if(order == 'c'||order == 'C'){
+                    CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_coul_chi0_block);
+                }
+                CUDA_CHECK(cudaFreeAsync(d_info, gpu_dev_stream.stream));
+                CUDA_CHECK(cudaFreeAsync(d_ipiv, gpu_dev_stream.stream));
+                d_chi0_block.set_data_device(coulwc_block.nr(), coulwc_block.nc(), d_coulwc_block.ptr(), gpu_dev_stream);
+                
+                CudaConnector::pgeadd_nvhpc(
+                    gpu_dev_stream, CUBLAS_OP_N,
+                    &calpha, 
+                    d_coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                    &calpha1,
+                    d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                    CUDA_C_64F, order
+                );
+                d_coul_chi0_block.set_data_device(d_chi0_block.nr(), d_chi0_block.nc(), d_chi0_block.ptr(), gpu_dev_stream);
+                CudaConnector::pgemm_nvhpc(
+                    gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf,
+                    &calpha,
+                    (order=='r'||order=='R')?d_coulwc_block:d_coul_chi0_block,1,1,desc_nabf_nabf_opt,
+                    (order=='r'||order=='R')?d_coul_chi0_block:d_coulwc_block,1,1,desc_nabf_nabf_opt,
+                    &cbeta,
+                    d_chi0_block,1,1,desc_nabf_nabf_opt,
+                    CUBLAS_COMPUTE_64F_PEDANTIC);
+                Profiler::stop("epsilon_invert_eps and epsilon_multiply_coulwc");
+            }
+            // Array_Desc_Device array_desc_device(desc_nabf_nabf_opt);
+            // printf("successful create object array_desc_device\n");
+            #ifndef ENABLE_NVHPC
+            gpu_dev_stream.calSync();
+            CUDA_CHECK(cudaMemcpy(chi0_block.ptr(),d_chi0_block.ptr(),sizeof(cuDoubleComplex)*chi0_block.nr()*chi0_block.nc(),cudaMemcpyDeviceToHost));
+            ScalapackConnector::pgemr2d_f(n_abf, n_abf, chi0_block.ptr(), 1, 1,
+                                          desc_nabf_nabf_opt.desc, temp_block.ptr(), 1, 1,
+                                          desc_nabf_nabf.desc, blacs_ctxt_global_h.ictxt);
+            #else
+            d_temp_block.set_data(temp_block.nr(),temp_block.nc());
+            CudaConnector::pgemr2d_nvhpc(
+                gpu_dev_stream, n_abf, n_abf,
+                d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt,
+                d_temp_block.ptr(), 1, 1, desc_nabf_nabf,
+                CUDA_C_64F
+            );
+            CUDA_CHECK(cudaMemcpyAsync(temp_block.ptr(), d_temp_block.ptr(), sizeof(cuDoubleComplex)*temp_block.nr()*temp_block.nc(), cudaMemcpyDeviceToHost, gpu_dev_stream.stream));
+            gpu_dev_stream.cudaSync();
+            #endif
+            
+            Profiler::start("epsilon_convert_wc_2d_to_ij", "Convert Wc, 2D -> IJ");
+            Profiler::start("epsilon_convert_wc_map_block", "Initialize Wc atom-pair map");
+            map<int, map<int, matrix_m<complex<double>>>> Wc_MNmap;
+            // map_block_to_IJ_storage(Wc_MNmap, LIBRPA::atomic_basis_abf,
+            //                         LIBRPA::atomic_basis_abf, chi0_block,
+            //                         desc_nabf_nabf, MAJOR::ROW);
+            map_block_to_IJ_storage_new(Wc_MNmap, LIBRPA::atomic_basis_abf, map_lor_v, map_loc_v,
+                                        temp_block, desc_nabf_nabf, MAJOR::ROW);
+            Profiler::stop("epsilon_convert_wc_map_block");
+
+            Profiler::start("epsilon_convert_wc_communicate", "Communicate");
+            {
+                std::map<int, std::map<std::pair<int, std::array<double, 3>>,
+                                       RI::Tensor<complex<double>>>>
+                    Wc_libri;
+                Profiler::start("epsilon_convert_wc_communicate_1");
+                for (const auto &M_NWc : Wc_MNmap)
+                {
+                    const auto &M = M_NWc.first;
+                    const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M);
+                    for (const auto &N_Wc : M_NWc.second)
+                    {
+                        const auto &N = N_Wc.first;
+                        const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N);
+                        const auto &Wc = N_Wc.second;
+                        // std::valarray<complex<double>> Wc_va(Wc.ptr(), Wc.size());
+                        // auto pWc = std::make_shared<std::valarray<complex<double>>>();
+                        // *pWc = Wc_va;
+                        /*if (iq == 10 && ifreq == 10)
+                        {
+                            char fn[100];
+                            sprintf(fn, "Wc_M_%zu_N_%zu.dat", M, N);
+                            print_matrix_mm_file(Wc, Params::output_dir + "/" + fn);
+                        }*/
+                        Wc_libri[M][{N, qa}] = RI::Tensor<complex<double>>({n_mu, n_nu}, Wc.sptr());
+                    }
+                }
+                Profiler::stop("epsilon_convert_wc_communicate_1");
+                Profiler::start("epsilon_convert_wc_communicate_2");
+                // main timing
+                // cout << Wc_libri;
+                const auto IJq_Wc = RI::Communicate_Tensors_Map_Judge::comm_map2_first(
+                    mpi_comm_global_h.comm, Wc_libri, Iset_Jset_Wc.first, Iset_Jset_Wc.second);
+                Profiler::stop("epsilon_convert_wc_communicate_2");
+                Profiler::start("epsilon_convert_wc_communicate_3");
+                // parse collected to
+                for (const auto &MN : atpair_local)
+                {
+                    const auto &M = MN.first;
+                    const auto &N = MN.second;
+                    const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M);
+                    const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N);
+                    // Use row major for later usage in LibRI
+                    Wc_freq_q[freq][M][N][q] = matrix_m<complex<double>>(
+                        n_mu, n_nu, IJq_Wc.at(M).at({N, qa}).data, MAJOR::ROW);
+                }
+                Profiler::stop("epsilon_convert_wc_communicate_3");
+                // for ( int i_mu = 0; i_mu != n_mu; i_mu++ )
+                //     for ( int i_nu = 0; i_nu != n_nu; i_nu++ )
+                //     {
+                //     }
+            }
+            Profiler::stop("epsilon_convert_wc_communicate");
+            Profiler::stop("epsilon_convert_wc_2d_to_ij");
+            Profiler::cease("epsilon_wc_work_q_omega");
+            LIBRPA::utils::lib_printf_root(
+                "Time for Wc(i_q=%d, i_omega=%d) (seconds, Wall/CPU): %f %f\n", iq + 1, ifreq + 1,
+                Profiler::get_wall_time_last("epsilon_wc_work_q_omega"),
+                Profiler::get_cpu_time_last("epsilon_wc_work_q_omega"));
+        }
+    }
+#else
+    throw std::logic_error("need compilation with LibRI");
+#endif
+    Profiler::cease("compute_Wc_freq_q_work");
+    LIBRPA::utils::lib_printf_root("Time for Wc computation (seconds, Wall/CPU): %f %f\n",
+                                   Profiler::get_wall_time_last("compute_Wc_freq_q_work"),
+                                   Profiler::get_cpu_time_last("compute_Wc_freq_q_work"));
+
+    return Wc_freq_q;
+}
+
+#endif
\ No newline at end of file
diff --git a/src/epsilon_cuda.h b/src/epsilon_cuda.h
new file mode 100644
index 00000000..0fd7b2d2
--- /dev/null
+++ b/src/epsilon_cuda.h
@@ -0,0 +1,18 @@
+#pragma once
+#include "epsilon.h"
+#include "device_connector.h"
+#include "matrix_device.h"
+
+#ifdef LIBRPA_USE_CUDA
+CorrEnergy compute_RPA_correlation_cuda(const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat);
+#endif
+#ifdef ENABLE_NVHPC
+CorrEnergy compute_RPA_correlation_blacs_2d_cuda(Chi0 &chi0, atpair_k_cplx_mat_t &coulmat);
+
+map<double, atom_mapping<std::map<Vector3_Order<double>, matrix_m<complex<double>>>>::pair_t_old>
+compute_Wc_freq_q_blacs_cuda(Chi0 &, const atpair_k_cplx_mat_t &,
+                        atpair_k_cplx_mat_t &,
+                        const vector<std::complex<double>> &);
+complex<double> compute_pi_det_blacs_2d_nvhpc(
+    MatrixDevice<std::complex<double>> &, const LIBRPA::Array_Desc &arrdesc_pi, int64_t *d_ipiv, int *d_info,char order='C');
+#endif
diff --git a/src/matrix_device.cpp b/src/matrix_device.cpp
new file mode 100644
index 00000000..28b7987e
--- /dev/null
+++ b/src/matrix_device.cpp
@@ -0,0 +1,16 @@
+#include "matrix_device.h"
+#include "device_stream.h"
+
+template <typename T>
+void MatrixDevice<T>::set_data_device(const int& m, const int& n, const void* d_A, const void* stream){
+    set_data(m,n,stream);
+    if(stream==nullptr){
+        #ifdef ENABLE_NVHPC
+        CUDA_CHECK(cudaMemcpyPeer(this->d_data, device_stream.local_device, d_A, device_stream.local_device, m * n * sizeof(T)));
+        #endif
+    }else{
+        #ifdef ENABLE_NVHPC
+        CUDA_CHECK(cudaMemcpyPeerAsync(this->d_data, device_stream.local_device, d_A, device_stream.local_device, m * n * sizeof(T), (cudaStream_t)stream));
+        #endif
+    }
+}
\ No newline at end of file
diff --git a/src/matrix_device.h b/src/matrix_device.h
new file mode 100644
index 00000000..000c3847
--- /dev/null
+++ b/src/matrix_device.h
@@ -0,0 +1,111 @@
+#ifndef MATRIX_DEVICE_H
+#define MATRIX_DEVICE_H
+
+#include<iostream>
+#ifdef ENABLE_NVHPC
+#include <cuda_runtime.h>
+#include "helpers.h"
+#endif
+
+template <typename T>
+class MatrixDevice{
+private:
+    T* d_data=nullptr;
+    int m=0;
+    int n=0;
+public:
+    MatrixDevice(){}
+    MatrixDevice(const int& m, const int& n, const void* stream){
+        this->m=m;
+        this->n=n;
+        if(stream==nullptr){
+            #ifdef ENABLE_NVHPC
+            CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(T)));
+            #endif
+        }else{
+            #ifdef ENABLE_NVHPC
+            CUDA_CHECK(cudaMallocAsync((void**)&d_data, m * n * sizeof(T), (cudaStream_t)stream));
+            #endif
+        }
+    }
+    MatrixDevice(const int& m, const int& n,const void* c_data, const void* stream){
+        this->m=m;
+        this->n=n;
+        if(stream==nullptr){
+            #ifdef ENABLE_NVHPC
+            CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(T)));
+            CUDA_CHECK(cudaMemcpy(d_data, c_data, m * n * sizeof(T), cudaMemcpyHostToDevice));
+            #endif
+        }else{
+            #ifdef ENABLE_NVHPC
+            CUDA_CHECK(cudaMallocAsync((void**)&d_data, m * n * sizeof(T), (cudaStream_t)stream));
+            CUDA_CHECK(cudaMemcpyAsync(d_data, c_data, m * n * sizeof(T), cudaMemcpyHostToDevice, (cudaStream_t)stream));
+            #endif
+        }
+    }
+    void set_data(const int& m, const int& n, const void* stream){
+        if(m!=this->m || n!=this->n){
+            clean(stream);
+            this->m=m;
+            this->n=n;
+            if(stream==nullptr){
+                #ifdef ENABLE_NVHPC
+                CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(T)));
+                #endif
+            }else{
+                #ifdef ENABLE_NVHPC
+                CUDA_CHECK(cudaMallocAsync((void**)&d_data, m * n * sizeof(T), (cudaStream_t)stream));
+                #endif
+            }
+        }
+    }
+    void set_data(const int& m, const int& n,const void* c_data, const void* stream){
+        set_data(m,n,stream);
+        if(stream==nullptr){
+            #ifdef ENABLE_NVHPC
+            CUDA_CHECK(cudaMemcpy(d_data, c_data, m * n * sizeof(T), cudaMemcpyHostToDevice));
+            #endif
+        }else{
+            #ifdef ENABLE_NVHPC
+            CUDA_CHECK(cudaMemcpyAsync(d_data, c_data, m * n * sizeof(T), cudaMemcpyHostToDevice, (cudaStream_t)stream));
+            #endif
+        }
+    }
+    void set_data_device(const int& m, const int& n, const void* d_A, const void* stream);
+    
+    
+    void clean(const void* stream){
+        if(d_data!=nullptr){
+            if(stream==nullptr){
+                #ifdef ENABLE_NVHPC
+                cudaFree(d_data);
+                #endif
+            }else{
+                #ifdef ENABLE_NVHPC
+                cudaFreeAsync(d_data, (cudaStream_t)stream);
+                #endif
+            }
+            d_data=nullptr;
+        }
+        this->m=0;
+        this->n=0;
+    }
+    T* ptr(){
+        return d_data;
+    }
+    const T* ptr() const {
+        return d_data;
+    }
+    int nr() const {
+        return m;
+    }
+    int nc() const {
+        return n;
+    }
+    ~MatrixDevice(){
+        clean(nullptr);
+    }
+
+};
+
+#endif // MATRIX_DEVICE_H
\ No newline at end of file

From 154ab9a882e053ed45f17276579e636c350b65af Mon Sep 17 00:00:00 2001
From: userchba <88939318+userchba@users.noreply.github.com>
Date: Fri, 14 Nov 2025 16:55:53 +0800
Subject: [PATCH 09/18] Add files via upload

---
 src/helpers.h | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 src/helpers.h

diff --git a/src/helpers.h b/src/helpers.h
new file mode 100644
index 00000000..ec014586
--- /dev/null
+++ b/src/helpers.h
@@ -0,0 +1,107 @@
+
+#ifndef HELPERS_H
+#define HELPERS_H
+
+#pragma once
+#include <mpi.h>
+#include <iostream>
+#ifdef ENABLE_NVHPC
+#include <cal.h>
+#include <cublasmp.h>
+#include <cusolverMp.h>
+#endif
+
+
+typedef enum{
+    LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE,
+    LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT,
+    LIBRPA_COMPUTE_TYPE_DOUBLE,
+    LIBRPA_COMPUTE_TYPE_FLOAT
+}LIBRPA_DEVICE_COMPUTE_TYPE;
+
+
+#ifdef ENABLE_NVHPC
+#define NVHPC_MPI_CHECK(call)                                                                                          \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        int status = call;                                                                                             \
+        if (status != MPI_SUCCESS)                                                                                     \
+        {                                                                                                              \
+            fprintf(stderr, "MPI error at %s:%d : %d\n", __FILE__, __LINE__, status);                                  \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    } while (0)
+#define NVSHMEM_CHECK(call)                                                                                            \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        int status = call;                                                                                             \
+        if (status != 0)                                                                                               \
+        {                                                                                                              \
+            fprintf(stderr, "NVSHMEM error at %s:%d : %d\n", __FILE__, __LINE__, status);                              \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    } while (0)
+
+
+#define CUDA_CHECK(call)                                                                                               \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        cudaError_t status = call;                                                                                     \
+        if (status != cudaSuccess)                                                                                     \
+        {                                                                                                              \
+            fprintf(stderr, "CUDA error at %s:%d : %s\n", __FILE__, __LINE__, cudaGetErrorString(status));             \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    } while (0)
+
+#define CUBLASMP_CHECK(call)                                                                                           \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        cublasStatus_t status = call;                                                                                  \
+        if (status != CUBLAS_STATUS_SUCCESS)                                                                           \
+        {                                                                                                              \
+            fprintf(stderr, "cuBLASMp error at %s:%d : %d\n", __FILE__, __LINE__, status);                             \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    } while (0)
+#define CUBLAS_CHECK(err)                                                                                              \
+    do {                                                                                                               \
+        cublasStatus_t err_ = (err);                                                                                   \
+        if (err_ != CUBLAS_STATUS_SUCCESS) {                                                                           \
+            std::printf("cublas error %d at %s:%d\n", err_, __FILE__, __LINE__);                                       \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    } while (0)
+#define CAL_CHECK(call)                                                                                                 \
+    do                                                                                                                  \
+    {                                                                                                                   \
+        calError_t status = call;                                                                                       \
+        if (status != CAL_OK)                                                                                           \
+        {                                                                                                               \
+            fprintf(stderr, "CAL error at %s:%d : %d\n", __FILE__, __LINE__, status);                                   \
+            exit(EXIT_FAILURE);                                                                                         \
+        }                                                                                                               \
+    }while(0)
+
+#define CUSOLVERMP_CHECK(call)                                                                                         \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        cusolverStatus_t status = call;                                                                                \
+        if (status != CUSOLVER_STATUS_SUCCESS)                                                                         \
+        {                                                                                                              \
+            fprintf(stderr, "cuSOLVERMp error at %s:%d : %d\n", __FILE__, __LINE__, status);                           \
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    } while (0)
+#endif
+#define ORDER_CHECK(order)                                                                                             \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        if(order !='C'&&order !='c'&&order !='R'&&order !='r')                                                         \
+        {                                                                                                              \
+            fprintf(stderr, "Order should be either 'C' or 'R', order error at %s:%d:%s\n", __FILE__, __LINE__, order);\
+            exit(EXIT_FAILURE);                                                                                        \
+        }                                                                                                              \
+    }while (0)
+
+#endif // HELPERS_H
\ No newline at end of file

From 91026d70fd5e955d4b810feb2a1bb2f156b5478a Mon Sep 17 00:00:00 2001
From: userchba <88939318+userchba@users.noreply.github.com>
Date: Fri, 14 Nov 2025 16:56:33 +0800
Subject: [PATCH 10/18] Add files via upload

---
 driver/main.cpp         |  12 +++--
 driver/task_gw_band.cpp | 115 ++++++++++++----------------------------
 2 files changed, 43 insertions(+), 84 deletions(-)

diff --git a/driver/main.cpp b/driver/main.cpp
index 736a1838..164671ed 100644
--- a/driver/main.cpp
+++ b/driver/main.cpp
@@ -39,7 +39,9 @@
 #include "utils_cmake.h"
 #include "utils_mem.h"
 #include "utils_mpi_io.h"
-
+#ifdef ENABLE_NVHPC
+#include "device_stream.h"
+#endif
 static void initialize(int argc, char **argv)
 {
     using namespace LIBRPA::envs;
@@ -54,7 +56,9 @@ static void initialize(int argc, char **argv)
     }
 
     initialize_librpa_environment(MPI_COMM_WORLD, 0, 0, "");
-
+    #ifdef ENABLE_NVHPC
+    device_stream.init();
+    #endif
     // Global profiler begins right after MPI is initialized
     Profiler::start("total", "Total");
 }
@@ -77,7 +81,9 @@ static void finalize(bool success)
             lib_printf("libRPA failed\n");
         }
     }
-
+    #ifdef ENABLE_NVHPC
+    device_stream.finalize();
+    #endif
     finalize_librpa_environment();
 
     MPI_Finalize();
diff --git a/driver/task_gw_band.cpp b/driver/task_gw_band.cpp
index fd420614..1a9d99b3 100644
--- a/driver/task_gw_band.cpp
+++ b/driver/task_gw_band.cpp
@@ -23,6 +23,10 @@
 #include "ri.h"
 #include "utils_timefreq.h"
 #include "write_aims.h"
+#ifdef ENABLE_NVHPC
+#include "epsilon_cuda.h"
+#include <cuda_runtime.h>
+#endif
 
 void task_g0w0_band(std::map<Vector3_Order<double>, ComplexMatrix> &sinvS)
 {
@@ -114,19 +118,9 @@ void task_g0w0_band(std::map<Vector3_Order<double>, ComplexMatrix> &sinvS)
     Profiler::start("g0w0_exx", "Build exchange self-energy");
     auto exx = LIBRPA::Exx(meanfield, kfrac_list, period);
     {
-        atpair_R_mat_t VR;
-        if (Params::use_fullcoul_exx)
-        {
-            Profiler::start("ft_vq_full", "Fourier transform full Coulomb");
-            VR = FT_Vq(Vq, meanfield.get_n_kpoints(), Rlist, true);
-            Profiler::stop("ft_vq_full");
-        }
-        else
-        {
-            Profiler::start("ft_vq_cut", "Fourier transform truncated Coulomb");
-            VR = FT_Vq(Vq_cut, meanfield.get_n_kpoints(), Rlist, true);
-            Profiler::stop("ft_vq_cut");
-        }
+        Profiler::start("ft_vq_cut", "Fourier transform truncated Coulomb");
+        const auto VR = FT_Vq(Vq_cut, meanfield.get_n_kpoints(), Rlist, true);
+        Profiler::stop("ft_vq_cut");
 
         Profiler::start("g0w0_exx_real_work");
         if (Params::use_shrink_abfs)
@@ -156,7 +150,15 @@ void task_g0w0_band(std::map<Vector3_Order<double>, ComplexMatrix> &sinvS)
         Wc_freq_q;
     if (Params::use_scalapack_gw_wc)
     {
+        #ifdef ENABLE_NVHPC
+        int numDevices;
+        cudaError_t cudaStat = cudaGetDeviceCount(&numDevices);
+        if(cudaStat == cudaSuccess && numDevices>0)
+            Wc_freq_q = compute_Wc_freq_q_blacs_cuda(chi0, Vq, Vq_cut, epsmac_LF_imagfreq);
+        else 
+        #endif
         Wc_freq_q = compute_Wc_freq_q_blacs(chi0, Vq, Vq_cut, epsmac_LF_imagfreq);
+        
     }
     else
     {
@@ -442,21 +444,11 @@ void task_g0w0_band(std::map<Vector3_Order<double>, ComplexMatrix> &sinvS)
             }
         }
         // output bandgap
-        double gw_bandgap = 0.0;
-        double gw_valence = -1.e10;
-        double gw_conduct = 1.e10;
-        double exx_bandgap = 0.0;
-        double exx_valence = -1.e10;
-        double exx_conduct = 1.e10;
-        double dft_bandgap = 0.0;
-        double dft_valence = -1.e10;
-        double dft_conduct = 1.e10;
-        int ik_val_gw = 0;
-        int ik_cond_gw = 0;
-        int ik_val_exx = 0;
-        int ik_cond_exx = 0;
-        int ik_val_dft = 0;
-        int ik_cond_dft = 0;
+        double bandgap = 0.0;
+        double valence = -1.e10;
+        double conduct = 1.e10;
+        int ik_val = 0;
+        int ik_cond = 0;
         int nocc = 0;
         auto &wg = meanfield.get_weight()[0];
         for (int i = 0; i != wg.size; i++)
@@ -522,38 +514,16 @@ void task_g0w0_band(std::map<Vector3_Order<double>, ComplexMatrix> &sinvS)
                     ofs_hf << std::setw(15) << std::setprecision(5) << occ_state << std::setw(15)
                            << std::setprecision(5) << eks_state - vxc_state + exx_state;
 
-                    // output GW bandgap
-                    if (i_state == nocc - 1 && eqp > gw_conduct)  // HOMO
-                    {
-                        gw_valence = eqp;
-                        ik_val_gw = i_kpoint;
-                    }
-                    else if (i_state == nocc && eqp < gw_conduct)  // LUMO
-                    {
-                        gw_conduct = eqp;
-                        ik_cond_gw = i_kpoint;
-                    }
-                    // output EXX bandgap
-                    if (i_state == nocc - 1 && exx_state > exx_valence)  // HOMO
-                    {
-                        exx_valence = exx_state;
-                        ik_val_exx = i_kpoint;
-                    }
-                    else if (i_state == nocc && exx_state < exx_conduct)  // LUMO
-                    {
-                        exx_conduct = exx_state;
-                        ik_cond_exx = i_kpoint;
-                    }
-                    // output DFT bandgap
-                    if (i_state == nocc - 1 && eks_state > dft_valence)  // HOMO
+                    // output bandgap
+                    if (i_state == nocc - 1 && eqp > valence)  // HOMO
                     {
-                        dft_valence = eks_state;
-                        ik_val_dft = i_kpoint;
+                        valence = eqp;
+                        ik_val = i_kpoint;
                     }
-                    else if (i_state == nocc && eks_state < dft_conduct)  // LUMO
+                    else if (i_state == nocc && eqp < conduct)  // LUMO
                     {
-                        dft_conduct = eks_state;
-                        ik_cond_dft = i_kpoint;
+                        conduct = eqp;
+                        ik_cond = i_kpoint;
                     }
                 }
                 ofs_gw << "\n";
@@ -561,30 +531,13 @@ void task_g0w0_band(std::map<Vector3_Order<double>, ComplexMatrix> &sinvS)
                 ofs_ks << "\n";
             }
         }
-        gw_bandgap = gw_conduct - gw_valence;
-        exx_bandgap = exx_conduct - exx_valence;
-        dft_bandgap = dft_conduct - dft_valence;
-        const auto &k_val_gw = kfrac_band[ik_val_gw];
-        const auto &k_cond_gw = kfrac_band[ik_cond_gw];
-        printf("GW VBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_val_gw + 1, k_val_gw.x, k_val_gw.y,
-               k_val_gw.z);
-        printf("GW CBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_cond_gw + 1, k_cond_gw.x,
-               k_cond_gw.y, k_cond_gw.z);
-        lib_printf("GW bandgap(eV): %12.7f \n", gw_bandgap);
-        const auto &k_val_exx = kfrac_band[ik_val_exx];
-        const auto &k_cond_exx = kfrac_band[ik_cond_exx];
-        printf("EXX VBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_val_exx + 1, k_val_exx.x,
-               k_val_exx.y, k_val_exx.z);
-        printf("EXX CBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_cond_exx + 1, k_cond_exx.x,
-               k_cond_exx.y, k_cond_exx.z);
-        lib_printf("EXX bandgap(eV): %12.7f \n", exx_bandgap);
-        const auto &k_val_dft = kfrac_band[ik_val_dft];
-        const auto &k_cond_dft = kfrac_band[ik_cond_dft];
-        printf("DFT VBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_val_dft + 1, k_val_dft.x,
-               k_val_dft.y, k_val_dft.z);
-        printf("DFT CBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_cond_dft + 1, k_cond_dft.x,
-               k_cond_dft.y, k_cond_dft.z);
-        lib_printf("DFT bandgap(eV): %12.7f \n", dft_bandgap);
+        bandgap = conduct - valence;
+        const auto &k_val = kfrac_band[ik_val];
+        printf("VBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_val + 1, k_val.x, k_val.y, k_val.z);
+        const auto &k_cond = kfrac_band[ik_cond];
+        printf("CBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_cond + 1, k_cond.x, k_cond.y,
+               k_cond.z);
+        lib_printf("Bandgap(eV): %12.7f \n", bandgap);
     }
     Profiler::stop("g0w0_solve_band_qpe");
 

From c91e40b956c9a98490931739a59fe289e44c133c Mon Sep 17 00:00:00 2001
From: userchba <88939318+userchba@users.noreply.github.com>
Date: Fri, 14 Nov 2025 17:00:45 +0800
Subject: [PATCH 11/18] Add files via upload

---
 src/dielecmodel.cpp | 355 +++++++++++++++++++++++++++++++++++++++++---
 src/dielecmodel.h   |  17 ++-
 2 files changed, 352 insertions(+), 20 deletions(-)

diff --git a/src/dielecmodel.cpp b/src/dielecmodel.cpp
index 87507dcd..cff7158f 100644
--- a/src/dielecmodel.cpp
+++ b/src/dielecmodel.cpp
@@ -19,7 +19,9 @@ using RI::Communicate_Tensors_Map_Judge::comm_map2_first;
 using LIBRPA::envs::mpi_comm_global_h;
 using LIBRPA::envs::ofs_myid;
 using LIBRPA::utils::lib_printf;
-
+#ifdef ENABLE_NVHPC
+#include "cuda_connector.h"
+#endif
 const int DoubleHavriliakNegami::d_npar = 8;
 
 const std::function<double(double, const std::vector<double> &)>
@@ -1070,6 +1072,43 @@ Array_Desc diele_func::get_body_inv(matrix_m<std::complex<double>> &chi0_block,
     return desc_body;
 };
 
+Array_Desc diele_func::get_body_inv_nvhpc(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice& d_chi0_block, const Array_Desc& desc_nabf_nabf_opt)
+{
+    Profiler::start("get_inverse_body_of_chi0_nvhpc");
+    gpu_dev_stream.calSync();
+    Array_Desc desc_body(blacs_ctxt_global_h);
+    desc_body.init_square_blk(n_nonsingular - 1, n_nonsingular - 1, 0, 0);
+    this->d_body_inv.set_data(desc_body.m_loc(),desc_body.n_loc(),gpu_dev_stream.stream);
+    Array_Desc_Device desc_body_dev(desc_body);
+    CudaConnector::pgemr2d_nvhpc(
+        gpu_dev_stream, n_nonsingular - 1, n_nonsingular - 1, 
+        d_chi0_block.ptr(), 2, 2, desc_nabf_nabf_opt, 
+        this->d_body_inv.ptr(), 1, 1, desc_body, 
+        CUDA_C_64F
+    );
+    ComplexMatrixDevice d_identity;
+    d_identity.set_data(desc_body.m_loc(),desc_body.n_loc(),gpu_dev_stream.stream);
+    d_identity.set_as_identity(gpu_dev_stream, desc_body_dev);
+    char order = 'c';
+    if(order == 'c'||order == 'C'){
+        CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_body_inv);
+        CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_identity);
+    }
+    CudaConnector::pgetrf_trs_nvhpc_mixed_precision(
+        gpu_dev_stream, CUBLAS_OP_N,
+        d_body_inv.ptr(), 1, 1, desc_body,
+        d_identity.ptr(), 1, 1, desc_body,
+        CUDA_C_64F, order
+    );
+    if(order == 'c'||order == 'C'){
+        CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_identity);
+    }
+    
+    this->d_body_inv.set_data_device(d_identity.nr(),d_identity.nc(),d_identity.ptr(),gpu_dev_stream);
+    Profiler::stop("get_inverse_body_of_chi0_nvhpc");
+    return desc_body;
+}
+
 void diele_func::construct_L(const int ifreq, Array_Desc &desc_body)
 {
     Profiler::start("cal_L");
@@ -1136,6 +1175,170 @@ void diele_func::construct_L(const int ifreq, Array_Desc &desc_body)
     Profiler::stop("cal_L");
 };
 
+#ifdef ENABLE_NVHPC
+void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const int& ifreq, Array_Desc& desc_body)
+{
+    Profiler::start("cal_L");
+    this->Lind.resize(3, 3, MAJOR::COL);
+    this->bw.resize(n_nonsingular - 1, 3, MAJOR::COL);
+    this->wb.resize(3, n_nonsingular - 1, MAJOR::COL);
+    Array_Desc desc_wing(blacs_ctxt_global_h);
+    desc_wing.init_square_blk(n_nonsingular - 1, 3, 0, 0);
+    
+    Array_Desc desc_wing_opt(blacs_ctxt_global_h);
+    desc_wing_opt.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0);
+    
+
+    Array_Desc desc_lam_3(blacs_ctxt_global_h);
+    desc_lam_3.init_square_blk(n_nonsingular - 1, 3, 0, 0);
+
+    Array_Desc desc_lam_3_opt(blacs_ctxt_global_h);
+    desc_lam_3_opt.init(n_nonsingular - 1, 3, desc_body.mb(), desc_lam_3.nb(), 0, 0);
+
+    Array_Desc desc_3_lam(blacs_ctxt_global_h);
+    desc_3_lam.init_square_blk(3, n_nonsingular - 1, 0, 0);
+
+    Array_Desc desc_3_lam_opt(blacs_ctxt_global_h);
+    desc_3_lam_opt.init(3, n_nonsingular - 1, desc_3_lam.mb(), desc_body.nb(), 0, 0);
+
+    Array_Desc desc_3_3(blacs_ctxt_global_h);
+    desc_3_3.init_square_blk(3, 3, 0, 0);
+
+    auto lam_3 = init_local_mat<complex<double>>(desc_lam_3, MAJOR::COL);
+    auto _3_lam = init_local_mat<complex<double>>(desc_3_lam, MAJOR::COL);
+    auto Lind_loc = init_local_mat<complex<double>>(desc_3_3, MAJOR::COL);
+    // printf("rank:%d,ma:%d,na:%d,mb:%d,nb:%d,mc:%d,nc:%d\n",gpu_dev_stream.rank,desc_body.m(),desc_body.n(),desc_wing_opt.m(),desc_wing_opt.n(),desc_lam_3_opt.m(),desc_lam_3_opt.n());
+    // printf("rank:%d,m_loc_a:%d,n_loc_a:%d,m_loc_b:%d,n_loc_b:%d,m_loc_c:%d,n_loc_c:%d\n",gpu_dev_stream.rank,desc_body.m_loc(),desc_body.n_loc(),desc_wing_opt.m_loc(),desc_wing_opt.n_loc(),desc_lam_3_opt.m_loc(),desc_lam_3_opt.n_loc());
+    // printf("rank:%d,mba:%d,nba:%d,mbb:%d,nbb:%d,mbc:%d,nbc:%d\n",gpu_dev_stream.rank,desc_body.mb(),desc_body.nb(),desc_wing_opt.mb(),desc_wing_opt.nb(),desc_lam_3_opt.mb(),desc_lam_3_opt.nb());
+    // tmp = head.at(ifreq) - transpose(wing.at(ifreq), true) * body_inv * wing.at(ifreq);
+    #ifdef ENABLE_NVHPC
+    auto wing_ifreq_opt = init_local_mat<complex<double>>(desc_wing_opt, MAJOR::COL);
+    auto lam_3_opt = init_local_mat<complex<double>>(desc_lam_3_opt, MAJOR::COL);
+    auto _3_lam_opt = init_local_mat<complex<double>>(desc_3_lam_opt, MAJOR::COL);
+    ComplexMatrixDevice d_lam_3,d_3_lam,d_Lind_loc;
+    ComplexMatrixDevice d_lam_3_opt,d_3_lam_opt;
+    ComplexMatrixDevice d_wing_ifreq,d_wing_ifreq_opt;
+    d_wing_ifreq.set_data(wing.at(ifreq).nr(),wing.at(ifreq).nc(),wing.at(ifreq).ptr(),gpu_dev_stream.stream);
+    d_wing_ifreq_opt.set_data(desc_wing_opt.m_loc(),desc_wing_opt.n_loc(),gpu_dev_stream.stream);
+    // d_lam_3.set_data(desc_lam_3.m_loc(),desc_lam_3.n_loc(),gpu_dev_stream.stream);
+    d_lam_3_opt.set_data(desc_lam_3_opt.m_loc(),desc_lam_3_opt.n_loc(),gpu_dev_stream.stream);
+    // d_3_lam.set_data(desc_3_lam.m_loc(),desc_3_lam.n_loc(),gpu_dev_stream.stream);
+    d_3_lam_opt.set_data(desc_3_lam_opt.m_loc(),desc_3_lam_opt.n_loc(),gpu_dev_stream.stream);
+    d_Lind_loc.set_data(desc_3_3.m_loc(),desc_3_3.n_loc(),gpu_dev_stream.stream);
+    ScalapackConnector::pgemr2d_f(
+        n_nonsingular-1, 3,
+        wing.at(ifreq).ptr(), 1, 1, desc_wing.desc,
+        wing_ifreq_opt.ptr(), 1, 1, desc_wing_opt.desc,
+        blacs_ctxt_global_h.ictxt    
+    );
+    // CudaConnector::pgemr2d_nvhpc(
+    //     gpu_dev_stream, n_nonsingular-1, 3,
+    //     d_wing_ifreq.ptr(), 1, 1, desc_wing,
+    //     d_wing_ifreq_opt.ptr(), 1, 1, desc_wing,
+    //     CUDA_C_64F
+    // );
+    CUDA_CHECK(cudaMemcpyAsync(d_wing_ifreq_opt.ptr(),wing_ifreq_opt.ptr(),desc_wing_opt.m_loc()*desc_wing_opt.n_loc()*sizeof(std::complex<double>),cudaMemcpyHostToDevice,gpu_dev_stream.stream));
+    std::complex<double> calpha(1.0,0.0),cbeta(0.0,0.0);
+    
+    CudaConnector::pgemm_nvhpc(
+        gpu_dev_stream, CUBLAS_OP_N, CUBLAS_OP_N, n_nonsingular - 1, 3, n_nonsingular - 1,
+        &calpha,
+        d_body_inv, 1, 1, desc_body,
+        d_wing_ifreq_opt, 1, 1, desc_wing_opt,
+        &cbeta,
+        d_lam_3_opt, 1, 1, desc_lam_3_opt,
+        CUBLAS_COMPUTE_64F_PEDANTIC
+    );
+    CUDA_CHECK(cudaMemcpyAsync(lam_3_opt.ptr(),d_lam_3_opt.ptr(),desc_lam_3_opt.m_loc()*desc_lam_3_opt.n_loc()*sizeof(std::complex<double>),cudaMemcpyDeviceToHost,gpu_dev_stream.stream));
+    gpu_dev_stream.cudaSync();
+    ScalapackConnector::pgemr2d_f(
+        n_nonsingular-1, 3,
+        lam_3_opt.ptr(), 1, 1, desc_lam_3_opt.desc,
+        lam_3.ptr(), 1, 1, desc_lam_3.desc,
+        blacs_ctxt_global_h.ictxt
+    );
+    gpu_dev_stream.cudaSync();
+    #else
+    ScalapackConnector::pgemm_f('N', 'N', n_nonsingular - 1, 3, n_nonsingular - 1, 1.0,
+                                body_inv.ptr(), 1, 1, desc_body.desc, wing.at(ifreq).ptr(), 1, 1,
+                                desc_wing.desc, 0.0, lam_3.ptr(), 1, 1, desc_lam_3.desc);
+    #endif
+    #ifdef ENABLE_NVHPC
+    CudaConnector::pgemm_nvhpc(
+        gpu_dev_stream, CUBLAS_OP_C, CUBLAS_OP_N, 3, 3, n_nonsingular - 1,
+        &calpha,
+        d_wing_ifreq_opt, 1, 1, desc_wing_opt,
+        d_lam_3_opt, 1, 1, desc_lam_3_opt,
+        &cbeta,
+        d_Lind_loc, 1, 1, desc_3_3,
+        CUBLAS_COMPUTE_64F_PEDANTIC
+    );
+    CUDA_CHECK(cudaMemcpyAsync(Lind_loc.ptr(),d_Lind_loc.ptr(),desc_3_3.m_loc()*desc_3_3.n_loc()*sizeof(std::complex<double>),cudaMemcpyDeviceToHost,gpu_dev_stream.stream));
+    gpu_dev_stream.cudaSync();
+    #else
+    ScalapackConnector::pgemm_f('C', 'N', 3, 3, n_nonsingular - 1, 1.0, wing.at(ifreq).ptr(), 1, 1,
+                                desc_wing.desc, lam_3.ptr(), 1, 1, desc_lam_3.desc, 0.0,
+                                Lind_loc.ptr(), 1, 1, desc_3_3.desc);
+    #endif
+    #ifndef ENABLE_NVHPC
+    CudaConnector::pgemm_nvhpc(
+        gpu_dev_stream, CUBLAS_OP_C, CUBLAS_OP_N, 3, n_nonsingular - 1, n_nonsingular - 1,
+        &calpha,
+        d_wing_ifreq_opt, 1, 1, desc_wing_opt,
+        d_body_inv, 1, 1, desc_body,
+        &cbeta,
+        d_3_lam_opt, 1, 1, desc_3_lam_opt,
+        CUBLAS_COMPUTE_64F_PEDANTIC
+    );
+    CUDA_CHECK(cudaMemcpyAsync(_3_lam_opt.ptr(),d_3_lam_opt.ptr(),desc_3_lam_opt.m_loc()*desc_3_lam_opt.n_loc()*sizeof(std::complex<double>),cudaMemcpyDeviceToHost,gpu_dev_stream.stream));
+    gpu_dev_stream.cudaSync();
+    ScalapackConnector::pgemr2d_f(
+        3, n_nonsingular-1,
+        _3_lam_opt.ptr(), 1, 1, desc_3_lam_opt.desc,
+        _3_lam.ptr(), 1, 1, desc_3_lam.desc,
+        blacs_ctxt_global_h.ictxt
+    );
+    #else
+    ScalapackConnector::pgemm_f('C', 'N', 3, n_nonsingular - 1, n_nonsingular - 1, 1.0,
+                                wing.at(ifreq).ptr(), 1, 1, desc_wing.desc, body_inv.ptr(), 1, 1,
+                                desc_body.desc, 0.0, _3_lam.ptr(), 1, 1, desc_3_lam.desc);
+    #endif
+
+    for (int i = 0; i != 3; i++)
+    {
+        auto loc_i = desc_3_3.indx_g2l_r(i);
+        for (int ilambda = 0; ilambda < n_nonsingular - 1; ilambda++)
+        {
+            auto loc_ilambda = desc_lam_3.indx_g2l_r(ilambda);
+            auto loc_ibw = desc_lam_3.indx_g2l_c(i);
+            if (loc_ibw >= 0 && loc_ilambda >= 0)
+                this->bw(ilambda, i) = lam_3(loc_ilambda, loc_ibw);
+
+            loc_ilambda = desc_3_lam.indx_g2l_c(ilambda);
+            auto loc_iwb = desc_3_lam.indx_g2l_r(i);
+            if (loc_iwb >= 0 && loc_ilambda >= 0)
+                this->wb(i, ilambda) = _3_lam(loc_iwb, loc_ilambda);
+
+            MPI_Allreduce(MPI_IN_PLACE, &bw(ilambda, i), 1, MPI_DOUBLE_COMPLEX, MPI_SUM,
+                          mpi_comm_global_h.comm);
+            MPI_Allreduce(MPI_IN_PLACE, &wb(i, ilambda), 1, MPI_DOUBLE_COMPLEX, MPI_SUM,
+                          mpi_comm_global_h.comm);
+        }
+
+        for (int j = 0; j != 3; j++)
+        {
+            auto loc_j = desc_3_3.indx_g2l_c(j);
+            if (loc_j >= 0 && loc_i >= 0)
+                this->Lind(i, j) = head.at(ifreq)(i, j) - Lind_loc(loc_i, loc_j);
+            MPI_Allreduce(MPI_IN_PLACE, &Lind(i, j), 1, MPI_DOUBLE_COMPLEX, MPI_SUM,
+                          mpi_comm_global_h.comm);
+        }
+    }
+
+    Profiler::stop("cal_L");
+};
+#endif
+
 void diele_func::get_Leb_points()
 {
     auto quad_order = lebedev::QuadratureOrder::order_5810;
@@ -1296,24 +1499,6 @@ void diele_func::cal_eps(const int ifreq, Array_Desc &desc_nabf_nabf_opt, Array_
             chi0(ilo, jlo) = result;
         }
     }
-    // auto identity = init_local_mat<complex<double>>(desc_body, MAJOR::COL);
-    // for (int i = 0; i < n_nonsingular - 1; i++)
-    // {
-    //     const int ilo = desc_body.indx_g2l_r(i);
-    //     if (ilo < 0) continue;
-    //     for (int j = 0; j < n_nonsingular - 1; j++)
-    //     {
-    //         const int jlo = desc_body.indx_g2l_c(j);
-    //         if (jlo < 0) continue;
-    //         if (i == j)
-    //             identity(ilo, jlo) = 1.0;
-    //         else
-    //             identity(ilo, jlo) = 0.0;
-    //     }
-    // }
-    // ScalapackConnector::pgemm_f('N', 'N', n_nonsingular - 1, n_nonsingular - 1, n_nonsingular - 1,
-    //                             1.0, body_inv.ptr(), 1, 1, desc_body.desc, identity.ptr(), 1, 1,
-    //                             desc_body.desc, 1.0, chi0.ptr(), 2, 2, desc_nabf_nabf_opt.desc);
     ScalapackConnector::pgeadd_f(
         'N', n_nonsingular - 1, n_nonsingular - 1, 
         1.0, 
@@ -1327,6 +1512,119 @@ void diele_func::cal_eps(const int ifreq, Array_Desc &desc_nabf_nabf_opt, Array_
     Profiler::stop("cal_inverse_dielectric_matrix");
 };
 
+#ifdef ENABLE_NVHPC
+void diele_func::cal_eps_nvhpc(const GpuDeviceStream& gpu_dev_stream, const int& ifreq, Array_Desc &desc_nabf_nabf_opt, Array_Desc &desc_body)
+{
+    Profiler::start("cal_inverse_dielectric_matrix");
+    // mpi_comm_global_h.barrier();
+    this->chi0 = init_local_mat<complex<double>>(desc_nabf_nabf_opt, MAJOR::COL);
+
+    const double k_volume = std::abs(G.Det());
+    this->vol_gamma = k_volume / nk;
+    double vol_gamma_numeric = 0.0;
+    if (ifreq == 0 && mpi_comm_global_h.is_root())
+    {
+        for (int ileb = 0; ileb != qw_leb.size(); ileb++)
+        {
+            vol_gamma_numeric += qw_leb[ileb] * std::pow(q_gamma[ileb], 3) / 3.0;
+        }
+        std::cout << "Number of angular grids for average inverse dielectric matrix: "
+                  << qw_leb.size() << std::endl;
+        std::cout << "vol_gamma_numeric/vol_gamma: " << vol_gamma_numeric << ", " << vol_gamma
+                  << std::endl;
+        std::cout << "Angular quadrature accuracy for volume: " << vol_gamma_numeric / vol_gamma
+                  << " (should be close to 1)" << std::endl;
+    }
+    /*std::cout << "major of Matz: " << wing[0].is_row_major() << "," << body_inv.is_row_major()
+              << "," << transpose(wing.at(0), true).is_row_major() << "," << Lind.is_row_major()
+              << std::endl;*/
+    construct_L_nvhpc(gpu_dev_stream, ifreq, desc_body);
+
+    Profiler::start("precompute_q_data");
+
+    const size_t nleb = qw_leb.size();
+    std::vector<std::complex<double>> weights(nleb);
+    std::vector<std::array<double, 3>> q_vectors(nleb);
+
+    const auto L00 = Lind(0, 0), L01 = Lind(0, 1), L02 = Lind(0, 2);
+    const auto L10 = Lind(1, 0), L11 = Lind(1, 1), L12 = Lind(1, 2);
+    const auto L20 = Lind(2, 0), L21 = Lind(2, 1), L22 = Lind(2, 2);
+
+#pragma omp parallel for schedule(static)
+    for (int ileb = 0; ileb < nleb; ++ileb)
+    {
+        const double qx = qx_leb[ileb];
+        const double qy = qy_leb[ileb];
+        const double qz = qz_leb[ileb];
+
+        q_vectors[ileb] = {qx, qy, qz};
+
+        const auto qLq = qx * (qx * L00 + qy * L01 + qz * L02) +
+                         qy * (qx * L10 + qy * L11 + qz * L12) +
+                         qz * (qx * L20 + qy * L21 + qz * L22);
+
+        weights[ileb] = qw_leb[ileb] * std::pow(q_gamma[ileb], 3) / (3.0 * vol_gamma) / qLq;
+    }
+    Profiler::stop("precompute_q_data");
+
+    Profiler::start("cal_inverse_dielectric_matrix_ij");
+    int i_start = 0, i_end = n_nonsingular;
+    int j_start = 0, j_end = n_nonsingular;
+#pragma omp parallel for schedule(dynamic, 4) collapse(2)
+    for (int i = i_start; i != i_end; i++)
+    {
+        for (int j = j_start; j != j_end; j++)
+        {
+            const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i);
+            if (ilo < 0) continue;
+            const int jlo = desc_nabf_nabf_opt.indx_g2l_c(j);
+            if (jlo < 0) continue;
+
+            complex<double> result = 0.0;
+
+            if (i == 0 && j == 0)
+            {
+                for (int ileb = 0; ileb < nleb; ++ileb)
+                {
+                    result += weights[ileb];
+                }
+            }
+            else if (i == 0 || j == 0)
+            {
+                result = 0.0;
+            }
+            else
+            {
+                const int idx_i = i - 1, idx_j = j - 1;
+
+                const auto bw_i0 = bw(idx_i, 0), bw_i1 = bw(idx_i, 1), bw_i2 = bw(idx_i, 2);
+                const auto wb_j0 = wb(0, idx_j), wb_j1 = wb(1, idx_j), wb_j2 = wb(2, idx_j);
+
+                for (int ileb = 0; ileb < nleb; ++ileb)
+                {
+                    const auto &[qx, qy, qz] = q_vectors[ileb];
+                    const auto bwq = bw_i0 * qx + bw_i1 * qy + bw_i2 * qz;
+                    const auto qwb = qx * wb_j0 + qy * wb_j1 + qz * wb_j2;
+
+                    result += weights[ileb] * bwq * qwb;
+                }
+            }
+            chi0(ilo, jlo) = result;
+        }
+    }
+    ScalapackConnector::pgeadd_f(
+        'N', n_nonsingular - 1, n_nonsingular - 1, 
+        1.0, 
+        body_inv.ptr(), 1, 1, desc_body.desc, 
+        1.0, 
+        chi0.ptr(), 2, 2, desc_nabf_nabf_opt.desc);
+    Profiler::stop("cal_inverse_dielectric_matrix_ij");
+    if (mpi_comm_global_h.is_root())
+        std::cout << "* Success: calculate average inverse dielectric matrix no." << ifreq + 1
+                  << "." << std::endl;
+    Profiler::stop("cal_inverse_dielectric_matrix");
+};
+#endif
 /*std::complex<double> diele_func::compute_chi0_inv_00(const int ifreq)
 {
     std::complex<double> total = 0.0;
@@ -1416,4 +1714,23 @@ void diele_func::rewrite_eps(matrix_m<std::complex<double>> &chi0_block, const i
     this->body_inv.clear();
 };
 
+#ifdef ENABLE_NVHPC
+void diele_func::rewrite_eps_nvhpc(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice& d_chi0_block, const int& ifreq, Array_Desc& desc_nabf_nabf_opt)
+{
+    auto desc_body = get_body_inv_nvhpc(gpu_dev_stream, d_chi0_block, desc_nabf_nabf_opt);
+    this->body_inv = init_local_mat<complex<double>>(desc_body, MAJOR::COL);
+    CUDA_CHECK(cudaMemcpyAsync(this->body_inv.ptr(),this->d_body_inv.ptr(),sizeof(cuDoubleComplex)*desc_body.m_loc()*desc_body.n_loc(),cudaMemcpyDeviceToHost,gpu_dev_stream.stream));
+    
+    cal_eps_nvhpc(gpu_dev_stream, ifreq, desc_nabf_nabf_opt, desc_body);
+    d_chi0_block.set_data(this->chi0.nr(),this->chi0.nc(),this->chi0.ptr(),gpu_dev_stream.stream);
+    // this->chi0.clear();
+    ofs_myid << get_timestamp() << "success finish rewrite_eps"<<endl;
+    
+    this->Lind.clear();
+    this->body_inv.clear();
+    this->d_body_inv.clean(gpu_dev_stream.stream);
+    ofs_myid << get_timestamp() << "success clean the data"<<endl;
+}
+#endif
+
 diele_func df_headwing = diele_func();
diff --git a/src/dielecmodel.h b/src/dielecmodel.h
index b479b540..70928ca6 100644
--- a/src/dielecmodel.h
+++ b/src/dielecmodel.h
@@ -18,7 +18,9 @@
 #include "utils_blacs.h"
 #include "utils_io.h"
 #include "vec.h"
-
+#ifdef ENABLE_NVHPC
+#include "cuda_connector.h"
+#endif
 using LIBRPA::Array_Desc;
 using LIBRPA::envs::blacs_ctxt_global_h;
 using RI::Tensor;
@@ -46,6 +48,9 @@ class diele_func
     // ( i:n_lambda, j:n_lambda )
     matrix_m<std::complex<double>> body_inv;
     // ( i:3, j:3 )
+    #ifdef ENABLE_NVHPC
+    ComplexMatrixDevice d_body_inv;
+    #endif
     matrix_m<std::complex<double>> Lind;
     // ( i:n_lambda, j:3 )
     matrix_m<std::complex<double>> bw;
@@ -132,16 +137,26 @@ class diele_func
     Array_Desc get_body_inv(matrix_m<std::complex<double>> &chi0_block,
                             Array_Desc &desc_nabf_nabf_opt);
     void construct_L(const int ifreq, Array_Desc &desc_body);
+    #ifdef ENABLE_NVHPC
+    void construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const int& ifreq, Array_Desc& desc_body);
+    #endif
     // Lebedev-Laikov quadrature
     void get_Leb_points();
     void get_g_enclosing_gamma();
     void calculate_q_gamma();
     void cal_eps(const int ifreq, Array_Desc &desc_nabf_nabf_opt, Array_Desc &desc_body);
+    #ifdef ENABLE_NVHPC
+    void cal_eps_nvhpc(const GpuDeviceStream& gpu_dev_stream, const int& ifreq, Array_Desc &desc_nabf_nabf_opt, Array_Desc &desc_body);
+    #endif
     // not used now due to performance optimization
     // std::complex<double> compute_chi0_inv_00(const int ifreq);
     // std::complex<double> compute_chi0_inv_ij(const int ifreq, int i, int j);
     void rewrite_eps(matrix_m<std::complex<double>> &chi0_block, const int ifreq,
                      Array_Desc &desc_nabf_nabf_opt);
+    #ifdef ENABLE_NVHPC
+    Array_Desc get_body_inv_nvhpc(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice&, const Array_Desc&);
+    void rewrite_eps_nvhpc(const GpuDeviceStream& gpu_dev_stream,ComplexMatrixDevice&,const int& ifreq,Array_Desc& desc_nabf_nabf_opt);
+    #endif
     void assign_chi0(matrix_m<std::complex<double>> &chi0_block, Array_Desc &desc_nabf_nabf_opt);
 };
 

From 8f430d7282e5bcddc878b55cbad9b58f7a19f679 Mon Sep 17 00:00:00 2001
From: userchba <88939318+userchba@users.noreply.github.com>
Date: Fri, 14 Nov 2025 17:13:30 +0800
Subject: [PATCH 12/18] Add files via upload

---
 src/gpu_device_stream.cpp |   0
 src/gpu_device_stream.h   | 121 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 121 insertions(+)
 create mode 100644 src/gpu_device_stream.cpp
 create mode 100644 src/gpu_device_stream.h

diff --git a/src/gpu_device_stream.cpp b/src/gpu_device_stream.cpp
new file mode 100644
index 00000000..e69de29b
diff --git a/src/gpu_device_stream.h b/src/gpu_device_stream.h
new file mode 100644
index 00000000..849556bd
--- /dev/null
+++ b/src/gpu_device_stream.h
@@ -0,0 +1,121 @@
+#pragma once
+//=================hbchen 2025-05-11=========================
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+#include <complex>
+//=================hbchen 2025-05-11=========================
+#include <iostream>
+#include <device_launch_parameters.h>
+#include <omp.h>
+#include <mpi.h>
+#ifdef ENABLE_NVHPC
+#include <cusolverMp.h>
+#include <cublasmp.h>
+#include "helpers.h"
+#endif
+
+class GpuDeviceStream{
+private:
+    cal_comm_create_params_t params;
+    static inline calError_t allgather(void* src_buf, void* recv_buf, size_t size, void* data, void** request)
+    {
+        MPI_Request req;
+        int         err = MPI_Iallgather(src_buf, size, MPI_BYTE, recv_buf, size, MPI_BYTE, (MPI_Comm)(data), &req);
+        if (err != MPI_SUCCESS)
+        {
+            return CAL_ERROR;
+        }
+        *request = (void*)(req);
+        return CAL_OK;
+    }
+
+    static inline calError_t request_test(void* request)
+    {
+        MPI_Request req = (MPI_Request)(request);
+        int         completed;
+        int         err = MPI_Test(&req, &completed, MPI_STATUS_IGNORE);
+        if (err != MPI_SUCCESS)
+        {
+            return CAL_ERROR;
+        }
+        return completed ? CAL_OK : CAL_ERROR_INPROGRESS;
+    }
+
+    static inline calError_t request_free(void* request)
+    {
+        return CAL_OK;
+    }
+    static inline int getLocalDevice()
+    {
+        int localRank;
+        MPI_Comm localComm;
+
+        MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &localComm);
+        MPI_Comm_rank(localComm, &localRank);
+        MPI_Comm_free(&localComm);
+
+        int deviceCount = 0;
+        CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
+
+        return localRank % deviceCount;
+    }
+
+
+public:  
+    int rank;
+    int nranks;
+    int local_device;
+    cudaStream_t stream = nullptr;
+    cal_comm_t cal_comm = nullptr;
+ 
+    cusolverMpHandle_t cusolver_handle = nullptr;
+    cublasMpHandle_t cublas_handle = nullptr;
+    GpuDeviceStream(){
+        MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+        local_device = GpuDeviceStream::getLocalDevice();
+        // printf("myrank:%d, local_device:%d\n", rank, local_device);
+        CUDA_CHECK(cudaSetDevice(local_device));
+        CUDA_CHECK(cudaFree(nullptr));
+        {
+            params.allgather    = GpuDeviceStream::allgather;
+            params.req_test     = GpuDeviceStream::request_test;
+            params.req_free     = GpuDeviceStream::request_free;
+            params.data         = (void*)(MPI_COMM_WORLD);
+            params.rank         = rank;
+            params.nranks       = nranks;
+            params.local_device = local_device;
+
+            CAL_CHECK(cal_comm_create(params, &cal_comm));
+        }
+        CUDA_CHECK(cudaStreamCreate(&stream));
+        CUSOLVERMP_CHECK(cusolverMpCreate(&cusolver_handle, local_device, stream));
+        CUBLASMP_CHECK(cublasMpCreate(&cublas_handle, stream));
+
+    }
+    ~GpuDeviceStream(){
+        if(stream!=nullptr){
+            CUDA_CHECK(cudaStreamDestroy(stream));
+            stream=nullptr;
+        }
+        if(cal_comm!=nullptr){
+            CAL_CHECK(cal_comm_destroy(cal_comm));
+            cal_comm=nullptr;
+        }
+        if(cusolver_handle!=nullptr){
+            CUSOLVERMP_CHECK(cusolverMpDestroy(cusolver_handle));
+            cusolver_handle=nullptr;
+        }
+        if(cublas_handle!=nullptr){
+            CUBLASMP_CHECK(cublasMpDestroy(cublas_handle));
+            cublas_handle=nullptr;
+        }
+    }
+    void cudaSync() const {
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+    }
+    void calSync() const {
+        CAL_CHECK(cal_stream_sync(cal_comm,stream));
+    }
+};
\ No newline at end of file

From 44f9f2c0e85fa35899ad39d354adbd1165f5186e Mon Sep 17 00:00:00 2001
From: userchba <88939318+userchba@users.noreply.github.com>
Date: Fri, 14 Nov 2025 17:58:42 +0800
Subject: [PATCH 13/18] fix error for non-gpu installation


From d2ec742920b66e6096eaa054a92da6d91e6cef54 Mon Sep 17 00:00:00 2001
From: userchba <88939318+userchba@users.noreply.github.com>
Date: Mon, 17 Nov 2025 20:34:09 +0800
Subject: [PATCH 14/18] Add files via upload

---
 src/dielecmodel.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/dielecmodel.cpp b/src/dielecmodel.cpp
index cff7158f..7ac8ee73 100644
--- a/src/dielecmodel.cpp
+++ b/src/dielecmodel.cpp
@@ -1071,7 +1071,7 @@ Array_Desc diele_func::get_body_inv(matrix_m<std::complex<double>> &chi0_block,
     Profiler::stop("get_inverse_body_of_chi0");
     return desc_body;
 };
-
+#ifdef ENABLE_NVHPC
 Array_Desc diele_func::get_body_inv_nvhpc(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice& d_chi0_block, const Array_Desc& desc_nabf_nabf_opt)
 {
     Profiler::start("get_inverse_body_of_chi0_nvhpc");
@@ -1108,7 +1108,7 @@ Array_Desc diele_func::get_body_inv_nvhpc(const GpuDeviceStream& gpu_dev_stream,
     Profiler::stop("get_inverse_body_of_chi0_nvhpc");
     return desc_body;
 }
-
+#endif
 void diele_func::construct_L(const int ifreq, Array_Desc &desc_body)
 {
     Profiler::start("cal_L");

From 71970ea907d0a80bff47037cde3e0253c153b08d Mon Sep 17 00:00:00 2001
From: userchba <88939318+userchba@users.noreply.github.com>
Date: Sat, 22 Nov 2025 20:15:34 +0800
Subject: [PATCH 15/18] merge

---
 driver/task_gw_band.cpp | 105 +++++++++++++++++++++++++++++++---------
 1 file changed, 81 insertions(+), 24 deletions(-)

diff --git a/driver/task_gw_band.cpp b/driver/task_gw_band.cpp
index 1a9d99b3..a777e9e7 100644
--- a/driver/task_gw_band.cpp
+++ b/driver/task_gw_band.cpp
@@ -27,7 +27,6 @@
 #include "epsilon_cuda.h"
 #include <cuda_runtime.h>
 #endif
-
 void task_g0w0_band(std::map<Vector3_Order<double>, ComplexMatrix> &sinvS)
 {
     using LIBRPA::envs::mpi_comm_global_h;
@@ -118,9 +117,19 @@ void task_g0w0_band(std::map<Vector3_Order<double>, ComplexMatrix> &sinvS)
     Profiler::start("g0w0_exx", "Build exchange self-energy");
     auto exx = LIBRPA::Exx(meanfield, kfrac_list, period);
     {
-        Profiler::start("ft_vq_cut", "Fourier transform truncated Coulomb");
-        const auto VR = FT_Vq(Vq_cut, meanfield.get_n_kpoints(), Rlist, true);
-        Profiler::stop("ft_vq_cut");
+        atpair_R_mat_t VR;
+        if (Params::use_fullcoul_exx)
+        {
+            Profiler::start("ft_vq_full", "Fourier transform full Coulomb");
+            VR = FT_Vq(Vq, meanfield.get_n_kpoints(), Rlist, true);
+            Profiler::stop("ft_vq_full");
+        }
+        else
+        {
+            Profiler::start("ft_vq_cut", "Fourier transform truncated Coulomb");
+            VR = FT_Vq(Vq_cut, meanfield.get_n_kpoints(), Rlist, true);
+            Profiler::stop("ft_vq_cut");
+        }
 
         Profiler::start("g0w0_exx_real_work");
         if (Params::use_shrink_abfs)
@@ -158,7 +167,6 @@ void task_g0w0_band(std::map<Vector3_Order<double>, ComplexMatrix> &sinvS)
         else 
         #endif
         Wc_freq_q = compute_Wc_freq_q_blacs(chi0, Vq, Vq_cut, epsmac_LF_imagfreq);
-        
     }
     else
     {
@@ -444,11 +452,21 @@ void task_g0w0_band(std::map<Vector3_Order<double>, ComplexMatrix> &sinvS)
             }
         }
         // output bandgap
-        double bandgap = 0.0;
-        double valence = -1.e10;
-        double conduct = 1.e10;
-        int ik_val = 0;
-        int ik_cond = 0;
+        double gw_bandgap = 0.0;
+        double gw_valence = -1.e10;
+        double gw_conduct = 1.e10;
+        double exx_bandgap = 0.0;
+        double exx_valence = -1.e10;
+        double exx_conduct = 1.e10;
+        double dft_bandgap = 0.0;
+        double dft_valence = -1.e10;
+        double dft_conduct = 1.e10;
+        int ik_val_gw = 0;
+        int ik_cond_gw = 0;
+        int ik_val_exx = 0;
+        int ik_cond_exx = 0;
+        int ik_val_dft = 0;
+        int ik_cond_dft = 0;
         int nocc = 0;
         auto &wg = meanfield.get_weight()[0];
         for (int i = 0; i != wg.size; i++)
@@ -514,16 +532,38 @@ void task_g0w0_band(std::map<Vector3_Order<double>, ComplexMatrix> &sinvS)
                     ofs_hf << std::setw(15) << std::setprecision(5) << occ_state << std::setw(15)
                            << std::setprecision(5) << eks_state - vxc_state + exx_state;
 
-                    // output bandgap
-                    if (i_state == nocc - 1 && eqp > valence)  // HOMO
+                    // output GW bandgap
+                    if (i_state == nocc - 1 && eqp > gw_conduct)  // HOMO
+                    {
+                        gw_valence = eqp;
+                        ik_val_gw = i_kpoint;
+                    }
+                    else if (i_state == nocc && eqp < gw_conduct)  // LUMO
+                    {
+                        gw_conduct = eqp;
+                        ik_cond_gw = i_kpoint;
+                    }
+                    // output EXX bandgap
+                    if (i_state == nocc - 1 && exx_state > exx_valence)  // HOMO
+                    {
+                        exx_valence = exx_state;
+                        ik_val_exx = i_kpoint;
+                    }
+                    else if (i_state == nocc && exx_state < exx_conduct)  // LUMO
+                    {
+                        exx_conduct = exx_state;
+                        ik_cond_exx = i_kpoint;
+                    }
+                    // output DFT bandgap
+                    if (i_state == nocc - 1 && eks_state > dft_valence)  // HOMO
                     {
-                        valence = eqp;
-                        ik_val = i_kpoint;
+                        dft_valence = eks_state;
+                        ik_val_dft = i_kpoint;
                     }
-                    else if (i_state == nocc && eqp < conduct)  // LUMO
+                    else if (i_state == nocc && eks_state < dft_conduct)  // LUMO
                     {
-                        conduct = eqp;
-                        ik_cond = i_kpoint;
+                        dft_conduct = eks_state;
+                        ik_cond_dft = i_kpoint;
                     }
                 }
                 ofs_gw << "\n";
@@ -531,13 +571,30 @@ void task_g0w0_band(std::map<Vector3_Order<double>, ComplexMatrix> &sinvS)
                 ofs_ks << "\n";
             }
         }
-        bandgap = conduct - valence;
-        const auto &k_val = kfrac_band[ik_val];
-        printf("VBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_val + 1, k_val.x, k_val.y, k_val.z);
-        const auto &k_cond = kfrac_band[ik_cond];
-        printf("CBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_cond + 1, k_cond.x, k_cond.y,
-               k_cond.z);
-        lib_printf("Bandgap(eV): %12.7f \n", bandgap);
+        gw_bandgap = gw_conduct - gw_valence;
+        exx_bandgap = exx_conduct - exx_valence;
+        dft_bandgap = dft_conduct - dft_valence;
+        const auto &k_val_gw = kfrac_band[ik_val_gw];
+        const auto &k_cond_gw = kfrac_band[ik_cond_gw];
+        printf("GW VBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_val_gw + 1, k_val_gw.x, k_val_gw.y,
+               k_val_gw.z);
+        printf("GW CBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_cond_gw + 1, k_cond_gw.x,
+               k_cond_gw.y, k_cond_gw.z);
+        lib_printf("GW bandgap(eV): %12.7f \n", gw_bandgap);
+        const auto &k_val_exx = kfrac_band[ik_val_exx];
+        const auto &k_cond_exx = kfrac_band[ik_cond_exx];
+        printf("EXX VBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_val_exx + 1, k_val_exx.x,
+               k_val_exx.y, k_val_exx.z);
+        printf("EXX CBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_cond_exx + 1, k_cond_exx.x,
+               k_cond_exx.y, k_cond_exx.z);
+        lib_printf("EXX bandgap(eV): %12.7f \n", exx_bandgap);
+        const auto &k_val_dft = kfrac_band[ik_val_dft];
+        const auto &k_cond_dft = kfrac_band[ik_cond_dft];
+        printf("DFT VBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_val_dft + 1, k_val_dft.x,
+               k_val_dft.y, k_val_dft.z);
+        printf("DFT CBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_cond_dft + 1, k_cond_dft.x,
+               k_cond_dft.y, k_cond_dft.z);
+        lib_printf("DFT bandgap(eV): %12.7f \n", dft_bandgap);
     }
     Profiler::stop("g0w0_solve_band_qpe");
 

From ac931d4bf5e6fb275eb9eb58156fd12e4796e895 Mon Sep 17 00:00:00 2001
From: userchba <88939318+userchba@users.noreply.github.com>
Date: Tue, 2 Dec 2025 16:50:28 +0800
Subject: [PATCH 16/18] add gpu head_wing calaulation

---
 src/dielecmodel.cpp | 154 ++++++++++++++++----------------------------
 1 file changed, 56 insertions(+), 98 deletions(-)

diff --git a/src/dielecmodel.cpp b/src/dielecmodel.cpp
index 7ac8ee73..ee046300 100644
--- a/src/dielecmodel.cpp
+++ b/src/dielecmodel.cpp
@@ -19,6 +19,7 @@ using RI::Communicate_Tensors_Map_Judge::comm_map2_first;
 using LIBRPA::envs::mpi_comm_global_h;
 using LIBRPA::envs::ofs_myid;
 using LIBRPA::utils::lib_printf;
+
 #ifdef ENABLE_NVHPC
 #include "cuda_connector.h"
 #endif
@@ -500,8 +501,16 @@ void diele_func::wing_mu_to_lambda(matrix_m<std::complex<double>> &sqrtveig_blac
     int n_lambda = this->n_nonsingular - 1;
     Array_Desc desc_wing_mu(blacs_ctxt_global_h);
     desc_wing_mu.init_square_blk(n_abf, 3, 0, 0);
+    desc_wing_mu.init(n_abf, 3, desc_nabf_nabf_opt.mb(), desc_wing_mu.nb(), 0, 0);
     Array_Desc desc_wing(blacs_ctxt_global_h);
     desc_wing.init_square_blk(n_nonsingular - 1, 3, 0, 0);
+    Array_Desc desc_body(blacs_ctxt_global_h);
+    desc_body.init_square_blk(n_nonsingular - 1, n_nonsingular - 1, 0, 0);
+    int mb = std::min(128, desc_body.mb());
+    desc_body.init(n_nonsingular - 1, n_nonsingular - 1, mb, mb, 0, 0);
+    
+    desc_wing.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0);
+
     for (int iomega = 0; iomega != this->omega.size(); iomega++)
     {
         auto &wing_tmp = this->wing.at(iomega);
@@ -524,7 +533,8 @@ void diele_func::wing_mu_to_lambda(matrix_m<std::complex<double>> &sqrtveig_blac
         // drop the first column of sqrtveig_blacs, the largest eigenvalue
         ScalapackConnector::pgemm_f('C', 'N', n_lambda, 3, n_abf, 1.0, sqrtveig_blacs.ptr(), 1, 2,
                                     desc_nabf_nabf_opt.desc, wing_mu_tmp.ptr(), 1, 1,
-                                    desc_wing_mu.desc, 0.0, wing_tmp.ptr(), 1, 1, desc_wing.desc);
+                                    desc_wing_mu.desc, 0.0, wing_tmp.ptr(), 1, 1,
+                                    desc_wing.desc);
     }
 
     this->wing_mu.clear();
@@ -1053,6 +1063,8 @@ Array_Desc diele_func::get_body_inv(matrix_m<std::complex<double>> &chi0_block,
 
     Array_Desc desc_body(blacs_ctxt_global_h);
     desc_body.init_square_blk(n_nonsingular - 1, n_nonsingular - 1, 0, 0);
+    int mb = std::min(128,desc_body.mb());
+    desc_body.init(n_nonsingular - 1, n_nonsingular - 1, mb, mb, 0, 0);
     this->body_inv = init_local_mat<complex<double>>(desc_body, MAJOR::COL);
 
     /* for (int ilambda = 0; ilambda < n_nonsingular - 1; ilambda++)
@@ -1071,6 +1083,7 @@ Array_Desc diele_func::get_body_inv(matrix_m<std::complex<double>> &chi0_block,
     Profiler::stop("get_inverse_body_of_chi0");
     return desc_body;
 };
+
 #ifdef ENABLE_NVHPC
 Array_Desc diele_func::get_body_inv_nvhpc(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice& d_chi0_block, const Array_Desc& desc_nabf_nabf_opt)
 {
@@ -1078,6 +1091,8 @@ Array_Desc diele_func::get_body_inv_nvhpc(const GpuDeviceStream& gpu_dev_stream,
     gpu_dev_stream.calSync();
     Array_Desc desc_body(blacs_ctxt_global_h);
     desc_body.init_square_blk(n_nonsingular - 1, n_nonsingular - 1, 0, 0);
+    int mb = std::min(128,desc_body.mb());
+    desc_body.init(n_nonsingular - 1, n_nonsingular - 1, mb, mb, 0, 0);
     this->d_body_inv.set_data(desc_body.m_loc(),desc_body.n_loc(),gpu_dev_stream.stream);
     Array_Desc_Device desc_body_dev(desc_body);
     CudaConnector::pgemr2d_nvhpc(
@@ -1117,15 +1132,17 @@ void diele_func::construct_L(const int ifreq, Array_Desc &desc_body)
     this->wb.resize(3, n_nonsingular - 1, MAJOR::COL);
     Array_Desc desc_wing(blacs_ctxt_global_h);
     desc_wing.init_square_blk(n_nonsingular - 1, 3, 0, 0);
-
+    // opt descriptor for wing
+    desc_wing.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0);
+   
     Array_Desc desc_lam_3(blacs_ctxt_global_h);
-    desc_lam_3.init_square_blk(n_nonsingular - 1, 3, 0, 0);
+    desc_lam_3.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0);
 
     Array_Desc desc_3_lam(blacs_ctxt_global_h);
-    desc_3_lam.init_square_blk(3, n_nonsingular - 1, 0, 0);
+    desc_3_lam.init(3, n_nonsingular - 1, desc_wing.nb(), desc_body.mb(), 0, 0);
 
     Array_Desc desc_3_3(blacs_ctxt_global_h);
-    desc_3_3.init_square_blk(3, 3, 0, 0);
+    desc_3_3.init(3, 3, desc_wing.nb(), desc_wing.nb(), 0, 0);
 
     auto lam_3 = init_local_mat<complex<double>>(desc_lam_3, MAJOR::COL);
     auto _3_lam = init_local_mat<complex<double>>(desc_3_lam, MAJOR::COL);
@@ -1138,8 +1155,8 @@ void diele_func::construct_L(const int ifreq, Array_Desc &desc_body)
                                 desc_wing.desc, lam_3.ptr(), 1, 1, desc_lam_3.desc, 0.0,
                                 Lind_loc.ptr(), 1, 1, desc_3_3.desc);
     ScalapackConnector::pgemm_f('C', 'N', 3, n_nonsingular - 1, n_nonsingular - 1, 1.0,
-                                wing.at(ifreq).ptr(), 1, 1, desc_wing.desc, body_inv.ptr(), 1, 1,
-                                desc_body.desc, 0.0, _3_lam.ptr(), 1, 1, desc_3_lam.desc);
+                                wing.at(ifreq).ptr(), 1, 1, desc_wing.desc, body_inv.ptr(), 1,
+                                1, desc_body.desc, 0.0, _3_lam.ptr(), 1, 1, desc_3_lam.desc);
 
     for (int i = 0; i != 3; i++)
     {
@@ -1184,126 +1201,70 @@ void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const
     this->wb.resize(3, n_nonsingular - 1, MAJOR::COL);
     Array_Desc desc_wing(blacs_ctxt_global_h);
     desc_wing.init_square_blk(n_nonsingular - 1, 3, 0, 0);
-    
-    Array_Desc desc_wing_opt(blacs_ctxt_global_h);
-    desc_wing_opt.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0);
-    
-
+    // opt descriptor for wing
+    desc_wing.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0);
+   
     Array_Desc desc_lam_3(blacs_ctxt_global_h);
-    desc_lam_3.init_square_blk(n_nonsingular - 1, 3, 0, 0);
-
-    Array_Desc desc_lam_3_opt(blacs_ctxt_global_h);
-    desc_lam_3_opt.init(n_nonsingular - 1, 3, desc_body.mb(), desc_lam_3.nb(), 0, 0);
+    desc_lam_3.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0);
 
     Array_Desc desc_3_lam(blacs_ctxt_global_h);
-    desc_3_lam.init_square_blk(3, n_nonsingular - 1, 0, 0);
-
-    Array_Desc desc_3_lam_opt(blacs_ctxt_global_h);
-    desc_3_lam_opt.init(3, n_nonsingular - 1, desc_3_lam.mb(), desc_body.nb(), 0, 0);
+    desc_3_lam.init(3, n_nonsingular - 1, desc_wing.nb(), desc_body.mb(), 0, 0);
 
     Array_Desc desc_3_3(blacs_ctxt_global_h);
-    desc_3_3.init_square_blk(3, 3, 0, 0);
+    desc_3_3.init(3, 3, desc_wing.nb(), desc_wing.nb(), 0, 0);
 
     auto lam_3 = init_local_mat<complex<double>>(desc_lam_3, MAJOR::COL);
     auto _3_lam = init_local_mat<complex<double>>(desc_3_lam, MAJOR::COL);
     auto Lind_loc = init_local_mat<complex<double>>(desc_3_3, MAJOR::COL);
-    // printf("rank:%d,ma:%d,na:%d,mb:%d,nb:%d,mc:%d,nc:%d\n",gpu_dev_stream.rank,desc_body.m(),desc_body.n(),desc_wing_opt.m(),desc_wing_opt.n(),desc_lam_3_opt.m(),desc_lam_3_opt.n());
-    // printf("rank:%d,m_loc_a:%d,n_loc_a:%d,m_loc_b:%d,n_loc_b:%d,m_loc_c:%d,n_loc_c:%d\n",gpu_dev_stream.rank,desc_body.m_loc(),desc_body.n_loc(),desc_wing_opt.m_loc(),desc_wing_opt.n_loc(),desc_lam_3_opt.m_loc(),desc_lam_3_opt.n_loc());
-    // printf("rank:%d,mba:%d,nba:%d,mbb:%d,nbb:%d,mbc:%d,nbc:%d\n",gpu_dev_stream.rank,desc_body.mb(),desc_body.nb(),desc_wing_opt.mb(),desc_wing_opt.nb(),desc_lam_3_opt.mb(),desc_lam_3_opt.nb());
     // tmp = head.at(ifreq) - transpose(wing.at(ifreq), true) * body_inv * wing.at(ifreq);
-    #ifdef ENABLE_NVHPC
-    auto wing_ifreq_opt = init_local_mat<complex<double>>(desc_wing_opt, MAJOR::COL);
-    auto lam_3_opt = init_local_mat<complex<double>>(desc_lam_3_opt, MAJOR::COL);
-    auto _3_lam_opt = init_local_mat<complex<double>>(desc_3_lam_opt, MAJOR::COL);
-    ComplexMatrixDevice d_lam_3,d_3_lam,d_Lind_loc;
-    ComplexMatrixDevice d_lam_3_opt,d_3_lam_opt;
-    ComplexMatrixDevice d_wing_ifreq,d_wing_ifreq_opt;
+    
+    ComplexMatrixDevice d_lam_3,d_3_lam,d_Lind_loc,d_wing_ifreq;
+    
     d_wing_ifreq.set_data(wing.at(ifreq).nr(),wing.at(ifreq).nc(),wing.at(ifreq).ptr(),gpu_dev_stream.stream);
-    d_wing_ifreq_opt.set_data(desc_wing_opt.m_loc(),desc_wing_opt.n_loc(),gpu_dev_stream.stream);
-    // d_lam_3.set_data(desc_lam_3.m_loc(),desc_lam_3.n_loc(),gpu_dev_stream.stream);
-    d_lam_3_opt.set_data(desc_lam_3_opt.m_loc(),desc_lam_3_opt.n_loc(),gpu_dev_stream.stream);
-    // d_3_lam.set_data(desc_3_lam.m_loc(),desc_3_lam.n_loc(),gpu_dev_stream.stream);
-    d_3_lam_opt.set_data(desc_3_lam_opt.m_loc(),desc_3_lam_opt.n_loc(),gpu_dev_stream.stream);
+    
+    d_lam_3.set_data(desc_lam_3.m_loc(),desc_lam_3.n_loc(),gpu_dev_stream.stream);
+    d_3_lam.set_data(desc_3_lam.m_loc(),desc_3_lam.n_loc(),gpu_dev_stream.stream);
     d_Lind_loc.set_data(desc_3_3.m_loc(),desc_3_3.n_loc(),gpu_dev_stream.stream);
-    ScalapackConnector::pgemr2d_f(
-        n_nonsingular-1, 3,
-        wing.at(ifreq).ptr(), 1, 1, desc_wing.desc,
-        wing_ifreq_opt.ptr(), 1, 1, desc_wing_opt.desc,
-        blacs_ctxt_global_h.ictxt    
-    );
-    // CudaConnector::pgemr2d_nvhpc(
-    //     gpu_dev_stream, n_nonsingular-1, 3,
-    //     d_wing_ifreq.ptr(), 1, 1, desc_wing,
-    //     d_wing_ifreq_opt.ptr(), 1, 1, desc_wing,
-    //     CUDA_C_64F
-    // );
-    CUDA_CHECK(cudaMemcpyAsync(d_wing_ifreq_opt.ptr(),wing_ifreq_opt.ptr(),desc_wing_opt.m_loc()*desc_wing_opt.n_loc()*sizeof(std::complex<double>),cudaMemcpyHostToDevice,gpu_dev_stream.stream));
-    std::complex<double> calpha(1.0,0.0),cbeta(0.0,0.0);
     
+    std::complex<double> calpha(1.0,0.0),cbeta(0.0,0.0);
+    printf("successful before gemm1\n");
     CudaConnector::pgemm_nvhpc(
         gpu_dev_stream, CUBLAS_OP_N, CUBLAS_OP_N, n_nonsingular - 1, 3, n_nonsingular - 1,
         &calpha,
         d_body_inv, 1, 1, desc_body,
-        d_wing_ifreq_opt, 1, 1, desc_wing_opt,
+        d_wing_ifreq, 1, 1, desc_wing,
         &cbeta,
-        d_lam_3_opt, 1, 1, desc_lam_3_opt,
+        d_lam_3, 1, 1, desc_lam_3,
         CUBLAS_COMPUTE_64F_PEDANTIC
     );
-    CUDA_CHECK(cudaMemcpyAsync(lam_3_opt.ptr(),d_lam_3_opt.ptr(),desc_lam_3_opt.m_loc()*desc_lam_3_opt.n_loc()*sizeof(std::complex<double>),cudaMemcpyDeviceToHost,gpu_dev_stream.stream));
-    gpu_dev_stream.cudaSync();
-    ScalapackConnector::pgemr2d_f(
-        n_nonsingular-1, 3,
-        lam_3_opt.ptr(), 1, 1, desc_lam_3_opt.desc,
-        lam_3.ptr(), 1, 1, desc_lam_3.desc,
-        blacs_ctxt_global_h.ictxt
-    );
-    gpu_dev_stream.cudaSync();
-    #else
-    ScalapackConnector::pgemm_f('N', 'N', n_nonsingular - 1, 3, n_nonsingular - 1, 1.0,
-                                body_inv.ptr(), 1, 1, desc_body.desc, wing.at(ifreq).ptr(), 1, 1,
-                                desc_wing.desc, 0.0, lam_3.ptr(), 1, 1, desc_lam_3.desc);
-    #endif
-    #ifdef ENABLE_NVHPC
+    printf("successful after gemm1\n");
+    // printf("desc_wing:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_wing.m_loc(),desc_wing.n_loc(),desc_wing.lld(),desc_wing.mb(),desc_wing.nb());
+    // printf("desc_lam_3:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_lam_3.m_loc(),desc_lam_3.n_loc(),desc_lam_3.lld(),desc_lam_3.mb(),desc_lam_3.nb());
+    // printf("desc_3_3:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_3_3.m_loc(),desc_3_3.n_loc(),desc_3_3.lld(),desc_3_3.mb(),desc_3_3.nb());
     CudaConnector::pgemm_nvhpc(
         gpu_dev_stream, CUBLAS_OP_C, CUBLAS_OP_N, 3, 3, n_nonsingular - 1,
         &calpha,
-        d_wing_ifreq_opt, 1, 1, desc_wing_opt,
-        d_lam_3_opt, 1, 1, desc_lam_3_opt,
+        d_wing_ifreq, 1, 1, desc_wing,
+        d_lam_3, 1, 1, desc_lam_3,
         &cbeta,
         d_Lind_loc, 1, 1, desc_3_3,
         CUBLAS_COMPUTE_64F_PEDANTIC
     );
-    CUDA_CHECK(cudaMemcpyAsync(Lind_loc.ptr(),d_Lind_loc.ptr(),desc_3_3.m_loc()*desc_3_3.n_loc()*sizeof(std::complex<double>),cudaMemcpyDeviceToHost,gpu_dev_stream.stream));
-    gpu_dev_stream.cudaSync();
-    #else
-    ScalapackConnector::pgemm_f('C', 'N', 3, 3, n_nonsingular - 1, 1.0, wing.at(ifreq).ptr(), 1, 1,
-                                desc_wing.desc, lam_3.ptr(), 1, 1, desc_lam_3.desc, 0.0,
-                                Lind_loc.ptr(), 1, 1, desc_3_3.desc);
-    #endif
-    #ifndef ENABLE_NVHPC
+    printf("successful after gemm2\n");
     CudaConnector::pgemm_nvhpc(
         gpu_dev_stream, CUBLAS_OP_C, CUBLAS_OP_N, 3, n_nonsingular - 1, n_nonsingular - 1,
         &calpha,
-        d_wing_ifreq_opt, 1, 1, desc_wing_opt,
+        d_wing_ifreq, 1, 1, desc_wing,
         d_body_inv, 1, 1, desc_body,
         &cbeta,
-        d_3_lam_opt, 1, 1, desc_3_lam_opt,
+        d_3_lam, 1, 1, desc_3_lam,
         CUBLAS_COMPUTE_64F_PEDANTIC
     );
-    CUDA_CHECK(cudaMemcpyAsync(_3_lam_opt.ptr(),d_3_lam_opt.ptr(),desc_3_lam_opt.m_loc()*desc_3_lam_opt.n_loc()*sizeof(std::complex<double>),cudaMemcpyDeviceToHost,gpu_dev_stream.stream));
+    printf("successful after gemm3\n");
+    CUDA_CHECK(cudaMemcpyAsync(lam_3.ptr(), d_lam_3.ptr(), desc_lam_3.m_loc() * desc_lam_3.n_loc() * sizeof(std::complex<double>), cudaMemcpyDeviceToHost, gpu_dev_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(_3_lam.ptr(), d_3_lam.ptr(), desc_3_lam.m_loc() * desc_3_lam.n_loc() * sizeof(std::complex<double>), cudaMemcpyDeviceToHost, gpu_dev_stream.stream));
+    CUDA_CHECK(cudaMemcpyAsync(Lind_loc.ptr(), d_Lind_loc.ptr(), desc_3_3.m_loc() * desc_3_3.n_loc() * sizeof(std::complex<double>), cudaMemcpyDeviceToHost, gpu_dev_stream.stream));
     gpu_dev_stream.cudaSync();
-    ScalapackConnector::pgemr2d_f(
-        3, n_nonsingular-1,
-        _3_lam_opt.ptr(), 1, 1, desc_3_lam_opt.desc,
-        _3_lam.ptr(), 1, 1, desc_3_lam.desc,
-        blacs_ctxt_global_h.ictxt
-    );
-    #else
-    ScalapackConnector::pgemm_f('C', 'N', 3, n_nonsingular - 1, n_nonsingular - 1, 1.0,
-                                wing.at(ifreq).ptr(), 1, 1, desc_wing.desc, body_inv.ptr(), 1, 1,
-                                desc_body.desc, 0.0, _3_lam.ptr(), 1, 1, desc_3_lam.desc);
-    #endif
-
     for (int i = 0; i != 3; i++)
     {
         auto loc_i = desc_3_3.indx_g2l_r(i);
@@ -1499,12 +1460,9 @@ void diele_func::cal_eps(const int ifreq, Array_Desc &desc_nabf_nabf_opt, Array_
             chi0(ilo, jlo) = result;
         }
     }
-    ScalapackConnector::pgeadd_f(
-        'N', n_nonsingular - 1, n_nonsingular - 1, 
-        1.0, 
-        body_inv.ptr(), 1, 1, desc_body.desc, 
-        1.0, 
-        chi0.ptr(), 2, 2, desc_nabf_nabf_opt.desc);
+    
+    ScalapackConnector::pgeadd_f('N', n_nonsingular - 1, n_nonsingular - 1, 1.0, body_inv.ptr(), 1,
+                                 1, desc_body.desc, 1.0, chi0.ptr(), 2, 2, desc_nabf_nabf_opt.desc);
     Profiler::stop("cal_inverse_dielectric_matrix_ij");
     if (mpi_comm_global_h.is_root())
         std::cout << "* Success: calculate average inverse dielectric matrix no." << ifreq + 1

From 3b16358ca3fd394db2d940684542bc310ecacafa Mon Sep 17 00:00:00 2001
From: userchba <88939318+userchba@users.noreply.github.com>
Date: Tue, 2 Dec 2025 17:25:52 +0800
Subject: [PATCH 17/18] Update dielecmodel.cpp

---
 src/dielecmodel.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/dielecmodel.cpp b/src/dielecmodel.cpp
index ee046300..74de2b27 100644
--- a/src/dielecmodel.cpp
+++ b/src/dielecmodel.cpp
@@ -1227,7 +1227,7 @@ void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const
     d_Lind_loc.set_data(desc_3_3.m_loc(),desc_3_3.n_loc(),gpu_dev_stream.stream);
     
     std::complex<double> calpha(1.0,0.0),cbeta(0.0,0.0);
-    printf("successful before gemm1\n");
+    
     CudaConnector::pgemm_nvhpc(
         gpu_dev_stream, CUBLAS_OP_N, CUBLAS_OP_N, n_nonsingular - 1, 3, n_nonsingular - 1,
         &calpha,
@@ -1237,7 +1237,7 @@ void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const
         d_lam_3, 1, 1, desc_lam_3,
         CUBLAS_COMPUTE_64F_PEDANTIC
     );
-    printf("successful after gemm1\n");
+    
     // printf("desc_wing:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_wing.m_loc(),desc_wing.n_loc(),desc_wing.lld(),desc_wing.mb(),desc_wing.nb());
     // printf("desc_lam_3:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_lam_3.m_loc(),desc_lam_3.n_loc(),desc_lam_3.lld(),desc_lam_3.mb(),desc_lam_3.nb());
     // printf("desc_3_3:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_3_3.m_loc(),desc_3_3.n_loc(),desc_3_3.lld(),desc_3_3.mb(),desc_3_3.nb());
@@ -1250,7 +1250,7 @@ void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const
         d_Lind_loc, 1, 1, desc_3_3,
         CUBLAS_COMPUTE_64F_PEDANTIC
     );
-    printf("successful after gemm2\n");
+    
     CudaConnector::pgemm_nvhpc(
         gpu_dev_stream, CUBLAS_OP_C, CUBLAS_OP_N, 3, n_nonsingular - 1, n_nonsingular - 1,
         &calpha,
@@ -1260,7 +1260,7 @@ void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const
         d_3_lam, 1, 1, desc_3_lam,
         CUBLAS_COMPUTE_64F_PEDANTIC
     );
-    printf("successful after gemm3\n");
+    
     CUDA_CHECK(cudaMemcpyAsync(lam_3.ptr(), d_lam_3.ptr(), desc_lam_3.m_loc() * desc_lam_3.n_loc() * sizeof(std::complex<double>), cudaMemcpyDeviceToHost, gpu_dev_stream.stream));
     CUDA_CHECK(cudaMemcpyAsync(_3_lam.ptr(), d_3_lam.ptr(), desc_3_lam.m_loc() * desc_3_lam.n_loc() * sizeof(std::complex<double>), cudaMemcpyDeviceToHost, gpu_dev_stream.stream));
     CUDA_CHECK(cudaMemcpyAsync(Lind_loc.ptr(), d_Lind_loc.ptr(), desc_3_3.m_loc() * desc_3_3.n_loc() * sizeof(std::complex<double>), cudaMemcpyDeviceToHost, gpu_dev_stream.stream));

From 0d11aebc60fa3f24b117ad4801276bcf3ded3e9c Mon Sep 17 00:00:00 2001
From: userchba <88939318+userchba@users.noreply.github.com>
Date: Tue, 2 Dec 2025 17:45:09 +0800
Subject: [PATCH 18/18] Add files via upload

---
 src/dielecmodel.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/dielecmodel.cpp b/src/dielecmodel.cpp
index cdcff61a..8338c640 100644
--- a/src/dielecmodel.cpp
+++ b/src/dielecmodel.cpp
@@ -514,7 +514,7 @@ void diele_func::wing_mu_to_lambda(matrix_m<std::complex<double>> &sqrtveig_blac
     for (int iomega = 0; iomega != this->omega.size(); iomega++)
     {
         auto &wing_tmp = this->wing.at(iomega);
-        wing_tmp = init_local_mat<complex<double>>(desc_wing_opt, MAJOR::COL);
+        wing_tmp = init_local_mat<complex<double>>(desc_wing, MAJOR::COL);
         // TODO: reconstruct wing_mu
         auto wing_mu_tmp = init_local_mat<complex<double>>(desc_wing_mu, MAJOR::COL);
         for (int alpha = 0; alpha != 3; alpha++)
@@ -1150,9 +1150,9 @@ void diele_func::construct_L(const int ifreq, Array_Desc &desc_body)
     // tmp = head.at(ifreq) - transpose(wing.at(ifreq), true) * body_inv * wing.at(ifreq);
     ScalapackConnector::pgemm_f('N', 'N', n_nonsingular - 1, 3, n_nonsingular - 1, 1.0,
                                 body_inv.ptr(), 1, 1, desc_body.desc, wing.at(ifreq).ptr(), 1, 1,
-                                desc_wing_opt.desc, 0.0, lam_3.ptr(), 1, 1, desc_lam_3.desc);
+                                desc_wing.desc, 0.0, lam_3.ptr(), 1, 1, desc_lam_3.desc);
     ScalapackConnector::pgemm_f('C', 'N', 3, 3, n_nonsingular - 1, 1.0, wing.at(ifreq).ptr(), 1, 1,
-                                desc_wing_opt.desc, lam_3.ptr(), 1, 1, desc_lam_3.desc, 0.0,
+                                desc_wing.desc, lam_3.ptr(), 1, 1, desc_lam_3.desc, 0.0,
                                 Lind_loc.ptr(), 1, 1, desc_3_3.desc);
     ScalapackConnector::pgemm_f('C', 'N', 3, n_nonsingular - 1, n_nonsingular - 1, 1.0,
                                 wing.at(ifreq).ptr(), 1, 1, desc_wing.desc, body_inv.ptr(), 1,
@@ -1227,7 +1227,7 @@ void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const
     d_Lind_loc.set_data(desc_3_3.m_loc(),desc_3_3.n_loc(),gpu_dev_stream.stream);
     
     std::complex<double> calpha(1.0,0.0),cbeta(0.0,0.0);
-    
+
     CudaConnector::pgemm_nvhpc(
         gpu_dev_stream, CUBLAS_OP_N, CUBLAS_OP_N, n_nonsingular - 1, 3, n_nonsingular - 1,
         &calpha,
@@ -1237,7 +1237,7 @@ void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const
         d_lam_3, 1, 1, desc_lam_3,
         CUBLAS_COMPUTE_64F_PEDANTIC
     );
-    
+
     // printf("desc_wing:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_wing.m_loc(),desc_wing.n_loc(),desc_wing.lld(),desc_wing.mb(),desc_wing.nb());
     // printf("desc_lam_3:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_lam_3.m_loc(),desc_lam_3.n_loc(),desc_lam_3.lld(),desc_lam_3.mb(),desc_lam_3.nb());
     // printf("desc_3_3:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_3_3.m_loc(),desc_3_3.n_loc(),desc_3_3.lld(),desc_3_3.mb(),desc_3_3.nb());
@@ -1250,7 +1250,7 @@ void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const
         d_Lind_loc, 1, 1, desc_3_3,
         CUBLAS_COMPUTE_64F_PEDANTIC
     );
-    
+
     CudaConnector::pgemm_nvhpc(
         gpu_dev_stream, CUBLAS_OP_C, CUBLAS_OP_N, 3, n_nonsingular - 1, n_nonsingular - 1,
         &calpha,
@@ -1260,7 +1260,7 @@ void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const
         d_3_lam, 1, 1, desc_3_lam,
         CUBLAS_COMPUTE_64F_PEDANTIC
     );
-    
+
     CUDA_CHECK(cudaMemcpyAsync(lam_3.ptr(), d_lam_3.ptr(), desc_lam_3.m_loc() * desc_lam_3.n_loc() * sizeof(std::complex<double>), cudaMemcpyDeviceToHost, gpu_dev_stream.stream));
     CUDA_CHECK(cudaMemcpyAsync(_3_lam.ptr(), d_3_lam.ptr(), desc_3_lam.m_loc() * desc_3_lam.n_loc() * sizeof(std::complex<double>), cudaMemcpyDeviceToHost, gpu_dev_stream.stream));
     CUDA_CHECK(cudaMemcpyAsync(Lind_loc.ptr(), d_Lind_loc.ptr(), desc_3_3.m_loc() * desc_3_3.n_loc() * sizeof(std::complex<double>), cudaMemcpyDeviceToHost, gpu_dev_stream.stream));