From adbe674a8dee9a7c7cc3a5d5c8b5bce1dfc2ad36 Mon Sep 17 00:00:00 2001 From: userchba <88939318+userchba@users.noreply.github.com> Date: Fri, 25 Jul 2025 01:09:12 +0800 Subject: [PATCH 01/18] epsilon.cpp fix a bug for rpa calculation when open 4 mpi --- epsilon.cpp | 2971 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2971 insertions(+) create mode 100644 epsilon.cpp diff --git a/epsilon.cpp b/epsilon.cpp new file mode 100644 index 00000000..9ed7d46b --- /dev/null +++ b/epsilon.cpp @@ -0,0 +1,2971 @@ +#include "epsilon.h" +#define OPEN_TEST_FOR_LU_DECOMPOSITION +#include +#include + +#include +#include +#include +#include +#include + +#include "atoms.h" +#include "constants.h" +#include "envs_blacs.h" +#include "envs_io.h" +#include "envs_mpi.h" +#include "lapack_connector.h" +#include "libri_utils.h" +#include "matrix_m_parallel_utils.h" +#include "parallel_mpi.h" +#include "params.h" +#include "pbc.h" +#include "profiler.h" +#include "scalapack_connector.h" +#include "stl_io_helper.h" +#include "utils_blacs.h" +#include "utils_io.h" +#include "utils_mem.h" +#include "utils_mpi_io.h" + +#ifdef LIBRPA_USE_LIBRI +#include +#include +using RI::Tensor; +using RI::Communicate_Tensors_Map_Judge::comm_map2_first; +#endif + +using LIBRPA::Array_Desc; +using LIBRPA::envs::blacs_ctxt_global_h; +using LIBRPA::envs::mpi_comm_global_h; +using LIBRPA::envs::ofs_myid; +using LIBRPA::utils::lib_printf; + +CorrEnergy compute_RPA_correlation_blacs_2d_gamma_only(Chi0 &chi0, atpair_k_cplx_mat_t &coulmat) +{ + CorrEnergy corr; + if (mpi_comm_global_h.myid == 0) + lib_printf("Calculating EcRPA with BLACS/ScaLAPACK 2D gamma_only\n"); + // lib_printf("Calculating EcRPA with BLACS, pid: %d\n", mpi_comm_global_h.myid); + const auto &mf = chi0.mf; + const double CONE = 1.0; + const int n_abf = LIBRPA::atomic_basis_abf.nb_total; + // std::cout << "n_abf " << n_abf << std::endl; + // std::cout << "n_atoms " << LIBRPA::atomic_basis_abf.n_atoms << std::endl; + const auto part_range = LIBRPA::atomic_basis_abf.get_part_range(); + // std::cout << "part_range " << part_range[0] << " " << part_range[1] << std::endl; + auto nbs_ = LIBRPA::atomic_basis_abf.get_atom_nbs(); + // std::cout << "nbs_ " << nbs_[0] << " " << nbs_[1] << std::endl; + + mpi_comm_global_h.barrier(); + + Array_Desc desc_nabf_nabf(blacs_ctxt_global_h); + // use a square blocksize instead max block, otherwise heev and inversion will complain about + // illegal parameter + desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0); + const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy( + 'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf); + const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf); + auto chi0_block = init_local_mat(desc_nabf_nabf, MAJOR::COL); + auto coul_block = init_local_mat(desc_nabf_nabf, MAJOR::COL); + auto coul_chi0_block = init_local_mat(desc_nabf_nabf, MAJOR::COL); + + vector> qpts; + for (const auto &qMuNuchi : chi0.get_chi0_q().at(chi0.tfg.get_freq_nodes()[0])) + qpts.push_back(qMuNuchi.first); + + complex tot_RPA_energy(0.0, 0.0); + map, complex> cRPA_q; + if (mpi_comm_global_h.is_root()) lib_printf("Finish init RPA blacs 2d\n"); +#ifdef LIBRPA_USE_LIBRI + for (const auto &q : qpts) + { + coul_block.zero_out(); + + int iq = std::distance(klist.begin(), std::find(klist.begin(), klist.end(), q)); + std::array qa = {q.x, q.y, q.z}; + // collect the block elements of coulomb matrices + { + double vq_begin = omp_get_wtime(); + // LibRI tensor for communication, release once done + std::map>, Tensor>> + coul_libri; + + for (const auto &Mu_Nu : local_atpair) + { + const auto Mu = Mu_Nu.first; + const auto Nu = Mu_Nu.second; + // ofs_myid << "myid " << blacs_ctxt_global_h.myid << "Mu " << Mu << " Nu " << Nu << + // endl; + if (coulmat.count(Mu) == 0 || coulmat.at(Mu).count(Nu) == 0 || + coulmat.at(Mu).at(Nu).count(q) == 0) + continue; + const auto &Vq = coulmat.at(Mu).at(Nu).at(q); + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu); + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu); + matrix tmp_vq_real = (*Vq).real(); + std::valarray Vq_va(tmp_vq_real.c, Vq->size); + auto pvq = std::make_shared>(); + *pvq = Vq_va; + coul_libri[Mu][{Nu, std::array{0, 0, 0}}] = + Tensor({n_mu, n_nu}, pvq); + coulmat.at(Mu).at(Nu).at(q).reset(); + } + + LIBRPA::utils::release_free_mem(); + + // printf("Finish RPA blacs 2d vq arr\n"); + double arr_end = omp_get_wtime(); + mpi_comm_global_h.barrier(); + double comm_begin = omp_get_wtime(); + // printf("Begin comm_map2_first myid: %d\n",mpi_comm_global_h.myid); + const auto IJq_coul = + comm_map2_first(mpi_comm_global_h.comm, coul_libri, s0_s1.first, s0_s1.second); + double comm_end = omp_get_wtime(); + mpi_comm_global_h.barrier(); + + double block_begin = omp_get_wtime(); + + collect_block_from_ALL_IJ_Tensor(coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, + qa, true, CONE, IJq_coul, MAJOR::ROW); + + double block_end = omp_get_wtime(); + lib_printf( + "Vq Time myid: %d arr_time: %f comm_time: %f block_time: %f pair_size: %d\n", + mpi_comm_global_h.myid, arr_end - vq_begin, comm_end - comm_begin, + block_end - block_begin, set_IJ_nabf_nabf.size()); + mpi_comm_global_h.barrier(); + double vq_end = omp_get_wtime(); + + if (mpi_comm_global_h.myid == 0) + lib_printf(" | Total vq time: %f lri_coul: %f comm_vq: %f block_vq: %f\n", + vq_end - vq_begin, comm_begin - vq_begin, block_begin - comm_begin, + vq_end - block_begin); + } + + double chi_arr_time = 0.0; + double chi_comm_time = 0.0; + double chi_2d_time = 0.0; + for (const auto &freq : chi0.tfg.get_freq_nodes()) + { + const auto ifreq = chi0.tfg.get_freq_index(freq); + const double freq_weight = chi0.tfg.find_freq_weight(freq); + double pi_freq_begin = omp_get_wtime(); + chi0_block.zero_out(); + { + double chi_begin_arr = omp_get_wtime(); + std::map>, Tensor>> + chi0_libri; + const auto &chi0_wq = chi0.get_chi0_q().at(freq).at(q); + + for (const auto &M_Nchi : chi0_wq) + { + const auto &M = M_Nchi.first; + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M); + for (const auto &N_chi : M_Nchi.second) + { + const auto &N = N_chi.first; + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N); + const auto &chi = N_chi.second.real(); + std::valarray chi_va(chi.c, chi.size); + auto pchi = std::make_shared>(); + *pchi = chi_va; + chi0_libri[M][{N, std::array{0, 0, 0}}] = + Tensor({n_mu, n_nu}, pchi); + } + } + + // if(mpi_comm_global_h.is_root()) + // { + // lib_printf("Begin to clean chi0 !!! \n"); + // system("free -m"); + // lib_printf("chi0_freq_q size: %d\n",chi0_wq.size()); + // } + + chi0.free_chi0_q(freq, q); + + LIBRPA::utils::release_free_mem(); + + // if(mpi_comm_global_h.is_root()) + // { + // lib_printf("After clean chi0 !!! \n"); + // system("free -m"); + // lib_printf("chi0_freq_q size: %d\n",chi0_wq.size()); + // } + + mpi_comm_global_h.barrier(); + double chi_end_arr = omp_get_wtime(); + // ofs_myid << "chi0_libri" << endl << chi0_libri; + + const auto IJq_chi0 = + comm_map2_first(mpi_comm_global_h.comm, chi0_libri, s0_s1.first, s0_s1.second); + // ofs_myid << "IJq_chi0" << endl << IJq_chi0; + double chi_end_comm = omp_get_wtime(); + + collect_block_from_ALL_IJ_Tensor(chi0_block, desc_nabf_nabf, + LIBRPA::atomic_basis_abf, qa, true, CONE, IJq_chi0, + MAJOR::ROW); + // printf("End collect block myid: %d ifreq: %d TIME_USED: + // %f\n",mpi_comm_global_h.myid,ifreq,chi_end_comm-chi_end_arr); + mpi_comm_global_h.barrier(); + double chi_end_2d = omp_get_wtime(); + + chi_arr_time = (chi_end_arr - chi_begin_arr); + chi_comm_time = (chi_end_comm - chi_end_arr); + chi_2d_time = (chi_end_2d - chi_end_comm); + } + + double pi_begin = omp_get_wtime(); + ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, coul_block.ptr(), 1, 1, + desc_nabf_nabf.desc, chi0_block.ptr(), 1, 1, + desc_nabf_nabf.desc, 0.0, coul_chi0_block.ptr(), 1, 1, + desc_nabf_nabf.desc); + // char fnp[100]; + // sprintf(fnp, "pi_ifreq_%d_iq_%d.mtx", ifreq, iq); + double pi_end = omp_get_wtime(); + // printf("End pgemm myid: %d ifreq: %d \n",mpi_comm_global_h.myid,ifreq); + double trace_pi = 0.0; + double trace_pi_loc = 0.0; + for (int i = 0; i != n_abf; i++) + { + const int ilo = desc_nabf_nabf.indx_g2l_r(i); + const int jlo = desc_nabf_nabf.indx_g2l_c(i); + if (ilo >= 0 && jlo >= 0) trace_pi_loc += coul_chi0_block(ilo, jlo); + } + + coul_chi0_block *= -1.0; + for (int i = 0; i != n_abf; i++) + { + const int ilo = desc_nabf_nabf.indx_g2l_r(i); + const int jlo = desc_nabf_nabf.indx_g2l_c(i); + if (ilo >= 0 && jlo >= 0) coul_chi0_block(ilo, jlo) += CONE; + } + + int *ipiv = new int[desc_nabf_nabf.m_loc() * 10]; + int info; + // printf("begin det myid: %d ifreq: %d \n",mpi_comm_global_h.myid,ifreq); + double ln_det = + compute_pi_det_blacs_2d_gamma_only(coul_chi0_block, desc_nabf_nabf, ipiv, info); + // printf("End det myid: %d ifreq: %d \n",mpi_comm_global_h.myid,ifreq); + double det_end = omp_get_wtime(); + mpi_comm_global_h.barrier(); + MPI_Allreduce(&trace_pi_loc, &trace_pi, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_global_h.comm); + double pi_freq_end = omp_get_wtime(); + + if (mpi_comm_global_h.myid == 0) + { + lib_printf( + "| TIME of DET-freq-q: %f, q: ( %f, %f, %f) TOT: %f CHI_arr: %f CHI_comm: " + "%f, CHI_2d: %f, Pi: %f, Det: %f\n", + freq, q.x, q.y, q.z, pi_freq_end - pi_freq_begin, chi_arr_time, chi_comm_time, + chi_2d_time, pi_end - pi_begin, det_end - pi_end); + complex rpa_for_omega_q = complex(trace_pi + ln_det); + /*std::cout << "q: " << iq << ", freq: " << ifreq << ", ln_det:" << ln_det + << ", trace_pi: " << trace_pi << ", rpa_for_omega_q" << rpa_for_omega_q + << ", contribution: " + << rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI << std::endl;*/ + cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; //! check + // std::cout << "rpa_for_omega_q: " << rpa_for_omega_q + // << ", freq_weight: " << freq_weight << ", irk_weight[q]:" << + // irk_weight[q] + // << ", cRPA_q[q]: " << cRPA_q[q] << std::endl; + tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; + } + } + } +#else + throw std::logic_error("need compilation with LibRI"); +#endif + if (mpi_comm_global_h.myid == 0) + { + for (auto &q_crpa : cRPA_q) + { + corr.qcontrib[q_crpa.first] = q_crpa.second; + } + } + mpi_comm_global_h.barrier(); + corr.value = tot_RPA_energy; + + corr.etype = CorrEnergy::type::RPA; + return corr; +} + +CorrEnergy compute_RPA_correlation_blacs_2d(Chi0 &chi0, atpair_k_cplx_mat_t &coulmat) +{ + lib_printf("Begin to compute_RPA_correlation_blacs_2d myid: %d\n", mpi_comm_global_h.myid); + system("free -m"); + CorrEnergy corr; + if (mpi_comm_global_h.myid == 0) lib_printf("Calculating EcRPA with BLACS/ScaLAPACK 2D\n"); + // lib_printf("Calculating EcRPA with BLACS, pid: %d\n", mpi_comm_global_h.myid); + const auto &mf = chi0.mf; + const complex CONE{1.0, 0.0}; + const int n_abf = LIBRPA::atomic_basis_abf.nb_total; + const auto part_range = LIBRPA::atomic_basis_abf.get_part_range(); + + mpi_comm_global_h.barrier(); + + Array_Desc desc_nabf_nabf(blacs_ctxt_global_h); + // use a square blocksize instead max block, otherwise heev and inversion will complain about + // illegal parameter + desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0); + const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy( + 'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf); + const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf); + auto chi0_block = init_local_mat>(desc_nabf_nabf, MAJOR::COL); + auto coul_block = init_local_mat>(desc_nabf_nabf, MAJOR::COL); + auto coul_chi0_block = init_local_mat>(desc_nabf_nabf, MAJOR::COL); + // ofs_myid << "Iset Jset " << s0_s1 << endl; + // ofs_myid << "atpair_unordered_local of myid " << blacs_ctxt_global_h.myid << " " << + // atpair_unordered_local << endl; + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success before vector qpts processid:%d,chi0.tfg.get_freq_nodes()[0]:%f,chi0.get_chi0_q().size():%d\n", mpi_comm_global_h.myid, + // chi0.tfg.get_freq_nodes()[0], chi0.get_chi0_q().size()); + // printf("chi0.get_chi0_q().empty():%d\n", chi0.get_chi0_q().empty()); + printf("processId:%d,chi0.klist.size():%zu\n", mpi_comm_global_h.myid, chi0.klist.size()); + // for(const auto &k : chi0.klist) + // { + // printf("processId:%d, k: (%f, %f, %f)\n", mpi_comm_global_h.myid, k.x, k.y, k.z); + // } + #endif + vector> qpts; + + // for (const auto &qMuNuchi : chi0.get_chi0_q().at(chi0.tfg.get_freq_nodes()[0])) + // { + // qpts.push_back(qMuNuchi.first); + // #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // const auto &q = qMuNuchi.first; + // printf("processId:%d, q: (%f, %f, %f)\n", mpi_comm_global_h.myid, q.x, q.y, q.z); + // #endif + // } + for(const auto &q : chi0.klist) + { + qpts.push_back(q); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + printf("processId:%d, q: (%f, %f, %f)\n", mpi_comm_global_h.myid, q.x, q.y, q.z); + #endif + } + complex tot_RPA_energy(0.0, 0.0); + map, complex> cRPA_q; + if (mpi_comm_global_h.is_root()) lib_printf("Finish init RPA blacs 2d\n"); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + printf("success before for loop processid:%d\n", mpi_comm_global_h.myid); + #endif +#ifdef LIBRPA_USE_LIBRI + + for (const auto &q : qpts) + { + coul_block.zero_out(); + + int iq = std::distance(klist.begin(), std::find(klist.begin(), klist.end(), q)); + std::array qa = {q.x, q.y, q.z}; + // collect the block elements of coulomb matrices + { + double vq_begin = omp_get_wtime(); + // LibRI tensor for communication, release once done + std::map>, Tensor>>> + coul_libri; + coul_libri.clear(); + for (const auto &Mu_Nu : local_atpair) + { + const auto Mu = Mu_Nu.first; + const auto Nu = Mu_Nu.second; + // ofs_myid << "myid " << blacs_ctxt_global_h.myid << "Mu " << Mu << " Nu " << Nu << + // endl; + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success before if coulmat.count:%d\n", mpi_comm_global_h.myid); + #endif + if (coulmat.count(Mu) == 0 || coulmat.at(Mu).count(Nu) == 0 || + coulmat.at(Mu).at(Nu).count(q) == 0) + continue; + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success after if coulmat.count:%d\n", mpi_comm_global_h.myid); + #endif + const auto &Vq = coulmat.at(Mu).at(Nu).at(q); + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu); + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu); + std::valarray> Vq_va(Vq->c, Vq->size); + auto pvq = std::make_shared>>(); + *pvq = Vq_va; + coul_libri[Mu][{Nu, qa}] = Tensor>({n_mu, n_nu}, pvq); + } + // printf("Finish RPA blacs 2d vq arr\n"); + double arr_end = omp_get_wtime(); + mpi_comm_global_h.barrier(); + double comm_begin = omp_get_wtime(); + // printf("Begin comm_map2_first myid: %d\n",mpi_comm_global_h.myid); + const auto IJq_coul = + comm_map2_first(mpi_comm_global_h.comm, coul_libri, s0_s1.first, s0_s1.second); + double comm_end = omp_get_wtime(); + mpi_comm_global_h.barrier(); + // printf("End vq comm_map2_first myid: %d TIME_USED: + // %f\n",mpi_comm_global_h.myid,comm_end-comm_begin); + // ofs_myid << "IJq_coul" << endl << IJq_coul; + // printf("Finish RPA blacs 2d vq 2d\n"); + double block_begin = omp_get_wtime(); + // for (const auto &IJ: set_IJ_nabf_nabf) + // { + // const auto &I = IJ.first; + // const auto &J = IJ.second; + // // cout << IJq_coul.at(I).at({J, qa}); + // collect_block_from_IJ_storage_syhe( + // coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, IJ.first, + // IJ.second, true, CONE, IJq_coul.at(I).at({J, qa}).ptr(), MAJOR::ROW); + // // lib_printf("myid %d I %d J %d nr %d nc %d\n%s", + // // blacs_ctxt_global_h.myid, I, J, + // // coul_block.nr(), coul_block.nc(), + // // str(coul_block).c_str()); + // } + collect_block_from_ALL_IJ_Tensor(coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, + qa, true, CONE, IJq_coul, MAJOR::ROW); + double block_end = omp_get_wtime(); + lib_printf( + "Vq Time myid: %d arr_time: %f comm_time: %f block_time: %f pair_size: %d\n", + mpi_comm_global_h.myid, arr_end - vq_begin, comm_end - comm_begin, + block_end - block_begin, set_IJ_nabf_nabf.size()); + mpi_comm_global_h.barrier(); + double vq_end = omp_get_wtime(); + + if (mpi_comm_global_h.myid == 0) + lib_printf(" | Total vq time: %f lri_coul: %f comm_vq: %f block_vq: %f\n", + vq_end - vq_begin, comm_begin - vq_begin, block_begin - comm_begin, + vq_end - block_begin); + } + + // if(mpi_comm_global_h.is_root()) + // printf("Finish RPA blacs 2d vq comm\n"); + // char fn[100]; + // sprintf(fn, "coul_iq_%d.mtx", iq); + // print_matrix_mm_file_parallel(fn, coul_block, desc_nabf_nabf); + // ofs_myid << str(coul_block); + // lib_printf("coul_block\n%s", str(coul_block).c_str()); + double chi_arr_time = 0.0; + double chi_comm_time = 0.0; + double chi_2d_time = 0.0; + for (const auto &freq : chi0.tfg.get_freq_nodes()) + { + const auto ifreq = chi0.tfg.get_freq_index(freq); + const double freq_weight = chi0.tfg.find_freq_weight(freq); + double pi_freq_begin = omp_get_wtime(); + chi0_block.zero_out(); + { + double chi_begin_arr = omp_get_wtime(); + std::map>, Tensor>>> + chi0_libri; + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success before chi0.get_chi0_q().at(freq).at(q) processId:%d\n", mpi_comm_global_h.myid); + // printf("processId:%d,chi0.get_chi0_q().empty():%d\n", mpi_comm_global_h.myid, chi0.get_chi0_q().empty()); + #endif + atom_mapping::pair_t_old chi0_wq; + if(!chi0.get_chi0_q().empty()) + chi0_wq = chi0.get_chi0_q().at(freq).at(q); + // const auto &chi0_wq = chi0.get_chi0_q().at(freq).at(q); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success after chi0.get_chi0_q().at(freq).at(q) processId:%d\n", mpi_comm_global_h.myid); + #endif + chi0_libri.clear(); + if(!chi0.get_chi0_q().empty()) + for (const auto &M_Nchi : chi0_wq) + { + const auto &M = M_Nchi.first; + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M); + for (const auto &N_chi : M_Nchi.second) + { + const auto &N = N_chi.first; + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N); + const auto &chi = N_chi.second; + std::valarray> chi_va(chi.c, chi.size); + auto pchi = std::make_shared>>(); + *pchi = chi_va; + chi0_libri[M][{N, qa}] = Tensor>({n_mu, n_nu}, pchi); + } + } + if (mpi_comm_global_h.is_root()) + { + lib_printf("Begin to clean chi0 !!! \n"); + LIBRPA::utils::display_free_mem(); + lib_printf("chi0_freq_q size: %d, freq: %f, q:( %f, %f, %f )\n", + chi0_wq.size(), freq, q.x, q.y, q.z); + } + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success before chi0.free_chi0_q(freq, q) processId:%d\n", mpi_comm_global_h.myid); + #endif + if(!chi0.get_chi0_q().empty()) + chi0.free_chi0_q(freq, q); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success after chi0.free_chi0_q(freq, q) processId:%d\n", mpi_comm_global_h.myid); + #endif + + + LIBRPA::utils::release_free_mem(); + // if(mpi_comm_global_h.is_root()) + // { + // lib_printf("After clean chi0 !!! \n"); + // system("free -m"); + // lib_printf("chi0_freq_q size: %d\n",chi0_wq.size()); + // } + mpi_comm_global_h.barrier(); + double chi_end_arr = omp_get_wtime(); + // ofs_myid << "chi0_libri" << endl << chi0_libri; + + const auto IJq_chi0 = + comm_map2_first(mpi_comm_global_h.comm, chi0_libri, s0_s1.first, s0_s1.second); + // ofs_myid << "IJq_chi0" << endl << IJq_chi0; + double chi_end_comm = omp_get_wtime(); + collect_block_from_ALL_IJ_Tensor(chi0_block, desc_nabf_nabf, + LIBRPA::atomic_basis_abf, qa, true, CONE, IJq_chi0, + MAJOR::ROW); + mpi_comm_global_h.barrier(); + double chi_end_2d = omp_get_wtime(); + + chi_arr_time = (chi_end_arr - chi_begin_arr); + chi_comm_time = (chi_end_comm - chi_end_arr); + chi_2d_time = (chi_end_2d - chi_end_comm); + // char fnc[100]; + // sprintf(fnc, "chi_ifreq_%d_iq_%d.mtx", ifreq, iq); + // if( ifreq== 0) + // print_matrix_mm_file_parallel(fnc, chi0_block, desc_nabf_nabf); + } + + double pi_begin = omp_get_wtime(); + ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, coul_block.ptr(), 1, 1, + desc_nabf_nabf.desc, chi0_block.ptr(), 1, 1, + desc_nabf_nabf.desc, 0.0, coul_chi0_block.ptr(), 1, 1, + desc_nabf_nabf.desc); + // char fnp[100]; + // sprintf(fnp, "pi_ifreq_%d_iq_%d.mtx", ifreq, iq); + double pi_end = omp_get_wtime(); + + complex trace_pi(0.0, 0.0); + complex trace_pi_loc(0.0, 0.0); + for (int i = 0; i != n_abf; i++) + { + const int ilo = desc_nabf_nabf.indx_g2l_r(i); + const int jlo = desc_nabf_nabf.indx_g2l_c(i); + if (ilo >= 0 && jlo >= 0) trace_pi_loc += coul_chi0_block(ilo, jlo); + } + + coul_chi0_block *= -1.0; + for (int i = 0; i != n_abf; i++) + { + const int ilo = desc_nabf_nabf.indx_g2l_r(i); + const int jlo = desc_nabf_nabf.indx_g2l_c(i); + if (ilo >= 0 && jlo >= 0) coul_chi0_block(ilo, jlo) += CONE; + // std::cout << "1-Pi: " << ilo << "," << jlo << "," << coul_chi0_block(ilo, jlo) + //<< std::endl; + } + // if( ifreq== 0 && mpi_comm_global_h.is_root() ) + // print_whole_matrix("pi-2D-loc", coul_chi0_block); + + int *ipiv = new int[desc_nabf_nabf.m_loc() * 10]; + int info; + complex ln_det = + compute_pi_det_blacs_2d(coul_chi0_block, desc_nabf_nabf, ipiv, info); + double det_end = omp_get_wtime(); + mpi_comm_global_h.barrier(); + MPI_Allreduce(&trace_pi_loc, &trace_pi, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, + mpi_comm_global_h.comm); + double pi_freq_end = omp_get_wtime(); + // double task_end = omp_get_wtime(); + // if(mpi_comm_global_h.is_root()) + // lib_printf("| After det for freq: %f, q: ( %f, %f, %f) TIME_LOCMAT: %f + // TIME_DET: %f TIME_CAL_Pi: %f, TIME_TRAN_LOC: %f\n",ifreq, + // q.x,q.y,q.z,task_mid-task_begin,task_end-task_mid,pi_time,loc_tran_time); + // para_mpi.mpi_barrier(); + + if (mpi_comm_global_h.myid == 0) + { + lib_printf( + "| TIME of DET-freq-q: %f, q: ( %f, %f, %f) TOT: %f CHI_arr: %f CHI_comm: " + "%f, CHI_2d: %f, Pi: %f, Det: %f\n", + freq, q.x, q.y, q.z, pi_freq_end - pi_freq_begin, chi_arr_time, chi_comm_time, + chi_2d_time, pi_end - pi_begin, det_end - pi_end); + complex rpa_for_omega_q = trace_pi + ln_det; + cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; //! check + tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; + } + } + } +#else + throw std::logic_error("need compilation with LibRI"); +#endif + if (mpi_comm_global_h.myid == 0) + { + for (auto &q_crpa : cRPA_q) + { + corr.qcontrib[q_crpa.first] = q_crpa.second; + // cout << q_crpa.first << q_crpa.second << endl; + } + // cout << "gx_num_" << chi0.tfg.size() << " tot_RPA_energy: " << setprecision(8) + // < &loc_piT, const Array_Desc &arrdesc_pi, + int *ipiv, int &info) +{ + int one = 1; + int range_all = N_all_mu; + int DESCPI_T[9]; + + double det_begin = omp_get_wtime(); + + ScalapackConnector::pgetrf_f(range_all, range_all, loc_piT.ptr(), one, one, arrdesc_pi.desc, + ipiv, info); + double trf_end = omp_get_wtime(); + + double ln_det_loc = 0.0; + double ln_det_all = 0.0; + + for (int ig = 0; ig != range_all; ig++) + { + int locr = arrdesc_pi.indx_g2l_r(ig); + int locc = arrdesc_pi.indx_g2l_c(ig); + if (locr >= 0 && locc >= 0) + { + double tmp_ln_det; + if (loc_piT(locr, locc) > 0) + { + tmp_ln_det = std::log(loc_piT(locr, locc)); + } + else + { + tmp_ln_det = std::log(-loc_piT(locr, locc)); + } + ln_det_loc += tmp_ln_det; + } + } + double ln_end = omp_get_wtime(); + + MPI_Allreduce(&ln_det_loc, &ln_det_all, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_global_h.comm); + double det_end = omp_get_wtime(); + return ln_det_all; +} +complex compute_pi_det_blacs_2d(matrix_m> &loc_piT, + const Array_Desc &arrdesc_pi, int *ipiv, int &info) +{ + int one = 1; + int range_all = N_all_mu; + int DESCPI_T[9]; + // if(out_pi) + // { + // print_complex_real_matrix("first_pi",pi_freq_q.at(0).at(0)); + // print_complex_real_matrix("first_loc_piT_mat",loc_piT); + // } + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + printf("success before pzgetrf_ processid:%d,range_all: %d, loc_piT.nr(): %d, loc_piT.nc(): %d\n", + mpi_comm_global_h.myid, range_all, loc_piT.nr(), loc_piT.nc()); + #endif + double det_begin = omp_get_wtime(); + // ScalapackConnector::transpose_desc(DESCPI_T, arrdesc_pi.desc); + pzgetrf_(&range_all, &range_all, loc_piT.ptr(), &one, &one, arrdesc_pi.desc, ipiv, &info); + double trf_end = omp_get_wtime(); + // ScalapackConnector::pgetrf_f(range_all,range_all,loc_piT.c,one,one,DESCPI_T,ipiv, info); + // printf(" after LU myid: %d\n",mpi_comm_global_h.myid); + // printf("desc myid: %d, m n: %d,%d, mb nb: %d, %d, loc_m_n: %d, %d, myp: %d,%d, npr,npc: + // %d, %d\n",mpi_comm_global_h.myid, arrdesc_pi.m(),arrdesc_pi.n(), + // arrdesc_pi.mb(),arrdesc_pi.nb(), + // arrdesc_pi.m_loc(),arrdesc_pi.n_loc(),arrdesc_pi.myprow(),arrdesc_pi.mypcol(),arrdesc_pi.nprows(),arrdesc_pi.npcols()); + complex ln_det_loc(0.0, 0.0); + complex ln_det_all(0.0, 0.0); + // complex det_loc(1.0,0.0); + // complex det_glo(0.0,0.0); + // vector> det_dig; + // vector> ln_det_dig; + // vector> det_dig_r; + // vector> det_dig_c; + // printf(" myid: %d ig=25, locr,locc: %d, + // %d)\n",mpi_comm_global_h.myid,arrdesc_pi.indx_g2l_r(25),arrdesc_pi.indx_g2l_c(25)); + for (int ig = 0; ig != range_all; ig++) + { + // int locr=para_mpi.localIndex(ig,row_nblk,para_mpi.nprow,para_mpi.myprow); + // int locc=para_mpi.localIndex(ig,col_nblk,para_mpi.npcol,para_mpi.mypcol); + int locr = arrdesc_pi.indx_g2l_r(ig); + int locc = arrdesc_pi.indx_g2l_c(ig); + if (locr >= 0 && locc >= 0) + { + // if(ipiv[locr]!=(ig+1)) + // det_loc=-1*det_loc * loc_piT(locc,locr); + // else + // det_loc=det_loc * loc_piT(locc,locr); + // det_dig.push_back(loc_piT(locr,locc)); + // det_dig_r.push_back(locr); + // det_dig_c.push_back(locc); + complex tmp_ln_det; + if (loc_piT(locr, locc).real() > 0) + { + tmp_ln_det = std::log(loc_piT(locr, locc)); + // ln_det_dig.push_back(tmp_ln_det); + } + else + { + tmp_ln_det = std::log(-loc_piT(locr, locc)); + // ln_det_dig.push_back(tmp_ln_det); + } + ln_det_loc += tmp_ln_det; + } + } + double ln_end = omp_get_wtime(); + // ComplexMatrix det_mm(loc_piT.nr(),loc_piT.nc()); + // for(int i=0;i!=loc_piT.nr();i++) + // for(int j=0;j!=loc_piT.nc();j++) + // det_mm(i,j)=loc_piT(i,j); + // // sort(det_dig.rbegin(),det_dig.rend()); + // ComplexMatrix det_dig_mm(det_dig.size(),4); + // for(int i=0;i!=det_dig.size();i++) + // { + // det_dig_mm(i,0) =det_dig_r[i]; + // det_dig_mm(i,1) =det_dig_c[i]; + // det_dig_mm(i,2)=det_dig[i]; + // det_dig_mm(i,3)=ln_det_dig[i]; + // } + // char fn[100]; + // sprintf(fn, "det_dig_myid_%d.mtx", mpi_comm_global_h.myid); + // print_complex_matrix_file("det_dig_loc", det_dig_mm, fn, false); + + // sprintf(fn, "det_mat_myid_%d.mtx", mpi_comm_global_h.myid); + // print_complex_matrix_file("det_mat_loc", det_mm, fn, false); + + MPI_Allreduce(&ln_det_loc, &ln_det_all, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_global_h.comm); + double det_end = omp_get_wtime(); + // if(mpi_comm_global_h.myid == 0) + // lib_printf(" | Det time trf: %f ln: %f allreduce: + // %f\n",trf_end-det_begin,ln_end-trf_end, det_end-ln_end); + // MPI_Allreduce(&det_loc,&det_glo,1,MPI_DOUBLE_COMPLEX,MPI_PROD,mpi_comm_global_h.comm); + // ln_det_all=std::log(det_glo); + return ln_det_all; +} + +complex compute_pi_det_blacs(ComplexMatrix &loc_piT, const Array_Desc &arrdesc_pi, + int *ipiv, int &info) +{ + // int range_all = atom_mu_part_range[natom-1]+atom_mu[natom-1]; + // int desc_pi[9]; + // int loc_row, loc_col, info; + // int row_nblk=1; + // int col_nblk=1; + int one = 1; + int range_all = N_all_mu; + // para_mpi.set_blacs_mat(desc_pi,loc_row,loc_col,range_all,range_all,row_nblk,col_nblk); + // int *ipiv = new int [loc_row*10]; + // ComplexMatrix loc_piT(loc_col,loc_row); + + // for(int i=0;i!=loc_row;i++) + // { + // int global_row = para_mpi.globalIndex(i,row_nblk,para_mpi.nprow,para_mpi.myprow); + // int mu; + // int I=atom_mu_glo2loc(global_row,mu); + // for(int j=0;j!=loc_col;j++) + // { + // int global_col = para_mpi.globalIndex(j,col_nblk,para_mpi.npcol,para_mpi.mypcol); + // int nu; + // int J=atom_mu_glo2loc(global_col,nu); + + // if( global_col == global_row) + // { + // loc_piT(j,i)=complex(1.0,0.0) - pi_freq_q.at(I).at(J)(mu,nu); + // } + // else + // { + // loc_piT(j,i)=-1* pi_freq_q.at(I).at(J)(mu,nu); + // } + + // } + // } + int DESCPI_T[9]; + // if(out_pi) + // { + // print_complex_real_matrix("first_pi",pi_freq_q.at(0).at(0)); + // print_complex_real_matrix("first_loc_piT_mat",loc_piT); + // } + + ScalapackConnector::transpose_desc(DESCPI_T, arrdesc_pi.desc); + + // para_mpi.mpi_barrier(); + // printf(" before LU Myid: %d Available DOS memory = %ld + // bytes\n",mpi_comm_global_h.myid, memavail()); printf(" before LU myid: %d range_all: %d, + // loc_mat.size: %d\n",mpi_comm_global_h.myid,range_all,loc_piT.size); + pzgetrf_(&range_all, &range_all, loc_piT.c, &one, &one, DESCPI_T, ipiv, &info); + // printf(" after LU myid: %d\n",mpi_comm_global_h.myid); + complex ln_det_loc(0.0, 0.0); + complex ln_det_all(0.0, 0.0); + for (int ig = 0; ig != range_all; ig++) + { + // int locr=para_mpi.localIndex(ig,row_nblk,para_mpi.nprow,para_mpi.myprow); + // int locc=para_mpi.localIndex(ig,col_nblk,para_mpi.npcol,para_mpi.mypcol); + int locr = arrdesc_pi.indx_g2l_r(ig); + int locc = arrdesc_pi.indx_g2l_c(ig); + if (locr >= 0 && locc >= 0) + { + // if(ipiv[locr]!=(ig+1)) + // det_loc=-1*det_loc * loc_piT(locc,locr); + // else + // det_loc=det_loc * loc_piT(locc,locr); + if (loc_piT(locc, locr).real() > 0) + ln_det_loc += std::log(loc_piT(locc, locr)); + else + ln_det_loc += std::log(-loc_piT(locc, locr)); + } + } + MPI_Allreduce(&ln_det_loc, &ln_det_all, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_global_h.comm); + return ln_det_all; +} + +CorrEnergy compute_RPA_correlation_blacs(const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat) +{ + CorrEnergy corr; + if (mpi_comm_global_h.myid == 0) lib_printf("Calculating EcRPA with BLACS/ScaLAPACK row\n"); + + const auto &mf = chi0.mf; + const complex CONE{1.0, 0.0}; + const int n_abf = LIBRPA::atomic_basis_abf.nb_total; + const auto part_range = LIBRPA::atomic_basis_abf.get_part_range(); + + mpi_comm_global_h.barrier(); + + LIBRPA::Array_Desc arrdesc_pi(blacs_ctxt_global_h); + arrdesc_pi.init_square_blk(n_abf, n_abf, 0, 0); + int loc_row = arrdesc_pi.m_loc(), loc_col = arrdesc_pi.n_loc(), info; + + // para_mpi.set_blacs_mat(desc_pi,loc_row,loc_col,N_all_mu,N_all_mu,row_nblk,col_nblk); + int *ipiv = new int[loc_row * 10]; + // double vq_begin_m2t= omp_get_wtime(); + // std::map>, Tensor>>> + // vq_libri; for(auto &Ip:Vq) + // { + // auto I=Ip.first; + // const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(I); + // for(auto &Jp:Ip.second) + // { + // auto J=Jp.first; + // const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(J); + // for(auto &qp:Jp.second) + // { + // auto q=qp.first; + // std::array qa = {q.x, q.y, q.z}; + // const auto &vq_ptr=qp.second; + // std::valarray> Vq_va(vq_ptr->c, vq_ptr->size); + // auto pvq = std::make_shared>>(); + // *pvq = Vq_va; + // vq_libri[I][{J, qa}] = Tensor>({n_mu, n_nu}, pvq); + // if(I!=J) + // { + // auto vqT=transpose(*vq_ptr, 1); + // std::valarray> VqT_va(vqT.c, vqT.size); + // auto pvqT = std::make_shared>>(); + // *pvqT = VqT_va; + // vq_libri[J][{I, qa}] = Tensor>({n_nu, n_mu}, pvqT); + // } + // } + // } + // } + // double vq_end_m2t = omp_get_wtime(); + // set loc_atp_IJ; + // for(auto &atp:local_atpair) + // { + // loc_atp_IJ.insert(atp.first); + // loc_atp_IJ.insert(atp.second); + // } + // set all_atom_set; + // for(int I=0;I!=natom;I++) + // all_atom_set.insert(I); + // const auto IJq_coul = Communicate_Tensors_Map_Judge::comm_map2_first(mpi_comm_global_h.comm, + // vq_libri, all_atom_set, loc_atp_IJ); atpair_k_cplx_mat_t Vq_loc; double vq_end_comm = + // omp_get_wtime(); for(auto Ip:IJq_coul) + // { + // auto I=Ip.first; + // auto n_mu=atom_mu[I]; + // for(auto &Jqp:Ip.second) + // { + // auto J=Jqp.first.first; + // auto n_nu=atom_mu[J]; + // auto qa=Jqp.first.second; + // Vector3_Order q{qa[0],qa[1],qa[2]}; + // shared_ptr vq_ptr = make_shared(); + // vq_ptr->create(n_mu, n_nu); + // const auto length=sizeof(complex)* n_mu *n_nu; + // memcpy((*vq_ptr).c, Jqp.second.ptr(),length); + // Vq_loc[I][J][q]=vq_ptr; + // //printf("| process %d, I: %d J: %d\n",mpi_comm_global_h.myid, I,J ); + // } + // } + // double vq_end_t2m = omp_get_wtime(); + // mpi_comm_global_h.barrier(); + // if(mpi_comm_global_h.is_root()) + // lib_printf("| Vq_time %f, TIME_m2t: %f TIME_comm: %f TIME_t2m: + // %f\n",vq_end_t2m-vq_begin_m2t,vq_end_m2t-vq_begin_m2t,vq_end_comm-vq_end_m2t,vq_end_t2m-vq_end_comm); + map, ComplexMatrix>> pi_freq_q; + complex tot_RPA_energy(0.0, 0.0); + map, complex> cRPA_q; + for (const auto &freq_q_MuNuchi0 : chi0.get_chi0_q()) + { + const auto freq = freq_q_MuNuchi0.first; + const double freq_weight = chi0.tfg.find_freq_weight(freq); + for (const auto &q_MuNuchi0 : freq_q_MuNuchi0.second) + { + double task_begin = omp_get_wtime(); + const auto q = q_MuNuchi0.first; + auto &MuNuchi0 = q_MuNuchi0.second; + + // ComplexMatrix loc_piT(loc_col,loc_row); + auto loc_piT = init_local_mat>(arrdesc_pi, MAJOR::COL); + complex trace_pi(0.0, 0.0); + double vq_time = 0.0; + double pi_time = 0.0; + double loc_tran_time = 0.0; + for (int Mu = 0; Mu != natom; Mu++) + { + double Mu_begin = omp_get_wtime(); + // lib_printf(" |process %d, Mu: %d\n",mpi_comm_global_h.myid,Mu); + const size_t n_mu = atom_mu[Mu]; + atom_mapping::pair_t_old Vq_row = gather_vq_row_q(Mu, coulmat, q); + double Mu_after_vq = omp_get_wtime(); + // atom_mapping::pair_t_old Vq_row; + // const auto IJq_coul = + // Communicate_Tensors_Map_Judge::comm_map2_first(mpi_comm_global_h.comm, vq_libri, + // {Mu}, loc_atp_atoms); double Mu_vq_comm = omp_get_wtime(); for(auto Ip:IJq_coul) + // { + // auto I=Ip.first; + // auto n_mu=atom_mu[I]; + // for(auto &Jqp:Ip.second) + // { + // auto J=Jqp.first.first; + // auto n_nu=atom_mu[J]; + // auto q=Jqp.first.second; + // Vq_row[I][J].create(n_mu,n_nu); + // const auto length=sizeof(complex)* n_mu *n_nu; + // memcpy(Vq_row[I][J].c, Jqp.second.ptr(),length); + // } + // } + // double Mu_after_vq=omp_get_wtime(); + // printf(" |process %d, Mu: %d vq_row.size: + // %d\n",para_mpi.get_myid(),Mu,Vq_row[Mu].size()); ComplexMatrix + // loc_pi_row=compute_Pi_freq_q_row(q,MuNuchi0,Vq_loc,Mu,q); + ComplexMatrix loc_pi_row = compute_Pi_freq_q_row(q, MuNuchi0, Vq_row, Mu); + // printf(" |process %d, compute_pi\n",para_mpi.get_myid()); + ComplexMatrix glo_pi_row(n_mu, N_all_mu); + mpi_comm_global_h.barrier(); + mpi_comm_global_h.allreduce_ComplexMatrix(loc_pi_row, glo_pi_row); + double Mu_after_pi_loc = omp_get_wtime(); + // cout<<" glo_pi_rowT nr,nc: "<(1.0, 0.0) - + glo_pi_row(mu_blacs, atom_mu_part_range[J_blacs] + nu_blacs); + } + else + { + loc_piT(i, j) = + -glo_pi_row(mu_blacs, atom_mu_part_range[J_blacs] + nu_blacs); + } + } + } + double Mu_after_loc_tran = omp_get_wtime(); + vq_time += (Mu_after_vq - Mu_begin); + pi_time += (Mu_after_pi_loc - Mu_after_vq); + loc_tran_time += (Mu_after_loc_tran - Mu_after_pi_loc); + } + // if(freq == chi0.tfg.get_freq_nodes()[0] && mpi_comm_global_h.is_root()) + // print_complex_matrix(" loc_piT",loc_piT); + double task_mid = omp_get_wtime(); + // printf("|process %d, before det\n",mpi_comm_global_h.myid); + complex ln_det = compute_pi_det_blacs_2d(loc_piT, arrdesc_pi, ipiv, info); + double task_end = omp_get_wtime(); + if (mpi_comm_global_h.is_root()) + lib_printf( + "| After det for freq: %f, q: ( %f, %f, %f) TIME_Vq_COMM: %f TIME_DET: " + "%f TIME_CAL_Pi: %f, TIME_TRAN_LOC: %f\n", + freq, q.x, q.y, q.z, vq_time, task_end - task_mid, pi_time, loc_tran_time); + // para_mpi.mpi_barrier(); + if (mpi_comm_global_h.myid == 0) + { + complex rpa_for_omega_q = trace_pi + ln_det; + // cout << " ifreq:" << freq << " rpa_for_omega_k: " << rpa_for_omega_q << " + // lnt_det: " << ln_det << " trace_pi " << trace_pi << endl; + cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; //! check + tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; + } + } + } + + if (mpi_comm_global_h.myid == 0) + { + for (auto &q_crpa : cRPA_q) + { + corr.qcontrib[q_crpa.first] = q_crpa.second; + // cout << q_crpa.first << q_crpa.second << endl; + } + // cout << "gx_num_" << chi0.tfg.size() << " tot_RPA_energy: " << setprecision(8) + // <, atom_mapping::pair_t_old>> + pi_freq_q_Mu_Nu; + if (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR || + LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI) + pi_freq_q_Mu_Nu = compute_Pi_q_MPI(chi0, coulmat); + else + pi_freq_q_Mu_Nu = compute_Pi_q(chi0, coulmat); + lib_printf("Finish Pi freq on Proc %4d, size %zu\n", mpi_comm_global_h.myid, + pi_freq_q_Mu_Nu.size()); + // mpi_comm_global_h.barrier(); + + int range_all = N_all_mu; + + vector part_range; + part_range.resize(atom_mu.size()); + part_range[0] = 0; + int count_range = 0; + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success before part_range processid:%d, atom_mu.size(): %zu\n", + // mpi_comm_global_h.myid, atom_mu.size()); + #endif + for (int I = 0; I != atom_mu.size() - 1; I++) + { + count_range += atom_mu[I]; + part_range[I + 1] = count_range; + } + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success after part_range processid:%d, atom_mu.size(): %zu\n", + // mpi_comm_global_h.myid, atom_mu.size()); + #endif + + // cout << "part_range:" << endl; + // for (int I = 0; I != atom_mu.size(); I++) + // { + // cout << part_range[I] << endl; + // } + // cout << "part_range over" << endl; + + // pi_freq_q contains all atoms + map, ComplexMatrix>> pi_freq_q; + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("| process %d, qpts.size(): %zu,freq.size():%zu\n", mpi_comm_global_h.myid, chi0.klist.size(),chi0.tfg.get_freq_nodes().size()); + #endif + for(const auto &freq : chi0.tfg.get_freq_nodes()) + { + // printf("| process %d, freq: %f\n", mpi_comm_global_h.myid, freq); + map, atom_mapping::pair_t_old> freq_q_MuNupi; + if(!chi0.get_chi0_q().empty()) + freq_q_MuNupi=pi_freq_q_Mu_Nu.at(freq); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success before freq_q_MuNupi processid:%d, freq_q_MuNupi.size(): %zu\n", + // mpi_comm_global_h.myid, freq_q_MuNupi.size()); + #endif + for(const auto &q:chi0.klist){ + atom_mapping::pair_t_old q_MuNupi; + if(!chi0.get_chi0_q().empty()) + q_MuNupi = freq_q_MuNupi.at(q); + const auto MuNupi = q_MuNupi; + pi_freq_q[freq][q].create(range_all, range_all); + + ComplexMatrix pi_munu_tmp(range_all, range_all); + pi_munu_tmp.zero_out(); + if(!chi0.get_chi0_q().empty()) + for (const auto &Mu_Nupi : MuNupi) + { + const auto Mu = Mu_Nupi.first; + const auto Nupi = Mu_Nupi.second; + const size_t n_mu = atom_mu[Mu]; + for (const auto &Nu_pi : Nupi) + { + const auto Nu = Nu_pi.first; + const auto pimat = Nu_pi.second; + const size_t n_nu = atom_mu[Nu]; + + for (size_t mu = 0; mu != n_mu; ++mu) + { + for (size_t nu = 0; nu != n_nu; ++nu) + { + pi_munu_tmp(part_range[Mu] + mu, part_range[Nu] + nu) += pimat(mu, nu); + } + } + } + } + if (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR || + LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI) + { + mpi_comm_global_h.reduce_ComplexMatrix(pi_munu_tmp, pi_freq_q.at(freq).at(q), 0); + } + else + { + pi_freq_q.at(freq).at(q) = std::move(pi_munu_tmp); + } + } + } + // lib_printf("Finish Pi communicate %4d, size %zu\n", mpi_comm_global_h.myid, + // pi_freq_q_Mu_Nu.size()); + mpi_comm_global_h.barrier(); + // if (mpi_comm_global_h.myid == 0) + { + complex tot_RPA_energy(0.0, 0.0); + map, complex> cRPA_q; + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + int num_iteration = 0; + #endif + for (const auto &freq_qpi : pi_freq_q) + { + const auto freq = freq_qpi.first; + const double freq_weight = chi0.tfg.find_freq_weight(freq); + for (const auto &q_pi : freq_qpi.second) + { + const auto q = q_pi.first; + const auto pimat = q_pi.second; + complex rpa_for_omega_q(0.0, 0.0); + ComplexMatrix identity(range_all, range_all); + ComplexMatrix identity_minus_pi(range_all, range_all); + identity.set_as_identity_matrix(); + identity_minus_pi = identity - pi_freq_q[freq][q]; + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // if(num_iteration==0) + // if(mpi_comm_global_h.myid == 1) + // { + // complex* test_c= identity_minus_pi.c; + // for(int i=0;i det_for_rpa(1.0, 0.0); + int info_LU = 0; + int *ipiv = new int[range_all]; + LapackConnector::zgetrf(range_all, range_all, identity_minus_pi, range_all, ipiv, + &info_LU); + for (int ib = 0; ib != range_all; ib++) + { + if (ipiv[ib] != (ib + 1)) + det_for_rpa = -det_for_rpa * identity_minus_pi(ib, ib); + else + det_for_rpa = det_for_rpa * identity_minus_pi(ib, ib); + } + delete[] ipiv; + + complex trace_pi; + complex ln_det; + ln_det = std::log(det_for_rpa); + trace_pi = trace(pi_freq_q.at(freq).at(q)); + // cout << "PI trace vector:" << endl; + // cout << endl; + rpa_for_omega_q = ln_det + trace_pi; + // cout << " ifreq:" << freq << " rpa_for_omega_k: " << rpa_for_omega_q << " + // lnt_det: " << ln_det << " trace_pi " << trace_pi << endl; + cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; + tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; + } + } + // lib_printf("Finish EcRPA %4d, size %zu\n", mpi_comm_global_h.myid, + // pi_freq_q_Mu_Nu.size()); + mpi_comm_global_h.barrier(); + map, complex> global_cRPA_q; + for (auto q_weight : irk_weight) + { + MPI_Reduce(&cRPA_q[q_weight.first], &global_cRPA_q[q_weight.first], 1, + MPI_DOUBLE_COMPLEX, MPI_SUM, 0, mpi_comm_global_h.comm); + } + + for (auto &q_crpa : global_cRPA_q) + { + corr.qcontrib[q_crpa.first] = q_crpa.second; + } + complex gather_tot_RPA_energy(0.0, 0.0); + MPI_Reduce(&tot_RPA_energy, &gather_tot_RPA_energy, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, 0, + mpi_comm_global_h.comm); + corr.value = gather_tot_RPA_energy; + } + corr.etype = CorrEnergy::type::RPA; + return corr; +} + +CorrEnergy compute_MP2_correlation(const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat) +{ + CorrEnergy corr; + corr.etype = CorrEnergy::type::MP2; + return corr; +} + +map, atom_mapping::pair_t_old>> compute_Pi_q( + const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat) +{ + map, atom_mapping::pair_t_old>> pi; + lib_printf("Begin compute_Pi_q , pid: %d\n", mpi_comm_global_h.myid); + for (auto const &freq_qJQchi0 : chi0.get_chi0_q()) + { + const double freq = freq_qJQchi0.first; + for (auto &q_JQchi0 : freq_qJQchi0.second) + { + Vector3_Order q = q_JQchi0.first; + for (auto &JQchi0 : q_JQchi0.second) + { + const size_t J = JQchi0.first; + const size_t J_mu = atom_mu[J]; + for (auto &Qchi0 : JQchi0.second) + { + const size_t Q = Qchi0.first; + const size_t Q_mu = atom_mu[Q]; + // auto &chi0_mat = Qchi0.second; + for (int I = 0; I != natom; I++) + { + // const size_t I = I_p.first; + const size_t I_mu = atom_mu[I]; + pi[freq][q][I][Q].create(I_mu, Q_mu); + if (J != Q) pi[freq][q][I][J].create(I_mu, J_mu); + } + } + } + // if(freq==chi0.tfg.get_freq_nodes()[0]) + // for(auto &Ip:pi[freq][q]) + // for(auto &Jp:Ip.second) + // lib_printf(" |process %d, pi atpair: %d, %d + // \n",mpi_comm_global_h.myid,Ip.first,Jp.first); + } + } + + // ofstream fp; + // std::stringstream ss; + // ss<<"out_pi_rank_"< ik_vec = k_pair.first; + auto chi0_freq_k = k_pair.second; + for (auto &J_p : chi0_freq_k) + { + const size_t J = J_p.first; + for (auto &Q_p : J_p.second) + { + const size_t Q = Q_p.first; + auto &chi0_mat = Q_p.second; + for (int I = 0; I != natom; I++) + { + // const size_t I = I_p.first; + // printf("cal_pi pid: %d , IJQ: %d %d %d\n", mpi_comm_global_h.myid, I, + // J, Q); + // cout<<" pi_IQ: "< trace_pi; + // trace_pi = trace(pi.at(freq).at(ik_vec).at(I).at(Q)); + // sm << " IJQ: " << I << " " << J << " " << Q << " ik_vec: " << + // ik_vec << " trace_pi: " << trace_pi << endl; + // print_complex_matrix_file(sm.str().c_str(), + // (*Vq.at(I).at(J).at(ik_vec)),fp,false); + // print_complex_matrix_file("chi0:", chi0_mat,fp,false); + // print_complex_matrix_file("pi_mat:", + // pi.at(freq).at(ik_vec).at(I).at(Q),fp,false); + // } + } + else + { + // if (freq == chi0.tfg.get_freq_nodes()[0]) + // lib_printf("cal_pi pid: %d , IJQ: %d %d %d type: %d \n", + // mpi_comm_global_h.myid, I, J, Q,2); + // << " Vq: " << transpose(*Vq.at(J).at(I).at(ik_vec), 1)(0, 0) << + // endl; + pi.at(freq).at(ik_vec).at(I).at(Q) += + transpose(*Vq.at(J).at(I).at(ik_vec), 1) * chi0_mat; + } + + if (J != Q) + { + ComplexMatrix chi0_QJ = transpose(chi0_mat, 1); + if (I <= Q) + { + // if (freq == chi0.tfg.get_freq_nodes()[0]) + // lib_printf("cal_pi pid: %d , IJQ: %d %d %d type: %d + // \n", mpi_comm_global_h.myid, I, J, Q,3); + // << " Vq: " << (*Vq.at(I).at(Q).at(ik_vec))(0, 0) << endl; + pi.at(freq).at(ik_vec).at(I).at(J) += + (*Vq.at(I).at(Q).at(ik_vec)) * chi0_QJ; + } + else + { + // if (freq == chi0.tfg.get_freq_nodes()[0]) + // lib_printf("cal_pi pid: %d , IJQ: %d %d %d type: %d + // \n", mpi_comm_global_h.myid, I, J, Q,4); + // << " Vq: " << transpose(*Vq.at(J).at(I).at(ik_vec), 1)(0, + // 0) << endl; + pi.at(freq).at(ik_vec).at(I).at(J) += + transpose(*Vq.at(Q).at(I).at(ik_vec), 1) * chi0_QJ; + } + } + } + } + } + } + } + // fp.close(); + // print_complex_matrix(" + // first_pi_mat:",pi.at(chi0.tfg.get_freq_nodes()[0]).at({0,0,0}).at(0).at(0)); + /* print_complex_matrix(" + * last_pi_mat:",pi.at(chi0.tfg.get_freq_nodes()[0]).at({0,0,0}).at(natom-1).at(natom-1)); */ + return pi; +} + +map, atom_mapping::pair_t_old>> compute_Pi_q_MPI( + const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat) +{ + map, atom_mapping::pair_t_old>> pi; + lib_printf("Begin compute_Pi_q_MPI , pid: %d\n", mpi_comm_global_h.myid); + for (auto const &freq_qJQchi0 : chi0.get_chi0_q()) + { + const double freq = freq_qJQchi0.first; + for (auto &q_JQchi0 : freq_qJQchi0.second) + { + Vector3_Order q = q_JQchi0.first; + for (auto &JQchi0 : q_JQchi0.second) + { + const size_t J = JQchi0.first; + const size_t J_mu = atom_mu[J]; + for (auto &Qchi0 : JQchi0.second) + { + const size_t Q = Qchi0.first; + const size_t Q_mu = atom_mu[Q]; + // auto &chi0_mat = Qchi0.second; + for (int I = 0; I != natom; I++) + { + // const size_t I = I_p.first; + const size_t I_mu = atom_mu[I]; + pi[freq][q][I][Q].create(I_mu, Q_mu); + if (J != Q) pi[freq][q][I][J].create(I_mu, J_mu); + } + } + } + // if(freq==chi0.tfg.get_freq_nodes()[0]) + // for(auto &Ip:pi[freq][q]) + // for(auto &Jp:Ip.second) + // lib_printf(" |process %d, pi atpair: %d, %d + // \n",mpi_comm_global_h.myid,Ip.first,Jp.first); + } + } + + // ofstream fp; + // std::stringstream ss; + // ss<<"out_pi_rank_"< ik_vec = k_pair.first; + for (int I = 0; I != natom; I++) + { + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success before gather_vp_row_q irk_weight, pid: %d\n", mpi_comm_global_h.myid); + #endif + atom_mapping::pair_t_old Vq_row = gather_vq_row_q(I, coulmat, ik_vec); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success after gather_vp_row_q irk_weight, pid: %d\n", mpi_comm_global_h.myid); + #endif + for (auto &freq_p : chi0.get_chi0_q()) + { + const double freq = freq_p.first; + const auto chi0_freq = freq_p.second; + + auto chi0_freq_k = freq_p.second.at(ik_vec); + + for (auto &J_p : chi0_freq_k) + { + const size_t J = J_p.first; + for (auto &Q_p : J_p.second) + { + const size_t Q = Q_p.first; + auto &chi0_mat = Q_p.second; + + // const size_t I = I_p.first; + // printf("cal_pi pid: %d , IJQ: %d %d %d\n", mpi_comm_global_h.myid, I, + // J, Q); + // cout<<" pi_IQ: "< trace_pi; + // trace_pi = trace(pi.at(freq).at(ik_vec).at(I).at(Q)); + // sm << " IJQ: " << I << " " << J << " " << Q << " ik_vec: " << ik_vec + // << " trace_pi: " << trace_pi << endl; + // print_complex_matrix_file(sm.str().c_str(), + // Vq_row.at(I).at(J),fp,false); print_complex_matrix_file("chi0:", + // chi0_mat,fp,false); print_complex_matrix_file("pi_mat:", + // pi.at(freq).at(ik_vec).at(I).at(Q),fp,false); + // } + + if (J != Q) + { + ComplexMatrix chi0_QJ = transpose(chi0_mat, 1); + // if (freq == chi0.tfg.get_freq_nodes()[0]) + // lib_printf("cal_pi pid: %d , IJQ: %d %d %d type: %d \n", + // mpi_comm_global_h.myid, I, J,Q,3); + // << " Vq: " << (*Vq.at(I).at(Q).at(ik_vec))(0, 0) << endl; + pi.at(freq).at(ik_vec).at(I).at(J) += Vq_row.at(I).at(Q) * chi0_QJ; + } + } + } + } + } + } + // fp.close(); + // print_complex_matrix(" + // first_pi_mat:",pi.at(chi0.tfg.get_freq_nodes()[0]).at({0,0,0}).at(0).at(0)); + /* print_complex_matrix(" + * last_pi_mat:",pi.at(chi0.tfg.get_freq_nodes()[0]).at({0,0,0}).at(natom-1).at(natom-1)); */ + lib_printf("End compute_Pi_q_MPI , pid: %d\n", mpi_comm_global_h.myid); + return pi; +} + +ComplexMatrix compute_Pi_freq_q_row(const Vector3_Order &ik_vec, + const atom_mapping::pair_t_old &chi0_freq_q, + const atom_mapping::pair_t_old &Vq_row, + const int &I) +{ + map pi; + // lib_printf("Begin cal_pi_k , pid: %d\n", para_mpi.get_myid()); + auto I_mu = atom_mu[I]; + for (int J = 0; J != natom; J++) pi[J].create(I_mu, atom_mu[J]); + + omp_lock_t pi_lock; + omp_init_lock(&pi_lock); +#pragma omp parallel for schedule(dynamic) + for (int iap = 0; iap != local_atpair.size(); iap++) + { + const size_t J = local_atpair[iap].first; + const size_t Q = local_atpair[iap].second; + auto &chi0_mat = chi0_freq_q.at(J).at(Q); + auto tmp_pi_mat = Vq_row.at(I).at(J) * chi0_mat; + ComplexMatrix chi0_QJ = transpose(chi0_mat, 1); + auto tmp_pi_mat2 = Vq_row.at(I).at(Q) * chi0_QJ; + omp_set_lock(&pi_lock); + pi.at(Q) += tmp_pi_mat; + if (J != Q) + { + pi.at(J) += tmp_pi_mat2; + } + omp_unset_lock(&pi_lock); + } + omp_destroy_lock(&pi_lock); + // for (auto &J_p : chi0_freq_q) + // { + // const size_t J = J_p.first; + // for (auto &Q_p : J_p.second) + // { + // const size_t Q = Q_p.first; + // auto &chi0_mat = Q_p.second; + // pi.at(Q) += Vq_row.at(I).at(J) * chi0_mat; + // if (J != Q) + // { + // ComplexMatrix chi0_QJ = transpose(chi0_mat, 1); + // pi.at(J) += Vq_row.at(I).at(Q) * chi0_QJ; + // } + // } + // } + // Pi_rowT + // ComplexMatrix pi_row(N_all_mu,atom_mu[I]); + // complex *pi_row_ptr=pi_row.c; + // for(auto &Jp:pi) + // { + // auto J=Jp.first; + // auto J_mu=atom_mu[J]; + // const auto length=sizeof(complex)* I_mu *J_mu; + // memcpy(pi_row_ptr, pi.at(J).c,length); + // pi_row_ptr+=I_mu *J_mu; + // } + ComplexMatrix pi_row(atom_mu[I], N_all_mu); + for (int i = 0; i != pi_row.nr; i++) + for (int J = 0; J != natom; J++) + for (int j = 0; j != atom_mu[J]; j++) + pi_row(i, atom_mu_part_range[J] + j) = pi.at(J)(i, j); + return pi_row; +} + +ComplexMatrix compute_Pi_freq_q_row_ri(const Vector3_Order &ik_vec, + const atom_mapping::pair_t_old &chi0_freq_q, + const atpair_k_cplx_mat_t &Vq_loc, const int &I, + const Vector3_Order &q) +{ + map pi; + // lib_printf("Begin cal_pi_k , pid: %d\n", mpi_comm_global_h.myid); + auto I_mu = atom_mu[I]; + for (int J = 0; J != natom; J++) pi[J].create(I_mu, atom_mu[J]); + + omp_lock_t pi_lock; + omp_init_lock(&pi_lock); +#pragma omp parallel for schedule(dynamic) + for (int iap = 0; iap != local_atpair.size(); iap++) + { + const size_t J = local_atpair[iap].first; + const size_t Q = local_atpair[iap].second; + auto &chi0_mat = chi0_freq_q.at(J).at(Q); + // printf("| IN cal Pi process %d, I: %d J: %d Q: %d\n",mpi_comm_global_h.myid, I,J,Q ); + auto tmp_pi_mat = *Vq_loc.at(I).at(J).at(q) * chi0_mat; + ComplexMatrix chi0_QJ = transpose(chi0_mat, 1); + auto tmp_pi_mat2 = *Vq_loc.at(I).at(Q).at(q) * chi0_QJ; + omp_set_lock(&pi_lock); + pi.at(Q) += tmp_pi_mat; + if (J != Q) + { + pi.at(J) += tmp_pi_mat2; + } + omp_unset_lock(&pi_lock); + } + omp_destroy_lock(&pi_lock); + // for (auto &J_p : chi0_freq_q) + // { + // const size_t J = J_p.first; + // for (auto &Q_p : J_p.second) + // { + // const size_t Q = Q_p.first; + // auto &chi0_mat = Q_p.second; + // pi.at(Q) += Vq_row.at(I).at(J) * chi0_mat; + // if (J != Q) + // { + // ComplexMatrix chi0_QJ = transpose(chi0_mat, 1); + // pi.at(J) += Vq_row.at(I).at(Q) * chi0_QJ; + // } + // } + // } + // Pi_rowT + // ComplexMatrix pi_row(N_all_mu,atom_mu[I]); + // complex *pi_row_ptr=pi_row.c; + // for(auto &Jp:pi) + // { + // auto J=Jp.first; + // auto J_mu=atom_mu[J]; + // const auto length=sizeof(complex)* I_mu *J_mu; + // memcpy(pi_row_ptr, pi.at(J).c,length); + // pi_row_ptr+=I_mu *J_mu; + // } + ComplexMatrix pi_row(atom_mu[I], N_all_mu); + for (int i = 0; i != pi_row.nr; i++) + for (int J = 0; J != natom; J++) + for (int j = 0; j != atom_mu[J]; j++) + pi_row(i, atom_mu_part_range[J] + j) = pi.at(J)(i, j); + return pi_row; +} + +atom_mapping::pair_t_old gather_vq_row_q(const int &I, + const atpair_k_cplx_mat_t &coulmat, + const Vector3_Order &ik_vec) +{ + auto I_mu = atom_mu[I]; + atom_mapping::pair_t_old Vq_row; + for (int J_tmp = 0; J_tmp != natom; J_tmp++) + { + auto J_mu = atom_mu[J_tmp]; + ComplexMatrix loc_vq(atom_mu[I], atom_mu[J_tmp]); + Vq_row[I][J_tmp].create(atom_mu[I], atom_mu[J_tmp]); + // const auto length=sizeof(complex)* I_mu *J_mu; + // complex *loc_vq_ptr=loc_vq.c; + if (I <= J_tmp) + { + if (Vq.count(I)) + if (Vq.at(I).count(J_tmp)) loc_vq = *Vq.at(I).at(J_tmp).at(ik_vec); + } + else + { + if (Vq.count(J_tmp)) + if (Vq.at(J_tmp).count(I)) loc_vq = transpose(*Vq.at(J_tmp).at(I).at(ik_vec), 1); + } + mpi_comm_global_h.allreduce_ComplexMatrix(loc_vq, Vq_row[I][J_tmp]); + } + return Vq_row; +} + +map, matrix_m>>>::pair_t_old> +compute_Wc_freq_q(Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat_eps, + atpair_k_cplx_mat_t &coulmat_wc, + const vector> &epsmac_LF_imagfreq) +{ + map, matrix_m>>>::pair_t_old> + Wc_freq_q; + const int range_all = LIBRPA::atomic_basis_abf.nb_total; + const auto part_range = LIBRPA::atomic_basis_abf.get_part_range(); + + if (mpi_comm_global_h.myid == 0) + { + cout << "Calculating Wc using LAPACK" << endl; + } + + mpi_comm_global_h.barrier(); + // use q-points as the outmost loop, so that square root of Coulomb will not be recalculated at + // each frequency point + vector> qpts; + for (const auto &qMuNuchi : chi0.get_chi0_q().at(chi0.tfg.get_freq_nodes()[0])) + qpts.push_back(qMuNuchi.first); + + for (const auto &q : qpts) + { + int iq = std::distance(klist.begin(), std::find(klist.begin(), klist.end(), q)); + char fn[80]; + + ComplexMatrix Vq_all(range_all, range_all); + for (const auto &Mu_NuqVq : coulmat_eps) + { + auto Mu = Mu_NuqVq.first; + auto n_mu = atom_mu[Mu]; + for (auto &Nu_qVq : Mu_NuqVq.second) + { + auto Nu = Nu_qVq.first; + if (0 == Nu_qVq.second.count(q)) continue; + auto n_nu = atom_mu[Nu]; + for (int i_mu = 0; i_mu != n_mu; i_mu++) + for (int i_nu = 0; i_nu != n_nu; i_nu++) + { + Vq_all(part_range[Mu] + i_mu, part_range[Nu] + i_nu) = + (*Nu_qVq.second.at(q))(i_mu, i_nu); + Vq_all(part_range[Nu] + i_nu, part_range[Mu] + i_mu) = + conj((*Nu_qVq.second.at(q))(i_mu, i_nu)); + } + } + } + if (Params::debug) + { + sprintf(fn, "Vq_all_q_%d.mtx", iq); + print_complex_matrix_mm(Vq_all, Params::output_dir + "/" + fn, 1e-15); + } + auto sqrtVq_all = power_hemat(Vq_all, 0.5, true, false, Params::sqrt_coulomb_threshold); + // Vq_all is now eigenvectors of the original Coulomb matrix + const auto &Vq_eigen = Vq_all; + if (Params::debug) + { + sprintf(fn, "sqrtVq_all_q_%d.mtx", iq); + print_complex_matrix_mm(sqrtVq_all, Params::output_dir + "/" + fn, 1e-15); + // sprintf(fn, "rotated_sqrtVq_all_q_%d.mtx", iq); + // print_complex_matrix_mm(Vq_all * sqrtVq_all * transpose(Vq_all, true), fn, 1e-15); + // print_complex_matrix_mm(transpose(Vq_all, true) * sqrtVq_all * Vq_all, fn, 1e-15); + sprintf(fn, "Vqeigenvec_q_%d.mtx", iq); + print_complex_matrix_mm(Vq_eigen, Params::output_dir + "/" + fn, 1e-15); + } + + // truncated (cutoff) Coulomb + ComplexMatrix Vqcut_all(range_all, range_all); + for (auto &Mu_NuqVq : coulmat_wc) + { + auto Mu = Mu_NuqVq.first; + auto n_mu = atom_mu[Mu]; + for (auto &Nu_qVq : Mu_NuqVq.second) + { + auto Nu = Nu_qVq.first; + if (0 == Nu_qVq.second.count(q)) continue; + auto n_nu = atom_mu[Nu]; + for (int i_mu = 0; i_mu != n_mu; i_mu++) + for (int i_nu = 0; i_nu != n_nu; i_nu++) + { + Vqcut_all(part_range[Mu] + i_mu, part_range[Nu] + i_nu) = + (*Nu_qVq.second.at(q))(i_mu, i_nu); + Vqcut_all(part_range[Nu] + i_nu, part_range[Mu] + i_mu) = + conj((*Nu_qVq.second.at(q))(i_mu, i_nu)); + } + } + } + auto sqrtVqcut_all = + power_hemat(Vqcut_all, 0.5, false, true, Params::sqrt_coulomb_threshold); + // sprintf(fn, "sqrtVqcut_all_q_%d.mtx", iq); + // print_complex_matrix_mm(sqrtVqcut_all, fn, 1e-15); + sprintf(fn, "Vqcut_all_filtered_q_%d.mtx", iq); + // print_complex_matrix_mm(Vqcut_all, fn, 1e-15); + // save the filtered truncated Coulomb back to the atom mapping object + // TODO: revise the necessity + for (auto &Mu_NuqVq : coulmat_wc) + { + auto Mu = Mu_NuqVq.first; + auto n_mu = atom_mu[Mu]; + for (auto &Nu_qVq : Mu_NuqVq.second) + { + auto Nu = Nu_qVq.first; + if (0 == Nu_qVq.second.count(q)) continue; + auto n_nu = atom_mu[Nu]; + for (int i_mu = 0; i_mu != n_mu; i_mu++) + for (int i_nu = 0; i_nu != n_nu; i_nu++) + (*Nu_qVq.second.at(q))(i_mu, i_nu) = + Vqcut_all(part_range[Mu] + i_mu, part_range[Nu] + i_nu); + } + } + + ComplexMatrix chi0fq_all(range_all, range_all); + for (const auto &freq_qMuNuchi : chi0.get_chi0_q()) + { + auto freq = freq_qMuNuchi.first; + auto ifreq = chi0.tfg.get_freq_index(freq); + auto MuNuchi = freq_qMuNuchi.second.at(q); + for (const auto &Mu_Nuchi : MuNuchi) + { + auto Mu = Mu_Nuchi.first; + auto n_mu = atom_mu[Mu]; + for (auto &Nu_chi : Mu_Nuchi.second) + { + auto Nu = Nu_chi.first; + auto n_nu = atom_mu[Nu]; + for (int i_mu = 0; i_mu != n_mu; i_mu++) + for (int i_nu = 0; i_nu != n_nu; i_nu++) + { + chi0fq_all(part_range[Mu] + i_mu, part_range[Nu] + i_nu) = + Nu_chi.second(i_mu, i_nu); + chi0fq_all(part_range[Nu] + i_nu, part_range[Mu] + i_mu) = + conj(Nu_chi.second(i_mu, i_nu)); + } + } + } + sprintf(fn, "chi0fq_all_q_%d_freq_%d.mtx", iq, ifreq); + print_complex_matrix_mm(chi0fq_all, Params::output_dir + "/" + fn, 1e-15); + + ComplexMatrix identity(range_all, range_all); + identity.set_as_identity_matrix(); + auto eps_fq = sqrtVq_all * chi0fq_all * sqrtVq_all; + eps_fq = transpose(Vq_eigen, true) * eps_fq * Vq_eigen; + if (!epsmac_LF_imagfreq.empty() && is_gamma_point(q)) + { + // rotate to Coulomb-diagonal basis + // lib_printf("Largest off-diagonal = %f\n", eps_fq.get_max_abs_offdiag()); + // print_matrix("rotated eps_fq: ", eps_fq.real()); + // replacing the element corresponding to largest Coulomb eigenvalue with dielectric + // function + lib_printf("%22.12f %22.12f %22.12f %22.12f\n", freq, eps_fq(0, 0).real(), + eps_fq(eps_fq.nr - 1, eps_fq.nc - 1).real(), + epsmac_LF_imagfreq[ifreq].real()); + // eps_fq(eps_fq.nr - 1, eps_fq.nc - 1) = epsmac_LF_imagfreq[ifreq]; + eps_fq(0, 0) = 1.0 - epsmac_LF_imagfreq[ifreq]; + } + if (Params::debug) + { + sprintf(fn, "rotated_vsxvs_q_%d_freq_%d.mtx", iq, ifreq); + print_complex_matrix_mm(eps_fq, Params::output_dir + "/" + fn, 1e-10); + } + // rotate back to ABF + eps_fq = Vq_eigen * eps_fq * transpose(Vq_eigen, true); + eps_fq = identity - eps_fq; + if (Params::debug) + { + sprintf(fn, "eps_q_%d_freq_%d.mtx", iq, ifreq); + print_complex_matrix_mm(eps_fq, Params::output_dir + "/" + fn, 1e-10); + } + + // invert the epsilon matrix + power_hemat_onsite(eps_fq, -1); + auto wc_all = sqrtVqcut_all * (eps_fq - identity) * sqrtVqcut_all; + // sprintf(fn, "inveps_q_%d_freq_%d.mtx", iq, ifreq); + // print_complex_matrix_mm(eps_fq, fn, 1e-15); + // sprintf(fn, "wc_q_%d_freq_%d.mtx", iq, ifreq); + // print_complex_matrix_mm(wc_all, fn, 1e-15); + + // save result to the atom mapping object + for (auto &Mu_Nuchi : MuNuchi) + { + auto Mu = Mu_Nuchi.first; + auto n_mu = atom_mu[Mu]; + for (auto &Nu_chi : Mu_Nuchi.second) + { + auto Nu = Nu_chi.first; + auto n_nu = atom_mu[Nu]; + shared_ptr wc_ptr = make_shared(); + wc_ptr->create(n_mu, n_nu); + for (int i_mu = 0; i_mu != n_mu; i_mu++) + for (int i_nu = 0; i_nu != n_nu; i_nu++) + { + (*wc_ptr)(i_mu, i_nu) = + wc_all(part_range[Mu] + i_mu, part_range[Nu] + i_nu); + } + Wc_freq_q[freq][Mu][Nu][q] = + matrix_m>(n_mu, n_nu, wc_ptr->c, MAJOR::ROW, MAJOR::ROW); + } + } + } + } + + return Wc_freq_q; +} + +// Done: converge compute_Wc_freq_q_blacs and compute_Wc_freq_q_blacs_wing +map, matrix_m>>>::pair_t_old> +compute_Wc_freq_q_blacs(Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat_eps, + atpair_k_cplx_mat_t &coulmat_wc, + const vector> &epsmac_LF_imagfreq) +{ + map, matrix_m>>>::pair_t_old> + Wc_freq_q; + const complex CONE{1.0, 0.0}; + const int n_abf = LIBRPA::atomic_basis_abf.nb_total; + const auto part_range = LIBRPA::atomic_basis_abf.get_part_range(); + + if (mpi_comm_global_h.myid == 0) + { + cout << "Calculating Wc using ScaLAPACK" << endl; + } + mpi_comm_global_h.barrier(); + + Profiler::start("compute_Wc_freq_q_blacs_init"); + Array_Desc desc_nabf_nabf(blacs_ctxt_global_h); + // Use a square blocksize instead max block, otherwise heev and inversion will complain about + // illegal parameter Maximal blocksize ensure that atom indices related to the rows/columns of a + // local matrix is minimized. + desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0); + // This, however, is not optimal for matrix operations, and may lead to segment fault during + // MPI operations with parallel linear algebra subroutine. Thus we define an optimal blocksize + Array_Desc desc_nabf_nabf_opt(blacs_ctxt_global_h); + const int nb_opt = min(128, desc_nabf_nabf.nb()); + desc_nabf_nabf_opt.init(n_abf, n_abf, nb_opt, nb_opt, 0, 0); + // obtain the indices of atom-pair block necessary to build 2D block of a Hermitian/symmetric + // matrix + const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy( + 'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf); + const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf); + // temp_block is used to collect data from IJ-pair data structure with comm_map2_first + auto temp_block = init_local_mat>(desc_nabf_nabf, MAJOR::COL); + // Below are the working arrays for matrix operations + auto chi0_block = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); + auto coul_block = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); + auto coul_eigen_block = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); + auto coul_chi0_block = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); + auto coulwc_block = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); + + const double mem_blocks = (chi0_block.size() + coul_block.size() + coul_eigen_block.size() + + coul_chi0_block.size() + coulwc_block.size()) * + 16.0e-6; + ofs_myid << get_timestamp() + << " Memory consumption of task-local blocks for screened Coulomb [MB]: " << mem_blocks + << endl; + + const auto atpair_local = dispatch_upper_trangular_tasks( + natom, blacs_ctxt_global_h.myid, blacs_ctxt_global_h.nprows, blacs_ctxt_global_h.npcols, + blacs_ctxt_global_h.myprow, blacs_ctxt_global_h.mypcol); +#ifdef LIBRPA_DEBUG + ofs_myid << get_timestamp() << " atpair_local " << atpair_local << endl; + ofs_myid << get_timestamp() << " s0_s1 " << s0_s1 << endl; +#endif + + // IJ pair of Wc to be returned + pair, set> Iset_Jset_Wc; + for (const auto &ap : atpair_local) + { + Iset_Jset_Wc.first.insert(ap.first); + Iset_Jset_Wc.second.insert(ap.second); + } + + // Prepare local basis indices for 2D->IJ map + int I, iI; + map> map_lor_v; + map> map_loc_v; + for (int i_lo = 0; i_lo != desc_nabf_nabf.m_loc(); i_lo++) + { + int i_glo = desc_nabf_nabf.indx_l2g_r(i_lo); + LIBRPA::atomic_basis_abf.get_local_index(i_glo, I, iI); + map_lor_v[I].push_back(iI); + } + for (int i_lo = 0; i_lo != desc_nabf_nabf.n_loc(); i_lo++) + { + int i_glo = desc_nabf_nabf.indx_l2g_c(i_lo); + LIBRPA::atomic_basis_abf.get_local_index(i_glo, I, iI); + map_loc_v[I].push_back(iI); + } + + vector> qpts; + for (const auto &q_weight : irk_weight) qpts.push_back(q_weight.first); + + vec eigenvalues(n_abf); + Profiler::cease("compute_Wc_freq_q_blacs_init"); + LIBRPA::utils::lib_printf_root("Time for Wc initialization (seconds, Wall/CPU): %f %f\n", + Profiler::get_wall_time_last("compute_Wc_freq_q_blacs_init"), + Profiler::get_cpu_time_last("compute_Wc_freq_q_blacs_init")); + + Profiler::start("compute_Wc_freq_q_work"); +#ifdef LIBRPA_USE_LIBRI + for (const auto &q : qpts) + { + const int iq = std::distance(qpts.cbegin(), std::find(qpts.cbegin(), qpts.cend(), q)); + const int iq_in_k = + std::distance(klist.cbegin(), std::find(klist.cbegin(), klist.cend(), q)); + // q-point in fractional coordinates + const auto &qf = kfrac_list[iq_in_k]; + LIBRPA::utils::lib_printf_root("Computing Wc(q), %d / %d, q=(%f, %f, %f)\n", iq + 1, + qpts.size(), qf.x, qf.y, qf.z); + coul_block.zero_out(); + coulwc_block.zero_out(); + // lib_printf("coul_block\n%s", str(coul_block).c_str()); + + // q-array for LibRI object + std::array qa = {q.x, q.y, q.z}; + + // collect the block elements of truncated coulomb matrices first + // as we reuse coul_eigen_block to reduce memory usage + Profiler::start("epsilon_prepare_coulwc_sqrt", "Prepare sqrt of truncated Coulomb"); + { + size_t n_singular_coulwc; + // LibRI tensor for communication, release once done + std::map>, RI::Tensor>>> + couleps_libri; + Profiler::start("epsilon_prepare_coulwc_sqrt_1", "Setup libRI object"); + for (const auto &Mu_Nu : atpair_local) + { + const auto Mu = Mu_Nu.first; + const auto Nu = Mu_Nu.second; + // ofs_myid << "Mu " << Mu << " Nu " << Nu << endl; + if (coulmat_wc.count(Mu) == 0 || coulmat_wc.at(Mu).count(Nu) == 0 || + coulmat_wc.at(Mu).at(Nu).count(q) == 0) + continue; + const auto &Vq = coulmat_wc.at(Mu).at(Nu).at(q); + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu); + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu); + std::valarray> Vq_va(Vq->c, Vq->size); + auto pvq = std::make_shared>>(); + *pvq = Vq_va; + couleps_libri[Mu][{Nu, qa}] = RI::Tensor>({n_mu, n_nu}, pvq); + } + Profiler::stop("epsilon_prepare_coulwc_sqrt_1"); + + Profiler::start("epsilon_prepare_coulwc_sqrt_2", "libRI Communicate"); + const auto IJq_coul = RI::Communicate_Tensors_Map_Judge::comm_map2_first( + mpi_comm_global_h.comm, couleps_libri, s0_s1.first, s0_s1.second); + Profiler::stop("epsilon_prepare_coulwc_sqrt_2"); + + Profiler::start("epsilon_prepare_coulwc_sqrt_3", "Collect 2D-block from IJ"); + // for (const auto &IJ: set_IJ_nabf_nabf) + // { + // const auto &I = IJ.first; + // const auto &J = IJ.second; + // collect_block_from_IJ_storage_syhe( + // coulwc_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, IJ.first, + // IJ.second, true, CONE, IJq_coul.at(I).at({J, qa}).ptr(), MAJOR::ROW); + // } + collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, + qa, true, CONE, IJq_coul, MAJOR::ROW); + ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1, desc_nabf_nabf.desc, + coulwc_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, + blacs_ctxt_global_h.ictxt); + Profiler::stop("epsilon_prepare_coulwc_sqrt_3"); + Profiler::start("epsilon_prepare_coulwc_sqrt_4", "Perform square root"); + power_hemat_blacs(coulwc_block, desc_nabf_nabf_opt, coul_eigen_block, + desc_nabf_nabf_opt, n_singular_coulwc, eigenvalues.c, 0.5, + Params::sqrt_coulomb_threshold); + Profiler::stop("epsilon_prepare_coulwc_sqrt_4"); + } + Profiler::stop("epsilon_prepare_coulwc_sqrt"); + LIBRPA::utils::lib_printf_root( + "Time to prepare sqrt root of Coulomb for Wc(q) (seconds, Wall/CPU): %f %f\n", + Profiler::get_wall_time_last("epsilon_prepare_coulwc_sqrt"), + Profiler::get_cpu_time_last("epsilon_prepare_coulwc_sqrt")); + ofs_myid << get_timestamp() << " Done coulwc sqrt" << endl; + + Profiler::start("epsilon_prepare_couleps_sqrt", "Prepare sqrt of bare Coulomb"); + // collect the block elements of coulomb matrices + { + // LibRI tensor for communication, release once done + std::map>, RI::Tensor>>> + couleps_libri; + ofs_myid << get_timestamp() << " Start build couleps_libri" << endl; + for (const auto &Mu_Nu : atpair_local) + { + const auto Mu = Mu_Nu.first; + const auto Nu = Mu_Nu.second; + // ofs_myid << "Mu " << Mu << " Nu " << Nu << endl; + if (coulmat_eps.count(Mu) == 0 || coulmat_eps.at(Mu).count(Nu) == 0 || + coulmat_eps.at(Mu).at(Nu).count(q) == 0) + continue; + const auto &Vq = coulmat_eps.at(Mu).at(Nu).at(q); + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu); + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu); + std::valarray> Vq_va(Vq->c, Vq->size); + auto pvq = std::make_shared>>(); + *pvq = Vq_va; + couleps_libri[Mu][{Nu, qa}] = RI::Tensor>({n_mu, n_nu}, pvq); + } + ofs_myid << get_timestamp() << " Done build couleps_libri" << endl; + // ofs_myid << "Couleps_libri" << endl << couleps_libri; + // if (couleps_libri.size() == 0) + // throw std::logic_error("data at q-point not found in coulmat_eps"); + + // perform communication + ofs_myid << get_timestamp() << " Start collect couleps_libri, targets" << endl; +#ifdef LIBRPA_DEBUG + ofs_myid << set_IJ_nabf_nabf << endl; + ofs_myid << "Extended blocks" << endl; + ofs_myid << "atom 1: " << s0_s1.first << endl; + ofs_myid << "atom 2: " << s0_s1.second << endl; +#endif + // ofs_myid << "Owned blocks\n"; + // print_keys(ofs_myid, couleps_libri); + // mpi_comm_global_h.barrier(); + const auto IJq_coul = RI::Communicate_Tensors_Map_Judge::comm_map2_first( + mpi_comm_global_h.comm, couleps_libri, s0_s1.first, s0_s1.second); + ofs_myid << get_timestamp() << " Done collect couleps_libri, collected blocks" << endl; + + ofs_myid << get_timestamp() << " Start construct couleps 2D block" << endl; + collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, + qa, true, CONE, IJq_coul, MAJOR::ROW); + ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1, desc_nabf_nabf.desc, + coul_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, + blacs_ctxt_global_h.ictxt); + ofs_myid << get_timestamp() << " Done construct couleps 2D block" << endl; + } + // char fn[100]; + // sprintf(fn, "couleps_iq_%d.mtx", iq); + // print_matrix_mm_file_parallel(fn, coul_block, desc_nabf_nabf); + // ofs_myid << str(coul_block); + // lib_printf("coul_block\n%s", str(coul_block).c_str()); + + size_t n_singular; + ofs_myid << get_timestamp() << " Start power hemat couleps\n"; + matrix_m> sqrtveig_blacs; + if (is_gamma_point(q)) + { + // choice of power_hemat_blacs_real/power_hemat_blacs_desc + // leads to sub-meV difference + sqrtveig_blacs = power_hemat_blacs_real( + coul_block, desc_nabf_nabf_opt, coul_eigen_block, desc_nabf_nabf_opt, n_singular, + eigenvalues.c, 0.5, Params::sqrt_coulomb_threshold); + if (Params::replace_w_head && Params::option_dielect_func == 3) + { + df_headwing.wing_mu_to_lambda(sqrtveig_blacs, desc_nabf_nabf_opt); + } + } + else + { + sqrtveig_blacs = power_hemat_blacs(coul_block, desc_nabf_nabf_opt, coul_eigen_block, + desc_nabf_nabf_opt, n_singular, eigenvalues.c, 0.5, + Params::sqrt_coulomb_threshold); + } + ofs_myid << get_timestamp() << " Done power hemat couleps\n"; + // lib_printf("nabf %d nsingu %lu\n", n_abf, n_singular); + // release sqrtv when the q-point is not Gamma, or macroscopic dielectric constant at + // imaginary frequency is not prepared + if (epsmac_LF_imagfreq.empty() || !is_gamma_point(q)) sqrtveig_blacs.clear(); + const size_t n_nonsingular = n_abf - n_singular; + Profiler::stop("epsilon_prepare_couleps_sqrt"); + LIBRPA::utils::lib_printf_root( + "Time to prepare sqrt root of Coulomb for Epsilon(q) (seconds, Wall/CPU): %f %f\n", + Profiler::get_wall_time_last("epsilon_prepare_couleps_sqrt"), + Profiler::get_cpu_time_last("epsilon_prepare_couleps_sqrt")); + ofs_myid << get_timestamp() << " Done couleps sqrt\n"; + std::flush(ofs_myid); + + for (const auto &freq : chi0.tfg.get_freq_nodes()) + { + const auto ifreq = chi0.tfg.get_freq_index(freq); + Profiler::start("epsilon_wc_work_q_omega"); + Profiler::start("epsilon_prepare_chi0_2d", "Prepare Chi0 2D block"); + chi0_block.zero_out(); + { + std::map>, + RI::Tensor>>> + chi0_libri; + if (chi0.get_chi0_q().count(freq) > 0 && chi0.get_chi0_q().at(freq).count(q) > 0) + { + const auto &chi0_wq = chi0.get_chi0_q().at(freq).at(q); + for (const auto &M_Nchi : chi0_wq) + { + const auto &M = M_Nchi.first; + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M); + for (const auto &N_chi : M_Nchi.second) + { + const auto &N = N_chi.first; + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N); + const auto &chi = N_chi.second; + std::valarray> chi_va(chi.c, chi.size); + auto pchi = std::make_shared>>(); + *pchi = chi_va; + chi0_libri[M][{N, qa}] = + RI::Tensor>({n_mu, n_nu}, pchi); + } + } + // Release the chi0 block for this frequency and q to reduce memory load, + // as they will not be used again + chi0.free_chi0_q(freq, q); + } + // ofs_myid << "chi0_libri" << endl << chi0_libri; + Profiler::start("epsilon_prepare_chi0_2d_comm_map2"); + const auto IJq_chi0 = RI::Communicate_Tensors_Map_Judge::comm_map2_first( + mpi_comm_global_h.comm, chi0_libri, s0_s1.first, s0_s1.second); + Profiler::stop("epsilon_prepare_chi0_2d_comm_map2"); + // ofs_myid << "IJq_chi0" << endl << IJq_chi0; + // for (const auto &IJ: set_IJ_nabf_nabf) + // { + // const auto &I = IJ.first; + // const auto &J = IJ.second; + // collect_block_from_IJ_storage_syhe( + // chi0_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, IJ.first, + // IJ.second, true, CONE, IJq_chi0.at(I).at({J, qa}).ptr(), MAJOR::ROW); + // } + Profiler::start("epsilon_prepare_chi0_2d_collect_block"); + collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf, + LIBRPA::atomic_basis_abf, qa, true, CONE, IJq_chi0, + MAJOR::ROW); + ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1, + desc_nabf_nabf.desc, chi0_block.ptr(), 1, 1, + desc_nabf_nabf_opt.desc, blacs_ctxt_global_h.ictxt); + Profiler::stop("epsilon_prepare_chi0_2d_collect_block"); + // sprintf(fn, "chi_ifreq_%d_iq_%d.mtx", ifreq, iq); + // print_matrix_mm_file_parallel(fn, chi0_block, desc_nabf_nabf); + } + Profiler::stop("epsilon_prepare_chi0_2d"); + + Profiler::start("epsilon_compute_eps", "Compute dielectric matrix"); + + // for Gamma point, overwrite the head term + if (epsmac_LF_imagfreq.size() > 0 && is_gamma_point(q)) + { + ofs_myid << get_timestamp() << " Entering dielectric matrix head overwrite" << endl; + // rotate to Coulomb-eigenvector basis + // descending order + ScalapackConnector::pgemm_f( + 'N', 'N', n_abf, n_nonsingular, n_abf, 1.0, chi0_block.ptr(), 1, 1, + desc_nabf_nabf_opt.desc, sqrtveig_blacs.ptr(), 1, 1, desc_nabf_nabf_opt.desc, + 0.0, coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc); + ScalapackConnector::pgemm_f('C', 'N', n_nonsingular, n_nonsingular, n_abf, 1.0, + sqrtveig_blacs.ptr(), 1, 1, desc_nabf_nabf_opt.desc, + coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, + 0.0, chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc); + + if (Params::option_dielect_func == 3) + { + chi0_block *= -1.0; + for (int i = 0; i != n_nonsingular; i++) + { + const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i); + if (ilo < 0) continue; + const int jlo = desc_nabf_nabf_opt.indx_g2l_c(i); + if (jlo < 0) continue; + chi0_block(ilo, jlo) += 1.0; + } + ofs_myid << get_timestamp() << "Perform the head & wing element overwrite" + << endl; + df_headwing.rewrite_eps(chi0_block, ifreq, desc_nabf_nabf_opt); + } + else + { + const int ilo = desc_nabf_nabf_opt.indx_g2l_r(0); + const int jlo = desc_nabf_nabf_opt.indx_g2l_c(0); + if (ilo >= 0 && jlo >= 0) + { + ofs_myid << get_timestamp() << "Perform the head element overwrite" << endl; + chi0_block(ilo, jlo) = 1.0 - epsmac_LF_imagfreq[ifreq]; + } + } + // rotate back to ABF + // descending order + ScalapackConnector::pgemm_f('N', 'N', n_abf, n_nonsingular, n_nonsingular, 1.0, + coul_eigen_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, + chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, 0.0, + coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc); + ScalapackConnector::pgemm_f('N', 'C', n_abf, n_abf, n_nonsingular, 1.0, + coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, + coul_eigen_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, + 0.0, chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc); + if (Params::option_dielect_func != 3) + { + // now chi0_block is actually v1/2 chi v1/2 + chi0_block *= -1.0; + for (int i = 0; i != n_abf; i++) + { + const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i); + if (ilo < 0) continue; + const int jlo = desc_nabf_nabf_opt.indx_g2l_c(i); + if (jlo < 0) continue; + chi0_block(ilo, jlo) += 1.0; + } + // now chi0_block is actually the dielectric matrix + // perform inversion + Profiler::start("epsilon_invert_eps", "Invert dielectric matrix"); + invert_scalapack(chi0_block, desc_nabf_nabf_opt); + } + // subtract 1 from diagonal + for (int i = 0; i != n_abf; i++) + { + const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i); + if (ilo < 0) continue; + const int jlo = desc_nabf_nabf_opt.indx_g2l_c(i); + if (jlo < 0) continue; + chi0_block(ilo, jlo) -= 1.0; + } + } + else + { + Profiler::start("epsilon_compute_eps_pgemm_1"); + ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, coul_block.ptr(), 1, + 1, desc_nabf_nabf_opt.desc, chi0_block.ptr(), 1, 1, + desc_nabf_nabf_opt.desc, 0.0, coul_chi0_block.ptr(), 1, + 1, desc_nabf_nabf_opt.desc); + Profiler::cease("epsilon_compute_eps_pgemm_1"); + Profiler::start("epsilon_compute_eps_pgemm_2"); + ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, + coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, + coul_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, 0.0, + chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc); + Profiler::cease("epsilon_compute_eps_pgemm_2"); + // now chi0_block is actually v1/2 chi v1/2 + chi0_block *= -1.0; + for (int i = 0; i != n_abf; i++) + { + const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i); + if (ilo < 0) continue; + const int jlo = desc_nabf_nabf_opt.indx_g2l_c(i); + if (jlo < 0) continue; + chi0_block(ilo, jlo) += 1.0; + } + Profiler::stop("epsilon_compute_eps"); + // now chi0_block is actually the dielectric matrix + // perform inversion + Profiler::start("epsilon_invert_eps", "Invert dielectric matrix"); + invert_scalapack(chi0_block, desc_nabf_nabf_opt); + // subtract 1 from diagonal + for (int i = 0; i != n_abf; i++) + { + const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i); + if (ilo < 0) continue; + const int jlo = desc_nabf_nabf_opt.indx_g2l_c(i); + if (jlo < 0) continue; + chi0_block(ilo, jlo) -= 1.0; + } + Profiler::stop("epsilon_invert_eps"); + } + // debug for GaAs + // for (int i = 0; i != n_abf; i++) + // { + // for (int j = 0; j != n_abf; j++) + // { + // const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i); + // if (ilo < 0) continue; + // const int jlo = desc_nabf_nabf_opt.indx_g2l_c(j); + // if (jlo < 0) continue; + // if(i==j) + // chi0_block(ilo, jlo) = 1.0; + // else + // chi0_block(ilo, jlo) = 0.0; + // } + // } + // debug for unfold shrink Wc + // for (int i = 0; i != n_abf; i++) + //{ + // const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i); + // if (ilo < 0) continue; + // for (int j = 0; j != n_abf; j++) + // { + // const int jlo = desc_nabf_nabf_opt.indx_g2l_c(j); + // if (jlo < 0) continue; + // if (i == j) + // chi0_block(ilo, jlo) = 1.0; + // else + // chi0_block(ilo, jlo) = 0.0; + // } + // } + // debug end + + Profiler::start("epsilon_multiply_coulwc", "Multiply truncated Coulomb"); + ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, coulwc_block.ptr(), 1, + 1, desc_nabf_nabf_opt.desc, chi0_block.ptr(), 1, 1, + desc_nabf_nabf_opt.desc, 0.0, coul_chi0_block.ptr(), 1, 1, + desc_nabf_nabf_opt.desc); + ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, coul_chi0_block.ptr(), + 1, 1, desc_nabf_nabf_opt.desc, coulwc_block.ptr(), 1, 1, + desc_nabf_nabf_opt.desc, 0.0, chi0_block.ptr(), 1, 1, + desc_nabf_nabf_opt.desc); + ScalapackConnector::pgemr2d_f(n_abf, n_abf, chi0_block.ptr(), 1, 1, + desc_nabf_nabf_opt.desc, temp_block.ptr(), 1, 1, + desc_nabf_nabf.desc, blacs_ctxt_global_h.ictxt); + Profiler::stop("epsilon_multiply_coulwc"); + // lib_printf("chi0_block\n%s", str(chi0_block).c_str()); + // now chi0_block is the screened Coulomb interaction Wc (i.e. W-V) + + Profiler::start("epsilon_convert_wc_2d_to_ij", "Convert Wc, 2D -> IJ"); + Profiler::start("epsilon_convert_wc_map_block", "Initialize Wc atom-pair map"); + map>>> Wc_MNmap; + // map_block_to_IJ_storage(Wc_MNmap, LIBRPA::atomic_basis_abf, + // LIBRPA::atomic_basis_abf, chi0_block, + // desc_nabf_nabf, MAJOR::ROW); + map_block_to_IJ_storage_new(Wc_MNmap, LIBRPA::atomic_basis_abf, map_lor_v, map_loc_v, + temp_block, desc_nabf_nabf, MAJOR::ROW); + Profiler::stop("epsilon_convert_wc_map_block"); + + Profiler::start("epsilon_convert_wc_communicate", "Communicate"); + { + std::map>, + RI::Tensor>>> + Wc_libri; + Profiler::start("epsilon_convert_wc_communicate_1"); + for (const auto &M_NWc : Wc_MNmap) + { + const auto &M = M_NWc.first; + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M); + for (const auto &N_Wc : M_NWc.second) + { + const auto &N = N_Wc.first; + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N); + const auto &Wc = N_Wc.second; + // std::valarray> Wc_va(Wc.ptr(), Wc.size()); + // auto pWc = std::make_shared>>(); + // *pWc = Wc_va; + /*if (iq == 10 && ifreq == 10) + { + char fn[100]; + sprintf(fn, "Wc_M_%zu_N_%zu.dat", M, N); + print_matrix_mm_file(Wc, Params::output_dir + "/" + fn); + }*/ + Wc_libri[M][{N, qa}] = RI::Tensor>({n_mu, n_nu}, Wc.sptr()); + } + } + Profiler::stop("epsilon_convert_wc_communicate_1"); + Profiler::start("epsilon_convert_wc_communicate_2"); + // main timing + // cout << Wc_libri; + const auto IJq_Wc = RI::Communicate_Tensors_Map_Judge::comm_map2_first( + mpi_comm_global_h.comm, Wc_libri, Iset_Jset_Wc.first, Iset_Jset_Wc.second); + Profiler::stop("epsilon_convert_wc_communicate_2"); + Profiler::start("epsilon_convert_wc_communicate_3"); + // parse collected to + for (const auto &MN : atpair_local) + { + const auto &M = MN.first; + const auto &N = MN.second; + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M); + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N); + // Use row major for later usage in LibRI + Wc_freq_q[freq][M][N][q] = matrix_m>( + n_mu, n_nu, IJq_Wc.at(M).at({N, qa}).data, MAJOR::ROW); + } + Profiler::stop("epsilon_convert_wc_communicate_3"); + // for ( int i_mu = 0; i_mu != n_mu; i_mu++ ) + // for ( int i_nu = 0; i_nu != n_nu; i_nu++ ) + // { + // } + } + Profiler::stop("epsilon_convert_wc_communicate"); + Profiler::stop("epsilon_convert_wc_2d_to_ij"); + Profiler::cease("epsilon_wc_work_q_omega"); + LIBRPA::utils::lib_printf_root( + "Time for Wc(i_q=%d, i_omega=%d) (seconds, Wall/CPU): %f %f\n", iq + 1, ifreq + 1, + Profiler::get_wall_time_last("epsilon_wc_work_q_omega"), + Profiler::get_cpu_time_last("epsilon_wc_work_q_omega")); + } + } +#else + throw std::logic_error("need compilation with LibRI"); +#endif + Profiler::cease("compute_Wc_freq_q_work"); + LIBRPA::utils::lib_printf_root("Time for Wc computation (seconds, Wall/CPU): %f %f\n", + Profiler::get_wall_time_last("compute_Wc_freq_q_work"), + Profiler::get_cpu_time_last("compute_Wc_freq_q_work")); + + return Wc_freq_q; +} + +map, matrix_m>>>::pair_t_old> +FT_Wc_freq_q( + const map, matrix_m>>>::pair_t_old> + &Wc_freq_q, + const TFGrids &tfg, const int &n_k_points, const vector> &Rlist) +{ + // major of Wc_freq_q input and Wc_tau_R output + const MAJOR major_Wc = MAJOR::ROW; + + map, matrix_m>>>::pair_t_old> + Wc_freq_R; + const int ngrids = tfg.get_n_grids(); + if (Params::debug) + { + if (mpi_comm_global_h.is_root()) lib_printf("Converting Wc q,w -> R,t\n"); + mpi_comm_global_h.barrier(); + } + set> atpairs_unique; + for (const auto &freq_MuNuqWc : Wc_freq_q) + { + for (const auto &Mu_NuqWc : freq_MuNuqWc.second) + { + const auto Mu = Mu_NuqWc.first; + for (const auto &Nu_qWc : Mu_NuqWc.second) + { + const auto Nu = Nu_qWc.first; + atpairs_unique.insert({Mu, Nu}); + for (const auto &q_Wc : Nu_qWc.second) + { + assert(q_Wc.second.major() == major_Wc); + } + } + } + } + + vector>, pair>> ifreqR_atpair_all; + // allocate space before hand + for (auto R : Rlist) + { + for (int ifreq = 0; ifreq != ngrids; ifreq++) + { + auto freq = tfg.get_freq_nodes()[ifreq]; + for (auto atpair_unique : atpairs_unique) + { + const auto Mu = atpair_unique.first; + const int n_mu = atom_mu[Mu]; + const auto Nu = atpair_unique.second; + const int n_nu = atom_mu[Nu]; + Wc_freq_R[freq][Mu][Nu][R] = matrix_m>(n_mu, n_nu, major_Wc); + ifreqR_atpair_all.push_back({{ifreq, R}, atpair_unique}); + } + } + } + +#pragma omp parallel for schedule(dynamic) + for (auto ifreqR_atpair : ifreqR_atpair_all) + { + const auto ifreq = ifreqR_atpair.first.first; + const auto freq = tfg.get_freq_nodes()[ifreq]; + const auto R = ifreqR_atpair.first.second; + const auto Mu = ifreqR_atpair.second.first; + const auto Nu = ifreqR_atpair.second.second; + const int n_mu = atom_mu[Mu]; + const int n_nu = atom_mu[Nu]; + + // thread local temporary matrix + matrix_m> WfR_temp(n_mu, n_nu, major_Wc); + + if (Wc_freq_q.count(freq) == 0) continue; + if (Wc_freq_q.at(freq).count(Mu) == 0) continue; + if (Wc_freq_q.at(freq).at(Mu).count(Nu) == 0) continue; + + for (auto &Wc_q : Wc_freq_q.at(freq).at(Mu).at(Nu)) + { + const auto q = Wc_q.first; + const auto &Wc = Wc_q.second; + for (auto q_bz : map_irk_ks[q]) + { + const double ang = -q_bz * (R * latvec) * TWO_PI; + const complex weight = + complex(cos(ang), sin(ang)) / double(n_k_points); + if (q == q_bz) + WfR_temp += Wc * weight; + else + WfR_temp += conj(Wc) * weight; + } + } + // omp_set_lock(&lock_Wc); + Wc_freq_R[freq][Mu][Nu][R] += WfR_temp; + // omp_unset_lock(&lock_Wc); + } + + if (mpi_comm_global_h.is_root()) + { + lib_printf("Done converting Wc(q,w) -> Wc(R,w)\n"); + } + mpi_comm_global_h.barrier(); + + return Wc_freq_R; +} + +map, matrix_m>>>::pair_t_old> +CT_FT_Wc_freq_q( + const map, matrix_m>>>::pair_t_old> + &Wc_freq_q, + const TFGrids &tfg, const int &n_k_points, const vector> &Rlist) +{ + // major of Wc_freq_q input and Wc_tau_R output + const MAJOR major_Wc = MAJOR::ROW; + + map, matrix_m>>>::pair_t_old> + Wc_tau_R; + if (!tfg.has_time_grids()) throw logic_error("TFGrids object does not have time grids"); + const int ngrids = tfg.get_n_grids(); + + LIBRPA::utils::lib_printf_root("Converting Wc(q,w) -> W(R,t)\n"); + mpi_comm_global_h.barrier(); + + set> atpairs_unique; + for (const auto &freq_MuNuqWc : Wc_freq_q) + { + for (const auto &Mu_NuqWc : freq_MuNuqWc.second) + { + const auto Mu = Mu_NuqWc.first; + for (const auto &Nu_qWc : Mu_NuqWc.second) + { + const auto Nu = Nu_qWc.first; + atpairs_unique.insert({Mu, Nu}); + for (const auto &q_Wc : Nu_qWc.second) + { + assert(q_Wc.second.major() == major_Wc); + } + } + } + } + + vector>, pair>> itauR_atpair_all; + // allocate space before hand + for (auto R : Rlist) + { + for (int itau = 0; itau != ngrids; itau++) + { + auto tau = tfg.get_time_nodes()[itau]; + for (auto atpair_unique : atpairs_unique) + { + const auto Mu = atpair_unique.first; + const int n_mu = atom_mu[Mu]; + const auto Nu = atpair_unique.second; + const int n_nu = atom_mu[Nu]; + Wc_tau_R[tau][Mu][Nu][R] = matrix_m>(n_mu, n_nu, major_Wc); + itauR_atpair_all.push_back({{itau, R}, atpair_unique}); + } + } + } + + LIBRPA::utils::lib_printf_coll("Task %4d: distributing %d {I, J, R, tau} on %d threads\n", + LIBRPA::envs::myid_global, itauR_atpair_all.size(), + omp_get_max_threads()); + +#pragma omp parallel for schedule(dynamic) + for (auto itauR_atpair : itauR_atpair_all) + { + const auto itau = itauR_atpair.first.first; + const auto tau = tfg.get_time_nodes()[itau]; + const auto R = itauR_atpair.first.second; + const auto Mu = itauR_atpair.second.first; + const auto Nu = itauR_atpair.second.second; + const int n_mu = atom_mu[Mu]; + const int n_nu = atom_mu[Nu]; + + // thread local temporary matrix + matrix_m> WtR_temp(n_mu, n_nu, major_Wc); + + for (int ifreq = 0; ifreq < ngrids; ifreq++) + { + const auto freq = tfg.get_freq_nodes()[ifreq]; + const auto f2t = tfg.get_costrans_f2t()(itau, ifreq); + // ofs_myid << "f2t cos eff for freq " << freq << " -> tau " << tau << ": " << f2t << + // "\n"; + if (Wc_freq_q.count(freq) == 0) continue; + if (Wc_freq_q.at(freq).count(Mu) == 0) continue; + if (Wc_freq_q.at(freq).at(Mu).count(Nu) == 0) continue; + // cout << "freq: " << freq << "\n"; + + const auto &Wc_q_all = Wc_freq_q.at(freq).at(Mu).at(Nu); + for (auto &Wc_q : Wc_q_all) + { + const auto q = Wc_q.first; + const auto &Wc = Wc_q.second; + for (auto q_bz : map_irk_ks[q]) + { + const double ang = -q_bz * (R * latvec) * TWO_PI; + const complex weight = + complex(cos(ang), sin(ang)) * f2t / double(n_k_points); + // ofs_myid << q << " " << q_bz << " weight = " << weight << "\n"; + // ofs_myid << q_Wc.second; + if (q == q_bz) + WtR_temp += Wc * weight; + else + WtR_temp += conj(Wc) * weight; + } + } + } + // omp_set_lock(&lock_Wc); + Wc_tau_R[tau][Mu][Nu][R] += WtR_temp; + // omp_unset_lock(&lock_Wc); + } + + LIBRPA::utils::lib_printf_root("Done converting Wc q,w -> R,t\n"); + mpi_comm_global_h.barrier(); + + // myz debug: check the imaginary part of the matrix + // NOTE: if G(R) is real, is W(R) real as well? + // if (Params::debug) + // { + // for (const auto & tau_MuNuRWc: Wc_tau_R) + // { + // char fn[80]; + // auto tau = tau_MuNuRWc.first; + // auto itau = tfg.get_time_index(tau); + // for (const auto & Mu_NuRWc: tau_MuNuRWc.second) + // { + // auto Mu = Mu_NuRWc.first; + // // const int n_mu = atom_mu[Mu]; + // for (const auto & Nu_RWc: Mu_NuRWc.second) + // { + // auto Nu = Nu_RWc.first; + // // const int n_nu = atom_mu[Nu]; + // for (const auto & R_Wc: Nu_RWc.second) + // { + // auto R = R_Wc.first; + // auto Wc = R_Wc.second; + // auto iteR = std::find(Rlist.cbegin(), Rlist.cend(), R); + // auto iR = std::distance(Rlist.cbegin(), iteR); + // sprintf(fn, "Wc_Mu_%zu_Nu_%zu_iR_%zu_itau_%d_id_%d.mtx", Mu, Nu, iR, + // itau, mpi_comm_global_h.myid); print_matrix_mm_file(Wc, + // Params::output_dir + "/" + fn, 1e-10); + // } + // } + // } + // } + // } + // end myz debug + return Wc_tau_R; +} + +map, matrix_m>>>::pair_t_old> +CT_FT_Wc_freq2time_q( + const map, matrix_m>>>::pair_t_old> + &Wc_freq_q, + const TFGrids &tfg, const int &n_k_points, const vector> &Rlist, + const vector> &qlist) +{ + // major of Wc_freq_q input and Wc_tau_R output + const MAJOR major_Wc = MAJOR::ROW; + + map, matrix_m>>>::pair_t_old> + Wc_tau_q; + if (!tfg.has_time_grids()) throw logic_error("TFGrids object does not have time grids"); + const int ngrids = tfg.get_n_grids(); + + LIBRPA::utils::lib_printf_root("Converting Wc(q,w) -> W(q,t)\n"); + mpi_comm_global_h.barrier(); + + set> atpairs_unique; + for (const auto &freq_MuNuqWc : Wc_freq_q) + { + for (const auto &Mu_NuqWc : freq_MuNuqWc.second) + { + const auto Mu = Mu_NuqWc.first; + for (const auto &Nu_qWc : Mu_NuqWc.second) + { + const auto Nu = Nu_qWc.first; + atpairs_unique.insert({Mu, Nu}); + for (const auto &q_Wc : Nu_qWc.second) + { + assert(q_Wc.second.major() == major_Wc); + } + } + } + } + vector>> itau_atpair_all; + // allocate space before hand + + for (int itau = 0; itau != ngrids; itau++) + { + auto tau = tfg.get_time_nodes()[itau]; + for (auto atpair_unique : atpairs_unique) + { + const auto Mu = atpair_unique.first; + const int n_mu = atom_mu_s[Mu]; + const auto Nu = atpair_unique.second; + const int n_nu = atom_mu_s[Nu]; + for (auto q : qlist) + Wc_tau_q[tau][Mu][Nu][q] = matrix_m>(n_mu, n_nu, major_Wc); + itau_atpair_all.push_back({itau, atpair_unique}); + } + } + + LIBRPA::utils::lib_printf_coll("Task %4d: distributing %d {I, J, R, tau} on %d threads\n", + LIBRPA::envs::myid_global, itau_atpair_all.size(), + omp_get_max_threads()); + +#pragma omp parallel for schedule(dynamic) + for (auto itau_atpair : itau_atpair_all) + { + const auto itau = itau_atpair.first; + const auto tau = tfg.get_time_nodes()[itau]; + const auto Mu = itau_atpair.second.first; + const auto Nu = itau_atpair.second.second; + const int n_mu = atom_mu_s[Mu]; + const int n_nu = atom_mu_s[Nu]; + + // thread local temporary matrix + matrix_m> Wtq_temp(n_mu, n_nu, major_Wc); + + for (int ifreq = 0; ifreq < ngrids; ifreq++) + { + const auto freq = tfg.get_freq_nodes()[ifreq]; + const auto f2t = tfg.get_costrans_f2t()(itau, ifreq); + // ofs_myid << "f2t cos eff for freq " << freq << " -> tau " << tau << ": " << f2t << + // "\n"; + if (Wc_freq_q.count(freq) == 0) continue; + if (Wc_freq_q.at(freq).count(Mu) == 0) continue; + if (Wc_freq_q.at(freq).at(Mu).count(Nu) == 0) continue; + // cout << "freq: " << freq << "\n"; + + const auto &Wc_q_all = Wc_freq_q.at(freq).at(Mu).at(Nu); + for (auto &Wc_q : Wc_q_all) + { + const auto q = Wc_q.first; + const auto &Wc = Wc_q.second; + const double weight = f2t; + Wtq_temp = Wc * weight; + // omp_set_lock(&lock_Wc); + Wc_tau_q[tau][Mu][Nu][q] += Wtq_temp; + // omp_unset_lock(&lock_Wc); + } + } + } + + LIBRPA::utils::lib_printf_root("Done converting Wc q,w -> q,t\n"); + mpi_comm_global_h.barrier(); + + return Wc_tau_q; +} + +atom_mapping, matrix_m>>>::pair_t_old CT_FT_Wc_tau_R2q( + const atom_mapping, matrix_m>>>::pair_t_old + &Wc_tau_q, + const TFGrids &tfg, const int &n_kpoints, const vector> &Rlist, + const int &itau) +{ + const int tau = tfg.get_time_nodes()[itau]; + // major of Wc_freq_q input and Wc_tau_R output + const MAJOR major_Wc = MAJOR::ROW; + + atom_mapping, matrix_m>>>::pair_t_old Wc_tau_R; + if (!tfg.has_time_grids()) throw logic_error("TFGrids object does not have time grids"); + const int ngrids = tfg.get_n_grids(); + + LIBRPA::utils::lib_printf_root("Converting Wc(q,t) -> W(R,t)\n"); + mpi_comm_global_h.barrier(); + + set> atpairs_unique; + for (const auto &MuNuqWc : Wc_tau_q) + { + const auto Mu = MuNuqWc.first; + for (const auto &Nu_qWc : MuNuqWc.second) + { + const auto Nu = Nu_qWc.first; + atpairs_unique.insert({Mu, Nu}); + for (const auto &q_Wc : Nu_qWc.second) + { + assert(q_Wc.second.major() == major_Wc); + } + } + } + + vector, pair>> iR_atpair_all; + // allocate space before hand + for (auto R : Rlist) + { + for (auto atpair_unique : atpairs_unique) + { + const auto Mu = atpair_unique.first; + const int n_mu = atom_mu_l[Mu]; + const auto Nu = atpair_unique.second; + const int n_nu = atom_mu_l[Nu]; + Wc_tau_R[Mu][Nu][R] = matrix_m>(n_mu, n_nu, major_Wc); + iR_atpair_all.push_back({R, atpair_unique}); + } + } + + LIBRPA::utils::lib_printf_coll("Task %4d: distributing %d {I, J, R, tau} on %d threads\n", + LIBRPA::envs::myid_global, iR_atpair_all.size(), + omp_get_max_threads()); + +#pragma omp parallel for schedule(dynamic) + for (auto iR_atpair : iR_atpair_all) + { + const auto R = iR_atpair.first; + const auto Mu = iR_atpair.second.first; + const auto Nu = iR_atpair.second.second; + const int n_mu = atom_mu_l[Mu]; + const int n_nu = atom_mu_l[Nu]; + + // thread local temporary matrix + matrix_m> WtR_temp(n_mu, n_nu, major_Wc); + + if (Wc_tau_q.count(Mu) == 0) continue; + if (Wc_tau_q.at(Mu).count(Nu) == 0) continue; + // cout << "freq: " << freq << "\n"; + + const auto &Wc_q_all = Wc_tau_q.at(Mu).at(Nu); + for (auto &Wc_q : Wc_q_all) + { + const auto q = Wc_q.first; + const auto &Wc = Wc_q.second; + for (auto q_bz : map_irk_ks[q]) + { + const double ang = -q_bz * (R * latvec) * TWO_PI; + const complex weight = + complex(cos(ang), sin(ang)) / double(n_kpoints); + if (q == q_bz) + WtR_temp += Wc * weight; + else + WtR_temp += conj(Wc) * weight; + } + } + // omp_set_lock(&lock_Wc); + Wc_tau_R[Mu][Nu][R] += WtR_temp; + // omp_unset_lock(&lock_Wc); + } + + LIBRPA::utils::lib_printf_root("Done converting Wc q,t -> R,t\n"); + mpi_comm_global_h.barrier(); + return Wc_tau_R; +} + +void test_libcomm_for_system(const atpair_k_cplx_mat_t &coulmat) +{ + if (mpi_comm_global_h.myid == 0) lib_printf("test_libcomm_for_system Coulumb\n"); + // lib_printf("Calculating EcRPA with BLACS, pid: %d\n", mpi_comm_global_h.myid); + const complex CONE{1.0, 0.0}; + const int n_abf = LIBRPA::atomic_basis_abf.nb_total; + const auto part_range = LIBRPA::atomic_basis_abf.get_part_range(); + + mpi_comm_global_h.barrier(); + + Array_Desc desc_nabf_nabf(blacs_ctxt_global_h); + // use a square blocksize instead max block, otherwise heev and inversion will complain about + // illegal parameter + desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0); + const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy( + 'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf); + const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf); + + auto coul_block = init_local_mat>(desc_nabf_nabf, MAJOR::COL); + + vector> qpts; + for (const auto &qMuNuchi : irk_weight) qpts.push_back(qMuNuchi.first); + +#ifdef LIBRPA_USE_LIBRI + for (const auto &q : qpts) + { + coul_block.zero_out(); + + int iq = std::distance(klist.begin(), std::find(klist.begin(), klist.end(), q)); + std::array qa = {q.x, q.y, q.z}; + // collect the block elements of coulomb matrices + { + double vq_begin = omp_get_wtime(); + // LibRI tensor for communication, release once done + std::map>, Tensor>>> + coul_libri; + coul_libri.clear(); + int count_coul = 0; + for (const auto &Mu_Nu : local_atpair) + { + const auto Mu = Mu_Nu.first; + const auto Nu = Mu_Nu.second; + // ofs_myid << "myid " << blacs_ctxt_global_h.myid << "Mu " << Mu << " Nu " << Nu << + // endl; + if (coulmat.count(Mu) == 0 || coulmat.at(Mu).count(Nu) == 0 || + coulmat.at(Mu).at(Nu).count(q) == 0) + continue; + const auto &Vq = coulmat.at(Mu).at(Nu).at(q); + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu); + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu); + std::valarray> Vq_va(Vq->c, Vq->size); + auto pvq = std::make_shared>>(); + *pvq = Vq_va; + coul_libri[Mu][{Nu, qa}] = Tensor>({n_mu, n_nu}, pvq); + count_coul += 1; + } + int count_pair = 0; + for (auto &Mu : coul_libri) + { + for (auto &nu_q : Mu.second) + { + count_pair += 1; + } + } + // printf("Finish RPA blacs 2d vq arr\n"); + double arr_end = omp_get_wtime(); + mpi_comm_global_h.barrier(); + double comm_begin = omp_get_wtime(); + lib_printf( + "Begin comm_map2_first myid: %d q:(%f, %f, %f) count_coul: %d count_pair: %d\n", + mpi_comm_global_h.myid, q.x, q.y, q.z, count_coul, count_pair); + const auto IJq_coul = + comm_map2_first(mpi_comm_global_h.comm, coul_libri, s0_s1.first, s0_s1.second); + double comm_end = omp_get_wtime(); + mpi_comm_global_h.barrier(); + // printf("End vq comm_map2_first myid: %d TIME_USED: + // %f\n",mpi_comm_global_h.myid,comm_end-comm_begin); + // ofs_myid << "IJq_coul" << endl << IJq_coul; + // printf("Finish RPA blacs 2d vq 2d\n"); + double block_begin = omp_get_wtime(); + // for (const auto &IJ: set_IJ_nabf_nabf) + // { + // const auto &I = IJ.first; + // const auto &J = IJ.second; + // // cout << IJq_coul.at(I).at({J, qa}); + // collect_block_from_IJ_storage_syhe( + // coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, IJ.first, + // IJ.second, true, CONE, IJq_coul.at(I).at({J, qa}).ptr(), MAJOR::ROW); + // // lib_printf("myid %d I %d J %d nr %d nc %d\n%s", + // // blacs_ctxt_global_h.myid, I, J, + // // coul_block.nr(), coul_block.nc(), + // // str(coul_block).c_str()); + // } + collect_block_from_ALL_IJ_Tensor(coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, + qa, true, CONE, IJq_coul, MAJOR::ROW); + double block_end = omp_get_wtime(); + // lib_printf("Vq Time myid: %d arr_time: %f comm_time: %f block_time: %f + // pair_size: %d\n",mpi_comm_global_h.myid,arr_end-vq_begin, comm_end-comm_begin, + // block_end-block_begin,set_IJ_nabf_nabf.size()); + mpi_comm_global_h.barrier(); + double vq_end = omp_get_wtime(); + + if (mpi_comm_global_h.myid == 0) + lib_printf(" | Total vq time: %f lri_coul: %f comm_vq: %f block_vq: %f\n", + vq_end - vq_begin, comm_begin - vq_begin, block_begin - comm_begin, + vq_end - block_begin); + } + } + lib_printf("Success test_libcomm_for_system\n"); +#endif +} From 9ce284227658babf9c8b1aaeb3fd754dc289be58 Mon Sep 17 00:00:00 2001 From: userchba <88939318+userchba@users.noreply.github.com> Date: Fri, 25 Jul 2025 01:11:27 +0800 Subject: [PATCH 02/18] Add files via upload fix a bug which open 4 mpi will contribute error for rap calculation with scalapack and without scalapack --- src/epsilon.cpp | 131 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 115 insertions(+), 16 deletions(-) diff --git a/src/epsilon.cpp b/src/epsilon.cpp index cec86fd0..9ed7d46b 100644 --- a/src/epsilon.cpp +++ b/src/epsilon.cpp @@ -1,5 +1,5 @@ #include "epsilon.h" - +#define OPEN_TEST_FOR_LU_DECOMPOSITION #include #include @@ -317,15 +317,41 @@ CorrEnergy compute_RPA_correlation_blacs_2d(Chi0 &chi0, atpair_k_cplx_mat_t &cou // ofs_myid << "Iset Jset " << s0_s1 << endl; // ofs_myid << "atpair_unordered_local of myid " << blacs_ctxt_global_h.myid << " " << // atpair_unordered_local << endl; - + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success before vector qpts processid:%d,chi0.tfg.get_freq_nodes()[0]:%f,chi0.get_chi0_q().size():%d\n", mpi_comm_global_h.myid, + // chi0.tfg.get_freq_nodes()[0], chi0.get_chi0_q().size()); + // printf("chi0.get_chi0_q().empty():%d\n", chi0.get_chi0_q().empty()); + printf("processId:%d,chi0.klist.size():%zu\n", mpi_comm_global_h.myid, chi0.klist.size()); + // for(const auto &k : chi0.klist) + // { + // printf("processId:%d, k: (%f, %f, %f)\n", mpi_comm_global_h.myid, k.x, k.y, k.z); + // } + #endif vector> qpts; - for (const auto &qMuNuchi : chi0.get_chi0_q().at(chi0.tfg.get_freq_nodes()[0])) - qpts.push_back(qMuNuchi.first); - + + // for (const auto &qMuNuchi : chi0.get_chi0_q().at(chi0.tfg.get_freq_nodes()[0])) + // { + // qpts.push_back(qMuNuchi.first); + // #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // const auto &q = qMuNuchi.first; + // printf("processId:%d, q: (%f, %f, %f)\n", mpi_comm_global_h.myid, q.x, q.y, q.z); + // #endif + // } + for(const auto &q : chi0.klist) + { + qpts.push_back(q); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + printf("processId:%d, q: (%f, %f, %f)\n", mpi_comm_global_h.myid, q.x, q.y, q.z); + #endif + } complex tot_RPA_energy(0.0, 0.0); map, complex> cRPA_q; if (mpi_comm_global_h.is_root()) lib_printf("Finish init RPA blacs 2d\n"); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + printf("success before for loop processid:%d\n", mpi_comm_global_h.myid); + #endif #ifdef LIBRPA_USE_LIBRI + for (const auto &q : qpts) { coul_block.zero_out(); @@ -345,9 +371,15 @@ CorrEnergy compute_RPA_correlation_blacs_2d(Chi0 &chi0, atpair_k_cplx_mat_t &cou const auto Nu = Mu_Nu.second; // ofs_myid << "myid " << blacs_ctxt_global_h.myid << "Mu " << Mu << " Nu " << Nu << // endl; + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success before if coulmat.count:%d\n", mpi_comm_global_h.myid); + #endif if (coulmat.count(Mu) == 0 || coulmat.at(Mu).count(Nu) == 0 || coulmat.at(Mu).at(Nu).count(q) == 0) continue; + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success after if coulmat.count:%d\n", mpi_comm_global_h.myid); + #endif const auto &Vq = coulmat.at(Mu).at(Nu).at(q); const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu); const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu); @@ -420,8 +452,19 @@ CorrEnergy compute_RPA_correlation_blacs_2d(Chi0 &chi0, atpair_k_cplx_mat_t &cou std::map>, Tensor>>> chi0_libri; - const auto &chi0_wq = chi0.get_chi0_q().at(freq).at(q); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success before chi0.get_chi0_q().at(freq).at(q) processId:%d\n", mpi_comm_global_h.myid); + // printf("processId:%d,chi0.get_chi0_q().empty():%d\n", mpi_comm_global_h.myid, chi0.get_chi0_q().empty()); + #endif + atom_mapping::pair_t_old chi0_wq; + if(!chi0.get_chi0_q().empty()) + chi0_wq = chi0.get_chi0_q().at(freq).at(q); + // const auto &chi0_wq = chi0.get_chi0_q().at(freq).at(q); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success after chi0.get_chi0_q().at(freq).at(q) processId:%d\n", mpi_comm_global_h.myid); + #endif chi0_libri.clear(); + if(!chi0.get_chi0_q().empty()) for (const auto &M_Nchi : chi0_wq) { const auto &M = M_Nchi.first; @@ -444,7 +487,15 @@ CorrEnergy compute_RPA_correlation_blacs_2d(Chi0 &chi0, atpair_k_cplx_mat_t &cou lib_printf("chi0_freq_q size: %d, freq: %f, q:( %f, %f, %f )\n", chi0_wq.size(), freq, q.x, q.y, q.z); } - chi0.free_chi0_q(freq, q); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success before chi0.free_chi0_q(freq, q) processId:%d\n", mpi_comm_global_h.myid); + #endif + if(!chi0.get_chi0_q().empty()) + chi0.free_chi0_q(freq, q); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success after chi0.free_chi0_q(freq, q) processId:%d\n", mpi_comm_global_h.myid); + #endif + LIBRPA::utils::release_free_mem(); // if(mpi_comm_global_h.is_root()) @@ -605,6 +656,10 @@ complex compute_pi_det_blacs_2d(matrix_m> &loc_piT, // print_complex_real_matrix("first_pi",pi_freq_q.at(0).at(0)); // print_complex_real_matrix("first_loc_piT_mat",loc_piT); // } + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + printf("success before pzgetrf_ processid:%d,range_all: %d, loc_piT.nr(): %d, loc_piT.nc(): %d\n", + mpi_comm_global_h.myid, range_all, loc_piT.nr(), loc_piT.nc()); + #endif double det_begin = omp_get_wtime(); // ScalapackConnector::transpose_desc(DESCPI_T, arrdesc_pi.desc); pzgetrf_(&range_all, &range_all, loc_piT.ptr(), &one, &one, arrdesc_pi.desc, ipiv, &info); @@ -1001,11 +1056,19 @@ CorrEnergy compute_RPA_correlation(const Chi0 &chi0, const atpair_k_cplx_mat_t & part_range.resize(atom_mu.size()); part_range[0] = 0; int count_range = 0; + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success before part_range processid:%d, atom_mu.size(): %zu\n", + // mpi_comm_global_h.myid, atom_mu.size()); + #endif for (int I = 0; I != atom_mu.size() - 1; I++) { count_range += atom_mu[I]; part_range[I + 1] = count_range; } + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success after part_range processid:%d, atom_mu.size(): %zu\n", + // mpi_comm_global_h.myid, atom_mu.size()); + #endif // cout << "part_range:" << endl; // for (int I = 0; I != atom_mu.size(); I++) @@ -1016,20 +1079,29 @@ CorrEnergy compute_RPA_correlation(const Chi0 &chi0, const atpair_k_cplx_mat_t & // pi_freq_q contains all atoms map, ComplexMatrix>> pi_freq_q; - - for (const auto &freq_q_MuNupi : pi_freq_q_Mu_Nu) + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("| process %d, qpts.size(): %zu,freq.size():%zu\n", mpi_comm_global_h.myid, chi0.klist.size(),chi0.tfg.get_freq_nodes().size()); + #endif + for(const auto &freq : chi0.tfg.get_freq_nodes()) { - const auto freq = freq_q_MuNupi.first; - - for (const auto &q_MuNupi : freq_q_MuNupi.second) - { - const auto q = q_MuNupi.first; - const auto MuNupi = q_MuNupi.second; + // printf("| process %d, freq: %f\n", mpi_comm_global_h.myid, freq); + map, atom_mapping::pair_t_old> freq_q_MuNupi; + if(!chi0.get_chi0_q().empty()) + freq_q_MuNupi=pi_freq_q_Mu_Nu.at(freq); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success before freq_q_MuNupi processid:%d, freq_q_MuNupi.size(): %zu\n", + // mpi_comm_global_h.myid, freq_q_MuNupi.size()); + #endif + for(const auto &q:chi0.klist){ + atom_mapping::pair_t_old q_MuNupi; + if(!chi0.get_chi0_q().empty()) + q_MuNupi = freq_q_MuNupi.at(q); + const auto MuNupi = q_MuNupi; pi_freq_q[freq][q].create(range_all, range_all); ComplexMatrix pi_munu_tmp(range_all, range_all); pi_munu_tmp.zero_out(); - + if(!chi0.get_chi0_q().empty()) for (const auto &Mu_Nupi : MuNupi) { const auto Mu = Mu_Nupi.first; @@ -1068,6 +1140,9 @@ CorrEnergy compute_RPA_correlation(const Chi0 &chi0, const atpair_k_cplx_mat_t & { complex tot_RPA_energy(0.0, 0.0); map, complex> cRPA_q; + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + int num_iteration = 0; + #endif for (const auto &freq_qpi : pi_freq_q) { const auto freq = freq_qpi.first; @@ -1081,6 +1156,21 @@ CorrEnergy compute_RPA_correlation(const Chi0 &chi0, const atpair_k_cplx_mat_t & ComplexMatrix identity_minus_pi(range_all, range_all); identity.set_as_identity_matrix(); identity_minus_pi = identity - pi_freq_q[freq][q]; + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // if(num_iteration==0) + // if(mpi_comm_global_h.myid == 1) + // { + // complex* test_c= identity_minus_pi.c; + // for(int i=0;i det_for_rpa(1.0, 0.0); int info_LU = 0; int *ipiv = new int[range_all]; @@ -1311,12 +1401,21 @@ map, atom_mapping::pair_t_old>> // std::stringstream ss; // ss<<"out_pi_rank_"< ik_vec = k_pair.first; for (int I = 0; I != natom; I++) { + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success before gather_vp_row_q irk_weight, pid: %d\n", mpi_comm_global_h.myid); + #endif atom_mapping::pair_t_old Vq_row = gather_vq_row_q(I, coulmat, ik_vec); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success after gather_vp_row_q irk_weight, pid: %d\n", mpi_comm_global_h.myid); + #endif for (auto &freq_p : chi0.get_chi0_q()) { const double freq = freq_p.first; From fb9774969a71a4b0425a62e756a503c2949f310e Mon Sep 17 00:00:00 2001 From: userchba <88939318+userchba@users.noreply.github.com> Date: Fri, 25 Jul 2025 01:17:39 +0800 Subject: [PATCH 03/18] Delete epsilon.cpp --- epsilon.cpp | 2970 --------------------------------------------------- 1 file changed, 2970 deletions(-) diff --git a/epsilon.cpp b/epsilon.cpp index 9ed7d46b..8b137891 100644 --- a/epsilon.cpp +++ b/epsilon.cpp @@ -1,2971 +1 @@ -#include "epsilon.h" -#define OPEN_TEST_FOR_LU_DECOMPOSITION -#include -#include -#include -#include -#include -#include -#include - -#include "atoms.h" -#include "constants.h" -#include "envs_blacs.h" -#include "envs_io.h" -#include "envs_mpi.h" -#include "lapack_connector.h" -#include "libri_utils.h" -#include "matrix_m_parallel_utils.h" -#include "parallel_mpi.h" -#include "params.h" -#include "pbc.h" -#include "profiler.h" -#include "scalapack_connector.h" -#include "stl_io_helper.h" -#include "utils_blacs.h" -#include "utils_io.h" -#include "utils_mem.h" -#include "utils_mpi_io.h" - -#ifdef LIBRPA_USE_LIBRI -#include -#include -using RI::Tensor; -using RI::Communicate_Tensors_Map_Judge::comm_map2_first; -#endif - -using LIBRPA::Array_Desc; -using LIBRPA::envs::blacs_ctxt_global_h; -using LIBRPA::envs::mpi_comm_global_h; -using LIBRPA::envs::ofs_myid; -using LIBRPA::utils::lib_printf; - -CorrEnergy compute_RPA_correlation_blacs_2d_gamma_only(Chi0 &chi0, atpair_k_cplx_mat_t &coulmat) -{ - CorrEnergy corr; - if (mpi_comm_global_h.myid == 0) - lib_printf("Calculating EcRPA with BLACS/ScaLAPACK 2D gamma_only\n"); - // lib_printf("Calculating EcRPA with BLACS, pid: %d\n", mpi_comm_global_h.myid); - const auto &mf = chi0.mf; - const double CONE = 1.0; - const int n_abf = LIBRPA::atomic_basis_abf.nb_total; - // std::cout << "n_abf " << n_abf << std::endl; - // std::cout << "n_atoms " << LIBRPA::atomic_basis_abf.n_atoms << std::endl; - const auto part_range = LIBRPA::atomic_basis_abf.get_part_range(); - // std::cout << "part_range " << part_range[0] << " " << part_range[1] << std::endl; - auto nbs_ = LIBRPA::atomic_basis_abf.get_atom_nbs(); - // std::cout << "nbs_ " << nbs_[0] << " " << nbs_[1] << std::endl; - - mpi_comm_global_h.barrier(); - - Array_Desc desc_nabf_nabf(blacs_ctxt_global_h); - // use a square blocksize instead max block, otherwise heev and inversion will complain about - // illegal parameter - desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0); - const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy( - 'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf); - const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf); - auto chi0_block = init_local_mat(desc_nabf_nabf, MAJOR::COL); - auto coul_block = init_local_mat(desc_nabf_nabf, MAJOR::COL); - auto coul_chi0_block = init_local_mat(desc_nabf_nabf, MAJOR::COL); - - vector> qpts; - for (const auto &qMuNuchi : chi0.get_chi0_q().at(chi0.tfg.get_freq_nodes()[0])) - qpts.push_back(qMuNuchi.first); - - complex tot_RPA_energy(0.0, 0.0); - map, complex> cRPA_q; - if (mpi_comm_global_h.is_root()) lib_printf("Finish init RPA blacs 2d\n"); -#ifdef LIBRPA_USE_LIBRI - for (const auto &q : qpts) - { - coul_block.zero_out(); - - int iq = std::distance(klist.begin(), std::find(klist.begin(), klist.end(), q)); - std::array qa = {q.x, q.y, q.z}; - // collect the block elements of coulomb matrices - { - double vq_begin = omp_get_wtime(); - // LibRI tensor for communication, release once done - std::map>, Tensor>> - coul_libri; - - for (const auto &Mu_Nu : local_atpair) - { - const auto Mu = Mu_Nu.first; - const auto Nu = Mu_Nu.second; - // ofs_myid << "myid " << blacs_ctxt_global_h.myid << "Mu " << Mu << " Nu " << Nu << - // endl; - if (coulmat.count(Mu) == 0 || coulmat.at(Mu).count(Nu) == 0 || - coulmat.at(Mu).at(Nu).count(q) == 0) - continue; - const auto &Vq = coulmat.at(Mu).at(Nu).at(q); - const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu); - const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu); - matrix tmp_vq_real = (*Vq).real(); - std::valarray Vq_va(tmp_vq_real.c, Vq->size); - auto pvq = std::make_shared>(); - *pvq = Vq_va; - coul_libri[Mu][{Nu, std::array{0, 0, 0}}] = - Tensor({n_mu, n_nu}, pvq); - coulmat.at(Mu).at(Nu).at(q).reset(); - } - - LIBRPA::utils::release_free_mem(); - - // printf("Finish RPA blacs 2d vq arr\n"); - double arr_end = omp_get_wtime(); - mpi_comm_global_h.barrier(); - double comm_begin = omp_get_wtime(); - // printf("Begin comm_map2_first myid: %d\n",mpi_comm_global_h.myid); - const auto IJq_coul = - comm_map2_first(mpi_comm_global_h.comm, coul_libri, s0_s1.first, s0_s1.second); - double comm_end = omp_get_wtime(); - mpi_comm_global_h.barrier(); - - double block_begin = omp_get_wtime(); - - collect_block_from_ALL_IJ_Tensor(coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, - qa, true, CONE, IJq_coul, MAJOR::ROW); - - double block_end = omp_get_wtime(); - lib_printf( - "Vq Time myid: %d arr_time: %f comm_time: %f block_time: %f pair_size: %d\n", - mpi_comm_global_h.myid, arr_end - vq_begin, comm_end - comm_begin, - block_end - block_begin, set_IJ_nabf_nabf.size()); - mpi_comm_global_h.barrier(); - double vq_end = omp_get_wtime(); - - if (mpi_comm_global_h.myid == 0) - lib_printf(" | Total vq time: %f lri_coul: %f comm_vq: %f block_vq: %f\n", - vq_end - vq_begin, comm_begin - vq_begin, block_begin - comm_begin, - vq_end - block_begin); - } - - double chi_arr_time = 0.0; - double chi_comm_time = 0.0; - double chi_2d_time = 0.0; - for (const auto &freq : chi0.tfg.get_freq_nodes()) - { - const auto ifreq = chi0.tfg.get_freq_index(freq); - const double freq_weight = chi0.tfg.find_freq_weight(freq); - double pi_freq_begin = omp_get_wtime(); - chi0_block.zero_out(); - { - double chi_begin_arr = omp_get_wtime(); - std::map>, Tensor>> - chi0_libri; - const auto &chi0_wq = chi0.get_chi0_q().at(freq).at(q); - - for (const auto &M_Nchi : chi0_wq) - { - const auto &M = M_Nchi.first; - const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M); - for (const auto &N_chi : M_Nchi.second) - { - const auto &N = N_chi.first; - const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N); - const auto &chi = N_chi.second.real(); - std::valarray chi_va(chi.c, chi.size); - auto pchi = std::make_shared>(); - *pchi = chi_va; - chi0_libri[M][{N, std::array{0, 0, 0}}] = - Tensor({n_mu, n_nu}, pchi); - } - } - - // if(mpi_comm_global_h.is_root()) - // { - // lib_printf("Begin to clean chi0 !!! \n"); - // system("free -m"); - // lib_printf("chi0_freq_q size: %d\n",chi0_wq.size()); - // } - - chi0.free_chi0_q(freq, q); - - LIBRPA::utils::release_free_mem(); - - // if(mpi_comm_global_h.is_root()) - // { - // lib_printf("After clean chi0 !!! \n"); - // system("free -m"); - // lib_printf("chi0_freq_q size: %d\n",chi0_wq.size()); - // } - - mpi_comm_global_h.barrier(); - double chi_end_arr = omp_get_wtime(); - // ofs_myid << "chi0_libri" << endl << chi0_libri; - - const auto IJq_chi0 = - comm_map2_first(mpi_comm_global_h.comm, chi0_libri, s0_s1.first, s0_s1.second); - // ofs_myid << "IJq_chi0" << endl << IJq_chi0; - double chi_end_comm = omp_get_wtime(); - - collect_block_from_ALL_IJ_Tensor(chi0_block, desc_nabf_nabf, - LIBRPA::atomic_basis_abf, qa, true, CONE, IJq_chi0, - MAJOR::ROW); - // printf("End collect block myid: %d ifreq: %d TIME_USED: - // %f\n",mpi_comm_global_h.myid,ifreq,chi_end_comm-chi_end_arr); - mpi_comm_global_h.barrier(); - double chi_end_2d = omp_get_wtime(); - - chi_arr_time = (chi_end_arr - chi_begin_arr); - chi_comm_time = (chi_end_comm - chi_end_arr); - chi_2d_time = (chi_end_2d - chi_end_comm); - } - - double pi_begin = omp_get_wtime(); - ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, coul_block.ptr(), 1, 1, - desc_nabf_nabf.desc, chi0_block.ptr(), 1, 1, - desc_nabf_nabf.desc, 0.0, coul_chi0_block.ptr(), 1, 1, - desc_nabf_nabf.desc); - // char fnp[100]; - // sprintf(fnp, "pi_ifreq_%d_iq_%d.mtx", ifreq, iq); - double pi_end = omp_get_wtime(); - // printf("End pgemm myid: %d ifreq: %d \n",mpi_comm_global_h.myid,ifreq); - double trace_pi = 0.0; - double trace_pi_loc = 0.0; - for (int i = 0; i != n_abf; i++) - { - const int ilo = desc_nabf_nabf.indx_g2l_r(i); - const int jlo = desc_nabf_nabf.indx_g2l_c(i); - if (ilo >= 0 && jlo >= 0) trace_pi_loc += coul_chi0_block(ilo, jlo); - } - - coul_chi0_block *= -1.0; - for (int i = 0; i != n_abf; i++) - { - const int ilo = desc_nabf_nabf.indx_g2l_r(i); - const int jlo = desc_nabf_nabf.indx_g2l_c(i); - if (ilo >= 0 && jlo >= 0) coul_chi0_block(ilo, jlo) += CONE; - } - - int *ipiv = new int[desc_nabf_nabf.m_loc() * 10]; - int info; - // printf("begin det myid: %d ifreq: %d \n",mpi_comm_global_h.myid,ifreq); - double ln_det = - compute_pi_det_blacs_2d_gamma_only(coul_chi0_block, desc_nabf_nabf, ipiv, info); - // printf("End det myid: %d ifreq: %d \n",mpi_comm_global_h.myid,ifreq); - double det_end = omp_get_wtime(); - mpi_comm_global_h.barrier(); - MPI_Allreduce(&trace_pi_loc, &trace_pi, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_global_h.comm); - double pi_freq_end = omp_get_wtime(); - - if (mpi_comm_global_h.myid == 0) - { - lib_printf( - "| TIME of DET-freq-q: %f, q: ( %f, %f, %f) TOT: %f CHI_arr: %f CHI_comm: " - "%f, CHI_2d: %f, Pi: %f, Det: %f\n", - freq, q.x, q.y, q.z, pi_freq_end - pi_freq_begin, chi_arr_time, chi_comm_time, - chi_2d_time, pi_end - pi_begin, det_end - pi_end); - complex rpa_for_omega_q = complex(trace_pi + ln_det); - /*std::cout << "q: " << iq << ", freq: " << ifreq << ", ln_det:" << ln_det - << ", trace_pi: " << trace_pi << ", rpa_for_omega_q" << rpa_for_omega_q - << ", contribution: " - << rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI << std::endl;*/ - cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; //! check - // std::cout << "rpa_for_omega_q: " << rpa_for_omega_q - // << ", freq_weight: " << freq_weight << ", irk_weight[q]:" << - // irk_weight[q] - // << ", cRPA_q[q]: " << cRPA_q[q] << std::endl; - tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; - } - } - } -#else - throw std::logic_error("need compilation with LibRI"); -#endif - if (mpi_comm_global_h.myid == 0) - { - for (auto &q_crpa : cRPA_q) - { - corr.qcontrib[q_crpa.first] = q_crpa.second; - } - } - mpi_comm_global_h.barrier(); - corr.value = tot_RPA_energy; - - corr.etype = CorrEnergy::type::RPA; - return corr; -} - -CorrEnergy compute_RPA_correlation_blacs_2d(Chi0 &chi0, atpair_k_cplx_mat_t &coulmat) -{ - lib_printf("Begin to compute_RPA_correlation_blacs_2d myid: %d\n", mpi_comm_global_h.myid); - system("free -m"); - CorrEnergy corr; - if (mpi_comm_global_h.myid == 0) lib_printf("Calculating EcRPA with BLACS/ScaLAPACK 2D\n"); - // lib_printf("Calculating EcRPA with BLACS, pid: %d\n", mpi_comm_global_h.myid); - const auto &mf = chi0.mf; - const complex CONE{1.0, 0.0}; - const int n_abf = LIBRPA::atomic_basis_abf.nb_total; - const auto part_range = LIBRPA::atomic_basis_abf.get_part_range(); - - mpi_comm_global_h.barrier(); - - Array_Desc desc_nabf_nabf(blacs_ctxt_global_h); - // use a square blocksize instead max block, otherwise heev and inversion will complain about - // illegal parameter - desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0); - const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy( - 'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf); - const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf); - auto chi0_block = init_local_mat>(desc_nabf_nabf, MAJOR::COL); - auto coul_block = init_local_mat>(desc_nabf_nabf, MAJOR::COL); - auto coul_chi0_block = init_local_mat>(desc_nabf_nabf, MAJOR::COL); - // ofs_myid << "Iset Jset " << s0_s1 << endl; - // ofs_myid << "atpair_unordered_local of myid " << blacs_ctxt_global_h.myid << " " << - // atpair_unordered_local << endl; - #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION - // printf("success before vector qpts processid:%d,chi0.tfg.get_freq_nodes()[0]:%f,chi0.get_chi0_q().size():%d\n", mpi_comm_global_h.myid, - // chi0.tfg.get_freq_nodes()[0], chi0.get_chi0_q().size()); - // printf("chi0.get_chi0_q().empty():%d\n", chi0.get_chi0_q().empty()); - printf("processId:%d,chi0.klist.size():%zu\n", mpi_comm_global_h.myid, chi0.klist.size()); - // for(const auto &k : chi0.klist) - // { - // printf("processId:%d, k: (%f, %f, %f)\n", mpi_comm_global_h.myid, k.x, k.y, k.z); - // } - #endif - vector> qpts; - - // for (const auto &qMuNuchi : chi0.get_chi0_q().at(chi0.tfg.get_freq_nodes()[0])) - // { - // qpts.push_back(qMuNuchi.first); - // #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION - // const auto &q = qMuNuchi.first; - // printf("processId:%d, q: (%f, %f, %f)\n", mpi_comm_global_h.myid, q.x, q.y, q.z); - // #endif - // } - for(const auto &q : chi0.klist) - { - qpts.push_back(q); - #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION - printf("processId:%d, q: (%f, %f, %f)\n", mpi_comm_global_h.myid, q.x, q.y, q.z); - #endif - } - complex tot_RPA_energy(0.0, 0.0); - map, complex> cRPA_q; - if (mpi_comm_global_h.is_root()) lib_printf("Finish init RPA blacs 2d\n"); - #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION - printf("success before for loop processid:%d\n", mpi_comm_global_h.myid); - #endif -#ifdef LIBRPA_USE_LIBRI - - for (const auto &q : qpts) - { - coul_block.zero_out(); - - int iq = std::distance(klist.begin(), std::find(klist.begin(), klist.end(), q)); - std::array qa = {q.x, q.y, q.z}; - // collect the block elements of coulomb matrices - { - double vq_begin = omp_get_wtime(); - // LibRI tensor for communication, release once done - std::map>, Tensor>>> - coul_libri; - coul_libri.clear(); - for (const auto &Mu_Nu : local_atpair) - { - const auto Mu = Mu_Nu.first; - const auto Nu = Mu_Nu.second; - // ofs_myid << "myid " << blacs_ctxt_global_h.myid << "Mu " << Mu << " Nu " << Nu << - // endl; - #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION - // printf("success before if coulmat.count:%d\n", mpi_comm_global_h.myid); - #endif - if (coulmat.count(Mu) == 0 || coulmat.at(Mu).count(Nu) == 0 || - coulmat.at(Mu).at(Nu).count(q) == 0) - continue; - #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION - // printf("success after if coulmat.count:%d\n", mpi_comm_global_h.myid); - #endif - const auto &Vq = coulmat.at(Mu).at(Nu).at(q); - const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu); - const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu); - std::valarray> Vq_va(Vq->c, Vq->size); - auto pvq = std::make_shared>>(); - *pvq = Vq_va; - coul_libri[Mu][{Nu, qa}] = Tensor>({n_mu, n_nu}, pvq); - } - // printf("Finish RPA blacs 2d vq arr\n"); - double arr_end = omp_get_wtime(); - mpi_comm_global_h.barrier(); - double comm_begin = omp_get_wtime(); - // printf("Begin comm_map2_first myid: %d\n",mpi_comm_global_h.myid); - const auto IJq_coul = - comm_map2_first(mpi_comm_global_h.comm, coul_libri, s0_s1.first, s0_s1.second); - double comm_end = omp_get_wtime(); - mpi_comm_global_h.barrier(); - // printf("End vq comm_map2_first myid: %d TIME_USED: - // %f\n",mpi_comm_global_h.myid,comm_end-comm_begin); - // ofs_myid << "IJq_coul" << endl << IJq_coul; - // printf("Finish RPA blacs 2d vq 2d\n"); - double block_begin = omp_get_wtime(); - // for (const auto &IJ: set_IJ_nabf_nabf) - // { - // const auto &I = IJ.first; - // const auto &J = IJ.second; - // // cout << IJq_coul.at(I).at({J, qa}); - // collect_block_from_IJ_storage_syhe( - // coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, IJ.first, - // IJ.second, true, CONE, IJq_coul.at(I).at({J, qa}).ptr(), MAJOR::ROW); - // // lib_printf("myid %d I %d J %d nr %d nc %d\n%s", - // // blacs_ctxt_global_h.myid, I, J, - // // coul_block.nr(), coul_block.nc(), - // // str(coul_block).c_str()); - // } - collect_block_from_ALL_IJ_Tensor(coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, - qa, true, CONE, IJq_coul, MAJOR::ROW); - double block_end = omp_get_wtime(); - lib_printf( - "Vq Time myid: %d arr_time: %f comm_time: %f block_time: %f pair_size: %d\n", - mpi_comm_global_h.myid, arr_end - vq_begin, comm_end - comm_begin, - block_end - block_begin, set_IJ_nabf_nabf.size()); - mpi_comm_global_h.barrier(); - double vq_end = omp_get_wtime(); - - if (mpi_comm_global_h.myid == 0) - lib_printf(" | Total vq time: %f lri_coul: %f comm_vq: %f block_vq: %f\n", - vq_end - vq_begin, comm_begin - vq_begin, block_begin - comm_begin, - vq_end - block_begin); - } - - // if(mpi_comm_global_h.is_root()) - // printf("Finish RPA blacs 2d vq comm\n"); - // char fn[100]; - // sprintf(fn, "coul_iq_%d.mtx", iq); - // print_matrix_mm_file_parallel(fn, coul_block, desc_nabf_nabf); - // ofs_myid << str(coul_block); - // lib_printf("coul_block\n%s", str(coul_block).c_str()); - double chi_arr_time = 0.0; - double chi_comm_time = 0.0; - double chi_2d_time = 0.0; - for (const auto &freq : chi0.tfg.get_freq_nodes()) - { - const auto ifreq = chi0.tfg.get_freq_index(freq); - const double freq_weight = chi0.tfg.find_freq_weight(freq); - double pi_freq_begin = omp_get_wtime(); - chi0_block.zero_out(); - { - double chi_begin_arr = omp_get_wtime(); - std::map>, Tensor>>> - chi0_libri; - #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION - // printf("success before chi0.get_chi0_q().at(freq).at(q) processId:%d\n", mpi_comm_global_h.myid); - // printf("processId:%d,chi0.get_chi0_q().empty():%d\n", mpi_comm_global_h.myid, chi0.get_chi0_q().empty()); - #endif - atom_mapping::pair_t_old chi0_wq; - if(!chi0.get_chi0_q().empty()) - chi0_wq = chi0.get_chi0_q().at(freq).at(q); - // const auto &chi0_wq = chi0.get_chi0_q().at(freq).at(q); - #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION - // printf("success after chi0.get_chi0_q().at(freq).at(q) processId:%d\n", mpi_comm_global_h.myid); - #endif - chi0_libri.clear(); - if(!chi0.get_chi0_q().empty()) - for (const auto &M_Nchi : chi0_wq) - { - const auto &M = M_Nchi.first; - const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M); - for (const auto &N_chi : M_Nchi.second) - { - const auto &N = N_chi.first; - const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N); - const auto &chi = N_chi.second; - std::valarray> chi_va(chi.c, chi.size); - auto pchi = std::make_shared>>(); - *pchi = chi_va; - chi0_libri[M][{N, qa}] = Tensor>({n_mu, n_nu}, pchi); - } - } - if (mpi_comm_global_h.is_root()) - { - lib_printf("Begin to clean chi0 !!! \n"); - LIBRPA::utils::display_free_mem(); - lib_printf("chi0_freq_q size: %d, freq: %f, q:( %f, %f, %f )\n", - chi0_wq.size(), freq, q.x, q.y, q.z); - } - #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION - // printf("success before chi0.free_chi0_q(freq, q) processId:%d\n", mpi_comm_global_h.myid); - #endif - if(!chi0.get_chi0_q().empty()) - chi0.free_chi0_q(freq, q); - #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION - // printf("success after chi0.free_chi0_q(freq, q) processId:%d\n", mpi_comm_global_h.myid); - #endif - - - LIBRPA::utils::release_free_mem(); - // if(mpi_comm_global_h.is_root()) - // { - // lib_printf("After clean chi0 !!! \n"); - // system("free -m"); - // lib_printf("chi0_freq_q size: %d\n",chi0_wq.size()); - // } - mpi_comm_global_h.barrier(); - double chi_end_arr = omp_get_wtime(); - // ofs_myid << "chi0_libri" << endl << chi0_libri; - - const auto IJq_chi0 = - comm_map2_first(mpi_comm_global_h.comm, chi0_libri, s0_s1.first, s0_s1.second); - // ofs_myid << "IJq_chi0" << endl << IJq_chi0; - double chi_end_comm = omp_get_wtime(); - collect_block_from_ALL_IJ_Tensor(chi0_block, desc_nabf_nabf, - LIBRPA::atomic_basis_abf, qa, true, CONE, IJq_chi0, - MAJOR::ROW); - mpi_comm_global_h.barrier(); - double chi_end_2d = omp_get_wtime(); - - chi_arr_time = (chi_end_arr - chi_begin_arr); - chi_comm_time = (chi_end_comm - chi_end_arr); - chi_2d_time = (chi_end_2d - chi_end_comm); - // char fnc[100]; - // sprintf(fnc, "chi_ifreq_%d_iq_%d.mtx", ifreq, iq); - // if( ifreq== 0) - // print_matrix_mm_file_parallel(fnc, chi0_block, desc_nabf_nabf); - } - - double pi_begin = omp_get_wtime(); - ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, coul_block.ptr(), 1, 1, - desc_nabf_nabf.desc, chi0_block.ptr(), 1, 1, - desc_nabf_nabf.desc, 0.0, coul_chi0_block.ptr(), 1, 1, - desc_nabf_nabf.desc); - // char fnp[100]; - // sprintf(fnp, "pi_ifreq_%d_iq_%d.mtx", ifreq, iq); - double pi_end = omp_get_wtime(); - - complex trace_pi(0.0, 0.0); - complex trace_pi_loc(0.0, 0.0); - for (int i = 0; i != n_abf; i++) - { - const int ilo = desc_nabf_nabf.indx_g2l_r(i); - const int jlo = desc_nabf_nabf.indx_g2l_c(i); - if (ilo >= 0 && jlo >= 0) trace_pi_loc += coul_chi0_block(ilo, jlo); - } - - coul_chi0_block *= -1.0; - for (int i = 0; i != n_abf; i++) - { - const int ilo = desc_nabf_nabf.indx_g2l_r(i); - const int jlo = desc_nabf_nabf.indx_g2l_c(i); - if (ilo >= 0 && jlo >= 0) coul_chi0_block(ilo, jlo) += CONE; - // std::cout << "1-Pi: " << ilo << "," << jlo << "," << coul_chi0_block(ilo, jlo) - //<< std::endl; - } - // if( ifreq== 0 && mpi_comm_global_h.is_root() ) - // print_whole_matrix("pi-2D-loc", coul_chi0_block); - - int *ipiv = new int[desc_nabf_nabf.m_loc() * 10]; - int info; - complex ln_det = - compute_pi_det_blacs_2d(coul_chi0_block, desc_nabf_nabf, ipiv, info); - double det_end = omp_get_wtime(); - mpi_comm_global_h.barrier(); - MPI_Allreduce(&trace_pi_loc, &trace_pi, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, - mpi_comm_global_h.comm); - double pi_freq_end = omp_get_wtime(); - // double task_end = omp_get_wtime(); - // if(mpi_comm_global_h.is_root()) - // lib_printf("| After det for freq: %f, q: ( %f, %f, %f) TIME_LOCMAT: %f - // TIME_DET: %f TIME_CAL_Pi: %f, TIME_TRAN_LOC: %f\n",ifreq, - // q.x,q.y,q.z,task_mid-task_begin,task_end-task_mid,pi_time,loc_tran_time); - // para_mpi.mpi_barrier(); - - if (mpi_comm_global_h.myid == 0) - { - lib_printf( - "| TIME of DET-freq-q: %f, q: ( %f, %f, %f) TOT: %f CHI_arr: %f CHI_comm: " - "%f, CHI_2d: %f, Pi: %f, Det: %f\n", - freq, q.x, q.y, q.z, pi_freq_end - pi_freq_begin, chi_arr_time, chi_comm_time, - chi_2d_time, pi_end - pi_begin, det_end - pi_end); - complex rpa_for_omega_q = trace_pi + ln_det; - cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; //! check - tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; - } - } - } -#else - throw std::logic_error("need compilation with LibRI"); -#endif - if (mpi_comm_global_h.myid == 0) - { - for (auto &q_crpa : cRPA_q) - { - corr.qcontrib[q_crpa.first] = q_crpa.second; - // cout << q_crpa.first << q_crpa.second << endl; - } - // cout << "gx_num_" << chi0.tfg.size() << " tot_RPA_energy: " << setprecision(8) - // < &loc_piT, const Array_Desc &arrdesc_pi, - int *ipiv, int &info) -{ - int one = 1; - int range_all = N_all_mu; - int DESCPI_T[9]; - - double det_begin = omp_get_wtime(); - - ScalapackConnector::pgetrf_f(range_all, range_all, loc_piT.ptr(), one, one, arrdesc_pi.desc, - ipiv, info); - double trf_end = omp_get_wtime(); - - double ln_det_loc = 0.0; - double ln_det_all = 0.0; - - for (int ig = 0; ig != range_all; ig++) - { - int locr = arrdesc_pi.indx_g2l_r(ig); - int locc = arrdesc_pi.indx_g2l_c(ig); - if (locr >= 0 && locc >= 0) - { - double tmp_ln_det; - if (loc_piT(locr, locc) > 0) - { - tmp_ln_det = std::log(loc_piT(locr, locc)); - } - else - { - tmp_ln_det = std::log(-loc_piT(locr, locc)); - } - ln_det_loc += tmp_ln_det; - } - } - double ln_end = omp_get_wtime(); - - MPI_Allreduce(&ln_det_loc, &ln_det_all, 1, MPI_DOUBLE, MPI_SUM, mpi_comm_global_h.comm); - double det_end = omp_get_wtime(); - return ln_det_all; -} -complex compute_pi_det_blacs_2d(matrix_m> &loc_piT, - const Array_Desc &arrdesc_pi, int *ipiv, int &info) -{ - int one = 1; - int range_all = N_all_mu; - int DESCPI_T[9]; - // if(out_pi) - // { - // print_complex_real_matrix("first_pi",pi_freq_q.at(0).at(0)); - // print_complex_real_matrix("first_loc_piT_mat",loc_piT); - // } - #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION - printf("success before pzgetrf_ processid:%d,range_all: %d, loc_piT.nr(): %d, loc_piT.nc(): %d\n", - mpi_comm_global_h.myid, range_all, loc_piT.nr(), loc_piT.nc()); - #endif - double det_begin = omp_get_wtime(); - // ScalapackConnector::transpose_desc(DESCPI_T, arrdesc_pi.desc); - pzgetrf_(&range_all, &range_all, loc_piT.ptr(), &one, &one, arrdesc_pi.desc, ipiv, &info); - double trf_end = omp_get_wtime(); - // ScalapackConnector::pgetrf_f(range_all,range_all,loc_piT.c,one,one,DESCPI_T,ipiv, info); - // printf(" after LU myid: %d\n",mpi_comm_global_h.myid); - // printf("desc myid: %d, m n: %d,%d, mb nb: %d, %d, loc_m_n: %d, %d, myp: %d,%d, npr,npc: - // %d, %d\n",mpi_comm_global_h.myid, arrdesc_pi.m(),arrdesc_pi.n(), - // arrdesc_pi.mb(),arrdesc_pi.nb(), - // arrdesc_pi.m_loc(),arrdesc_pi.n_loc(),arrdesc_pi.myprow(),arrdesc_pi.mypcol(),arrdesc_pi.nprows(),arrdesc_pi.npcols()); - complex ln_det_loc(0.0, 0.0); - complex ln_det_all(0.0, 0.0); - // complex det_loc(1.0,0.0); - // complex det_glo(0.0,0.0); - // vector> det_dig; - // vector> ln_det_dig; - // vector> det_dig_r; - // vector> det_dig_c; - // printf(" myid: %d ig=25, locr,locc: %d, - // %d)\n",mpi_comm_global_h.myid,arrdesc_pi.indx_g2l_r(25),arrdesc_pi.indx_g2l_c(25)); - for (int ig = 0; ig != range_all; ig++) - { - // int locr=para_mpi.localIndex(ig,row_nblk,para_mpi.nprow,para_mpi.myprow); - // int locc=para_mpi.localIndex(ig,col_nblk,para_mpi.npcol,para_mpi.mypcol); - int locr = arrdesc_pi.indx_g2l_r(ig); - int locc = arrdesc_pi.indx_g2l_c(ig); - if (locr >= 0 && locc >= 0) - { - // if(ipiv[locr]!=(ig+1)) - // det_loc=-1*det_loc * loc_piT(locc,locr); - // else - // det_loc=det_loc * loc_piT(locc,locr); - // det_dig.push_back(loc_piT(locr,locc)); - // det_dig_r.push_back(locr); - // det_dig_c.push_back(locc); - complex tmp_ln_det; - if (loc_piT(locr, locc).real() > 0) - { - tmp_ln_det = std::log(loc_piT(locr, locc)); - // ln_det_dig.push_back(tmp_ln_det); - } - else - { - tmp_ln_det = std::log(-loc_piT(locr, locc)); - // ln_det_dig.push_back(tmp_ln_det); - } - ln_det_loc += tmp_ln_det; - } - } - double ln_end = omp_get_wtime(); - // ComplexMatrix det_mm(loc_piT.nr(),loc_piT.nc()); - // for(int i=0;i!=loc_piT.nr();i++) - // for(int j=0;j!=loc_piT.nc();j++) - // det_mm(i,j)=loc_piT(i,j); - // // sort(det_dig.rbegin(),det_dig.rend()); - // ComplexMatrix det_dig_mm(det_dig.size(),4); - // for(int i=0;i!=det_dig.size();i++) - // { - // det_dig_mm(i,0) =det_dig_r[i]; - // det_dig_mm(i,1) =det_dig_c[i]; - // det_dig_mm(i,2)=det_dig[i]; - // det_dig_mm(i,3)=ln_det_dig[i]; - // } - // char fn[100]; - // sprintf(fn, "det_dig_myid_%d.mtx", mpi_comm_global_h.myid); - // print_complex_matrix_file("det_dig_loc", det_dig_mm, fn, false); - - // sprintf(fn, "det_mat_myid_%d.mtx", mpi_comm_global_h.myid); - // print_complex_matrix_file("det_mat_loc", det_mm, fn, false); - - MPI_Allreduce(&ln_det_loc, &ln_det_all, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_global_h.comm); - double det_end = omp_get_wtime(); - // if(mpi_comm_global_h.myid == 0) - // lib_printf(" | Det time trf: %f ln: %f allreduce: - // %f\n",trf_end-det_begin,ln_end-trf_end, det_end-ln_end); - // MPI_Allreduce(&det_loc,&det_glo,1,MPI_DOUBLE_COMPLEX,MPI_PROD,mpi_comm_global_h.comm); - // ln_det_all=std::log(det_glo); - return ln_det_all; -} - -complex compute_pi_det_blacs(ComplexMatrix &loc_piT, const Array_Desc &arrdesc_pi, - int *ipiv, int &info) -{ - // int range_all = atom_mu_part_range[natom-1]+atom_mu[natom-1]; - // int desc_pi[9]; - // int loc_row, loc_col, info; - // int row_nblk=1; - // int col_nblk=1; - int one = 1; - int range_all = N_all_mu; - // para_mpi.set_blacs_mat(desc_pi,loc_row,loc_col,range_all,range_all,row_nblk,col_nblk); - // int *ipiv = new int [loc_row*10]; - // ComplexMatrix loc_piT(loc_col,loc_row); - - // for(int i=0;i!=loc_row;i++) - // { - // int global_row = para_mpi.globalIndex(i,row_nblk,para_mpi.nprow,para_mpi.myprow); - // int mu; - // int I=atom_mu_glo2loc(global_row,mu); - // for(int j=0;j!=loc_col;j++) - // { - // int global_col = para_mpi.globalIndex(j,col_nblk,para_mpi.npcol,para_mpi.mypcol); - // int nu; - // int J=atom_mu_glo2loc(global_col,nu); - - // if( global_col == global_row) - // { - // loc_piT(j,i)=complex(1.0,0.0) - pi_freq_q.at(I).at(J)(mu,nu); - // } - // else - // { - // loc_piT(j,i)=-1* pi_freq_q.at(I).at(J)(mu,nu); - // } - - // } - // } - int DESCPI_T[9]; - // if(out_pi) - // { - // print_complex_real_matrix("first_pi",pi_freq_q.at(0).at(0)); - // print_complex_real_matrix("first_loc_piT_mat",loc_piT); - // } - - ScalapackConnector::transpose_desc(DESCPI_T, arrdesc_pi.desc); - - // para_mpi.mpi_barrier(); - // printf(" before LU Myid: %d Available DOS memory = %ld - // bytes\n",mpi_comm_global_h.myid, memavail()); printf(" before LU myid: %d range_all: %d, - // loc_mat.size: %d\n",mpi_comm_global_h.myid,range_all,loc_piT.size); - pzgetrf_(&range_all, &range_all, loc_piT.c, &one, &one, DESCPI_T, ipiv, &info); - // printf(" after LU myid: %d\n",mpi_comm_global_h.myid); - complex ln_det_loc(0.0, 0.0); - complex ln_det_all(0.0, 0.0); - for (int ig = 0; ig != range_all; ig++) - { - // int locr=para_mpi.localIndex(ig,row_nblk,para_mpi.nprow,para_mpi.myprow); - // int locc=para_mpi.localIndex(ig,col_nblk,para_mpi.npcol,para_mpi.mypcol); - int locr = arrdesc_pi.indx_g2l_r(ig); - int locc = arrdesc_pi.indx_g2l_c(ig); - if (locr >= 0 && locc >= 0) - { - // if(ipiv[locr]!=(ig+1)) - // det_loc=-1*det_loc * loc_piT(locc,locr); - // else - // det_loc=det_loc * loc_piT(locc,locr); - if (loc_piT(locc, locr).real() > 0) - ln_det_loc += std::log(loc_piT(locc, locr)); - else - ln_det_loc += std::log(-loc_piT(locc, locr)); - } - } - MPI_Allreduce(&ln_det_loc, &ln_det_all, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_global_h.comm); - return ln_det_all; -} - -CorrEnergy compute_RPA_correlation_blacs(const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat) -{ - CorrEnergy corr; - if (mpi_comm_global_h.myid == 0) lib_printf("Calculating EcRPA with BLACS/ScaLAPACK row\n"); - - const auto &mf = chi0.mf; - const complex CONE{1.0, 0.0}; - const int n_abf = LIBRPA::atomic_basis_abf.nb_total; - const auto part_range = LIBRPA::atomic_basis_abf.get_part_range(); - - mpi_comm_global_h.barrier(); - - LIBRPA::Array_Desc arrdesc_pi(blacs_ctxt_global_h); - arrdesc_pi.init_square_blk(n_abf, n_abf, 0, 0); - int loc_row = arrdesc_pi.m_loc(), loc_col = arrdesc_pi.n_loc(), info; - - // para_mpi.set_blacs_mat(desc_pi,loc_row,loc_col,N_all_mu,N_all_mu,row_nblk,col_nblk); - int *ipiv = new int[loc_row * 10]; - // double vq_begin_m2t= omp_get_wtime(); - // std::map>, Tensor>>> - // vq_libri; for(auto &Ip:Vq) - // { - // auto I=Ip.first; - // const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(I); - // for(auto &Jp:Ip.second) - // { - // auto J=Jp.first; - // const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(J); - // for(auto &qp:Jp.second) - // { - // auto q=qp.first; - // std::array qa = {q.x, q.y, q.z}; - // const auto &vq_ptr=qp.second; - // std::valarray> Vq_va(vq_ptr->c, vq_ptr->size); - // auto pvq = std::make_shared>>(); - // *pvq = Vq_va; - // vq_libri[I][{J, qa}] = Tensor>({n_mu, n_nu}, pvq); - // if(I!=J) - // { - // auto vqT=transpose(*vq_ptr, 1); - // std::valarray> VqT_va(vqT.c, vqT.size); - // auto pvqT = std::make_shared>>(); - // *pvqT = VqT_va; - // vq_libri[J][{I, qa}] = Tensor>({n_nu, n_mu}, pvqT); - // } - // } - // } - // } - // double vq_end_m2t = omp_get_wtime(); - // set loc_atp_IJ; - // for(auto &atp:local_atpair) - // { - // loc_atp_IJ.insert(atp.first); - // loc_atp_IJ.insert(atp.second); - // } - // set all_atom_set; - // for(int I=0;I!=natom;I++) - // all_atom_set.insert(I); - // const auto IJq_coul = Communicate_Tensors_Map_Judge::comm_map2_first(mpi_comm_global_h.comm, - // vq_libri, all_atom_set, loc_atp_IJ); atpair_k_cplx_mat_t Vq_loc; double vq_end_comm = - // omp_get_wtime(); for(auto Ip:IJq_coul) - // { - // auto I=Ip.first; - // auto n_mu=atom_mu[I]; - // for(auto &Jqp:Ip.second) - // { - // auto J=Jqp.first.first; - // auto n_nu=atom_mu[J]; - // auto qa=Jqp.first.second; - // Vector3_Order q{qa[0],qa[1],qa[2]}; - // shared_ptr vq_ptr = make_shared(); - // vq_ptr->create(n_mu, n_nu); - // const auto length=sizeof(complex)* n_mu *n_nu; - // memcpy((*vq_ptr).c, Jqp.second.ptr(),length); - // Vq_loc[I][J][q]=vq_ptr; - // //printf("| process %d, I: %d J: %d\n",mpi_comm_global_h.myid, I,J ); - // } - // } - // double vq_end_t2m = omp_get_wtime(); - // mpi_comm_global_h.barrier(); - // if(mpi_comm_global_h.is_root()) - // lib_printf("| Vq_time %f, TIME_m2t: %f TIME_comm: %f TIME_t2m: - // %f\n",vq_end_t2m-vq_begin_m2t,vq_end_m2t-vq_begin_m2t,vq_end_comm-vq_end_m2t,vq_end_t2m-vq_end_comm); - map, ComplexMatrix>> pi_freq_q; - complex tot_RPA_energy(0.0, 0.0); - map, complex> cRPA_q; - for (const auto &freq_q_MuNuchi0 : chi0.get_chi0_q()) - { - const auto freq = freq_q_MuNuchi0.first; - const double freq_weight = chi0.tfg.find_freq_weight(freq); - for (const auto &q_MuNuchi0 : freq_q_MuNuchi0.second) - { - double task_begin = omp_get_wtime(); - const auto q = q_MuNuchi0.first; - auto &MuNuchi0 = q_MuNuchi0.second; - - // ComplexMatrix loc_piT(loc_col,loc_row); - auto loc_piT = init_local_mat>(arrdesc_pi, MAJOR::COL); - complex trace_pi(0.0, 0.0); - double vq_time = 0.0; - double pi_time = 0.0; - double loc_tran_time = 0.0; - for (int Mu = 0; Mu != natom; Mu++) - { - double Mu_begin = omp_get_wtime(); - // lib_printf(" |process %d, Mu: %d\n",mpi_comm_global_h.myid,Mu); - const size_t n_mu = atom_mu[Mu]; - atom_mapping::pair_t_old Vq_row = gather_vq_row_q(Mu, coulmat, q); - double Mu_after_vq = omp_get_wtime(); - // atom_mapping::pair_t_old Vq_row; - // const auto IJq_coul = - // Communicate_Tensors_Map_Judge::comm_map2_first(mpi_comm_global_h.comm, vq_libri, - // {Mu}, loc_atp_atoms); double Mu_vq_comm = omp_get_wtime(); for(auto Ip:IJq_coul) - // { - // auto I=Ip.first; - // auto n_mu=atom_mu[I]; - // for(auto &Jqp:Ip.second) - // { - // auto J=Jqp.first.first; - // auto n_nu=atom_mu[J]; - // auto q=Jqp.first.second; - // Vq_row[I][J].create(n_mu,n_nu); - // const auto length=sizeof(complex)* n_mu *n_nu; - // memcpy(Vq_row[I][J].c, Jqp.second.ptr(),length); - // } - // } - // double Mu_after_vq=omp_get_wtime(); - // printf(" |process %d, Mu: %d vq_row.size: - // %d\n",para_mpi.get_myid(),Mu,Vq_row[Mu].size()); ComplexMatrix - // loc_pi_row=compute_Pi_freq_q_row(q,MuNuchi0,Vq_loc,Mu,q); - ComplexMatrix loc_pi_row = compute_Pi_freq_q_row(q, MuNuchi0, Vq_row, Mu); - // printf(" |process %d, compute_pi\n",para_mpi.get_myid()); - ComplexMatrix glo_pi_row(n_mu, N_all_mu); - mpi_comm_global_h.barrier(); - mpi_comm_global_h.allreduce_ComplexMatrix(loc_pi_row, glo_pi_row); - double Mu_after_pi_loc = omp_get_wtime(); - // cout<<" glo_pi_rowT nr,nc: "<(1.0, 0.0) - - glo_pi_row(mu_blacs, atom_mu_part_range[J_blacs] + nu_blacs); - } - else - { - loc_piT(i, j) = - -glo_pi_row(mu_blacs, atom_mu_part_range[J_blacs] + nu_blacs); - } - } - } - double Mu_after_loc_tran = omp_get_wtime(); - vq_time += (Mu_after_vq - Mu_begin); - pi_time += (Mu_after_pi_loc - Mu_after_vq); - loc_tran_time += (Mu_after_loc_tran - Mu_after_pi_loc); - } - // if(freq == chi0.tfg.get_freq_nodes()[0] && mpi_comm_global_h.is_root()) - // print_complex_matrix(" loc_piT",loc_piT); - double task_mid = omp_get_wtime(); - // printf("|process %d, before det\n",mpi_comm_global_h.myid); - complex ln_det = compute_pi_det_blacs_2d(loc_piT, arrdesc_pi, ipiv, info); - double task_end = omp_get_wtime(); - if (mpi_comm_global_h.is_root()) - lib_printf( - "| After det for freq: %f, q: ( %f, %f, %f) TIME_Vq_COMM: %f TIME_DET: " - "%f TIME_CAL_Pi: %f, TIME_TRAN_LOC: %f\n", - freq, q.x, q.y, q.z, vq_time, task_end - task_mid, pi_time, loc_tran_time); - // para_mpi.mpi_barrier(); - if (mpi_comm_global_h.myid == 0) - { - complex rpa_for_omega_q = trace_pi + ln_det; - // cout << " ifreq:" << freq << " rpa_for_omega_k: " << rpa_for_omega_q << " - // lnt_det: " << ln_det << " trace_pi " << trace_pi << endl; - cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; //! check - tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; - } - } - } - - if (mpi_comm_global_h.myid == 0) - { - for (auto &q_crpa : cRPA_q) - { - corr.qcontrib[q_crpa.first] = q_crpa.second; - // cout << q_crpa.first << q_crpa.second << endl; - } - // cout << "gx_num_" << chi0.tfg.size() << " tot_RPA_energy: " << setprecision(8) - // <, atom_mapping::pair_t_old>> - pi_freq_q_Mu_Nu; - if (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR || - LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI) - pi_freq_q_Mu_Nu = compute_Pi_q_MPI(chi0, coulmat); - else - pi_freq_q_Mu_Nu = compute_Pi_q(chi0, coulmat); - lib_printf("Finish Pi freq on Proc %4d, size %zu\n", mpi_comm_global_h.myid, - pi_freq_q_Mu_Nu.size()); - // mpi_comm_global_h.barrier(); - - int range_all = N_all_mu; - - vector part_range; - part_range.resize(atom_mu.size()); - part_range[0] = 0; - int count_range = 0; - #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION - // printf("success before part_range processid:%d, atom_mu.size(): %zu\n", - // mpi_comm_global_h.myid, atom_mu.size()); - #endif - for (int I = 0; I != atom_mu.size() - 1; I++) - { - count_range += atom_mu[I]; - part_range[I + 1] = count_range; - } - #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION - // printf("success after part_range processid:%d, atom_mu.size(): %zu\n", - // mpi_comm_global_h.myid, atom_mu.size()); - #endif - - // cout << "part_range:" << endl; - // for (int I = 0; I != atom_mu.size(); I++) - // { - // cout << part_range[I] << endl; - // } - // cout << "part_range over" << endl; - - // pi_freq_q contains all atoms - map, ComplexMatrix>> pi_freq_q; - #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION - // printf("| process %d, qpts.size(): %zu,freq.size():%zu\n", mpi_comm_global_h.myid, chi0.klist.size(),chi0.tfg.get_freq_nodes().size()); - #endif - for(const auto &freq : chi0.tfg.get_freq_nodes()) - { - // printf("| process %d, freq: %f\n", mpi_comm_global_h.myid, freq); - map, atom_mapping::pair_t_old> freq_q_MuNupi; - if(!chi0.get_chi0_q().empty()) - freq_q_MuNupi=pi_freq_q_Mu_Nu.at(freq); - #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION - // printf("success before freq_q_MuNupi processid:%d, freq_q_MuNupi.size(): %zu\n", - // mpi_comm_global_h.myid, freq_q_MuNupi.size()); - #endif - for(const auto &q:chi0.klist){ - atom_mapping::pair_t_old q_MuNupi; - if(!chi0.get_chi0_q().empty()) - q_MuNupi = freq_q_MuNupi.at(q); - const auto MuNupi = q_MuNupi; - pi_freq_q[freq][q].create(range_all, range_all); - - ComplexMatrix pi_munu_tmp(range_all, range_all); - pi_munu_tmp.zero_out(); - if(!chi0.get_chi0_q().empty()) - for (const auto &Mu_Nupi : MuNupi) - { - const auto Mu = Mu_Nupi.first; - const auto Nupi = Mu_Nupi.second; - const size_t n_mu = atom_mu[Mu]; - for (const auto &Nu_pi : Nupi) - { - const auto Nu = Nu_pi.first; - const auto pimat = Nu_pi.second; - const size_t n_nu = atom_mu[Nu]; - - for (size_t mu = 0; mu != n_mu; ++mu) - { - for (size_t nu = 0; nu != n_nu; ++nu) - { - pi_munu_tmp(part_range[Mu] + mu, part_range[Nu] + nu) += pimat(mu, nu); - } - } - } - } - if (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR || - LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI) - { - mpi_comm_global_h.reduce_ComplexMatrix(pi_munu_tmp, pi_freq_q.at(freq).at(q), 0); - } - else - { - pi_freq_q.at(freq).at(q) = std::move(pi_munu_tmp); - } - } - } - // lib_printf("Finish Pi communicate %4d, size %zu\n", mpi_comm_global_h.myid, - // pi_freq_q_Mu_Nu.size()); - mpi_comm_global_h.barrier(); - // if (mpi_comm_global_h.myid == 0) - { - complex tot_RPA_energy(0.0, 0.0); - map, complex> cRPA_q; - #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION - int num_iteration = 0; - #endif - for (const auto &freq_qpi : pi_freq_q) - { - const auto freq = freq_qpi.first; - const double freq_weight = chi0.tfg.find_freq_weight(freq); - for (const auto &q_pi : freq_qpi.second) - { - const auto q = q_pi.first; - const auto pimat = q_pi.second; - complex rpa_for_omega_q(0.0, 0.0); - ComplexMatrix identity(range_all, range_all); - ComplexMatrix identity_minus_pi(range_all, range_all); - identity.set_as_identity_matrix(); - identity_minus_pi = identity - pi_freq_q[freq][q]; - #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION - // if(num_iteration==0) - // if(mpi_comm_global_h.myid == 1) - // { - // complex* test_c= identity_minus_pi.c; - // for(int i=0;i det_for_rpa(1.0, 0.0); - int info_LU = 0; - int *ipiv = new int[range_all]; - LapackConnector::zgetrf(range_all, range_all, identity_minus_pi, range_all, ipiv, - &info_LU); - for (int ib = 0; ib != range_all; ib++) - { - if (ipiv[ib] != (ib + 1)) - det_for_rpa = -det_for_rpa * identity_minus_pi(ib, ib); - else - det_for_rpa = det_for_rpa * identity_minus_pi(ib, ib); - } - delete[] ipiv; - - complex trace_pi; - complex ln_det; - ln_det = std::log(det_for_rpa); - trace_pi = trace(pi_freq_q.at(freq).at(q)); - // cout << "PI trace vector:" << endl; - // cout << endl; - rpa_for_omega_q = ln_det + trace_pi; - // cout << " ifreq:" << freq << " rpa_for_omega_k: " << rpa_for_omega_q << " - // lnt_det: " << ln_det << " trace_pi " << trace_pi << endl; - cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; - tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; - } - } - // lib_printf("Finish EcRPA %4d, size %zu\n", mpi_comm_global_h.myid, - // pi_freq_q_Mu_Nu.size()); - mpi_comm_global_h.barrier(); - map, complex> global_cRPA_q; - for (auto q_weight : irk_weight) - { - MPI_Reduce(&cRPA_q[q_weight.first], &global_cRPA_q[q_weight.first], 1, - MPI_DOUBLE_COMPLEX, MPI_SUM, 0, mpi_comm_global_h.comm); - } - - for (auto &q_crpa : global_cRPA_q) - { - corr.qcontrib[q_crpa.first] = q_crpa.second; - } - complex gather_tot_RPA_energy(0.0, 0.0); - MPI_Reduce(&tot_RPA_energy, &gather_tot_RPA_energy, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, 0, - mpi_comm_global_h.comm); - corr.value = gather_tot_RPA_energy; - } - corr.etype = CorrEnergy::type::RPA; - return corr; -} - -CorrEnergy compute_MP2_correlation(const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat) -{ - CorrEnergy corr; - corr.etype = CorrEnergy::type::MP2; - return corr; -} - -map, atom_mapping::pair_t_old>> compute_Pi_q( - const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat) -{ - map, atom_mapping::pair_t_old>> pi; - lib_printf("Begin compute_Pi_q , pid: %d\n", mpi_comm_global_h.myid); - for (auto const &freq_qJQchi0 : chi0.get_chi0_q()) - { - const double freq = freq_qJQchi0.first; - for (auto &q_JQchi0 : freq_qJQchi0.second) - { - Vector3_Order q = q_JQchi0.first; - for (auto &JQchi0 : q_JQchi0.second) - { - const size_t J = JQchi0.first; - const size_t J_mu = atom_mu[J]; - for (auto &Qchi0 : JQchi0.second) - { - const size_t Q = Qchi0.first; - const size_t Q_mu = atom_mu[Q]; - // auto &chi0_mat = Qchi0.second; - for (int I = 0; I != natom; I++) - { - // const size_t I = I_p.first; - const size_t I_mu = atom_mu[I]; - pi[freq][q][I][Q].create(I_mu, Q_mu); - if (J != Q) pi[freq][q][I][J].create(I_mu, J_mu); - } - } - } - // if(freq==chi0.tfg.get_freq_nodes()[0]) - // for(auto &Ip:pi[freq][q]) - // for(auto &Jp:Ip.second) - // lib_printf(" |process %d, pi atpair: %d, %d - // \n",mpi_comm_global_h.myid,Ip.first,Jp.first); - } - } - - // ofstream fp; - // std::stringstream ss; - // ss<<"out_pi_rank_"< ik_vec = k_pair.first; - auto chi0_freq_k = k_pair.second; - for (auto &J_p : chi0_freq_k) - { - const size_t J = J_p.first; - for (auto &Q_p : J_p.second) - { - const size_t Q = Q_p.first; - auto &chi0_mat = Q_p.second; - for (int I = 0; I != natom; I++) - { - // const size_t I = I_p.first; - // printf("cal_pi pid: %d , IJQ: %d %d %d\n", mpi_comm_global_h.myid, I, - // J, Q); - // cout<<" pi_IQ: "< trace_pi; - // trace_pi = trace(pi.at(freq).at(ik_vec).at(I).at(Q)); - // sm << " IJQ: " << I << " " << J << " " << Q << " ik_vec: " << - // ik_vec << " trace_pi: " << trace_pi << endl; - // print_complex_matrix_file(sm.str().c_str(), - // (*Vq.at(I).at(J).at(ik_vec)),fp,false); - // print_complex_matrix_file("chi0:", chi0_mat,fp,false); - // print_complex_matrix_file("pi_mat:", - // pi.at(freq).at(ik_vec).at(I).at(Q),fp,false); - // } - } - else - { - // if (freq == chi0.tfg.get_freq_nodes()[0]) - // lib_printf("cal_pi pid: %d , IJQ: %d %d %d type: %d \n", - // mpi_comm_global_h.myid, I, J, Q,2); - // << " Vq: " << transpose(*Vq.at(J).at(I).at(ik_vec), 1)(0, 0) << - // endl; - pi.at(freq).at(ik_vec).at(I).at(Q) += - transpose(*Vq.at(J).at(I).at(ik_vec), 1) * chi0_mat; - } - - if (J != Q) - { - ComplexMatrix chi0_QJ = transpose(chi0_mat, 1); - if (I <= Q) - { - // if (freq == chi0.tfg.get_freq_nodes()[0]) - // lib_printf("cal_pi pid: %d , IJQ: %d %d %d type: %d - // \n", mpi_comm_global_h.myid, I, J, Q,3); - // << " Vq: " << (*Vq.at(I).at(Q).at(ik_vec))(0, 0) << endl; - pi.at(freq).at(ik_vec).at(I).at(J) += - (*Vq.at(I).at(Q).at(ik_vec)) * chi0_QJ; - } - else - { - // if (freq == chi0.tfg.get_freq_nodes()[0]) - // lib_printf("cal_pi pid: %d , IJQ: %d %d %d type: %d - // \n", mpi_comm_global_h.myid, I, J, Q,4); - // << " Vq: " << transpose(*Vq.at(J).at(I).at(ik_vec), 1)(0, - // 0) << endl; - pi.at(freq).at(ik_vec).at(I).at(J) += - transpose(*Vq.at(Q).at(I).at(ik_vec), 1) * chi0_QJ; - } - } - } - } - } - } - } - // fp.close(); - // print_complex_matrix(" - // first_pi_mat:",pi.at(chi0.tfg.get_freq_nodes()[0]).at({0,0,0}).at(0).at(0)); - /* print_complex_matrix(" - * last_pi_mat:",pi.at(chi0.tfg.get_freq_nodes()[0]).at({0,0,0}).at(natom-1).at(natom-1)); */ - return pi; -} - -map, atom_mapping::pair_t_old>> compute_Pi_q_MPI( - const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat) -{ - map, atom_mapping::pair_t_old>> pi; - lib_printf("Begin compute_Pi_q_MPI , pid: %d\n", mpi_comm_global_h.myid); - for (auto const &freq_qJQchi0 : chi0.get_chi0_q()) - { - const double freq = freq_qJQchi0.first; - for (auto &q_JQchi0 : freq_qJQchi0.second) - { - Vector3_Order q = q_JQchi0.first; - for (auto &JQchi0 : q_JQchi0.second) - { - const size_t J = JQchi0.first; - const size_t J_mu = atom_mu[J]; - for (auto &Qchi0 : JQchi0.second) - { - const size_t Q = Qchi0.first; - const size_t Q_mu = atom_mu[Q]; - // auto &chi0_mat = Qchi0.second; - for (int I = 0; I != natom; I++) - { - // const size_t I = I_p.first; - const size_t I_mu = atom_mu[I]; - pi[freq][q][I][Q].create(I_mu, Q_mu); - if (J != Q) pi[freq][q][I][J].create(I_mu, J_mu); - } - } - } - // if(freq==chi0.tfg.get_freq_nodes()[0]) - // for(auto &Ip:pi[freq][q]) - // for(auto &Jp:Ip.second) - // lib_printf(" |process %d, pi atpair: %d, %d - // \n",mpi_comm_global_h.myid,Ip.first,Jp.first); - } - } - - // ofstream fp; - // std::stringstream ss; - // ss<<"out_pi_rank_"< ik_vec = k_pair.first; - for (int I = 0; I != natom; I++) - { - #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION - // printf("success before gather_vp_row_q irk_weight, pid: %d\n", mpi_comm_global_h.myid); - #endif - atom_mapping::pair_t_old Vq_row = gather_vq_row_q(I, coulmat, ik_vec); - #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION - // printf("success after gather_vp_row_q irk_weight, pid: %d\n", mpi_comm_global_h.myid); - #endif - for (auto &freq_p : chi0.get_chi0_q()) - { - const double freq = freq_p.first; - const auto chi0_freq = freq_p.second; - - auto chi0_freq_k = freq_p.second.at(ik_vec); - - for (auto &J_p : chi0_freq_k) - { - const size_t J = J_p.first; - for (auto &Q_p : J_p.second) - { - const size_t Q = Q_p.first; - auto &chi0_mat = Q_p.second; - - // const size_t I = I_p.first; - // printf("cal_pi pid: %d , IJQ: %d %d %d\n", mpi_comm_global_h.myid, I, - // J, Q); - // cout<<" pi_IQ: "< trace_pi; - // trace_pi = trace(pi.at(freq).at(ik_vec).at(I).at(Q)); - // sm << " IJQ: " << I << " " << J << " " << Q << " ik_vec: " << ik_vec - // << " trace_pi: " << trace_pi << endl; - // print_complex_matrix_file(sm.str().c_str(), - // Vq_row.at(I).at(J),fp,false); print_complex_matrix_file("chi0:", - // chi0_mat,fp,false); print_complex_matrix_file("pi_mat:", - // pi.at(freq).at(ik_vec).at(I).at(Q),fp,false); - // } - - if (J != Q) - { - ComplexMatrix chi0_QJ = transpose(chi0_mat, 1); - // if (freq == chi0.tfg.get_freq_nodes()[0]) - // lib_printf("cal_pi pid: %d , IJQ: %d %d %d type: %d \n", - // mpi_comm_global_h.myid, I, J,Q,3); - // << " Vq: " << (*Vq.at(I).at(Q).at(ik_vec))(0, 0) << endl; - pi.at(freq).at(ik_vec).at(I).at(J) += Vq_row.at(I).at(Q) * chi0_QJ; - } - } - } - } - } - } - // fp.close(); - // print_complex_matrix(" - // first_pi_mat:",pi.at(chi0.tfg.get_freq_nodes()[0]).at({0,0,0}).at(0).at(0)); - /* print_complex_matrix(" - * last_pi_mat:",pi.at(chi0.tfg.get_freq_nodes()[0]).at({0,0,0}).at(natom-1).at(natom-1)); */ - lib_printf("End compute_Pi_q_MPI , pid: %d\n", mpi_comm_global_h.myid); - return pi; -} - -ComplexMatrix compute_Pi_freq_q_row(const Vector3_Order &ik_vec, - const atom_mapping::pair_t_old &chi0_freq_q, - const atom_mapping::pair_t_old &Vq_row, - const int &I) -{ - map pi; - // lib_printf("Begin cal_pi_k , pid: %d\n", para_mpi.get_myid()); - auto I_mu = atom_mu[I]; - for (int J = 0; J != natom; J++) pi[J].create(I_mu, atom_mu[J]); - - omp_lock_t pi_lock; - omp_init_lock(&pi_lock); -#pragma omp parallel for schedule(dynamic) - for (int iap = 0; iap != local_atpair.size(); iap++) - { - const size_t J = local_atpair[iap].first; - const size_t Q = local_atpair[iap].second; - auto &chi0_mat = chi0_freq_q.at(J).at(Q); - auto tmp_pi_mat = Vq_row.at(I).at(J) * chi0_mat; - ComplexMatrix chi0_QJ = transpose(chi0_mat, 1); - auto tmp_pi_mat2 = Vq_row.at(I).at(Q) * chi0_QJ; - omp_set_lock(&pi_lock); - pi.at(Q) += tmp_pi_mat; - if (J != Q) - { - pi.at(J) += tmp_pi_mat2; - } - omp_unset_lock(&pi_lock); - } - omp_destroy_lock(&pi_lock); - // for (auto &J_p : chi0_freq_q) - // { - // const size_t J = J_p.first; - // for (auto &Q_p : J_p.second) - // { - // const size_t Q = Q_p.first; - // auto &chi0_mat = Q_p.second; - // pi.at(Q) += Vq_row.at(I).at(J) * chi0_mat; - // if (J != Q) - // { - // ComplexMatrix chi0_QJ = transpose(chi0_mat, 1); - // pi.at(J) += Vq_row.at(I).at(Q) * chi0_QJ; - // } - // } - // } - // Pi_rowT - // ComplexMatrix pi_row(N_all_mu,atom_mu[I]); - // complex *pi_row_ptr=pi_row.c; - // for(auto &Jp:pi) - // { - // auto J=Jp.first; - // auto J_mu=atom_mu[J]; - // const auto length=sizeof(complex)* I_mu *J_mu; - // memcpy(pi_row_ptr, pi.at(J).c,length); - // pi_row_ptr+=I_mu *J_mu; - // } - ComplexMatrix pi_row(atom_mu[I], N_all_mu); - for (int i = 0; i != pi_row.nr; i++) - for (int J = 0; J != natom; J++) - for (int j = 0; j != atom_mu[J]; j++) - pi_row(i, atom_mu_part_range[J] + j) = pi.at(J)(i, j); - return pi_row; -} - -ComplexMatrix compute_Pi_freq_q_row_ri(const Vector3_Order &ik_vec, - const atom_mapping::pair_t_old &chi0_freq_q, - const atpair_k_cplx_mat_t &Vq_loc, const int &I, - const Vector3_Order &q) -{ - map pi; - // lib_printf("Begin cal_pi_k , pid: %d\n", mpi_comm_global_h.myid); - auto I_mu = atom_mu[I]; - for (int J = 0; J != natom; J++) pi[J].create(I_mu, atom_mu[J]); - - omp_lock_t pi_lock; - omp_init_lock(&pi_lock); -#pragma omp parallel for schedule(dynamic) - for (int iap = 0; iap != local_atpair.size(); iap++) - { - const size_t J = local_atpair[iap].first; - const size_t Q = local_atpair[iap].second; - auto &chi0_mat = chi0_freq_q.at(J).at(Q); - // printf("| IN cal Pi process %d, I: %d J: %d Q: %d\n",mpi_comm_global_h.myid, I,J,Q ); - auto tmp_pi_mat = *Vq_loc.at(I).at(J).at(q) * chi0_mat; - ComplexMatrix chi0_QJ = transpose(chi0_mat, 1); - auto tmp_pi_mat2 = *Vq_loc.at(I).at(Q).at(q) * chi0_QJ; - omp_set_lock(&pi_lock); - pi.at(Q) += tmp_pi_mat; - if (J != Q) - { - pi.at(J) += tmp_pi_mat2; - } - omp_unset_lock(&pi_lock); - } - omp_destroy_lock(&pi_lock); - // for (auto &J_p : chi0_freq_q) - // { - // const size_t J = J_p.first; - // for (auto &Q_p : J_p.second) - // { - // const size_t Q = Q_p.first; - // auto &chi0_mat = Q_p.second; - // pi.at(Q) += Vq_row.at(I).at(J) * chi0_mat; - // if (J != Q) - // { - // ComplexMatrix chi0_QJ = transpose(chi0_mat, 1); - // pi.at(J) += Vq_row.at(I).at(Q) * chi0_QJ; - // } - // } - // } - // Pi_rowT - // ComplexMatrix pi_row(N_all_mu,atom_mu[I]); - // complex *pi_row_ptr=pi_row.c; - // for(auto &Jp:pi) - // { - // auto J=Jp.first; - // auto J_mu=atom_mu[J]; - // const auto length=sizeof(complex)* I_mu *J_mu; - // memcpy(pi_row_ptr, pi.at(J).c,length); - // pi_row_ptr+=I_mu *J_mu; - // } - ComplexMatrix pi_row(atom_mu[I], N_all_mu); - for (int i = 0; i != pi_row.nr; i++) - for (int J = 0; J != natom; J++) - for (int j = 0; j != atom_mu[J]; j++) - pi_row(i, atom_mu_part_range[J] + j) = pi.at(J)(i, j); - return pi_row; -} - -atom_mapping::pair_t_old gather_vq_row_q(const int &I, - const atpair_k_cplx_mat_t &coulmat, - const Vector3_Order &ik_vec) -{ - auto I_mu = atom_mu[I]; - atom_mapping::pair_t_old Vq_row; - for (int J_tmp = 0; J_tmp != natom; J_tmp++) - { - auto J_mu = atom_mu[J_tmp]; - ComplexMatrix loc_vq(atom_mu[I], atom_mu[J_tmp]); - Vq_row[I][J_tmp].create(atom_mu[I], atom_mu[J_tmp]); - // const auto length=sizeof(complex)* I_mu *J_mu; - // complex *loc_vq_ptr=loc_vq.c; - if (I <= J_tmp) - { - if (Vq.count(I)) - if (Vq.at(I).count(J_tmp)) loc_vq = *Vq.at(I).at(J_tmp).at(ik_vec); - } - else - { - if (Vq.count(J_tmp)) - if (Vq.at(J_tmp).count(I)) loc_vq = transpose(*Vq.at(J_tmp).at(I).at(ik_vec), 1); - } - mpi_comm_global_h.allreduce_ComplexMatrix(loc_vq, Vq_row[I][J_tmp]); - } - return Vq_row; -} - -map, matrix_m>>>::pair_t_old> -compute_Wc_freq_q(Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat_eps, - atpair_k_cplx_mat_t &coulmat_wc, - const vector> &epsmac_LF_imagfreq) -{ - map, matrix_m>>>::pair_t_old> - Wc_freq_q; - const int range_all = LIBRPA::atomic_basis_abf.nb_total; - const auto part_range = LIBRPA::atomic_basis_abf.get_part_range(); - - if (mpi_comm_global_h.myid == 0) - { - cout << "Calculating Wc using LAPACK" << endl; - } - - mpi_comm_global_h.barrier(); - // use q-points as the outmost loop, so that square root of Coulomb will not be recalculated at - // each frequency point - vector> qpts; - for (const auto &qMuNuchi : chi0.get_chi0_q().at(chi0.tfg.get_freq_nodes()[0])) - qpts.push_back(qMuNuchi.first); - - for (const auto &q : qpts) - { - int iq = std::distance(klist.begin(), std::find(klist.begin(), klist.end(), q)); - char fn[80]; - - ComplexMatrix Vq_all(range_all, range_all); - for (const auto &Mu_NuqVq : coulmat_eps) - { - auto Mu = Mu_NuqVq.first; - auto n_mu = atom_mu[Mu]; - for (auto &Nu_qVq : Mu_NuqVq.second) - { - auto Nu = Nu_qVq.first; - if (0 == Nu_qVq.second.count(q)) continue; - auto n_nu = atom_mu[Nu]; - for (int i_mu = 0; i_mu != n_mu; i_mu++) - for (int i_nu = 0; i_nu != n_nu; i_nu++) - { - Vq_all(part_range[Mu] + i_mu, part_range[Nu] + i_nu) = - (*Nu_qVq.second.at(q))(i_mu, i_nu); - Vq_all(part_range[Nu] + i_nu, part_range[Mu] + i_mu) = - conj((*Nu_qVq.second.at(q))(i_mu, i_nu)); - } - } - } - if (Params::debug) - { - sprintf(fn, "Vq_all_q_%d.mtx", iq); - print_complex_matrix_mm(Vq_all, Params::output_dir + "/" + fn, 1e-15); - } - auto sqrtVq_all = power_hemat(Vq_all, 0.5, true, false, Params::sqrt_coulomb_threshold); - // Vq_all is now eigenvectors of the original Coulomb matrix - const auto &Vq_eigen = Vq_all; - if (Params::debug) - { - sprintf(fn, "sqrtVq_all_q_%d.mtx", iq); - print_complex_matrix_mm(sqrtVq_all, Params::output_dir + "/" + fn, 1e-15); - // sprintf(fn, "rotated_sqrtVq_all_q_%d.mtx", iq); - // print_complex_matrix_mm(Vq_all * sqrtVq_all * transpose(Vq_all, true), fn, 1e-15); - // print_complex_matrix_mm(transpose(Vq_all, true) * sqrtVq_all * Vq_all, fn, 1e-15); - sprintf(fn, "Vqeigenvec_q_%d.mtx", iq); - print_complex_matrix_mm(Vq_eigen, Params::output_dir + "/" + fn, 1e-15); - } - - // truncated (cutoff) Coulomb - ComplexMatrix Vqcut_all(range_all, range_all); - for (auto &Mu_NuqVq : coulmat_wc) - { - auto Mu = Mu_NuqVq.first; - auto n_mu = atom_mu[Mu]; - for (auto &Nu_qVq : Mu_NuqVq.second) - { - auto Nu = Nu_qVq.first; - if (0 == Nu_qVq.second.count(q)) continue; - auto n_nu = atom_mu[Nu]; - for (int i_mu = 0; i_mu != n_mu; i_mu++) - for (int i_nu = 0; i_nu != n_nu; i_nu++) - { - Vqcut_all(part_range[Mu] + i_mu, part_range[Nu] + i_nu) = - (*Nu_qVq.second.at(q))(i_mu, i_nu); - Vqcut_all(part_range[Nu] + i_nu, part_range[Mu] + i_mu) = - conj((*Nu_qVq.second.at(q))(i_mu, i_nu)); - } - } - } - auto sqrtVqcut_all = - power_hemat(Vqcut_all, 0.5, false, true, Params::sqrt_coulomb_threshold); - // sprintf(fn, "sqrtVqcut_all_q_%d.mtx", iq); - // print_complex_matrix_mm(sqrtVqcut_all, fn, 1e-15); - sprintf(fn, "Vqcut_all_filtered_q_%d.mtx", iq); - // print_complex_matrix_mm(Vqcut_all, fn, 1e-15); - // save the filtered truncated Coulomb back to the atom mapping object - // TODO: revise the necessity - for (auto &Mu_NuqVq : coulmat_wc) - { - auto Mu = Mu_NuqVq.first; - auto n_mu = atom_mu[Mu]; - for (auto &Nu_qVq : Mu_NuqVq.second) - { - auto Nu = Nu_qVq.first; - if (0 == Nu_qVq.second.count(q)) continue; - auto n_nu = atom_mu[Nu]; - for (int i_mu = 0; i_mu != n_mu; i_mu++) - for (int i_nu = 0; i_nu != n_nu; i_nu++) - (*Nu_qVq.second.at(q))(i_mu, i_nu) = - Vqcut_all(part_range[Mu] + i_mu, part_range[Nu] + i_nu); - } - } - - ComplexMatrix chi0fq_all(range_all, range_all); - for (const auto &freq_qMuNuchi : chi0.get_chi0_q()) - { - auto freq = freq_qMuNuchi.first; - auto ifreq = chi0.tfg.get_freq_index(freq); - auto MuNuchi = freq_qMuNuchi.second.at(q); - for (const auto &Mu_Nuchi : MuNuchi) - { - auto Mu = Mu_Nuchi.first; - auto n_mu = atom_mu[Mu]; - for (auto &Nu_chi : Mu_Nuchi.second) - { - auto Nu = Nu_chi.first; - auto n_nu = atom_mu[Nu]; - for (int i_mu = 0; i_mu != n_mu; i_mu++) - for (int i_nu = 0; i_nu != n_nu; i_nu++) - { - chi0fq_all(part_range[Mu] + i_mu, part_range[Nu] + i_nu) = - Nu_chi.second(i_mu, i_nu); - chi0fq_all(part_range[Nu] + i_nu, part_range[Mu] + i_mu) = - conj(Nu_chi.second(i_mu, i_nu)); - } - } - } - sprintf(fn, "chi0fq_all_q_%d_freq_%d.mtx", iq, ifreq); - print_complex_matrix_mm(chi0fq_all, Params::output_dir + "/" + fn, 1e-15); - - ComplexMatrix identity(range_all, range_all); - identity.set_as_identity_matrix(); - auto eps_fq = sqrtVq_all * chi0fq_all * sqrtVq_all; - eps_fq = transpose(Vq_eigen, true) * eps_fq * Vq_eigen; - if (!epsmac_LF_imagfreq.empty() && is_gamma_point(q)) - { - // rotate to Coulomb-diagonal basis - // lib_printf("Largest off-diagonal = %f\n", eps_fq.get_max_abs_offdiag()); - // print_matrix("rotated eps_fq: ", eps_fq.real()); - // replacing the element corresponding to largest Coulomb eigenvalue with dielectric - // function - lib_printf("%22.12f %22.12f %22.12f %22.12f\n", freq, eps_fq(0, 0).real(), - eps_fq(eps_fq.nr - 1, eps_fq.nc - 1).real(), - epsmac_LF_imagfreq[ifreq].real()); - // eps_fq(eps_fq.nr - 1, eps_fq.nc - 1) = epsmac_LF_imagfreq[ifreq]; - eps_fq(0, 0) = 1.0 - epsmac_LF_imagfreq[ifreq]; - } - if (Params::debug) - { - sprintf(fn, "rotated_vsxvs_q_%d_freq_%d.mtx", iq, ifreq); - print_complex_matrix_mm(eps_fq, Params::output_dir + "/" + fn, 1e-10); - } - // rotate back to ABF - eps_fq = Vq_eigen * eps_fq * transpose(Vq_eigen, true); - eps_fq = identity - eps_fq; - if (Params::debug) - { - sprintf(fn, "eps_q_%d_freq_%d.mtx", iq, ifreq); - print_complex_matrix_mm(eps_fq, Params::output_dir + "/" + fn, 1e-10); - } - - // invert the epsilon matrix - power_hemat_onsite(eps_fq, -1); - auto wc_all = sqrtVqcut_all * (eps_fq - identity) * sqrtVqcut_all; - // sprintf(fn, "inveps_q_%d_freq_%d.mtx", iq, ifreq); - // print_complex_matrix_mm(eps_fq, fn, 1e-15); - // sprintf(fn, "wc_q_%d_freq_%d.mtx", iq, ifreq); - // print_complex_matrix_mm(wc_all, fn, 1e-15); - - // save result to the atom mapping object - for (auto &Mu_Nuchi : MuNuchi) - { - auto Mu = Mu_Nuchi.first; - auto n_mu = atom_mu[Mu]; - for (auto &Nu_chi : Mu_Nuchi.second) - { - auto Nu = Nu_chi.first; - auto n_nu = atom_mu[Nu]; - shared_ptr wc_ptr = make_shared(); - wc_ptr->create(n_mu, n_nu); - for (int i_mu = 0; i_mu != n_mu; i_mu++) - for (int i_nu = 0; i_nu != n_nu; i_nu++) - { - (*wc_ptr)(i_mu, i_nu) = - wc_all(part_range[Mu] + i_mu, part_range[Nu] + i_nu); - } - Wc_freq_q[freq][Mu][Nu][q] = - matrix_m>(n_mu, n_nu, wc_ptr->c, MAJOR::ROW, MAJOR::ROW); - } - } - } - } - - return Wc_freq_q; -} - -// Done: converge compute_Wc_freq_q_blacs and compute_Wc_freq_q_blacs_wing -map, matrix_m>>>::pair_t_old> -compute_Wc_freq_q_blacs(Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat_eps, - atpair_k_cplx_mat_t &coulmat_wc, - const vector> &epsmac_LF_imagfreq) -{ - map, matrix_m>>>::pair_t_old> - Wc_freq_q; - const complex CONE{1.0, 0.0}; - const int n_abf = LIBRPA::atomic_basis_abf.nb_total; - const auto part_range = LIBRPA::atomic_basis_abf.get_part_range(); - - if (mpi_comm_global_h.myid == 0) - { - cout << "Calculating Wc using ScaLAPACK" << endl; - } - mpi_comm_global_h.barrier(); - - Profiler::start("compute_Wc_freq_q_blacs_init"); - Array_Desc desc_nabf_nabf(blacs_ctxt_global_h); - // Use a square blocksize instead max block, otherwise heev and inversion will complain about - // illegal parameter Maximal blocksize ensure that atom indices related to the rows/columns of a - // local matrix is minimized. - desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0); - // This, however, is not optimal for matrix operations, and may lead to segment fault during - // MPI operations with parallel linear algebra subroutine. Thus we define an optimal blocksize - Array_Desc desc_nabf_nabf_opt(blacs_ctxt_global_h); - const int nb_opt = min(128, desc_nabf_nabf.nb()); - desc_nabf_nabf_opt.init(n_abf, n_abf, nb_opt, nb_opt, 0, 0); - // obtain the indices of atom-pair block necessary to build 2D block of a Hermitian/symmetric - // matrix - const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy( - 'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf); - const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf); - // temp_block is used to collect data from IJ-pair data structure with comm_map2_first - auto temp_block = init_local_mat>(desc_nabf_nabf, MAJOR::COL); - // Below are the working arrays for matrix operations - auto chi0_block = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); - auto coul_block = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); - auto coul_eigen_block = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); - auto coul_chi0_block = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); - auto coulwc_block = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); - - const double mem_blocks = (chi0_block.size() + coul_block.size() + coul_eigen_block.size() + - coul_chi0_block.size() + coulwc_block.size()) * - 16.0e-6; - ofs_myid << get_timestamp() - << " Memory consumption of task-local blocks for screened Coulomb [MB]: " << mem_blocks - << endl; - - const auto atpair_local = dispatch_upper_trangular_tasks( - natom, blacs_ctxt_global_h.myid, blacs_ctxt_global_h.nprows, blacs_ctxt_global_h.npcols, - blacs_ctxt_global_h.myprow, blacs_ctxt_global_h.mypcol); -#ifdef LIBRPA_DEBUG - ofs_myid << get_timestamp() << " atpair_local " << atpair_local << endl; - ofs_myid << get_timestamp() << " s0_s1 " << s0_s1 << endl; -#endif - - // IJ pair of Wc to be returned - pair, set> Iset_Jset_Wc; - for (const auto &ap : atpair_local) - { - Iset_Jset_Wc.first.insert(ap.first); - Iset_Jset_Wc.second.insert(ap.second); - } - - // Prepare local basis indices for 2D->IJ map - int I, iI; - map> map_lor_v; - map> map_loc_v; - for (int i_lo = 0; i_lo != desc_nabf_nabf.m_loc(); i_lo++) - { - int i_glo = desc_nabf_nabf.indx_l2g_r(i_lo); - LIBRPA::atomic_basis_abf.get_local_index(i_glo, I, iI); - map_lor_v[I].push_back(iI); - } - for (int i_lo = 0; i_lo != desc_nabf_nabf.n_loc(); i_lo++) - { - int i_glo = desc_nabf_nabf.indx_l2g_c(i_lo); - LIBRPA::atomic_basis_abf.get_local_index(i_glo, I, iI); - map_loc_v[I].push_back(iI); - } - - vector> qpts; - for (const auto &q_weight : irk_weight) qpts.push_back(q_weight.first); - - vec eigenvalues(n_abf); - Profiler::cease("compute_Wc_freq_q_blacs_init"); - LIBRPA::utils::lib_printf_root("Time for Wc initialization (seconds, Wall/CPU): %f %f\n", - Profiler::get_wall_time_last("compute_Wc_freq_q_blacs_init"), - Profiler::get_cpu_time_last("compute_Wc_freq_q_blacs_init")); - - Profiler::start("compute_Wc_freq_q_work"); -#ifdef LIBRPA_USE_LIBRI - for (const auto &q : qpts) - { - const int iq = std::distance(qpts.cbegin(), std::find(qpts.cbegin(), qpts.cend(), q)); - const int iq_in_k = - std::distance(klist.cbegin(), std::find(klist.cbegin(), klist.cend(), q)); - // q-point in fractional coordinates - const auto &qf = kfrac_list[iq_in_k]; - LIBRPA::utils::lib_printf_root("Computing Wc(q), %d / %d, q=(%f, %f, %f)\n", iq + 1, - qpts.size(), qf.x, qf.y, qf.z); - coul_block.zero_out(); - coulwc_block.zero_out(); - // lib_printf("coul_block\n%s", str(coul_block).c_str()); - - // q-array for LibRI object - std::array qa = {q.x, q.y, q.z}; - - // collect the block elements of truncated coulomb matrices first - // as we reuse coul_eigen_block to reduce memory usage - Profiler::start("epsilon_prepare_coulwc_sqrt", "Prepare sqrt of truncated Coulomb"); - { - size_t n_singular_coulwc; - // LibRI tensor for communication, release once done - std::map>, RI::Tensor>>> - couleps_libri; - Profiler::start("epsilon_prepare_coulwc_sqrt_1", "Setup libRI object"); - for (const auto &Mu_Nu : atpair_local) - { - const auto Mu = Mu_Nu.first; - const auto Nu = Mu_Nu.second; - // ofs_myid << "Mu " << Mu << " Nu " << Nu << endl; - if (coulmat_wc.count(Mu) == 0 || coulmat_wc.at(Mu).count(Nu) == 0 || - coulmat_wc.at(Mu).at(Nu).count(q) == 0) - continue; - const auto &Vq = coulmat_wc.at(Mu).at(Nu).at(q); - const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu); - const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu); - std::valarray> Vq_va(Vq->c, Vq->size); - auto pvq = std::make_shared>>(); - *pvq = Vq_va; - couleps_libri[Mu][{Nu, qa}] = RI::Tensor>({n_mu, n_nu}, pvq); - } - Profiler::stop("epsilon_prepare_coulwc_sqrt_1"); - - Profiler::start("epsilon_prepare_coulwc_sqrt_2", "libRI Communicate"); - const auto IJq_coul = RI::Communicate_Tensors_Map_Judge::comm_map2_first( - mpi_comm_global_h.comm, couleps_libri, s0_s1.first, s0_s1.second); - Profiler::stop("epsilon_prepare_coulwc_sqrt_2"); - - Profiler::start("epsilon_prepare_coulwc_sqrt_3", "Collect 2D-block from IJ"); - // for (const auto &IJ: set_IJ_nabf_nabf) - // { - // const auto &I = IJ.first; - // const auto &J = IJ.second; - // collect_block_from_IJ_storage_syhe( - // coulwc_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, IJ.first, - // IJ.second, true, CONE, IJq_coul.at(I).at({J, qa}).ptr(), MAJOR::ROW); - // } - collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, - qa, true, CONE, IJq_coul, MAJOR::ROW); - ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1, desc_nabf_nabf.desc, - coulwc_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, - blacs_ctxt_global_h.ictxt); - Profiler::stop("epsilon_prepare_coulwc_sqrt_3"); - Profiler::start("epsilon_prepare_coulwc_sqrt_4", "Perform square root"); - power_hemat_blacs(coulwc_block, desc_nabf_nabf_opt, coul_eigen_block, - desc_nabf_nabf_opt, n_singular_coulwc, eigenvalues.c, 0.5, - Params::sqrt_coulomb_threshold); - Profiler::stop("epsilon_prepare_coulwc_sqrt_4"); - } - Profiler::stop("epsilon_prepare_coulwc_sqrt"); - LIBRPA::utils::lib_printf_root( - "Time to prepare sqrt root of Coulomb for Wc(q) (seconds, Wall/CPU): %f %f\n", - Profiler::get_wall_time_last("epsilon_prepare_coulwc_sqrt"), - Profiler::get_cpu_time_last("epsilon_prepare_coulwc_sqrt")); - ofs_myid << get_timestamp() << " Done coulwc sqrt" << endl; - - Profiler::start("epsilon_prepare_couleps_sqrt", "Prepare sqrt of bare Coulomb"); - // collect the block elements of coulomb matrices - { - // LibRI tensor for communication, release once done - std::map>, RI::Tensor>>> - couleps_libri; - ofs_myid << get_timestamp() << " Start build couleps_libri" << endl; - for (const auto &Mu_Nu : atpair_local) - { - const auto Mu = Mu_Nu.first; - const auto Nu = Mu_Nu.second; - // ofs_myid << "Mu " << Mu << " Nu " << Nu << endl; - if (coulmat_eps.count(Mu) == 0 || coulmat_eps.at(Mu).count(Nu) == 0 || - coulmat_eps.at(Mu).at(Nu).count(q) == 0) - continue; - const auto &Vq = coulmat_eps.at(Mu).at(Nu).at(q); - const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu); - const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu); - std::valarray> Vq_va(Vq->c, Vq->size); - auto pvq = std::make_shared>>(); - *pvq = Vq_va; - couleps_libri[Mu][{Nu, qa}] = RI::Tensor>({n_mu, n_nu}, pvq); - } - ofs_myid << get_timestamp() << " Done build couleps_libri" << endl; - // ofs_myid << "Couleps_libri" << endl << couleps_libri; - // if (couleps_libri.size() == 0) - // throw std::logic_error("data at q-point not found in coulmat_eps"); - - // perform communication - ofs_myid << get_timestamp() << " Start collect couleps_libri, targets" << endl; -#ifdef LIBRPA_DEBUG - ofs_myid << set_IJ_nabf_nabf << endl; - ofs_myid << "Extended blocks" << endl; - ofs_myid << "atom 1: " << s0_s1.first << endl; - ofs_myid << "atom 2: " << s0_s1.second << endl; -#endif - // ofs_myid << "Owned blocks\n"; - // print_keys(ofs_myid, couleps_libri); - // mpi_comm_global_h.barrier(); - const auto IJq_coul = RI::Communicate_Tensors_Map_Judge::comm_map2_first( - mpi_comm_global_h.comm, couleps_libri, s0_s1.first, s0_s1.second); - ofs_myid << get_timestamp() << " Done collect couleps_libri, collected blocks" << endl; - - ofs_myid << get_timestamp() << " Start construct couleps 2D block" << endl; - collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, - qa, true, CONE, IJq_coul, MAJOR::ROW); - ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1, desc_nabf_nabf.desc, - coul_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, - blacs_ctxt_global_h.ictxt); - ofs_myid << get_timestamp() << " Done construct couleps 2D block" << endl; - } - // char fn[100]; - // sprintf(fn, "couleps_iq_%d.mtx", iq); - // print_matrix_mm_file_parallel(fn, coul_block, desc_nabf_nabf); - // ofs_myid << str(coul_block); - // lib_printf("coul_block\n%s", str(coul_block).c_str()); - - size_t n_singular; - ofs_myid << get_timestamp() << " Start power hemat couleps\n"; - matrix_m> sqrtveig_blacs; - if (is_gamma_point(q)) - { - // choice of power_hemat_blacs_real/power_hemat_blacs_desc - // leads to sub-meV difference - sqrtveig_blacs = power_hemat_blacs_real( - coul_block, desc_nabf_nabf_opt, coul_eigen_block, desc_nabf_nabf_opt, n_singular, - eigenvalues.c, 0.5, Params::sqrt_coulomb_threshold); - if (Params::replace_w_head && Params::option_dielect_func == 3) - { - df_headwing.wing_mu_to_lambda(sqrtveig_blacs, desc_nabf_nabf_opt); - } - } - else - { - sqrtveig_blacs = power_hemat_blacs(coul_block, desc_nabf_nabf_opt, coul_eigen_block, - desc_nabf_nabf_opt, n_singular, eigenvalues.c, 0.5, - Params::sqrt_coulomb_threshold); - } - ofs_myid << get_timestamp() << " Done power hemat couleps\n"; - // lib_printf("nabf %d nsingu %lu\n", n_abf, n_singular); - // release sqrtv when the q-point is not Gamma, or macroscopic dielectric constant at - // imaginary frequency is not prepared - if (epsmac_LF_imagfreq.empty() || !is_gamma_point(q)) sqrtveig_blacs.clear(); - const size_t n_nonsingular = n_abf - n_singular; - Profiler::stop("epsilon_prepare_couleps_sqrt"); - LIBRPA::utils::lib_printf_root( - "Time to prepare sqrt root of Coulomb for Epsilon(q) (seconds, Wall/CPU): %f %f\n", - Profiler::get_wall_time_last("epsilon_prepare_couleps_sqrt"), - Profiler::get_cpu_time_last("epsilon_prepare_couleps_sqrt")); - ofs_myid << get_timestamp() << " Done couleps sqrt\n"; - std::flush(ofs_myid); - - for (const auto &freq : chi0.tfg.get_freq_nodes()) - { - const auto ifreq = chi0.tfg.get_freq_index(freq); - Profiler::start("epsilon_wc_work_q_omega"); - Profiler::start("epsilon_prepare_chi0_2d", "Prepare Chi0 2D block"); - chi0_block.zero_out(); - { - std::map>, - RI::Tensor>>> - chi0_libri; - if (chi0.get_chi0_q().count(freq) > 0 && chi0.get_chi0_q().at(freq).count(q) > 0) - { - const auto &chi0_wq = chi0.get_chi0_q().at(freq).at(q); - for (const auto &M_Nchi : chi0_wq) - { - const auto &M = M_Nchi.first; - const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M); - for (const auto &N_chi : M_Nchi.second) - { - const auto &N = N_chi.first; - const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N); - const auto &chi = N_chi.second; - std::valarray> chi_va(chi.c, chi.size); - auto pchi = std::make_shared>>(); - *pchi = chi_va; - chi0_libri[M][{N, qa}] = - RI::Tensor>({n_mu, n_nu}, pchi); - } - } - // Release the chi0 block for this frequency and q to reduce memory load, - // as they will not be used again - chi0.free_chi0_q(freq, q); - } - // ofs_myid << "chi0_libri" << endl << chi0_libri; - Profiler::start("epsilon_prepare_chi0_2d_comm_map2"); - const auto IJq_chi0 = RI::Communicate_Tensors_Map_Judge::comm_map2_first( - mpi_comm_global_h.comm, chi0_libri, s0_s1.first, s0_s1.second); - Profiler::stop("epsilon_prepare_chi0_2d_comm_map2"); - // ofs_myid << "IJq_chi0" << endl << IJq_chi0; - // for (const auto &IJ: set_IJ_nabf_nabf) - // { - // const auto &I = IJ.first; - // const auto &J = IJ.second; - // collect_block_from_IJ_storage_syhe( - // chi0_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, IJ.first, - // IJ.second, true, CONE, IJq_chi0.at(I).at({J, qa}).ptr(), MAJOR::ROW); - // } - Profiler::start("epsilon_prepare_chi0_2d_collect_block"); - collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf, - LIBRPA::atomic_basis_abf, qa, true, CONE, IJq_chi0, - MAJOR::ROW); - ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1, - desc_nabf_nabf.desc, chi0_block.ptr(), 1, 1, - desc_nabf_nabf_opt.desc, blacs_ctxt_global_h.ictxt); - Profiler::stop("epsilon_prepare_chi0_2d_collect_block"); - // sprintf(fn, "chi_ifreq_%d_iq_%d.mtx", ifreq, iq); - // print_matrix_mm_file_parallel(fn, chi0_block, desc_nabf_nabf); - } - Profiler::stop("epsilon_prepare_chi0_2d"); - - Profiler::start("epsilon_compute_eps", "Compute dielectric matrix"); - - // for Gamma point, overwrite the head term - if (epsmac_LF_imagfreq.size() > 0 && is_gamma_point(q)) - { - ofs_myid << get_timestamp() << " Entering dielectric matrix head overwrite" << endl; - // rotate to Coulomb-eigenvector basis - // descending order - ScalapackConnector::pgemm_f( - 'N', 'N', n_abf, n_nonsingular, n_abf, 1.0, chi0_block.ptr(), 1, 1, - desc_nabf_nabf_opt.desc, sqrtveig_blacs.ptr(), 1, 1, desc_nabf_nabf_opt.desc, - 0.0, coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc); - ScalapackConnector::pgemm_f('C', 'N', n_nonsingular, n_nonsingular, n_abf, 1.0, - sqrtveig_blacs.ptr(), 1, 1, desc_nabf_nabf_opt.desc, - coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, - 0.0, chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc); - - if (Params::option_dielect_func == 3) - { - chi0_block *= -1.0; - for (int i = 0; i != n_nonsingular; i++) - { - const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i); - if (ilo < 0) continue; - const int jlo = desc_nabf_nabf_opt.indx_g2l_c(i); - if (jlo < 0) continue; - chi0_block(ilo, jlo) += 1.0; - } - ofs_myid << get_timestamp() << "Perform the head & wing element overwrite" - << endl; - df_headwing.rewrite_eps(chi0_block, ifreq, desc_nabf_nabf_opt); - } - else - { - const int ilo = desc_nabf_nabf_opt.indx_g2l_r(0); - const int jlo = desc_nabf_nabf_opt.indx_g2l_c(0); - if (ilo >= 0 && jlo >= 0) - { - ofs_myid << get_timestamp() << "Perform the head element overwrite" << endl; - chi0_block(ilo, jlo) = 1.0 - epsmac_LF_imagfreq[ifreq]; - } - } - // rotate back to ABF - // descending order - ScalapackConnector::pgemm_f('N', 'N', n_abf, n_nonsingular, n_nonsingular, 1.0, - coul_eigen_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, - chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, 0.0, - coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc); - ScalapackConnector::pgemm_f('N', 'C', n_abf, n_abf, n_nonsingular, 1.0, - coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, - coul_eigen_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, - 0.0, chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc); - if (Params::option_dielect_func != 3) - { - // now chi0_block is actually v1/2 chi v1/2 - chi0_block *= -1.0; - for (int i = 0; i != n_abf; i++) - { - const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i); - if (ilo < 0) continue; - const int jlo = desc_nabf_nabf_opt.indx_g2l_c(i); - if (jlo < 0) continue; - chi0_block(ilo, jlo) += 1.0; - } - // now chi0_block is actually the dielectric matrix - // perform inversion - Profiler::start("epsilon_invert_eps", "Invert dielectric matrix"); - invert_scalapack(chi0_block, desc_nabf_nabf_opt); - } - // subtract 1 from diagonal - for (int i = 0; i != n_abf; i++) - { - const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i); - if (ilo < 0) continue; - const int jlo = desc_nabf_nabf_opt.indx_g2l_c(i); - if (jlo < 0) continue; - chi0_block(ilo, jlo) -= 1.0; - } - } - else - { - Profiler::start("epsilon_compute_eps_pgemm_1"); - ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, coul_block.ptr(), 1, - 1, desc_nabf_nabf_opt.desc, chi0_block.ptr(), 1, 1, - desc_nabf_nabf_opt.desc, 0.0, coul_chi0_block.ptr(), 1, - 1, desc_nabf_nabf_opt.desc); - Profiler::cease("epsilon_compute_eps_pgemm_1"); - Profiler::start("epsilon_compute_eps_pgemm_2"); - ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, - coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, - coul_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, 0.0, - chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc); - Profiler::cease("epsilon_compute_eps_pgemm_2"); - // now chi0_block is actually v1/2 chi v1/2 - chi0_block *= -1.0; - for (int i = 0; i != n_abf; i++) - { - const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i); - if (ilo < 0) continue; - const int jlo = desc_nabf_nabf_opt.indx_g2l_c(i); - if (jlo < 0) continue; - chi0_block(ilo, jlo) += 1.0; - } - Profiler::stop("epsilon_compute_eps"); - // now chi0_block is actually the dielectric matrix - // perform inversion - Profiler::start("epsilon_invert_eps", "Invert dielectric matrix"); - invert_scalapack(chi0_block, desc_nabf_nabf_opt); - // subtract 1 from diagonal - for (int i = 0; i != n_abf; i++) - { - const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i); - if (ilo < 0) continue; - const int jlo = desc_nabf_nabf_opt.indx_g2l_c(i); - if (jlo < 0) continue; - chi0_block(ilo, jlo) -= 1.0; - } - Profiler::stop("epsilon_invert_eps"); - } - // debug for GaAs - // for (int i = 0; i != n_abf; i++) - // { - // for (int j = 0; j != n_abf; j++) - // { - // const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i); - // if (ilo < 0) continue; - // const int jlo = desc_nabf_nabf_opt.indx_g2l_c(j); - // if (jlo < 0) continue; - // if(i==j) - // chi0_block(ilo, jlo) = 1.0; - // else - // chi0_block(ilo, jlo) = 0.0; - // } - // } - // debug for unfold shrink Wc - // for (int i = 0; i != n_abf; i++) - //{ - // const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i); - // if (ilo < 0) continue; - // for (int j = 0; j != n_abf; j++) - // { - // const int jlo = desc_nabf_nabf_opt.indx_g2l_c(j); - // if (jlo < 0) continue; - // if (i == j) - // chi0_block(ilo, jlo) = 1.0; - // else - // chi0_block(ilo, jlo) = 0.0; - // } - // } - // debug end - - Profiler::start("epsilon_multiply_coulwc", "Multiply truncated Coulomb"); - ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, coulwc_block.ptr(), 1, - 1, desc_nabf_nabf_opt.desc, chi0_block.ptr(), 1, 1, - desc_nabf_nabf_opt.desc, 0.0, coul_chi0_block.ptr(), 1, 1, - desc_nabf_nabf_opt.desc); - ScalapackConnector::pgemm_f('N', 'N', n_abf, n_abf, n_abf, 1.0, coul_chi0_block.ptr(), - 1, 1, desc_nabf_nabf_opt.desc, coulwc_block.ptr(), 1, 1, - desc_nabf_nabf_opt.desc, 0.0, chi0_block.ptr(), 1, 1, - desc_nabf_nabf_opt.desc); - ScalapackConnector::pgemr2d_f(n_abf, n_abf, chi0_block.ptr(), 1, 1, - desc_nabf_nabf_opt.desc, temp_block.ptr(), 1, 1, - desc_nabf_nabf.desc, blacs_ctxt_global_h.ictxt); - Profiler::stop("epsilon_multiply_coulwc"); - // lib_printf("chi0_block\n%s", str(chi0_block).c_str()); - // now chi0_block is the screened Coulomb interaction Wc (i.e. W-V) - - Profiler::start("epsilon_convert_wc_2d_to_ij", "Convert Wc, 2D -> IJ"); - Profiler::start("epsilon_convert_wc_map_block", "Initialize Wc atom-pair map"); - map>>> Wc_MNmap; - // map_block_to_IJ_storage(Wc_MNmap, LIBRPA::atomic_basis_abf, - // LIBRPA::atomic_basis_abf, chi0_block, - // desc_nabf_nabf, MAJOR::ROW); - map_block_to_IJ_storage_new(Wc_MNmap, LIBRPA::atomic_basis_abf, map_lor_v, map_loc_v, - temp_block, desc_nabf_nabf, MAJOR::ROW); - Profiler::stop("epsilon_convert_wc_map_block"); - - Profiler::start("epsilon_convert_wc_communicate", "Communicate"); - { - std::map>, - RI::Tensor>>> - Wc_libri; - Profiler::start("epsilon_convert_wc_communicate_1"); - for (const auto &M_NWc : Wc_MNmap) - { - const auto &M = M_NWc.first; - const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M); - for (const auto &N_Wc : M_NWc.second) - { - const auto &N = N_Wc.first; - const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N); - const auto &Wc = N_Wc.second; - // std::valarray> Wc_va(Wc.ptr(), Wc.size()); - // auto pWc = std::make_shared>>(); - // *pWc = Wc_va; - /*if (iq == 10 && ifreq == 10) - { - char fn[100]; - sprintf(fn, "Wc_M_%zu_N_%zu.dat", M, N); - print_matrix_mm_file(Wc, Params::output_dir + "/" + fn); - }*/ - Wc_libri[M][{N, qa}] = RI::Tensor>({n_mu, n_nu}, Wc.sptr()); - } - } - Profiler::stop("epsilon_convert_wc_communicate_1"); - Profiler::start("epsilon_convert_wc_communicate_2"); - // main timing - // cout << Wc_libri; - const auto IJq_Wc = RI::Communicate_Tensors_Map_Judge::comm_map2_first( - mpi_comm_global_h.comm, Wc_libri, Iset_Jset_Wc.first, Iset_Jset_Wc.second); - Profiler::stop("epsilon_convert_wc_communicate_2"); - Profiler::start("epsilon_convert_wc_communicate_3"); - // parse collected to - for (const auto &MN : atpair_local) - { - const auto &M = MN.first; - const auto &N = MN.second; - const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M); - const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N); - // Use row major for later usage in LibRI - Wc_freq_q[freq][M][N][q] = matrix_m>( - n_mu, n_nu, IJq_Wc.at(M).at({N, qa}).data, MAJOR::ROW); - } - Profiler::stop("epsilon_convert_wc_communicate_3"); - // for ( int i_mu = 0; i_mu != n_mu; i_mu++ ) - // for ( int i_nu = 0; i_nu != n_nu; i_nu++ ) - // { - // } - } - Profiler::stop("epsilon_convert_wc_communicate"); - Profiler::stop("epsilon_convert_wc_2d_to_ij"); - Profiler::cease("epsilon_wc_work_q_omega"); - LIBRPA::utils::lib_printf_root( - "Time for Wc(i_q=%d, i_omega=%d) (seconds, Wall/CPU): %f %f\n", iq + 1, ifreq + 1, - Profiler::get_wall_time_last("epsilon_wc_work_q_omega"), - Profiler::get_cpu_time_last("epsilon_wc_work_q_omega")); - } - } -#else - throw std::logic_error("need compilation with LibRI"); -#endif - Profiler::cease("compute_Wc_freq_q_work"); - LIBRPA::utils::lib_printf_root("Time for Wc computation (seconds, Wall/CPU): %f %f\n", - Profiler::get_wall_time_last("compute_Wc_freq_q_work"), - Profiler::get_cpu_time_last("compute_Wc_freq_q_work")); - - return Wc_freq_q; -} - -map, matrix_m>>>::pair_t_old> -FT_Wc_freq_q( - const map, matrix_m>>>::pair_t_old> - &Wc_freq_q, - const TFGrids &tfg, const int &n_k_points, const vector> &Rlist) -{ - // major of Wc_freq_q input and Wc_tau_R output - const MAJOR major_Wc = MAJOR::ROW; - - map, matrix_m>>>::pair_t_old> - Wc_freq_R; - const int ngrids = tfg.get_n_grids(); - if (Params::debug) - { - if (mpi_comm_global_h.is_root()) lib_printf("Converting Wc q,w -> R,t\n"); - mpi_comm_global_h.barrier(); - } - set> atpairs_unique; - for (const auto &freq_MuNuqWc : Wc_freq_q) - { - for (const auto &Mu_NuqWc : freq_MuNuqWc.second) - { - const auto Mu = Mu_NuqWc.first; - for (const auto &Nu_qWc : Mu_NuqWc.second) - { - const auto Nu = Nu_qWc.first; - atpairs_unique.insert({Mu, Nu}); - for (const auto &q_Wc : Nu_qWc.second) - { - assert(q_Wc.second.major() == major_Wc); - } - } - } - } - - vector>, pair>> ifreqR_atpair_all; - // allocate space before hand - for (auto R : Rlist) - { - for (int ifreq = 0; ifreq != ngrids; ifreq++) - { - auto freq = tfg.get_freq_nodes()[ifreq]; - for (auto atpair_unique : atpairs_unique) - { - const auto Mu = atpair_unique.first; - const int n_mu = atom_mu[Mu]; - const auto Nu = atpair_unique.second; - const int n_nu = atom_mu[Nu]; - Wc_freq_R[freq][Mu][Nu][R] = matrix_m>(n_mu, n_nu, major_Wc); - ifreqR_atpair_all.push_back({{ifreq, R}, atpair_unique}); - } - } - } - -#pragma omp parallel for schedule(dynamic) - for (auto ifreqR_atpair : ifreqR_atpair_all) - { - const auto ifreq = ifreqR_atpair.first.first; - const auto freq = tfg.get_freq_nodes()[ifreq]; - const auto R = ifreqR_atpair.first.second; - const auto Mu = ifreqR_atpair.second.first; - const auto Nu = ifreqR_atpair.second.second; - const int n_mu = atom_mu[Mu]; - const int n_nu = atom_mu[Nu]; - - // thread local temporary matrix - matrix_m> WfR_temp(n_mu, n_nu, major_Wc); - - if (Wc_freq_q.count(freq) == 0) continue; - if (Wc_freq_q.at(freq).count(Mu) == 0) continue; - if (Wc_freq_q.at(freq).at(Mu).count(Nu) == 0) continue; - - for (auto &Wc_q : Wc_freq_q.at(freq).at(Mu).at(Nu)) - { - const auto q = Wc_q.first; - const auto &Wc = Wc_q.second; - for (auto q_bz : map_irk_ks[q]) - { - const double ang = -q_bz * (R * latvec) * TWO_PI; - const complex weight = - complex(cos(ang), sin(ang)) / double(n_k_points); - if (q == q_bz) - WfR_temp += Wc * weight; - else - WfR_temp += conj(Wc) * weight; - } - } - // omp_set_lock(&lock_Wc); - Wc_freq_R[freq][Mu][Nu][R] += WfR_temp; - // omp_unset_lock(&lock_Wc); - } - - if (mpi_comm_global_h.is_root()) - { - lib_printf("Done converting Wc(q,w) -> Wc(R,w)\n"); - } - mpi_comm_global_h.barrier(); - - return Wc_freq_R; -} - -map, matrix_m>>>::pair_t_old> -CT_FT_Wc_freq_q( - const map, matrix_m>>>::pair_t_old> - &Wc_freq_q, - const TFGrids &tfg, const int &n_k_points, const vector> &Rlist) -{ - // major of Wc_freq_q input and Wc_tau_R output - const MAJOR major_Wc = MAJOR::ROW; - - map, matrix_m>>>::pair_t_old> - Wc_tau_R; - if (!tfg.has_time_grids()) throw logic_error("TFGrids object does not have time grids"); - const int ngrids = tfg.get_n_grids(); - - LIBRPA::utils::lib_printf_root("Converting Wc(q,w) -> W(R,t)\n"); - mpi_comm_global_h.barrier(); - - set> atpairs_unique; - for (const auto &freq_MuNuqWc : Wc_freq_q) - { - for (const auto &Mu_NuqWc : freq_MuNuqWc.second) - { - const auto Mu = Mu_NuqWc.first; - for (const auto &Nu_qWc : Mu_NuqWc.second) - { - const auto Nu = Nu_qWc.first; - atpairs_unique.insert({Mu, Nu}); - for (const auto &q_Wc : Nu_qWc.second) - { - assert(q_Wc.second.major() == major_Wc); - } - } - } - } - - vector>, pair>> itauR_atpair_all; - // allocate space before hand - for (auto R : Rlist) - { - for (int itau = 0; itau != ngrids; itau++) - { - auto tau = tfg.get_time_nodes()[itau]; - for (auto atpair_unique : atpairs_unique) - { - const auto Mu = atpair_unique.first; - const int n_mu = atom_mu[Mu]; - const auto Nu = atpair_unique.second; - const int n_nu = atom_mu[Nu]; - Wc_tau_R[tau][Mu][Nu][R] = matrix_m>(n_mu, n_nu, major_Wc); - itauR_atpair_all.push_back({{itau, R}, atpair_unique}); - } - } - } - - LIBRPA::utils::lib_printf_coll("Task %4d: distributing %d {I, J, R, tau} on %d threads\n", - LIBRPA::envs::myid_global, itauR_atpair_all.size(), - omp_get_max_threads()); - -#pragma omp parallel for schedule(dynamic) - for (auto itauR_atpair : itauR_atpair_all) - { - const auto itau = itauR_atpair.first.first; - const auto tau = tfg.get_time_nodes()[itau]; - const auto R = itauR_atpair.first.second; - const auto Mu = itauR_atpair.second.first; - const auto Nu = itauR_atpair.second.second; - const int n_mu = atom_mu[Mu]; - const int n_nu = atom_mu[Nu]; - - // thread local temporary matrix - matrix_m> WtR_temp(n_mu, n_nu, major_Wc); - - for (int ifreq = 0; ifreq < ngrids; ifreq++) - { - const auto freq = tfg.get_freq_nodes()[ifreq]; - const auto f2t = tfg.get_costrans_f2t()(itau, ifreq); - // ofs_myid << "f2t cos eff for freq " << freq << " -> tau " << tau << ": " << f2t << - // "\n"; - if (Wc_freq_q.count(freq) == 0) continue; - if (Wc_freq_q.at(freq).count(Mu) == 0) continue; - if (Wc_freq_q.at(freq).at(Mu).count(Nu) == 0) continue; - // cout << "freq: " << freq << "\n"; - - const auto &Wc_q_all = Wc_freq_q.at(freq).at(Mu).at(Nu); - for (auto &Wc_q : Wc_q_all) - { - const auto q = Wc_q.first; - const auto &Wc = Wc_q.second; - for (auto q_bz : map_irk_ks[q]) - { - const double ang = -q_bz * (R * latvec) * TWO_PI; - const complex weight = - complex(cos(ang), sin(ang)) * f2t / double(n_k_points); - // ofs_myid << q << " " << q_bz << " weight = " << weight << "\n"; - // ofs_myid << q_Wc.second; - if (q == q_bz) - WtR_temp += Wc * weight; - else - WtR_temp += conj(Wc) * weight; - } - } - } - // omp_set_lock(&lock_Wc); - Wc_tau_R[tau][Mu][Nu][R] += WtR_temp; - // omp_unset_lock(&lock_Wc); - } - - LIBRPA::utils::lib_printf_root("Done converting Wc q,w -> R,t\n"); - mpi_comm_global_h.barrier(); - - // myz debug: check the imaginary part of the matrix - // NOTE: if G(R) is real, is W(R) real as well? - // if (Params::debug) - // { - // for (const auto & tau_MuNuRWc: Wc_tau_R) - // { - // char fn[80]; - // auto tau = tau_MuNuRWc.first; - // auto itau = tfg.get_time_index(tau); - // for (const auto & Mu_NuRWc: tau_MuNuRWc.second) - // { - // auto Mu = Mu_NuRWc.first; - // // const int n_mu = atom_mu[Mu]; - // for (const auto & Nu_RWc: Mu_NuRWc.second) - // { - // auto Nu = Nu_RWc.first; - // // const int n_nu = atom_mu[Nu]; - // for (const auto & R_Wc: Nu_RWc.second) - // { - // auto R = R_Wc.first; - // auto Wc = R_Wc.second; - // auto iteR = std::find(Rlist.cbegin(), Rlist.cend(), R); - // auto iR = std::distance(Rlist.cbegin(), iteR); - // sprintf(fn, "Wc_Mu_%zu_Nu_%zu_iR_%zu_itau_%d_id_%d.mtx", Mu, Nu, iR, - // itau, mpi_comm_global_h.myid); print_matrix_mm_file(Wc, - // Params::output_dir + "/" + fn, 1e-10); - // } - // } - // } - // } - // } - // end myz debug - return Wc_tau_R; -} - -map, matrix_m>>>::pair_t_old> -CT_FT_Wc_freq2time_q( - const map, matrix_m>>>::pair_t_old> - &Wc_freq_q, - const TFGrids &tfg, const int &n_k_points, const vector> &Rlist, - const vector> &qlist) -{ - // major of Wc_freq_q input and Wc_tau_R output - const MAJOR major_Wc = MAJOR::ROW; - - map, matrix_m>>>::pair_t_old> - Wc_tau_q; - if (!tfg.has_time_grids()) throw logic_error("TFGrids object does not have time grids"); - const int ngrids = tfg.get_n_grids(); - - LIBRPA::utils::lib_printf_root("Converting Wc(q,w) -> W(q,t)\n"); - mpi_comm_global_h.barrier(); - - set> atpairs_unique; - for (const auto &freq_MuNuqWc : Wc_freq_q) - { - for (const auto &Mu_NuqWc : freq_MuNuqWc.second) - { - const auto Mu = Mu_NuqWc.first; - for (const auto &Nu_qWc : Mu_NuqWc.second) - { - const auto Nu = Nu_qWc.first; - atpairs_unique.insert({Mu, Nu}); - for (const auto &q_Wc : Nu_qWc.second) - { - assert(q_Wc.second.major() == major_Wc); - } - } - } - } - vector>> itau_atpair_all; - // allocate space before hand - - for (int itau = 0; itau != ngrids; itau++) - { - auto tau = tfg.get_time_nodes()[itau]; - for (auto atpair_unique : atpairs_unique) - { - const auto Mu = atpair_unique.first; - const int n_mu = atom_mu_s[Mu]; - const auto Nu = atpair_unique.second; - const int n_nu = atom_mu_s[Nu]; - for (auto q : qlist) - Wc_tau_q[tau][Mu][Nu][q] = matrix_m>(n_mu, n_nu, major_Wc); - itau_atpair_all.push_back({itau, atpair_unique}); - } - } - - LIBRPA::utils::lib_printf_coll("Task %4d: distributing %d {I, J, R, tau} on %d threads\n", - LIBRPA::envs::myid_global, itau_atpair_all.size(), - omp_get_max_threads()); - -#pragma omp parallel for schedule(dynamic) - for (auto itau_atpair : itau_atpair_all) - { - const auto itau = itau_atpair.first; - const auto tau = tfg.get_time_nodes()[itau]; - const auto Mu = itau_atpair.second.first; - const auto Nu = itau_atpair.second.second; - const int n_mu = atom_mu_s[Mu]; - const int n_nu = atom_mu_s[Nu]; - - // thread local temporary matrix - matrix_m> Wtq_temp(n_mu, n_nu, major_Wc); - - for (int ifreq = 0; ifreq < ngrids; ifreq++) - { - const auto freq = tfg.get_freq_nodes()[ifreq]; - const auto f2t = tfg.get_costrans_f2t()(itau, ifreq); - // ofs_myid << "f2t cos eff for freq " << freq << " -> tau " << tau << ": " << f2t << - // "\n"; - if (Wc_freq_q.count(freq) == 0) continue; - if (Wc_freq_q.at(freq).count(Mu) == 0) continue; - if (Wc_freq_q.at(freq).at(Mu).count(Nu) == 0) continue; - // cout << "freq: " << freq << "\n"; - - const auto &Wc_q_all = Wc_freq_q.at(freq).at(Mu).at(Nu); - for (auto &Wc_q : Wc_q_all) - { - const auto q = Wc_q.first; - const auto &Wc = Wc_q.second; - const double weight = f2t; - Wtq_temp = Wc * weight; - // omp_set_lock(&lock_Wc); - Wc_tau_q[tau][Mu][Nu][q] += Wtq_temp; - // omp_unset_lock(&lock_Wc); - } - } - } - - LIBRPA::utils::lib_printf_root("Done converting Wc q,w -> q,t\n"); - mpi_comm_global_h.barrier(); - - return Wc_tau_q; -} - -atom_mapping, matrix_m>>>::pair_t_old CT_FT_Wc_tau_R2q( - const atom_mapping, matrix_m>>>::pair_t_old - &Wc_tau_q, - const TFGrids &tfg, const int &n_kpoints, const vector> &Rlist, - const int &itau) -{ - const int tau = tfg.get_time_nodes()[itau]; - // major of Wc_freq_q input and Wc_tau_R output - const MAJOR major_Wc = MAJOR::ROW; - - atom_mapping, matrix_m>>>::pair_t_old Wc_tau_R; - if (!tfg.has_time_grids()) throw logic_error("TFGrids object does not have time grids"); - const int ngrids = tfg.get_n_grids(); - - LIBRPA::utils::lib_printf_root("Converting Wc(q,t) -> W(R,t)\n"); - mpi_comm_global_h.barrier(); - - set> atpairs_unique; - for (const auto &MuNuqWc : Wc_tau_q) - { - const auto Mu = MuNuqWc.first; - for (const auto &Nu_qWc : MuNuqWc.second) - { - const auto Nu = Nu_qWc.first; - atpairs_unique.insert({Mu, Nu}); - for (const auto &q_Wc : Nu_qWc.second) - { - assert(q_Wc.second.major() == major_Wc); - } - } - } - - vector, pair>> iR_atpair_all; - // allocate space before hand - for (auto R : Rlist) - { - for (auto atpair_unique : atpairs_unique) - { - const auto Mu = atpair_unique.first; - const int n_mu = atom_mu_l[Mu]; - const auto Nu = atpair_unique.second; - const int n_nu = atom_mu_l[Nu]; - Wc_tau_R[Mu][Nu][R] = matrix_m>(n_mu, n_nu, major_Wc); - iR_atpair_all.push_back({R, atpair_unique}); - } - } - - LIBRPA::utils::lib_printf_coll("Task %4d: distributing %d {I, J, R, tau} on %d threads\n", - LIBRPA::envs::myid_global, iR_atpair_all.size(), - omp_get_max_threads()); - -#pragma omp parallel for schedule(dynamic) - for (auto iR_atpair : iR_atpair_all) - { - const auto R = iR_atpair.first; - const auto Mu = iR_atpair.second.first; - const auto Nu = iR_atpair.second.second; - const int n_mu = atom_mu_l[Mu]; - const int n_nu = atom_mu_l[Nu]; - - // thread local temporary matrix - matrix_m> WtR_temp(n_mu, n_nu, major_Wc); - - if (Wc_tau_q.count(Mu) == 0) continue; - if (Wc_tau_q.at(Mu).count(Nu) == 0) continue; - // cout << "freq: " << freq << "\n"; - - const auto &Wc_q_all = Wc_tau_q.at(Mu).at(Nu); - for (auto &Wc_q : Wc_q_all) - { - const auto q = Wc_q.first; - const auto &Wc = Wc_q.second; - for (auto q_bz : map_irk_ks[q]) - { - const double ang = -q_bz * (R * latvec) * TWO_PI; - const complex weight = - complex(cos(ang), sin(ang)) / double(n_kpoints); - if (q == q_bz) - WtR_temp += Wc * weight; - else - WtR_temp += conj(Wc) * weight; - } - } - // omp_set_lock(&lock_Wc); - Wc_tau_R[Mu][Nu][R] += WtR_temp; - // omp_unset_lock(&lock_Wc); - } - - LIBRPA::utils::lib_printf_root("Done converting Wc q,t -> R,t\n"); - mpi_comm_global_h.barrier(); - return Wc_tau_R; -} - -void test_libcomm_for_system(const atpair_k_cplx_mat_t &coulmat) -{ - if (mpi_comm_global_h.myid == 0) lib_printf("test_libcomm_for_system Coulumb\n"); - // lib_printf("Calculating EcRPA with BLACS, pid: %d\n", mpi_comm_global_h.myid); - const complex CONE{1.0, 0.0}; - const int n_abf = LIBRPA::atomic_basis_abf.nb_total; - const auto part_range = LIBRPA::atomic_basis_abf.get_part_range(); - - mpi_comm_global_h.barrier(); - - Array_Desc desc_nabf_nabf(blacs_ctxt_global_h); - // use a square blocksize instead max block, otherwise heev and inversion will complain about - // illegal parameter - desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0); - const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy( - 'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf); - const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf); - - auto coul_block = init_local_mat>(desc_nabf_nabf, MAJOR::COL); - - vector> qpts; - for (const auto &qMuNuchi : irk_weight) qpts.push_back(qMuNuchi.first); - -#ifdef LIBRPA_USE_LIBRI - for (const auto &q : qpts) - { - coul_block.zero_out(); - - int iq = std::distance(klist.begin(), std::find(klist.begin(), klist.end(), q)); - std::array qa = {q.x, q.y, q.z}; - // collect the block elements of coulomb matrices - { - double vq_begin = omp_get_wtime(); - // LibRI tensor for communication, release once done - std::map>, Tensor>>> - coul_libri; - coul_libri.clear(); - int count_coul = 0; - for (const auto &Mu_Nu : local_atpair) - { - const auto Mu = Mu_Nu.first; - const auto Nu = Mu_Nu.second; - // ofs_myid << "myid " << blacs_ctxt_global_h.myid << "Mu " << Mu << " Nu " << Nu << - // endl; - if (coulmat.count(Mu) == 0 || coulmat.at(Mu).count(Nu) == 0 || - coulmat.at(Mu).at(Nu).count(q) == 0) - continue; - const auto &Vq = coulmat.at(Mu).at(Nu).at(q); - const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu); - const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu); - std::valarray> Vq_va(Vq->c, Vq->size); - auto pvq = std::make_shared>>(); - *pvq = Vq_va; - coul_libri[Mu][{Nu, qa}] = Tensor>({n_mu, n_nu}, pvq); - count_coul += 1; - } - int count_pair = 0; - for (auto &Mu : coul_libri) - { - for (auto &nu_q : Mu.second) - { - count_pair += 1; - } - } - // printf("Finish RPA blacs 2d vq arr\n"); - double arr_end = omp_get_wtime(); - mpi_comm_global_h.barrier(); - double comm_begin = omp_get_wtime(); - lib_printf( - "Begin comm_map2_first myid: %d q:(%f, %f, %f) count_coul: %d count_pair: %d\n", - mpi_comm_global_h.myid, q.x, q.y, q.z, count_coul, count_pair); - const auto IJq_coul = - comm_map2_first(mpi_comm_global_h.comm, coul_libri, s0_s1.first, s0_s1.second); - double comm_end = omp_get_wtime(); - mpi_comm_global_h.barrier(); - // printf("End vq comm_map2_first myid: %d TIME_USED: - // %f\n",mpi_comm_global_h.myid,comm_end-comm_begin); - // ofs_myid << "IJq_coul" << endl << IJq_coul; - // printf("Finish RPA blacs 2d vq 2d\n"); - double block_begin = omp_get_wtime(); - // for (const auto &IJ: set_IJ_nabf_nabf) - // { - // const auto &I = IJ.first; - // const auto &J = IJ.second; - // // cout << IJq_coul.at(I).at({J, qa}); - // collect_block_from_IJ_storage_syhe( - // coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, IJ.first, - // IJ.second, true, CONE, IJq_coul.at(I).at({J, qa}).ptr(), MAJOR::ROW); - // // lib_printf("myid %d I %d J %d nr %d nc %d\n%s", - // // blacs_ctxt_global_h.myid, I, J, - // // coul_block.nr(), coul_block.nc(), - // // str(coul_block).c_str()); - // } - collect_block_from_ALL_IJ_Tensor(coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, - qa, true, CONE, IJq_coul, MAJOR::ROW); - double block_end = omp_get_wtime(); - // lib_printf("Vq Time myid: %d arr_time: %f comm_time: %f block_time: %f - // pair_size: %d\n",mpi_comm_global_h.myid,arr_end-vq_begin, comm_end-comm_begin, - // block_end-block_begin,set_IJ_nabf_nabf.size()); - mpi_comm_global_h.barrier(); - double vq_end = omp_get_wtime(); - - if (mpi_comm_global_h.myid == 0) - lib_printf(" | Total vq time: %f lri_coul: %f comm_vq: %f block_vq: %f\n", - vq_end - vq_begin, comm_begin - vq_begin, block_begin - comm_begin, - vq_end - block_begin); - } - } - lib_printf("Success test_libcomm_for_system\n"); -#endif -} From ee6853bdcff7abfd113ab07268553904d242886c Mon Sep 17 00:00:00 2001 From: userchba <88939318+userchba@users.noreply.github.com> Date: Fri, 25 Jul 2025 01:19:41 +0800 Subject: [PATCH 04/18] Delete specific file --- epsilon.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/epsilon.cpp b/epsilon.cpp index 8b137891..139597f9 100644 --- a/epsilon.cpp +++ b/epsilon.cpp @@ -1 +1,2 @@ + From ece4c3f8702f4c6f3df8d8614f1de2038c908089 Mon Sep 17 00:00:00 2001 From: userchba <88939318+userchba@users.noreply.github.com> Date: Fri, 25 Jul 2025 01:23:08 +0800 Subject: [PATCH 05/18] Delete epsilon.cpp --- epsilon.cpp | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 epsilon.cpp diff --git a/epsilon.cpp b/epsilon.cpp deleted file mode 100644 index 139597f9..00000000 --- a/epsilon.cpp +++ /dev/null @@ -1,2 +0,0 @@ - - From ab5910f48810c9731b05167116af68bb659519e3 Mon Sep 17 00:00:00 2001 From: userchba <88939318+userchba@users.noreply.github.com> Date: Mon, 3 Nov 2025 16:33:52 +0800 Subject: [PATCH 06/18] change pgemm by pgeadd in cal_eps replace commented-out identity matrix initialization and matrix multiplication code by plead --- src/dielecmodel.cpp | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/src/dielecmodel.cpp b/src/dielecmodel.cpp index b58a2059..87507dcd 100644 --- a/src/dielecmodel.cpp +++ b/src/dielecmodel.cpp @@ -1296,24 +1296,30 @@ void diele_func::cal_eps(const int ifreq, Array_Desc &desc_nabf_nabf_opt, Array_ chi0(ilo, jlo) = result; } } - auto identity = init_local_mat>(desc_body, MAJOR::COL); - for (int i = 0; i < n_nonsingular - 1; i++) - { - const int ilo = desc_body.indx_g2l_r(i); - if (ilo < 0) continue; - for (int j = 0; j < n_nonsingular - 1; j++) - { - const int jlo = desc_body.indx_g2l_c(j); - if (jlo < 0) continue; - if (i == j) - identity(ilo, jlo) = 1.0; - else - identity(ilo, jlo) = 0.0; - } - } - ScalapackConnector::pgemm_f('N', 'N', n_nonsingular - 1, n_nonsingular - 1, n_nonsingular - 1, - 1.0, body_inv.ptr(), 1, 1, desc_body.desc, identity.ptr(), 1, 1, - desc_body.desc, 1.0, chi0.ptr(), 2, 2, desc_nabf_nabf_opt.desc); + // auto identity = init_local_mat>(desc_body, MAJOR::COL); + // for (int i = 0; i < n_nonsingular - 1; i++) + // { + // const int ilo = desc_body.indx_g2l_r(i); + // if (ilo < 0) continue; + // for (int j = 0; j < n_nonsingular - 1; j++) + // { + // const int jlo = desc_body.indx_g2l_c(j); + // if (jlo < 0) continue; + // if (i == j) + // identity(ilo, jlo) = 1.0; + // else + // identity(ilo, jlo) = 0.0; + // } + // } + // ScalapackConnector::pgemm_f('N', 'N', n_nonsingular - 1, n_nonsingular - 1, n_nonsingular - 1, + // 1.0, body_inv.ptr(), 1, 1, desc_body.desc, identity.ptr(), 1, 1, + // desc_body.desc, 1.0, chi0.ptr(), 2, 2, desc_nabf_nabf_opt.desc); + ScalapackConnector::pgeadd_f( + 'N', n_nonsingular - 1, n_nonsingular - 1, + 1.0, + body_inv.ptr(), 1, 1, desc_body.desc, + 1.0, + chi0.ptr(), 2, 2, desc_nabf_nabf_opt.desc); Profiler::stop("cal_inverse_dielectric_matrix_ij"); if (mpi_comm_global_h.is_root()) std::cout << "* Success: calculate average inverse dielectric matrix no." << ifreq + 1 From c7ead5c9b95fcad3bde9ee8465f27f5833c03d48 Mon Sep 17 00:00:00 2001 From: userchba <88939318+userchba@users.noreply.github.com> Date: Fri, 14 Nov 2025 16:50:29 +0800 Subject: [PATCH 07/18] add gpu install setting --- CMakeLists.txt | 52 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e9d4ee05..4264025f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,14 +20,18 @@ set(LIB_NAME "rpa") set(FORTRAN_LIB_NAME "rpa_f") # options setup -option(USE_LIBRI "Use LibRI for tensor contraction" OFF) +option(USE_LIBRI "Use LibRI for tensor contraction" ON) option(USE_CMAKE_INC "Use cmake.inc for configure" OFF) -option(USE_GREENX_API "Use GreenX API for minimax grids generation" OFF) +option(USE_GREENX_API "Use GreenX API for minimax grids generation" ON) option(USE_EXTERNAL_GREENX "Use external GreenX library rather than the packaged one" OFF) option(ENABLE_TEST "Flag to build unit tests" ON) option(ENABLE_DRIVER "Flag to build driver executables" ON) option(ENABLE_FORTRAN_BIND "Flag to build Fotran binding" OFF) option(VERBOSE_OUTPUT "Flag to print verbose information in stdout and process output" OFF) +option(USE_CUDA "Use cuda for EcRPA calculation" OFF) +option(ENABLE_CUSOLVERMP "Use cusolverMp for EcRPA calculation" OFF) +option(ENABLE_CUBLASMP "Use cublasmp for EcRPA calculation" OFF ) +option(ENABLE_NVHPC "Use nvhpc for calculation" OFF ) # NOTE: static library not tested option(BUILD_LIBRPA_SHARED "Flag to build shared libraries" ON) @@ -61,6 +65,10 @@ if(USE_GREENX_API OR ENABLE_FORTRAN_BIND) else() enable_language(CXX) endif() +# added by hbchen in 2025-5-19 +if(USE_CUDA) + enable_language(CUDA) +endif() # bypass the deprecation warning of classic C++ of oneAPI if(CMAKE_CXX_COMPILER_ID MATCHES Intel) @@ -177,6 +185,46 @@ if(USE_LIBRI) add_compile_definitions("LIBRPA_USE_LIBRI") endif() + +# add gpu accelaration by chenhaobo in 2025-04-26 +# cmake_policy(SET CMP0146 NEW) +if(USE_CUDA) + find_package(CUDA REQUIRED) + find_package(CUDAToolkit REQUIRED) + add_definitions(-DADD_) + include_directories(${MAGMA_ROOT}/include) + set(MAGMA_INCLUDE_DIR "${MAGMA_ROOT}/include") + set(MAGMA_INCLUDE_DIR $ENV{MAGMA_INCLUDE_DIR}) + link_directories(${MAGMA_ROOT}/lib) + set(CMAKE_CUDA_STANDARD 14) + set(CUDA_LIBRARIES "-lcublas -lcudart -lcusolver") + list(APPEND math_libs + ${CUDA_LIBRARIES} + ${MAGMA_ROOT}/lib/libmagma.so + ) + add_compile_definitions("LIBRPA_USE_CUDA") +endif() +if(ENABLE_NVHPC) +add_compile_definitions("ENABLE_NVHPC") +set(ENABLE_CUSOLVERMP ON) +set(ENABLE_CUBLASMP ON) +endif() +if(ENABLE_CUSOLVERMP) + add_compile_definitions("ENABLE_CUSOLVERMP") + set(CUSOLVERMP_LIBRAYIES "-lcal -lcusolverMp") + list(APPEND math_libs + ${CUSOLVERMP_LIBRAYIES} + ) +endif() +if(ENABLE_CUBLASMP) + add_compile_definitions("ENABLE_CUBLASMP") + set(CUBLASMP_LIBRAYIES "-lcublasmp -lcurand -lcublas") + list(APPEND math_libs + ${CUBLASMP_LIBRAYIES} + ) +endif() +# finish adding gpu accelaration by chenhaobo in 2025-04-26 + if(CMAKE_BUILD_TYPE MATCHES "Debug") message(STATUS "Build type set to Debug, adding LIBRPA_DEBUG preprocessor directive") add_compile_definitions("LIBRPA_DEBUG") From b47a007254448e15c4c4a6a6aa8680f53a398034 Mon Sep 17 00:00:00 2001 From: userchba <88939318+userchba@users.noreply.github.com> Date: Fri, 14 Nov 2025 16:55:23 +0800 Subject: [PATCH 08/18] add gpu calculation for rpa and g0w0_band --- src/CMakeLists.txt | 23 + src/app_rpa.cpp | 21 + src/array_desc_device.cu | 36 + src/array_desc_device.h | 61 ++ src/cuda_connector.cpp | 2108 ++++++++++++++++++++++++++++++++++++++ src/cuda_connector.cu | 601 +++++++++++ src/cuda_connector.h | 621 +++++++++++ src/device_connector.cpp | 248 +++++ src/device_connector.h | 54 + src/device_stream.cpp | 34 + src/device_stream.h | 134 +++ src/epsilon_cuda.cpp | 1448 ++++++++++++++++++++++++++ src/epsilon_cuda.h | 18 + src/matrix_device.cpp | 16 + src/matrix_device.h | 111 ++ 15 files changed, 5534 insertions(+) create mode 100644 src/array_desc_device.cu create mode 100644 src/array_desc_device.h create mode 100644 src/cuda_connector.cpp create mode 100644 src/cuda_connector.cu create mode 100644 src/cuda_connector.h create mode 100644 src/device_connector.cpp create mode 100644 src/device_connector.h create mode 100644 src/device_stream.cpp create mode 100644 src/device_stream.h create mode 100644 src/epsilon_cuda.cpp create mode 100644 src/epsilon_cuda.h create mode 100644 src/matrix_device.cpp create mode 100644 src/matrix_device.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e36be38e..213ef544 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -72,6 +72,29 @@ else() list(APPEND lib_sources get_minimax_local.cpp) endif() +if(USE_CUDA) + list(APPEND lib_sources + epsilon_cuda.cpp + cuda_connector.cu + ) +endif() +if(ENABLE_NVHPC) + list(APPEND lib_sources + cuda_connector.cpp + array_desc_device.cu + device_stream.cpp + matrix_device.cpp + device_connector.cpp + ) + # set_source_files_properties( + # base_blacs.cpp + # PROPERTIES + # LANGUAGE CUDA + # # COMPILE_OPTIONS "--x cu" + # ) +endif() + + target_include_directories(rpa_lib PRIVATE # ${CMAKE_CURRENT_LIST_DIR} diff --git a/src/app_rpa.cpp b/src/app_rpa.cpp index a6cb3128..a837684c 100644 --- a/src/app_rpa.cpp +++ b/src/app_rpa.cpp @@ -11,6 +11,10 @@ #include "stl_io_helper.h" #include "utils_mem.h" #include "utils_timefreq.h" +#ifdef LIBRPA_USE_CUDA +#include // added by hbchen in 2025-5-17 +#include // added by hbchen in 2025-7-26 +#endif namespace LIBRPA { @@ -87,6 +91,23 @@ void get_rpa_correlation_energy_(std::complex &rpa_corr, mpi_comm_global_h.barrier(); Profiler::start("EcRPA", "Compute RPA correlation Energy"); CorrEnergy corr; + #ifdef LIBRPA_USE_CUDA + int deviceCount; + cudaError_t err= cudaGetDeviceCount(&deviceCount); + // lib_printf("cudaSuccess:%d\n",err==cudaSuccess&&deviceCount>0); + printf("Number of CUDA devices: %d\n", deviceCount); + // deviceCount!=4 is a bug, because if I don't set the gres=gpu:*, cuda will detect all the gpu device + if(err==cudaSuccess&&deviceCount>0){//说明存在gpu设备 + #ifdef ENABLE_NVHPC + if (Params::use_scalapack_ecrpa && + (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR || + LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI)) + corr = compute_RPA_correlation_blacs_2d_cuda(chi0, Vq); + else + #endif + corr = compute_RPA_correlation_cuda(chi0, Vq); + }else + #endif if (Params::use_scalapack_ecrpa && (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR || LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI)) diff --git a/src/array_desc_device.cu b/src/array_desc_device.cu new file mode 100644 index 00000000..5dab0fa5 --- /dev/null +++ b/src/array_desc_device.cu @@ -0,0 +1,36 @@ +#include "array_desc_device.h" + +Array_Desc_Device::Array_Desc_Device(const LIBRPA::Array_Desc& array_desc) { + this->ictxt_ = array_desc.ictxt(); + this->m_ = array_desc.m(); + this->n_ = array_desc.n(); + this->mb_ = array_desc.mb(); + this->nb_ = array_desc.nb(); + this->lld_ = array_desc.lld(); + this->irsrc_ = array_desc.irsrc(); + this->icsrc_ = array_desc.icsrc(); + this->m_local_ = array_desc.m_loc(); + this->n_local_ = array_desc.n_loc(); + this->myprow_ = array_desc.myprow(); + this->mypcol_ = array_desc.mypcol(); + this->nprows_ = array_desc.nprows(); + this->npcols_ = array_desc.npcols(); +} +__host__ __device__ int Array_Desc_Device::indx_g2p(const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs) { + return (isrcproc + indxglob / nb) % nprocs; +} +__host__ __device__ int Array_Desc_Device::indx_g2l(const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs) { + return nb * (indxglob / (nb * nprocs)) + indxglob % nb; +} +__host__ __device__ int Array_Desc_Device::indx_g2l_r(int gindx)const{ + return this->myprow_ != indx_g2p(gindx, this->mb_, this->irsrc_, this->nprows_) || + gindx >= this->m_ + ? -1 + : indx_g2l(gindx, this->mb_, this->irsrc_, this->nprows_); +} +__host__ __device__ int Array_Desc_Device::indx_g2l_c(int gindx)const{ + return this->mypcol_ != indx_g2p(gindx, this->nb_, this->icsrc_, this->npcols_) || + gindx >= this->n_ + ? -1 + : indx_g2l(gindx, this->nb_, this->icsrc_, this->npcols_); +} \ No newline at end of file diff --git a/src/array_desc_device.h b/src/array_desc_device.h new file mode 100644 index 00000000..15e25e80 --- /dev/null +++ b/src/array_desc_device.h @@ -0,0 +1,61 @@ +#include "base_blacs.h" +class Array_Desc_Device{ +private: + int ictxt_; + // int nprocs_; + // int myid_; + int nprows_; + int myprow_; + int npcols_; + int mypcol_; + + // Array dimensions + int m_; + int n_; + int mb_; + int nb_; + int irsrc_; + int icsrc_; + int lld_; + int m_local_; + int n_local_; + __host__ __device__ static int indx_g2p( + const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs); + __host__ __device__ static int indx_g2l( + const int &indxglob, const int &nb, const int &isrcproc, const int &nprocs); +public: + Array_Desc_Device(const LIBRPA::Array_Desc& array_desc); + __host__ __device__ + int indx_g2l_r(int gindx) const; + __host__ __device__ + int indx_g2l_c(int gindx) const; + __host__ __device__ + const int& m() const{ return m_; } + __host__ __device__ + const int& n() const{ return n_; } + __host__ __device__ + const int& mb() const{ return mb_; } + __host__ __device__ + const int& nb() const{ return nb_; } + __host__ __device__ + const int& irsrc() const{ return irsrc_; } + __host__ __device__ + const int& icsrc() const{ return icsrc_; } + __host__ __device__ + const int& lld() const{ return lld_; } + __host__ __device__ + const int& m_loc() const{ return m_local_; } + __host__ __device__ + const int& n_loc() const{ return n_local_; } + __host__ __device__ + const int& nprows() const{ return nprows_; } + __host__ __device__ + const int& npcols() const{ return npcols_; } + __host__ __device__ + const int& myprow() const{ return myprow_; } + __host__ __device__ + const int& mypcol() const{ return mypcol_; } + __host__ __device__ + const int& ictxt() const{ return ictxt_; } + +}; \ No newline at end of file diff --git a/src/cuda_connector.cpp b/src/cuda_connector.cpp new file mode 100644 index 00000000..f273c680 --- /dev/null +++ b/src/cuda_connector.cpp @@ -0,0 +1,2108 @@ +#include "cuda_connector.h" +#include +// cusolverMp include +#include "helpers.h" +#include +#include "lapack_connector.h" +#include "envs_mpi.h" +using LIBRPA::envs::mpi_comm_global_h; +#include "device_stream.h" +// #define CUSOLVERMP_MPI_GRID_COL_MAJOR +// #define OPEN_TEST_FOR_LU_DECOMPOSITION +// #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION +// #include +// #include +// #endif +// #ifdef ENABLE_CUBLASMP +// // #include +// // #include +// #endif +// #include +// #ifdef ENABLE_CUSOLVERMP +// void CudaConnector::pzgetrf_cusolverMp(const int &m, const int &n, std::complex *h_C_A, const int &ia, +// const int &ja, const LIBRPA::Array_Desc &arrdesc_pi, int *ipiv, int &h_info_getrf,const char order) +// { +// const int64_t M = arrdesc_pi.m(); +// const int64_t N = arrdesc_pi.n(); + +// const int64_t IA = ia; +// const int64_t JA = ja; + +// /* Tile sizes */ +// const int64_t MA = arrdesc_pi.mb(); +// const int64_t NA = arrdesc_pi.nb(); +// int numRowDevices, numColDevices; +// if(order=='C'){ +// numRowDevices = arrdesc_pi.npcols(); +// numColDevices = arrdesc_pi.nprows(); +// } +// else if(order=='R'){ +// numRowDevices = arrdesc_pi.nprows(); +// numColDevices = arrdesc_pi.npcols(); +// }else{ +// fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n"); +// } + + +// const uint32_t RSRCA = 0; +// const uint32_t CSRCA = 0; + +// int mpiCommSize, mpiRank; +// MPI_Comm_size(MPI_COMM_WORLD, &mpiCommSize); +// MPI_Comm_rank(MPI_COMM_WORLD, &mpiRank); + +// int local_device = getLocalDevice(); +// int numDevices = 0; + +// // printf("Number of devices = %d\n", numDevices); +// // printf("local_device = %d, mpiRank = %d, mpiCommSize = %d\n", local_device, mpiRank, mpiCommSize); +// #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION +// double time_start_set_device=omp_get_wtime(); +// #endif +// cudaError_t cudaStat = cudaSetDevice(local_device); +// #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION +// printf("time for set device:%f seconds, rank:%d\n", omp_get_wtime() - time_start_set_device,mpiRank); +// #endif +// assert(cudaStat == cudaSuccess); +// cudaStat = cudaFree(0); +// assert(cudaStat == cudaSuccess); +// // const int mpiRank_T=arrdesc_pi.myprow()+ arrdesc_pi.mypcol()*arrdesc_pi.npcols(); +// const int rank = mpiRank; +// // printf("mpiRank=%d,mpiRank_T=%d\n", mpiRank, mpiRank_T); +// const int commSize = mpiCommSize; +// /* Library handles */ +// cusolverMpHandle_t cusolverMpHandle = NULL; +// cal_comm_t cal_comm = NULL; + +// /* Error codes */ +// cusolverStatus_t cusolverStat = CUSOLVER_STATUS_SUCCESS; +// calError_t calStat = CAL_OK; +// cudaStat = cudaSuccess; + +// /* User defined stream */ +// cudaStream_t localStream = NULL; +// cal_comm_create_params_t params; +// params.allgather = allgather; +// params.req_test = request_test; +// params.req_free = request_free; +// params.data = (void*)(MPI_COMM_WORLD); +// params.rank = rank; +// params.nranks = commSize; +// params.local_device = local_device; + +// calStat = cal_comm_create(params, &cal_comm); +// assert(calStat == CAL_OK); + +// /* Create local stream */ +// #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION +// double time_start_create_stream=omp_get_wtime(); +// #endif +// cudaStat = cudaStreamCreate(&localStream); +// #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION +// printf("time for create stream:%f seconds, rank:%d\n", omp_get_wtime() - time_start_create_stream,mpiRank); +// #endif +// assert(cudaStat == cudaSuccess); + +// /* Initialize cusolverMp library handle */ +// #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION +// double time_start_cusolverMpCreate=omp_get_wtime(); +// #endif +// cusolverStat = cusolverMpCreate(&cusolverMpHandle, local_device, localStream); +// #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION +// printf("time for cusolverMpCreate:%f seconds, rank:%d\n", omp_get_wtime() - time_start_cusolverMpCreate,mpiRank); +// #endif +// assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + +// /* cusolverMp grids */ +// cusolverMpGrid_t gridA_C = NULL; + +// /* cusolverMp matrix descriptors */ +// cusolverMpMatrixDescriptor_t descrA_C = NULL; + +// /* Distributed matrices */ +// void* d_C_A = NULL; // for cuDoubleComplex +// int64_t* d_ipiv = NULL; + +// /* Distributed device workspace */ +// void* d_work_getrf_C = NULL; + +// /* Distributed host workspace */ +// void* h_work_getrf_C = NULL; + +// /* size of workspace on device */ +// size_t workspaceInBytesOnDevice_getrf_C = 0; + +// /* size of workspace on host */ +// size_t workspaceInBytesOnHost_getrf_C = 0; + +// /* error codes from cusolverMp (device) */ +// int* d_info_getrf = NULL; + +// /* error codes from cusolverMp (host) */ +// // int h_info_getrf = 0; + +// /* Single process per device */ +// assert((numRowDevices * numColDevices) == commSize); + +// /* =========================================== */ +// /* Create inputs on master rank */ +// /* =========================================== */ + +// const int64_t lda = (IA - 1) + N; +// const int64_t colsA = (JA - 1) + N; +// // cuDoubleComplex* h_C_A = NULL; +// int64_t LLDA, localColsA; +// if(order=='C'){ +// localColsA =arrdesc_pi.m_loc(); +// LLDA=arrdesc_pi.n_loc(); +// }else if(order=='R'){ +// localColsA =arrdesc_pi.n_loc(); +// LLDA=arrdesc_pi.m_loc(); +// }else{ +// fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n"); +// } + +// /* Allocate global d_A */ +// cudaStat = cudaMalloc((void**)&d_C_A, localColsA * LLDA * sizeof(cuDoubleComplex)); +// assert(cudaStat == cudaSuccess); + +// /* =========================================== */ +// /* CREATE GRID DESCRIPTORS */ +// /* =========================================== */ +// if(order=='C'){ +// cusolverStat = cusolverMpCreateDeviceGrid( +// cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_COL_MAJOR); +// }else if(order=='R'){ +// cusolverStat = cusolverMpCreateDeviceGrid( +// cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_ROW_MAJOR); +// }else{ +// fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n"); +// } +// assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + +// /* =========================================== */ +// /* CREATE MATRIX DESCRIPTORS */ +// /* =========================================== */ +// cusolverStat = cusolverMpCreateMatrixDesc( +// &descrA_C, gridA_C, CUDA_C_64F, (IA - 1) + M, (JA - 1) + N, MA, NA, RSRCA, CSRCA, LLDA); +// assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + +// /* Allocate global d_ipiv */ +// /* REMARK : ipiv overlaps A[IA, JA:JA+N] as in Netlib's ScaLAPACK */ +// cudaStat = cudaMalloc((void**)&d_ipiv, arrdesc_pi.m_loc() * sizeof(int64_t)); +// assert(cudaStat == cudaSuccess); + +// /* =========================================== */ +// /* ALLOCATE D_INFO */ +// /* =========================================== */ + +// cudaStat = cudaMalloc((void**)&d_info_getrf, sizeof(int)); +// assert(cudaStat == cudaSuccess); + +// /* =========================================== */ +// /* RESET D_INFO */ +// /* =========================================== */ + +// cudaStat = cudaMemset(d_info_getrf, 1, sizeof(int)); +// assert(cudaStat == cudaSuccess); + +// /* =========================================== */ +// /* QUERY WORKSPACE SIZE FOR MP ROUTINES */ +// /* =========================================== */ +// cusolverStat = cusolverMpGetrf_bufferSize(cusolverMpHandle, +// N, +// N, +// d_C_A, +// IA, +// JA, +// descrA_C, +// d_ipiv, +// CUDA_C_64F, +// &workspaceInBytesOnDevice_getrf_C, +// &workspaceInBytesOnHost_getrf_C); +// assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + +// /* =========================================== */ +// /* ALLOCATE PGETRF WORKSPACE */ +// /* =========================================== */ +// cudaStat = cudaMalloc((void**)&d_work_getrf_C, workspaceInBytesOnDevice_getrf_C); +// assert(cudaStat == cudaSuccess); +// h_work_getrf_C = (void*)malloc(workspaceInBytesOnHost_getrf_C); +// assert(h_work_getrf_C != NULL); + +// // copy matrix from h_C_A to d_C_A +// std::complex* h_C_A_temp = NULL; +// size_t temp_size = (int64_t)localColsA * (int64_t)LLDA * sizeof(cuDoubleComplex); +// if(order=='C'){ +// h_C_A_temp = LapackConnector::transpose(h_C_A, arrdesc_pi.m_loc(), arrdesc_pi.n_loc()); +// cudaStat = cudaMemcpy(d_C_A, h_C_A_temp, temp_size, cudaMemcpyHostToDevice); +// }else if(order=='R'){ +// cudaStat = cudaMemcpy(d_C_A, h_C_A, temp_size, cudaMemcpyHostToDevice); +// }else{ +// fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n"); +// } +// assert(cudaStat == cudaSuccess); + + +// /* sync wait for data to arrive to device */ +// calStat = cal_stream_sync(cal_comm, localStream); +// assert(calStat == CAL_OK); + + +// /* =========================================== */ +// /* CALL PGETRF */ +// /* =========================================== */ +// h_info_getrf=1; +// // printf("h_info_getrf before LU composition(cuDoubleComplex) : %d\n", h_info_getrf); +// // printf("LU decomposition begin(cuDoubleComplex)\n"); +// double start_time_C = omp_get_wtime(); +// cusolverStat = cusolverMpGetrf(cusolverMpHandle, +// N, +// N, +// d_C_A, +// IA, +// JA, +// descrA_C, +// d_ipiv, +// CUDA_C_64F, +// d_work_getrf_C, +// workspaceInBytesOnDevice_getrf_C, +// h_work_getrf_C, +// workspaceInBytesOnHost_getrf_C, +// d_info_getrf); + +// /* sync after cusolverMpGetrf */ +// calStat = cal_stream_sync(cal_comm, localStream); +// assert(calStat == CAL_OK); +// // printf("LU decomposition end(cuDoubleComplex), time = %f seconds,rand=%d\n", omp_get_wtime() - start_time_C,rank); + +// /* copy d_info_getrf to host */ +// cudaStat = cudaMemcpyAsync(&h_info_getrf, d_info_getrf, sizeof(int), cudaMemcpyDeviceToHost, localStream); +// assert(cudaStat == cudaSuccess); +// /* wait for d_info_getrf copy */ +// cudaStat = cudaStreamSynchronize(localStream); +// assert(cudaStat == cudaSuccess); +// // printf("h_info_getrf after composition(cuDoubleComplex) : %d\n", h_info_getrf); +// /* check return value of cusolverMpGetrf */ +// assert(h_info_getrf == 0); +// // copy d_ipiv to ipiv +// cudaStat = cudaMemcpy(ipiv, d_ipiv, arrdesc_pi.m_loc() * sizeof(int64_t), cudaMemcpyDeviceToHost); +// assert(cudaStat == cudaSuccess); +// // copy matrix from d_C_A to h_C_A +// if(order=='C'){ +// cudaStat = cudaMemcpy(h_C_A_temp, d_C_A, temp_size, cudaMemcpyDeviceToHost); +// LapackConnector::transpose(h_C_A_temp, h_C_A, arrdesc_pi.m_loc(), arrdesc_pi.n_loc()); +// }else if(order=='R'){ +// cudaStat = cudaMemcpy(h_C_A, d_C_A, temp_size, cudaMemcpyDeviceToHost); +// }else{ +// fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n"); +// } +// assert(cudaStat == cudaSuccess); + +// calStat = cal_stream_sync(cal_comm, localStream); +// assert(calStat == CAL_OK); + +// cudaStat = cudaStreamSynchronize(localStream); +// assert(cudaStat == cudaSuccess); + + +// /* sync wait for data to arrive to host */ +// calStat = cal_stream_sync(cal_comm, localStream); +// assert(calStat == CAL_OK); + + +// /* =========================================== */ +// /* CHECK RESIDUAL ON MASTER */ +// /* =========================================== */ + +// /* =========================================== */ +// /* CLEAN UP HOST WORKSPACE ON MASTER */ +// /* =========================================== */ +// cusolverStat = cusolverMpDestroyMatrixDesc(descrA_C); +// assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + +// cusolverStat = cusolverMpDestroyGrid(gridA_C); +// assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); +// if (d_C_A != NULL) +// { +// cudaStat = cudaFree(d_C_A); +// assert(cudaStat == cudaSuccess); +// d_C_A = NULL; +// } +// if (d_ipiv != NULL) +// { +// cudaStat = cudaFree(d_ipiv); +// assert(cudaStat == cudaSuccess); +// d_ipiv = NULL; +// } + +// if(h_C_A_temp != NULL) +// { +// delete [] h_C_A_temp; +// h_C_A_temp = NULL; +// } + + +// if (d_work_getrf_C != NULL) +// { +// cudaStat = cudaFree(d_work_getrf_C); +// assert(cudaStat == cudaSuccess); +// d_work_getrf_C = NULL; +// } + +// if (d_info_getrf != NULL) +// { +// cudaStat = cudaFree(d_info_getrf); +// assert(cudaStat == cudaSuccess); +// d_info_getrf = NULL; +// } +// if (h_work_getrf_C) +// { +// free(h_work_getrf_C); +// h_work_getrf_C = NULL; +// } +// /* Destroy cusolverMp handle */ +// cusolverStat = cusolverMpDestroy(cusolverMpHandle); +// assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + +// /* sync before cal_comm_destroy */ +// calStat = cal_comm_barrier(cal_comm, localStream); +// assert(calStat == CAL_OK); + +// /* destroy CAL communicator */ +// calStat = cal_comm_destroy(cal_comm); +// assert(calStat == CAL_OK); + +// /* destroy user stream */ +// cudaStat = cudaStreamDestroy(localStream); +// assert(cudaStat == cudaSuccess); + +// /* MPI barrier before MPI_Finalize */ +// MPI_Barrier(MPI_COMM_WORLD); +// // printf("success in test\n"); +// return; +// } +// #endif + +#ifdef ENABLE_NVHPC +void CudaConnector::pzgetrf_nvhpc(const GpuDeviceStream&gpu_dev_stream, ComplexMatrixDevice &d_A, const int &ia, + const int &ja, const LIBRPA::Array_Desc &arrdesc_pi, int64_t *d_ipiv, int *d_info_getrf, const char& order) +{ + const int64_t M = arrdesc_pi.m(); + const int64_t N = arrdesc_pi.n(); + + const int64_t IA = ia; + const int64_t JA = ja; + + /* Tile sizes */ + const int64_t MA = arrdesc_pi.mb(); + const int64_t NA = arrdesc_pi.nb(); + int numRowDevices, numColDevices; + if(order=='C'){ + numRowDevices = arrdesc_pi.npcols(); + numColDevices = arrdesc_pi.nprows(); + } + else if(order=='R'){ + numRowDevices = arrdesc_pi.nprows(); + numColDevices = arrdesc_pi.npcols(); + }else{ + fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n"); + } + + + const uint32_t RSRCA = 0; + const uint32_t CSRCA = 0; + + int mpiRank = gpu_dev_stream.rank; + int mpiCommSize = gpu_dev_stream.nranks; + + int local_device = gpu_dev_stream.local_device; + + const int rank = mpiRank; + const int commSize = mpiCommSize; + /* Library handles */ + cusolverMpHandle_t cusolverMpHandle = gpu_dev_stream.cusolver_handle; + cal_comm_t cal_comm = gpu_dev_stream.cal_comm; + + /* Error codes */ + cusolverStatus_t cusolverStat = CUSOLVER_STATUS_SUCCESS; + cudaError_t cudaStat = cudaSuccess; + cudaStream_t localStream = gpu_dev_stream.stream; + /* Initialize cusolverMp library handle */ + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + double time_start_cusolverMpCreate=omp_get_wtime(); + #endif + // cusolverStat = cusolverMpCreate(&cusolverMpHandle, local_device, localStream); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("time for cusolverMpCreate:%f seconds, rank:%d\n", omp_get_wtime() - time_start_cusolverMpCreate,mpiRank); + #endif + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + /* cusolverMp grids */ + cusolverMpGrid_t gridA_C = NULL; + + /* cusolverMp matrix descriptors */ + cusolverMpMatrixDescriptor_t descrA_C = NULL; + + /* Distributed matrices */ + void* d_C_A = d_A.ptr(); // for cuDoubleComplex + + + /* Distributed device workspace */ + void* d_work_getrf_C = NULL; + + /* Distributed host workspace */ + void* h_work_getrf_C = NULL; + + /* size of workspace on device */ + size_t workspaceInBytesOnDevice_getrf_C = 0; + + /* size of workspace on host */ + size_t workspaceInBytesOnHost_getrf_C = 0; + + /* Single process per device */ + assert((numRowDevices * numColDevices) == commSize); + + /* =========================================== */ + /* Create inputs on master rank */ + /* =========================================== */ + + const int64_t lda = (IA - 1) + N; + const int64_t colsA = (JA - 1) + N; + // cuDoubleComplex* h_C_A = NULL; + int64_t LLDA, localColsA; + if(order=='C'){ + localColsA =arrdesc_pi.m_loc(); + LLDA=arrdesc_pi.n_loc(); + }else if(order=='R'){ + localColsA =arrdesc_pi.n_loc(); + LLDA=arrdesc_pi.m_loc(); + }else{ + fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n"); + } + + /* =========================================== */ + /* CREATE GRID DESCRIPTORS */ + /* =========================================== */ + if(order=='C'){ + cusolverStat = cusolverMpCreateDeviceGrid( + cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_COL_MAJOR); + }else if(order=='R'){ + cusolverStat = cusolverMpCreateDeviceGrid( + cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_ROW_MAJOR); + }else{ + fprintf(stderr, "Error: cusolverMpgetrf order must be 'C' or 'R'\n"); + } + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + /* =========================================== */ + /* CREATE MATRIX DESCRIPTORS */ + /* =========================================== */ + cusolverStat = cusolverMpCreateMatrixDesc( + &descrA_C, gridA_C, CUDA_C_64F, (IA - 1) + M, (JA - 1) + N, MA, NA, RSRCA, CSRCA, LLDA); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + + /* =========================================== */ + /* RESET D_INFO */ + /* =========================================== */ + + CUDA_CHECK(cudaMemset(d_info_getrf, 1, sizeof(int))); + + + /* =========================================== */ + /* QUERY WORKSPACE SIZE FOR MP ROUTINES */ + /* =========================================== */ + cusolverStat = cusolverMpGetrf_bufferSize(cusolverMpHandle, + N, + N, + d_C_A, + IA, + JA, + descrA_C, + d_ipiv, + CUDA_C_64F, + &workspaceInBytesOnDevice_getrf_C, + &workspaceInBytesOnHost_getrf_C); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + /* =========================================== */ + /* ALLOCATE PGETRF WORKSPACE */ + /* =========================================== */ + cudaStat = cudaMalloc((void**)&d_work_getrf_C, workspaceInBytesOnDevice_getrf_C); + assert(cudaStat == cudaSuccess); + h_work_getrf_C = (void*)malloc(workspaceInBytesOnHost_getrf_C); + assert(h_work_getrf_C != NULL); + + + /* sync wait for data to arrive to device */ + CAL_CHECK(cal_stream_sync(cal_comm, localStream)); + + + /* =========================================== */ + /* CALL PGETRF */ + /* =========================================== */ + double start_time_C = omp_get_wtime(); + cusolverStat = cusolverMpGetrf(cusolverMpHandle, + N, + N, + d_C_A, + IA, + JA, + descrA_C, + d_ipiv, + CUDA_C_64F, + d_work_getrf_C, + workspaceInBytesOnDevice_getrf_C, + h_work_getrf_C, + workspaceInBytesOnHost_getrf_C, + d_info_getrf); + + /* sync after cusolverMpGetrf */ + CAL_CHECK(cal_stream_sync(cal_comm, localStream)); + // printf("LU decomposition end(cuDoubleComplex), time = %f seconds,rand=%d\n", omp_get_wtime() - start_time_C,rank); + + /* copy d_info_getrf to host */ + int h_info_getrf=1; + cudaStat = cudaMemcpyAsync(&h_info_getrf, d_info_getrf, sizeof(int), cudaMemcpyDeviceToHost, localStream); + assert(cudaStat == cudaSuccess); + /* wait for d_info_getrf copy */ + gpu_dev_stream.cudaSync(); + assert(cudaStat == cudaSuccess); + assert(h_info_getrf == 0); + + + /* sync wait for data to arrive to host */ + CAL_CHECK(cal_stream_sync(cal_comm, localStream)); + + /* =========================================== */ + /* CLEAN UP HOST WORKSPACE ON MASTER */ + /* =========================================== */ + cusolverStat = cusolverMpDestroyMatrixDesc(descrA_C); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + cusolverStat = cusolverMpDestroyGrid(gridA_C); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + if (d_work_getrf_C != NULL) + { + cudaStat = cudaFree(d_work_getrf_C); + assert(cudaStat == cudaSuccess); + d_work_getrf_C = NULL; + } + + if (h_work_getrf_C) + { + free(h_work_getrf_C); + h_work_getrf_C = NULL; + } + /* Destroy cusolverMp handle */ + // cusolverStat = cusolverMpDestroy(cusolverMpHandle); + // assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + /* MPI barrier before MPI_Finalize */ + MPI_Barrier(MPI_COMM_WORLD); + // printf("success in test\n"); + return; +} +void CudaConnector::pgetrf_nvhpc_mixed_precision( + const GpuDeviceStream&gpu_dev_stream, void *d_A, + const int &ia, const int &ja, + const LIBRPA::Array_Desc &arrdesc_pi, int64_t *d_ipiv, int *d_info_getrf, + const cudaDataType_t &computeType,const char &order + ) +{ + const int64_t M = arrdesc_pi.m(); + const int64_t N = arrdesc_pi.n(); + + const int64_t IA = ia; + const int64_t JA = ja; + + /* Tile sizes */ + const int64_t MA = arrdesc_pi.mb(); + const int64_t NA = arrdesc_pi.nb(); + int numRowDevices, numColDevices; + if(order=='C'||order=='c'){ + numRowDevices = arrdesc_pi.npcols(); + numColDevices = arrdesc_pi.nprows(); + } + else if(order=='R'||order=='r'){ + numRowDevices = arrdesc_pi.nprows(); + numColDevices = arrdesc_pi.npcols(); + }else{ + fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n"); + } + + + const uint32_t RSRCA = 0; + const uint32_t CSRCA = 0; + + int local_device = gpu_dev_stream.local_device; + + const int rank = gpu_dev_stream.rank; + const int commSize = gpu_dev_stream.nranks; + /* Library handles */ + cusolverMpHandle_t cusolverMpHandle = gpu_dev_stream.cusolver_handle; + cal_comm_t cal_comm = gpu_dev_stream.cal_comm; + + /* Error codes */ + cusolverStatus_t cusolverStat = CUSOLVER_STATUS_SUCCESS; + cudaError_t cudaStat = cudaSuccess; + cudaStream_t localStream = gpu_dev_stream.stream; + + /* cusolverMp grids */ + cusolverMpGrid_t gridA_C = NULL; + + /* cusolverMp matrix descriptors */ + cusolverMpMatrixDescriptor_t descrA_C = NULL; + + /* Distributed matrices */ + void* d_C_A = d_A; // + + + /* Distributed device workspace */ + void* d_work_getrf_C = NULL; + + /* Distributed host workspace */ + void* h_work_getrf_C = NULL; + + /* size of workspace on device */ + size_t workspaceInBytesOnDevice_getrf_C = 0; + + /* size of workspace on host */ + size_t workspaceInBytesOnHost_getrf_C = 0; + + /* Single process per device */ + assert((numRowDevices * numColDevices) == commSize); + + const int64_t lda = (IA - 1) + N; + const int64_t colsA = (JA - 1) + N; + // cuDoubleComplex* h_C_A = NULL; + int64_t LLDA, localColsA; + if(order=='C'||order=='c'){ + localColsA =arrdesc_pi.m_loc(); + LLDA=arrdesc_pi.n_loc(); + }else if(order=='R'||order=='r'){ + localColsA =arrdesc_pi.n_loc(); + LLDA=arrdesc_pi.m_loc(); + }else{ + fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n"); + } + + /* =========================================== */ + /* CREATE GRID DESCRIPTORS */ + /* =========================================== */ + if(order=='C'||order=='c'){ + cusolverStat = cusolverMpCreateDeviceGrid( + cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_COL_MAJOR); + }else if(order=='R'||order=='r'){ + cusolverStat = cusolverMpCreateDeviceGrid( + cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_ROW_MAJOR); + }else{ + fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n"); + } + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + /* =========================================== */ + /* CREATE MATRIX DESCRIPTORS */ + /* =========================================== */ + cusolverStat = cusolverMpCreateMatrixDesc( + &descrA_C, gridA_C, computeType, (IA - 1) + M, (JA - 1) + N, MA, NA, RSRCA, CSRCA, LLDA); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + + /* =========================================== */ + /* RESET D_INFO */ + /* =========================================== */ + + CUDA_CHECK(cudaMemset(d_info_getrf, 1, sizeof(int))); + + + /* =========================================== */ + /* QUERY WORKSPACE SIZE FOR MP ROUTINES */ + /* =========================================== */ + cusolverStat = cusolverMpGetrf_bufferSize(cusolverMpHandle, + N, + N, + d_C_A, + IA, + JA, + descrA_C, + d_ipiv, + computeType, + &workspaceInBytesOnDevice_getrf_C, + &workspaceInBytesOnHost_getrf_C); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + /* =========================================== */ + /* ALLOCATE PGETRF WORKSPACE */ + /* =========================================== */ + CUDA_CHECK(cudaMallocAsync((void**)&d_work_getrf_C, workspaceInBytesOnDevice_getrf_C, localStream)); + h_work_getrf_C = (void*)malloc(workspaceInBytesOnHost_getrf_C); + assert(h_work_getrf_C != NULL); + + /* =========================================== */ + /* CALL PGETRF */ + /* =========================================== */ + double start_time_C = omp_get_wtime(); + CUSOLVERMP_CHECK(cusolverMpGetrf( + cusolverMpHandle, N, N, + d_C_A, IA, JA, descrA_C, + d_ipiv, computeType, + d_work_getrf_C, workspaceInBytesOnDevice_getrf_C, + h_work_getrf_C, workspaceInBytesOnHost_getrf_C, + d_info_getrf)); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + /* sync after cusolverMpGetrf */ + CAL_CHECK(cal_stream_sync(cal_comm, localStream)); + + /* copy d_info_getrf to host */ + int h_info_getrf=1; + cudaStat = cudaMemcpyAsync(&h_info_getrf, d_info_getrf, sizeof(int), cudaMemcpyDeviceToHost, localStream); + assert(cudaStat == cudaSuccess); + /* wait for d_info_getrf copy */ + gpu_dev_stream.cudaSync(); + assert(cudaStat == cudaSuccess); + assert(h_info_getrf == 0); + + /* =========================================== */ + /* CLEAN UP HOST WORKSPACE ON MASTER */ + /* =========================================== */ + cusolverStat = cusolverMpDestroyMatrixDesc(descrA_C); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + cusolverStat = cusolverMpDestroyGrid(gridA_C); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + if (d_work_getrf_C != NULL) + { + CUDA_CHECK(cudaFreeAsync(d_work_getrf_C, localStream)); + d_work_getrf_C = NULL; + } + + if (h_work_getrf_C) + { + free(h_work_getrf_C); + h_work_getrf_C = NULL; + } + + /* MPI barrier before MPI_Finalize */ + MPI_Barrier(MPI_COMM_WORLD); + return; +} +void CudaConnector::pgetrf_nvhpc_mixed_precision( + void *d_A, const int &ia, const int &ja, + const LIBRPA::Array_Desc &arrdesc_pi, int64_t *d_ipiv, int *d_info_getrf, + const cudaDataType_t &computeType,const char &order + ) +{ + const int64_t M = arrdesc_pi.m(); + const int64_t N = arrdesc_pi.n(); + + const int64_t IA = ia; + const int64_t JA = ja; + + /* Tile sizes */ + const int64_t MA = arrdesc_pi.mb(); + const int64_t NA = arrdesc_pi.nb(); + int numRowDevices, numColDevices; + if(order=='C'||order=='c'){ + numRowDevices = arrdesc_pi.npcols(); + numColDevices = arrdesc_pi.nprows(); + } + else if(order=='R'||order=='r'){ + numRowDevices = arrdesc_pi.nprows(); + numColDevices = arrdesc_pi.npcols(); + }else{ + fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n"); + } + + + const uint32_t RSRCA = 0; + const uint32_t CSRCA = 0; + + int local_device = device_stream.local_device; + + const int rank = mpi_comm_global_h.myid; + const int commSize = mpi_comm_global_h.nprocs; + /* Library handles */ + cusolverMpHandle_t cusolverMpHandle = device_stream.cusolverMp_handle; + cal_comm_t cal_comm = device_stream.cal_comm; + + /* Error codes */ + cusolverStatus_t cusolverStat = CUSOLVER_STATUS_SUCCESS; + cudaError_t cudaStat = cudaSuccess; + cudaStream_t localStream = (cudaStream_t)device_stream.stream; + + /* cusolverMp grids */ + cusolverMpGrid_t gridA_C = NULL; + + /* cusolverMp matrix descriptors */ + cusolverMpMatrixDescriptor_t descrA_C = NULL; + + /* Distributed matrices */ + void* d_C_A = d_A; // + + + /* Distributed device workspace */ + void* d_work_getrf_C = NULL; + + /* Distributed host workspace */ + void* h_work_getrf_C = NULL; + + /* size of workspace on device */ + size_t workspaceInBytesOnDevice_getrf_C = 0; + + /* size of workspace on host */ + size_t workspaceInBytesOnHost_getrf_C = 0; + + /* Single process per device */ + assert((numRowDevices * numColDevices) == commSize); + + const int64_t lda = (IA - 1) + N; + const int64_t colsA = (JA - 1) + N; + // cuDoubleComplex* h_C_A = NULL; + int64_t LLDA, localColsA; + if(order=='C'||order=='c'){ + localColsA =arrdesc_pi.m_loc(); + LLDA=arrdesc_pi.n_loc(); + }else if(order=='R'||order=='r'){ + localColsA =arrdesc_pi.n_loc(); + LLDA=arrdesc_pi.m_loc(); + }else{ + fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n"); + } + + /* =========================================== */ + /* CREATE GRID DESCRIPTORS */ + /* =========================================== */ + if(order=='C'||order=='c'){ + cusolverStat = cusolverMpCreateDeviceGrid( + cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_COL_MAJOR); + }else if(order=='R'||order=='r'){ + cusolverStat = cusolverMpCreateDeviceGrid( + cusolverMpHandle, &gridA_C, cal_comm, numRowDevices, numColDevices, CUSOLVERMP_GRID_MAPPING_ROW_MAJOR); + }else{ + fprintf(stderr, "Error: cusolverMpgetrf order must be 'C'('c') or 'R'('r')\n"); + } + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + /* =========================================== */ + /* CREATE MATRIX DESCRIPTORS */ + /* =========================================== */ + cusolverStat = cusolverMpCreateMatrixDesc( + &descrA_C, gridA_C, computeType, (IA - 1) + M, (JA - 1) + N, MA, NA, RSRCA, CSRCA, LLDA); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + + /* =========================================== */ + /* RESET D_INFO */ + /* =========================================== */ + + CUDA_CHECK(cudaMemset(d_info_getrf, 1, sizeof(int))); + + + /* =========================================== */ + /* QUERY WORKSPACE SIZE FOR MP ROUTINES */ + /* =========================================== */ + cusolverStat = cusolverMpGetrf_bufferSize(cusolverMpHandle, + N, + N, + d_C_A, + IA, + JA, + descrA_C, + d_ipiv, + computeType, + &workspaceInBytesOnDevice_getrf_C, + &workspaceInBytesOnHost_getrf_C); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + /* =========================================== */ + /* ALLOCATE PGETRF WORKSPACE */ + /* =========================================== */ + CUDA_CHECK(cudaMallocAsync((void**)&d_work_getrf_C, workspaceInBytesOnDevice_getrf_C, localStream)); + h_work_getrf_C = (void*)malloc(workspaceInBytesOnHost_getrf_C); + assert(h_work_getrf_C != NULL); + + /* =========================================== */ + /* CALL PGETRF */ + /* =========================================== */ + double start_time_C = omp_get_wtime(); + CUSOLVERMP_CHECK(cusolverMpGetrf( + cusolverMpHandle, N, N, + d_C_A, IA, JA, descrA_C, + d_ipiv, computeType, + d_work_getrf_C, workspaceInBytesOnDevice_getrf_C, + h_work_getrf_C, workspaceInBytesOnHost_getrf_C, + d_info_getrf)); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + /* sync after cusolverMpGetrf */ + CAL_CHECK(cal_stream_sync(cal_comm, localStream)); + + /* copy d_info_getrf to host */ + int h_info_getrf=1; + cudaStat = cudaMemcpyAsync(&h_info_getrf, d_info_getrf, sizeof(int), cudaMemcpyDeviceToHost, localStream); + assert(cudaStat == cudaSuccess); + /* wait for d_info_getrf copy */ + device_stream.cudaSync(); + assert(cudaStat == cudaSuccess); + assert(h_info_getrf == 0); + + /* =========================================== */ + /* CLEAN UP HOST WORKSPACE ON MASTER */ + /* =========================================== */ + cusolverStat = cusolverMpDestroyMatrixDesc(descrA_C); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + cusolverStat = cusolverMpDestroyGrid(gridA_C); + assert(cusolverStat == CUSOLVER_STATUS_SUCCESS); + + if (d_work_getrf_C != NULL) + { + CUDA_CHECK(cudaFreeAsync(d_work_getrf_C, localStream)); + d_work_getrf_C = NULL; + } + + if (h_work_getrf_C) + { + free(h_work_getrf_C); + h_work_getrf_C = NULL; + } + + /* MPI barrier before MPI_Finalize */ + MPI_Barrier(MPI_COMM_WORLD); + return; +} +void CudaConnector::pgetrs_nvhpc_mixed_precision( + const GpuDeviceStream& gpu_dev_stream, const cublasOperation_t& trans, + const void* d_A, const int64_t& IA, const int64_t& JA, const LIBRPA::Array_Desc &arrdesc_A, + const int64_t* d_ipiv, + void* d_B, const int64_t& IB, const int64_t& JB, const LIBRPA::Array_Desc &arrdesc_B, + int* d_info, const cudaDataType_t& compute_type, + const char& order +) +{ + const int64_t N = arrdesc_A.m(); + const int64_t NRHS = arrdesc_B.n(); + assert(arrdesc_A.m()==arrdesc_A.n()); + + int h_info = 1; + size_t workspaceInBytesOnDevice, workspaceInBytesOnHost; + void* h_work = NULL; + void* d_work = NULL; + const int64_t mbA = arrdesc_A.mb(); + const int64_t nbA = arrdesc_A.nb(); + const int64_t mbB = arrdesc_B.mb(); + const int64_t nbB = arrdesc_B.nb(); + assert(arrdesc_A.npcols() == arrdesc_B.npcols()); + assert(arrdesc_A.nprows() == arrdesc_B.nprows()); + int numRowDevices, numColDevices; + ORDER_CHECK(order); + if(order=='C'||order=='c'){ + assert(N==NRHS); + numRowDevices = arrdesc_A.npcols(); + numColDevices = arrdesc_A.nprows(); + } + else{ + numRowDevices = arrdesc_A.nprows(); + numColDevices = arrdesc_A.npcols(); + } + int64_t LLDA, localColsA, LLDB, localColsB; + if(order=='C'||order=='c'){ + LLDA=arrdesc_A.n_loc(); + localColsA =arrdesc_A.m_loc(); + LLDB=arrdesc_B.n_loc(); + localColsB =arrdesc_B.m_loc(); + }else{ + LLDA=arrdesc_A.m_loc(); + localColsA =arrdesc_A.n_loc(); + LLDB=arrdesc_B.m_loc(); + localColsB =arrdesc_B.n_loc(); + } + int mpiCommSize = gpu_dev_stream.nranks; + int rank = gpu_dev_stream.rank; + + cusolverMpHandle_t cusolverMpHandle = gpu_dev_stream.cusolver_handle; + + cusolverMpGrid_t grid = NULL; + cusolverMpMatrixDescriptor_t descrA = NULL; + cusolverMpMatrixDescriptor_t descrB = NULL; + + CUSOLVERMP_CHECK(cusolverMpCreateDeviceGrid( + cusolverMpHandle, &grid, gpu_dev_stream.cal_comm, + numRowDevices, numColDevices, + (order=='c'||order=='C')?CUSOLVERMP_GRID_MAPPING_COL_MAJOR:CUSOLVERMP_GRID_MAPPING_ROW_MAJOR) + ); + + CUSOLVERMP_CHECK(cusolverMpCreateMatrixDesc( + &descrA, grid, compute_type, + (IA - 1) + N, (JA - 1) + N, + mbA, nbA, 0, 0, LLDA) + ); + CUSOLVERMP_CHECK(cusolverMpCreateMatrixDesc( + &descrB, grid, compute_type, + (IB - 1) + N, (JB - 1) + NRHS, + mbB, nbB, 0, 0, LLDB) + ); + + CUSOLVERMP_CHECK(cusolverMpGetrs_bufferSize( + cusolverMpHandle, trans, N, NRHS, + d_A, IA, JA, descrA, d_ipiv, + d_B, IB, JB, descrB, + compute_type, + &workspaceInBytesOnDevice,&workspaceInBytesOnHost) + ); + gpu_dev_stream.calSync(); + if (workspaceInBytesOnHost > 0) + { + h_work = (void*)malloc(workspaceInBytesOnHost); + assert(h_work != NULL); + } + if (workspaceInBytesOnDevice > 0) + { + CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, gpu_dev_stream.stream)); + } + gpu_dev_stream.calSync(); + CUSOLVERMP_CHECK(cusolverMpGetrs( + cusolverMpHandle, trans, + N, NRHS, + d_A, IA, JA, descrA, d_ipiv, + d_B, IB, JB, descrB, + compute_type, + d_work, workspaceInBytesOnDevice, + h_work, workspaceInBytesOnHost, + d_info) + ); + gpu_dev_stream.calSync(); + + CUDA_CHECK(cudaMemcpy(&h_info, d_info, sizeof(int), cudaMemcpyDeviceToHost)); + if(h_info!=0) + { + fprintf(stderr, "Error: cusolverMpgetrs failed with info=%d\n", h_info); + exit(1); + } + if(h_work!=NULL) + { + free(h_work); + } + if(d_work!=NULL){ + CUDA_CHECK(cudaFreeAsync(d_work, gpu_dev_stream.stream)); + } +} +void CudaConnector::pgetrf_trs_nvhpc_mixed_precision( + const GpuDeviceStream& gpu_dev_stream, const cublasOperation_t& trans, + void* d_A, const int64_t& IA, const int64_t& JA, const LIBRPA::Array_Desc &arrdesc_A, + void* d_B, const int64_t& IB, const int64_t& JB, const LIBRPA::Array_Desc &arrdesc_B, + const cudaDataType_t& compute_type, const char& order +) +{ + int64_t* d_ipiv; + int* d_info; + CUDA_CHECK(cudaMallocAsync(&d_info,sizeof(int),gpu_dev_stream.stream)); + if(order == 'c'||order == 'C'){ + CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*arrdesc_A.n_loc(),gpu_dev_stream.stream)); + }else{ + CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*arrdesc_A.m_loc(),gpu_dev_stream.stream)); + } + pgetrf_nvhpc_mixed_precision( + gpu_dev_stream, d_A, 1, 1, arrdesc_A, + d_ipiv, d_info, + CUDA_C_64F, order + ); + pgetrs_nvhpc_mixed_precision( + gpu_dev_stream, trans, + d_A, 1, 1, arrdesc_A, + d_ipiv, + d_B, 1, 1, arrdesc_B, + d_info, + CUDA_C_64F, order + ); + CUDA_CHECK(cudaFreeAsync(d_info, gpu_dev_stream.stream)); + CUDA_CHECK(cudaFreeAsync(d_ipiv, gpu_dev_stream.stream)); +} +// void CudaConnector::pgemm_cublasMp(const char &transa, const char &transb, const int &m, const int &n, const int &k, +// const double &alphaD, const std::complex *A, const int &ia, const int &ja, const LIBRPA::Array_Desc &arrdesc_A, +// const std::complex *B, const int &ib, const int &jb, const LIBRPA::Array_Desc &arrdesc_B, +// const double &betaD, std::complex *C, const int &ic, const int &jc, const LIBRPA::Array_Desc &arrdesc_C) +// { +// using input_t = cuDoubleComplex; +// using output_t = cuDoubleComplex; +// using compute_t = cuDoubleComplex; +// const cudaDataType_t cuda_input_type = CUDA_C_64F; +// const cudaDataType_t cuda_output_type = CUDA_C_64F; +// cublasComputeType_t cublas_compute_type = CUBLAS_COMPUTE_64F_PEDANTIC; + +// int64_t mbA = arrdesc_A.mb(); +// int64_t nbA = arrdesc_A.nb(); +// int64_t mbB = arrdesc_B.mb(); +// int64_t nbB = arrdesc_B.nb(); +// int64_t mbC = arrdesc_C.mb(); +// int64_t nbC = arrdesc_C.nb(); +// int nprow = arrdesc_A.nprows(); +// int npcol = arrdesc_A.npcols(); +// char grid_layout = 'r'; +// bool verbose = false; +// int rank, nranks; +// MPI_Comm_size(MPI_COMM_WORLD, &nranks); +// MPI_Comm_rank(MPI_COMM_WORLD, &rank); +// // if(rank==0) +// // printf("sizeof(input_t)=%zu sizeof(output_t)=%zu sizeof(compute_t)=%zu\n", sizeof(input_t), sizeof(output_t), sizeof(compute_t)); +// const int myprow = arrdesc_A.myprow(); +// const int mypcol = arrdesc_A.mypcol(); +// if(rank==0) +// { +// printf("m:%" PRId64 ", n:%" PRId64 ", k:%" PRId64 "\n", m, n, k); +// printf("nprow:%d, npcol:%d\n", nprow, npcol); +// printf("mbA:%" PRId64 ", nbA:%" PRId64 "\n", mbA, nbA); +// } +// const int local_device = getLocalDevice(); +// printf("myrank:%d, myprow:%d, mypcol:%d, local_device:%d\n", rank, myprow, mypcol, local_device); +// CUDA_CHECK(cudaSetDevice(local_device)); +// CUDA_CHECK(cudaFree(nullptr)); + +// cal_comm_t cal_comm; +// cal_comm_create_params_t params; +// { +// params.allgather = allgather; +// params.req_test = request_test; +// params.req_free = request_free; +// params.data = (void*)(MPI_COMM_WORLD); +// params.rank = rank; +// params.nranks = nranks; +// params.local_device = local_device; + +// CAL_CHECK(cal_comm_create(params, &cal_comm)); +// } +// cudaStream_t stream = nullptr; +// CUDA_CHECK(cudaStreamCreate(&stream)); + +// cublasMpHandle_t handle = nullptr; +// CUBLASMP_CHECK(cublasMpCreate(&handle, stream)); + +// cublasMpGrid_t grid = nullptr; + +// cublasMpMatrixDescriptor_t descA = nullptr; +// cublasMpMatrixDescriptor_t descB = nullptr; +// cublasMpMatrixDescriptor_t descC = nullptr; + +// input_t* d_A = nullptr; +// input_t* d_B = nullptr; +// output_t* d_C = nullptr; + +// void* d_work = nullptr; + +// compute_t alpha = {alphaD,0.0}; +// compute_t beta = {betaD,0.0}; + + +// size_t workspaceInBytesOnDevice = 0; +// size_t workspaceInBytesOnHost = 0; + +// const int64_t global_m_a = (ia - 1) + m; +// const int64_t global_n_a = (ja - 1) + k; +// const int64_t global_m_b = (ib - 1) + k; +// const int64_t global_n_b = (jb - 1) + n; +// const int64_t global_m_c = (ic - 1) + m; +// const int64_t global_n_c = (jc - 1) + n; + +// const int64_t llda = cublasMpNumroc(global_m_a, mbA, myprow, 0, nprow); +// const int64_t loc_n_a = cublasMpNumroc(global_n_a, nbA, mypcol, 0, npcol); +// printf("rank:%d, llda=%" PRId64 ", loc_n_a=%" PRId64 "\n", rank, llda, loc_n_a); +// const int64_t lldb = cublasMpNumroc(global_m_b, mbB, myprow, 0, nprow); +// const int64_t loc_n_b = cublasMpNumroc(global_n_b, nbB, mypcol, 0, npcol); + +// const int64_t lldc = cublasMpNumroc(global_m_c, mbC, myprow, 0, nprow); +// const int64_t loc_n_c = cublasMpNumroc(global_n_c, nbC, mypcol, 0, npcol); + +// CUDA_CHECK(cudaMallocAsync(&d_A, llda * loc_n_a * sizeof(input_t), stream)); +// CUDA_CHECK(cudaMallocAsync(&d_B, lldb * loc_n_b * sizeof(input_t), stream)); +// CUDA_CHECK(cudaMallocAsync(&d_C, lldc * loc_n_c * sizeof(output_t), stream)); + +// CUDA_CHECK(cudaMemcpyAsync(d_A, A, llda * loc_n_a * sizeof(input_t), cudaMemcpyHostToDevice, stream)); +// CUDA_CHECK(cudaMemcpyAsync(d_B, B, lldb * loc_n_b * sizeof(input_t), cudaMemcpyHostToDevice, stream)); +// CUDA_CHECK(cudaMemcpyAsync(d_C, C, lldc * loc_n_c * sizeof(output_t), cudaMemcpyHostToDevice, stream)); + +// CUBLASMP_CHECK(cublasMpGridCreate( +// handle, +// nprow, +// npcol, +// grid_layout == 'c' ? CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR, +// cal_comm, +// &grid)); + +// CUBLASMP_CHECK( +// cublasMpMatrixDescriptorCreate(handle,global_m_a, global_n_a, mbA, nbA, 0, 0, llda, cuda_input_type, grid, &descA)); +// CUBLASMP_CHECK( +// cublasMpMatrixDescriptorCreate(handle,global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, cuda_input_type, grid, &descB)); +// CUBLASMP_CHECK( +// cublasMpMatrixDescriptorCreate(handle,global_m_c, global_n_c, mbC, nbC, 0, 0, lldc, cuda_output_type, grid, &descC)); +// cublasOperation_t transA,transB; +// if(transa=='N') +// transA=CUBLAS_OP_N; +// else if(transa=='T') +// transA=CUBLAS_OP_T; +// else if(transa=='C') +// transA=CUBLAS_OP_C; +// else{ +// if(rank==0) +// printf("transa=%c is not supported\n", transa); +// exit(1); +// } +// if(transb=='N') +// transB=CUBLAS_OP_N; +// else if(transb=='T') +// transB=CUBLAS_OP_T; +// else if(transb=='C') +// transB=CUBLAS_OP_C; +// else{ +// if(rank==0) +// printf("transb=%c is not supported\n", transb); +// exit(1); +// } + + + +// CUBLASMP_CHECK(cublasMpGemm_bufferSize( +// handle, +// transA, +// transB, +// m, +// n, +// k, +// &alpha, +// d_A, +// ia, +// ja, +// descA, +// d_B, +// ib, +// jb, +// descB, +// &beta, +// d_C, +// ic, +// jc, +// descC, +// cublas_compute_type, +// &workspaceInBytesOnDevice, +// &workspaceInBytesOnHost)); + +// CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream)); +// printf("workspaceInBytesOnDevice=%zu, workspaceInBytesOnHost=%zu, rank:%d\n", workspaceInBytesOnDevice, workspaceInBytesOnHost, rank); +// std::vector h_work(workspaceInBytesOnHost); + +// CUDA_CHECK(cudaStreamSynchronize(stream)); + +// const double begin = MPI_Wtime(); + +// CUBLASMP_CHECK(cublasMpGemm( +// handle, +// transA, +// transB, +// m, +// n, +// k, +// &alpha, +// d_A, +// ia, +// ja, +// descA, +// d_B, +// ib, +// jb, +// descB, +// &beta, +// d_C, +// ic, +// jc, +// descC, +// cublas_compute_type, +// d_work, +// workspaceInBytesOnDevice, +// h_work.data(), +// workspaceInBytesOnHost)); +// printf("Duration(before synchronize): %lf GFlops: %lf rank:%d\n", MPI_Wtime() - begin, (2 * m * n * k * 1e-9) / (MPI_Wtime() - begin), rank); +// CUDA_CHECK(cudaStreamSynchronize(stream)); +// printf("Duration(after synchronize): %lf GFlops: %lf rank:%d\n", MPI_Wtime() - begin, (2 * m * n * k * 1e-9) / (MPI_Wtime() - begin), rank); +// if(verbose){ +// } +// CUDA_CHECK(cudaMemcpyAsync(C, d_C, lldc * loc_n_c * sizeof(output_t), cudaMemcpyDeviceToHost, stream)); +// CUDA_CHECK(cudaStreamSynchronize(stream)); + +// CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descA)); +// CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descB)); +// CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descC)); + +// CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid)); + +// CUBLASMP_CHECK(cublasMpDestroy(handle)); + +// CUDA_CHECK(cudaFreeAsync(d_A, stream)); +// CUDA_CHECK(cudaFreeAsync(d_B, stream)); +// CUDA_CHECK(cudaFreeAsync(d_C, stream)); +// CUDA_CHECK(cudaFreeAsync(d_work, stream)); + +// CAL_CHECK(cal_comm_destroy(cal_comm)); +// CUDA_CHECK(cudaStreamDestroy(stream)); + +// MPI_Barrier(MPI_COMM_WORLD); + +// } +void CudaConnector::pgemm_device(cublasMpHandle_t handle,cublasOperation_t transA,cublasOperation_t transB,const int &m,const int &n,const int &k, + const void *alpha, + const ComplexMatrixDevice &d_A,int64_t ia,int64_t ja, + const ComplexMatrixDevice &d_B,int64_t ib,int64_t jb, + const void *beta, + ComplexMatrixDevice &d_C,int64_t ic,int64_t jc, + cublasComputeType_t cublas_compute_type) +{ + void* d_work; + + size_t workspaceInBytesOnDevice,workspaceInBytesOnHost; + CUBLASMP_CHECK(cublasMpGemm_bufferSize(handle,transA,transB,m,n,k, + alpha, + (const void *)(d_A.ptr()),ia,ja,d_A.desc_cublas, + (const void *)d_B.ptr(),ib,jb,d_B.desc_cublas, + beta, + d_C.ptr(),ic,jc,d_C.desc_cublas, + cublas_compute_type, + &workspaceInBytesOnDevice, + &workspaceInBytesOnHost)); + CUDA_CHECK(cudaMalloc((void**)&d_work,workspaceInBytesOnDevice)); + std::vector h_work(workspaceInBytesOnHost); + CUBLASMP_CHECK(cublasMpGemm(handle,transA,transB,m,n,k, + alpha, + (const void *)d_A.ptr(),ia,ja,d_A.desc_cublas, + (const void *)d_B.ptr(),ib,jb,d_B.desc_cublas, + beta, + d_C.ptr(),ic,jc,d_C.desc_cublas, + cublas_compute_type, + d_work,workspaceInBytesOnDevice, + h_work.data(),workspaceInBytesOnHost)); + + CUDA_CHECK(cudaFree(d_work)); +} + +void CudaConnector::pgemm_nvhpc(const GpuDeviceStream& gpu_dev_stream,cublasOperation_t transA,cublasOperation_t transB,const int & m,const int & n,const int & k, + const void *alpha, + const ComplexMatrixDevice &d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA, + const ComplexMatrixDevice &d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB, + const void *beta, + ComplexMatrixDevice & d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC, + cublasComputeType_t cublas_compute_type) +{ + using input_t = cuDoubleComplex; + using output_t = cuDoubleComplex; + using compute_t = cuDoubleComplex; + const cudaDataType_t cuda_input_type = CUDA_C_64F; + const cudaDataType_t cuda_output_type = CUDA_C_64F; + + int64_t mbA = array_descA.mb(); + int64_t nbA = array_descA.nb(); + int64_t mbB = array_descB.mb(); + int64_t nbB = array_descB.nb(); + int64_t mbC = array_descC.mb(); + int64_t nbC = array_descC.nb(); + int nprow = array_descA.nprows(); + int npcol = array_descA.npcols(); + int llda= array_descA.lld(); + int lldb= array_descB.lld(); + int lldc= array_descC.lld(); + cublasMpGrid_t grid = nullptr; + + cublasMpMatrixDescriptor_t descA = nullptr; + cublasMpMatrixDescriptor_t descB = nullptr; + cublasMpMatrixDescriptor_t descC = nullptr; + char grid_layout = 'r'; + int rank=gpu_dev_stream.rank; + int nranks=gpu_dev_stream.nranks; + + const int myprow = (grid_layout == 'c' ? rank % nprow : rank / npcol); + const int mypcol = (grid_layout == 'c' ? rank / nprow : rank % npcol); + + cublasMpHandle_t handle = nullptr; + CUBLASMP_CHECK(cublasMpCreate(&handle, gpu_dev_stream.stream)); + void* d_work = nullptr; + + + size_t workspaceInBytesOnDevice = 0; + size_t workspaceInBytesOnHost = 0; + + int64_t global_m_a, global_n_a, global_m_b, global_n_b; + if(transA == CUBLAS_OP_N){ + global_m_a = (ia - 1) + m; + global_n_a = (ja - 1) + k; + }else{ + global_m_a = (ia - 1) + k; + global_n_a = (ja - 1) + m; + } + if(transB == CUBLAS_OP_N){ + global_m_b = (ib - 1) + k; + global_n_b = (jb - 1) + n; + }else{ + global_m_b = (ib - 1) + n; + global_n_b = (jb - 1) + k; + } + const int64_t global_m_c = (ic - 1) + m; + const int64_t global_n_c = (jc - 1) + n; + + CUBLASMP_CHECK(cublasMpGridCreate( + handle, nprow, npcol, + grid_layout == 'c' ? CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR, + gpu_dev_stream.cal_comm, &grid) + ); + + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle,global_m_a, global_n_a, mbA, nbA, 0, 0, llda, cuda_input_type, grid, &descA)); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle,global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, cuda_input_type, grid, &descB)); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle,global_m_c, global_n_c, mbC, nbC, 0, 0, lldc, cuda_output_type, grid, &descC)); + + CUBLASMP_CHECK(cublasMpGemm_bufferSize( + handle, transA, transB, + m, n, k, + alpha, + d_A.ptr(), ia, ja, descA, + d_B.ptr(), ib, jb, descB, + beta, + d_C.ptr(), ic, jc, descC, + cublas_compute_type, + &workspaceInBytesOnDevice, &workspaceInBytesOnHost) + ); + gpu_dev_stream.cudaSync(); + CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, gpu_dev_stream.stream)); + std::vector h_work(workspaceInBytesOnHost); + + CUBLASMP_CHECK(cublasMpGemm( + handle, transA, transB, + m, n, k, + alpha, + d_A.ptr(), ia, ja, descA, + d_B.ptr(), ib, jb, descB, + beta, + d_C.ptr(), ic, jc, descC, + cublas_compute_type, + d_work, workspaceInBytesOnDevice, + h_work.data(), workspaceInBytesOnHost + )); + + gpu_dev_stream.cudaSync(); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descA)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descB)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descC)); + + CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid)); + + CUBLASMP_CHECK(cublasMpDestroy(handle)); + CUDA_CHECK(cudaFree(d_work)); + + MPI_Barrier(MPI_COMM_WORLD); +} +// void CudaConnector::pgemm_nvhpc_cuFloatComplex(const GpuDeviceStream& gpu_dev_stream,cublasOperation_t transA,cublasOperation_t transB,const int & m,const int & n,const int & k, +// const void *alpha, +// const cuFloatComplex* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA, +// const cuFloatComplex* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB, +// const void *beta, +// cuFloatComplex * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC, +// cublasComputeType_t cublas_compute_type) +// { +// using input_t = cuFloatComplex; +// using output_t = cuFloatComplex; +// using compute_t = cuFloatComplex; +// const cudaDataType_t cuda_input_type = CUDA_C_32F; +// const cudaDataType_t cuda_output_type = CUDA_C_32F; + +// int64_t mbA = array_descA.mb(); +// int64_t nbA = array_descA.nb(); +// int64_t mbB = array_descB.mb(); +// int64_t nbB = array_descB.nb(); +// int64_t mbC = array_descC.mb(); +// int64_t nbC = array_descC.nb(); +// int nprow = array_descA.nprows(); +// int npcol = array_descA.npcols(); +// int llda= array_descA.lld(); +// int lldb= array_descB.lld(); +// int lldc= array_descC.lld(); +// cublasMpGrid_t grid = nullptr; + +// cublasMpMatrixDescriptor_t descA = nullptr; +// cublasMpMatrixDescriptor_t descB = nullptr; +// cublasMpMatrixDescriptor_t descC = nullptr; +// char grid_layout = 'r'; +// int rank=gpu_dev_stream.rank; +// int nranks=gpu_dev_stream.nranks; +// // if(rank==0) +// // printf("sizeof(input_t)=%zu sizeof(output_t)=%zu sizeof(compute_t)=%zu\n", sizeof(input_t), sizeof(output_t), sizeof(compute_t)); +// const int myprow = (grid_layout == 'c' ? rank % nprow : rank / npcol); +// const int mypcol = (grid_layout == 'c' ? rank / nprow : rank % npcol); +// const int local_device = gpu_dev_stream.local_device; +// cudaStream_t stream = gpu_dev_stream.stream; + +// cublasMpHandle_t handle = nullptr; +// CUBLASMP_CHECK(cublasMpCreate(&handle, stream)); +// void* d_work = nullptr; + + +// size_t workspaceInBytesOnDevice = 0; +// size_t workspaceInBytesOnHost = 0; + +// const int64_t global_m_a = (ia - 1) + m; +// const int64_t global_n_a = (ja - 1) + k; +// const int64_t global_m_b = (ib - 1) + k; +// const int64_t global_n_b = (jb - 1) + n; +// const int64_t global_m_c = (ic - 1) + m; +// const int64_t global_n_c = (jc - 1) + n; + +// gpu_dev_stream.cudaSync(); +// // printf("before create grid, rank:%d\n", rank); +// CUBLASMP_CHECK(cublasMpGridCreate( +// handle, +// nprow, +// npcol, +// grid_layout == 'c' ? CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR, +// gpu_dev_stream.cal_comm, +// &grid)); +// // printf("after create grid, rank:%d\n", rank); +// CUBLASMP_CHECK( +// cublasMpMatrixDescriptorCreate(handle,global_m_a, global_n_a, mbA, nbA, 0, 0, llda, cuda_input_type, grid, &descA)); +// CUBLASMP_CHECK( +// cublasMpMatrixDescriptorCreate(handle,global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, cuda_input_type, grid, &descB)); +// CUBLASMP_CHECK( +// cublasMpMatrixDescriptorCreate(handle,global_m_c, global_n_c, mbC, nbC, 0, 0, lldc, cuda_output_type, grid, &descC)); +// // printf("after create desc, rank:%d\n", rank); +// CUBLASMP_CHECK(cublasMpGemm_bufferSize( +// handle, +// transA, +// transB, +// m, +// n, +// k, +// alpha, +// d_A, +// ia, +// ja, +// descA, +// d_B, +// ib, +// jb, +// descB, +// beta, +// d_C, +// ic, +// jc, +// descC, +// cublas_compute_type, +// &workspaceInBytesOnDevice, +// &workspaceInBytesOnHost)); +// // printf("workspaceInBytesOnDevice=%zu, workspaceInBytesOnHost=%zu, rank:%d\n", workspaceInBytesOnDevice, workspaceInBytesOnHost, rank); +// CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream)); +// std::vector h_work(workspaceInBytesOnHost); + +// gpu_dev_stream.cudaSync(); + +// // const double begin = MPI_Wtime(); +// // printf("before gemm, rank:%d\n", rank); +// CUBLASMP_CHECK(cublasMpGemm( +// handle, +// transA, +// transB, +// m, +// n, +// k, +// alpha, +// d_A, +// ia, +// ja, +// descA, +// d_B, +// ib, +// jb, +// descB, +// beta, +// d_C, +// ic, +// jc, +// descC, +// cublas_compute_type, +// d_work, +// workspaceInBytesOnDevice, +// h_work.data(), +// workspaceInBytesOnHost)); +// // printf("after gemm, rank:%d\n", rank); +// CUDA_CHECK(cudaStreamSynchronize(stream)); +// CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descA)); +// CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descB)); +// CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descC)); + +// CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid)); + +// CUBLASMP_CHECK(cublasMpDestroy(handle)); +// CUDA_CHECK(cudaFree(d_work)); + +// MPI_Barrier(MPI_COMM_WORLD); +// } +void CudaConnector::pgemm_nvhpc_mixed_precision( + const GpuDeviceStream& gpu_dev_stream,cublasOperation_t transA,cublasOperation_t transB,const int & m,const int & n,const int & k, + const void *alpha, + const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA, + const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB, + const void *beta, + void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC, + cublasComputeType_t cublas_compute_type +) +{ + cudaDataType_t cuda_compute_type; + if(cublas_compute_type == CUBLAS_COMPUTE_64F_PEDANTIC){ + cuda_compute_type = CUDA_C_64F; + }else if(cublas_compute_type == CUBLAS_COMPUTE_32F_PEDANTIC){ + cuda_compute_type = CUDA_C_32F; + }else{ + fprintf(stderr, "Unsupported cublas_compute_type\n"); + } + int64_t mbA = array_descA.mb(); + int64_t nbA = array_descA.nb(); + int64_t mbB = array_descB.mb(); + int64_t nbB = array_descB.nb(); + int64_t mbC = array_descC.mb(); + int64_t nbC = array_descC.nb(); + int nprow = array_descA.nprows(); + int npcol = array_descA.npcols(); + int llda= array_descA.lld(); + int lldb= array_descB.lld(); + int lldc= array_descC.lld(); + cublasMpGrid_t grid = nullptr; + + cublasMpMatrixDescriptor_t descA = nullptr; + cublasMpMatrixDescriptor_t descB = nullptr; + cublasMpMatrixDescriptor_t descC = nullptr; + char grid_layout = 'r'; + int rank=gpu_dev_stream.rank; + int nranks=gpu_dev_stream.nranks; + // if(rank==0) + // printf("sizeof(input_t)=%zu sizeof(output_t)=%zu sizeof(compute_t)=%zu\n", sizeof(input_t), sizeof(output_t), sizeof(compute_t)); + const int myprow = (grid_layout == 'c' ? rank % nprow : rank / npcol); + const int mypcol = (grid_layout == 'c' ? rank / nprow : rank % npcol); + const int local_device = gpu_dev_stream.local_device; + cudaStream_t stream = gpu_dev_stream.stream; + + cublasMpHandle_t handle = nullptr; + // printf("before create handle, rank:%d\n"); + // CudaConnector::check_memory(gpu_dev_stream); + CUBLASMP_CHECK(cublasMpCreate(&handle, stream)); + // printf("after create handle, rank:%d\n"); + // CudaConnector::check_memory(gpu_dev_stream); + void* d_work = nullptr; + + + size_t workspaceInBytesOnDevice = 0; + size_t workspaceInBytesOnHost = 0; + + int64_t global_m_a, global_n_a, global_m_b, global_n_b; + if(transA == CUBLAS_OP_N){ + global_m_a = (ia - 1) + m; + global_n_a = (ja - 1) + k; + }else{ + global_m_a = (ia - 1) + k; + global_n_a = (ja - 1) + m; + } + if(transB == CUBLAS_OP_N){ + global_m_b = (ib - 1) + k; + global_n_b = (jb - 1) + n; + }else{ + global_m_b = (ib - 1) + n; + global_n_b = (jb - 1) + k; + } + const int64_t global_m_c = (ic - 1) + m; + const int64_t global_n_c = (jc - 1) + n; + + // gpu_dev_stream.cudaSync(); + // printf("before create grid, rank:%d\n", rank); + // CudaConnector::check_memory(gpu_dev_stream); + CUBLASMP_CHECK(cublasMpGridCreate( + handle, + nprow, + npcol, + grid_layout == 'c' ? CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR, + gpu_dev_stream.cal_comm, + &grid)); + // printf("after create grid, rank:%d\n", rank); + // CudaConnector::check_memory(gpu_dev_stream); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle,global_m_a, global_n_a, mbA, nbA, 0, 0, llda, cuda_compute_type, grid, &descA)); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle,global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, cuda_compute_type, grid, &descB)); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle,global_m_c, global_n_c, mbC, nbC, 0, 0, lldc, cuda_compute_type, grid, &descC)); + + CUBLASMP_CHECK(cublasMpGemm_bufferSize( + handle, transA, transB, + m, n, k, + alpha, + d_A, ia, ja, descA, + d_B, ib, jb, descB, + beta, + d_C, ic, jc, descC, + cublas_compute_type, + &workspaceInBytesOnDevice, &workspaceInBytesOnHost) + ); + // printf("workspaceInBytesOnDevice=%zu GiB, workspaceInBytesOnHost=%zu GiB, rank:%d\n", workspaceInBytesOnDevice, workspaceInBytesOnHost, rank); + gpu_dev_stream.cudaSync(); + // printf("before malloc d_work, rank:%d\n"); + // CudaConnector::check_memory(gpu_dev_stream); + CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream)); + // printf("after malloc d_work, rank:%d\n"); + // CudaConnector::check_memory(gpu_dev_stream); + std::vector h_work(workspaceInBytesOnHost); + + // gpu_dev_stream.cudaSync(); + + // const double begin = MPI_Wtime(); + CUBLASMP_CHECK(cublasMpGemm( + handle, transA, transB, + m, n, k, + alpha, + d_A, ia, ja, descA, + d_B, ib, jb, descB, + beta, + d_C, ic, jc, descC, + cublas_compute_type, + d_work, workspaceInBytesOnDevice, + h_work.data(), workspaceInBytesOnHost) + ); + // gpu_dev_stream.cudaSync(); + // printf("before free d_work, rank:%d\n"); + // CudaConnector::check_memory(gpu_dev_stream); + CUDA_CHECK(cudaFreeAsync(d_work, stream)); + // printf("after free d_work, rank:%d\n"); + // CudaConnector::check_memory(gpu_dev_stream); + gpu_dev_stream.cudaSync(); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descA)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descB)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descC)); + + CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid)); + + CUBLASMP_CHECK(cublasMpDestroy(handle)); + + MPI_Barrier(MPI_COMM_WORLD); +} + + +void CudaConnector::pgemm_nvhpc_mixed_precision( + cublasOperation_t transA,cublasOperation_t transB,const int & m,const int & n,const int & k, + const void *alpha, + const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA, + const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB, + const void *beta, + void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC, + cublasComputeType_t cublas_compute_type +) +{ + cudaDataType_t cuda_compute_type; + if(cublas_compute_type == CUBLAS_COMPUTE_64F_PEDANTIC){ + cuda_compute_type = CUDA_C_64F; + }else if(cublas_compute_type == CUBLAS_COMPUTE_32F_PEDANTIC){ + cuda_compute_type = CUDA_C_32F; + }else{ + fprintf(stderr, "Unsupported cublas_compute_type\n"); + } + int64_t mbA = array_descA.mb(); + int64_t nbA = array_descA.nb(); + int64_t mbB = array_descB.mb(); + int64_t nbB = array_descB.nb(); + int64_t mbC = array_descC.mb(); + int64_t nbC = array_descC.nb(); + int nprow = array_descA.nprows(); + int npcol = array_descA.npcols(); + int llda= array_descA.lld(); + int lldb= array_descB.lld(); + int lldc= array_descC.lld(); + cublasMpGrid_t grid = nullptr; + + cublasMpMatrixDescriptor_t descA = nullptr; + cublasMpMatrixDescriptor_t descB = nullptr; + cublasMpMatrixDescriptor_t descC = nullptr; + char grid_layout = 'r'; + int rank=mpi_comm_global_h.myid; + int nranks=mpi_comm_global_h.nprocs; + // if(rank==0) + // printf("sizeof(input_t)=%zu sizeof(output_t)=%zu sizeof(compute_t)=%zu\n", sizeof(input_t), sizeof(output_t), sizeof(compute_t)); + const int myprow = (grid_layout == 'c' ? rank % nprow : rank / npcol); + const int mypcol = (grid_layout == 'c' ? rank / nprow : rank % npcol); + const int local_device = device_stream.local_device; + cudaStream_t stream = (cudaStream_t)device_stream.stream; + + cublasMpHandle_t handle = nullptr; + CUBLASMP_CHECK(cublasMpCreate(&handle, stream)); + void* d_work = nullptr; + + + size_t workspaceInBytesOnDevice = 0; + size_t workspaceInBytesOnHost = 0; + + int64_t global_m_a, global_n_a, global_m_b, global_n_b; + if(transA == CUBLAS_OP_N){ + global_m_a = (ia - 1) + m; + global_n_a = (ja - 1) + k; + }else{ + global_m_a = (ia - 1) + k; + global_n_a = (ja - 1) + m; + } + if(transB == CUBLAS_OP_N){ + global_m_b = (ib - 1) + k; + global_n_b = (jb - 1) + n; + }else{ + global_m_b = (ib - 1) + n; + global_n_b = (jb - 1) + k; + } + const int64_t global_m_c = (ic - 1) + m; + const int64_t global_n_c = (jc - 1) + n; + + CUBLASMP_CHECK(cublasMpGridCreate( + handle, + nprow, + npcol, + grid_layout == 'c' ? CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR, + device_stream.cal_comm, + &grid)); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle,global_m_a, global_n_a, mbA, nbA, 0, 0, llda, cuda_compute_type, grid, &descA)); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle,global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, cuda_compute_type, grid, &descB)); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle,global_m_c, global_n_c, mbC, nbC, 0, 0, lldc, cuda_compute_type, grid, &descC)); + + CUBLASMP_CHECK(cublasMpGemm_bufferSize( + handle, transA, transB, + m, n, k, + alpha, + d_A, ia, ja, descA, + d_B, ib, jb, descB, + beta, + d_C, ic, jc, descC, + cublas_compute_type, + &workspaceInBytesOnDevice, &workspaceInBytesOnHost) + ); + device_stream.cudaSync(); + CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream)); + std::vector h_work(workspaceInBytesOnHost); + + // const double begin = MPI_Wtime(); + CUBLASMP_CHECK(cublasMpGemm( + handle, transA, transB, + m, n, k, + alpha, + d_A, ia, ja, descA, + d_B, ib, jb, descB, + beta, + d_C, ic, jc, descC, + cublas_compute_type, + d_work, workspaceInBytesOnDevice, + h_work.data(), workspaceInBytesOnHost) + ); + + CUDA_CHECK(cudaFreeAsync(d_work, stream)); + + device_stream.cudaSync(); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descA)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descB)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,descC)); + + CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid)); + + CUBLASMP_CHECK(cublasMpDestroy(handle)); +} +void CudaConnector::pgeadd_nvhpc( + const GpuDeviceStream& gpu_dev_stream,const cublasOperation_t& trans, + const void *alpha, + const void* d_A, const int64_t& ia, const int64_t& ja, const Array_Desc& array_descA, + const void* beta, + void* d_B, const int64_t& ib, const int64_t& jb, const Array_Desc& array_descB, + const cudaDataType_t& compute_type, + const char& order +) +{ + cudaStream_t stream = gpu_dev_stream.stream; + cublasMpHandle_t handle = nullptr; + const int64_t mbA = array_descA.mb(); + const int64_t nbA = array_descA.nb(); + const int64_t mbB = array_descB.mb(); + const int64_t nbB = array_descB.nb(); + + int nprow, npcol; + assert(array_descA.nprows() == array_descB.nprows()); + assert(array_descA.npcols() == array_descB.npcols()); + ORDER_CHECK(order); + if(order == 'c'||order == 'C'){ + nprow = array_descA.npcols(); + npcol = array_descA.nprows(); + }else{ + nprow = array_descA.nprows(); + npcol = array_descA.npcols(); + } + int llda,loc_n_a,lldb,loc_n_b,mA,nA,mB,nB; + if(order == 'c'||order == 'C'){ + llda = array_descA.n_loc(); + loc_n_a = array_descA.m_loc(); + lldb = array_descB.n_loc(); + loc_n_b = array_descB.m_loc(); + mA = array_descA.n(); + nA = array_descA.m(); + mB = array_descB.n(); + nB = array_descB.m(); + + }else{ + llda = array_descA.m_loc(); + loc_n_a = array_descA.n_loc(); + lldb = array_descB.m_loc(); + loc_n_b = array_descB.n_loc(); + mA = array_descA.m(); + nA = array_descA.n(); + mB = array_descB.m(); + nB = array_descB.n(); + } + if(trans == CUBLAS_OP_N){ + assert(mA == nA); + assert(mB == nB); + }else{ + assert(mA == nB); + assert(mB == nA); + } + const int global_m_a = ia-1+mA; + const int global_n_a = ja-1+nA; + const int global_m_b = ib-1+mB; + const int global_n_b = jb-1+nB; + int rank = gpu_dev_stream.rank; + int nranks = gpu_dev_stream.nranks; + + const int local_device = gpu_dev_stream.local_device; + CUBLASMP_CHECK(cublasMpCreate(&handle, stream)); + cublasMpGrid_t grid = nullptr; + cublasMpMatrixDescriptor_t descA = nullptr; + cublasMpMatrixDescriptor_t descB = nullptr; + void* d_work = nullptr; + size_t workspaceInBytesOnDevice = 0; + size_t workspaceInBytesOnHost = 0; + CUBLASMP_CHECK(cublasMpGridCreate( + handle, nprow, npcol, + order=='c'||order=='C'?CUBLASMP_GRID_LAYOUT_COL_MAJOR : CUBLASMP_GRID_LAYOUT_ROW_MAJOR, + gpu_dev_stream.cal_comm, + &grid)); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle, global_m_a, global_n_a, mbA, nbA, 0, 0, llda, compute_type, grid, &descA)); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle, global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, compute_type, grid, &descB)); + CUBLASMP_CHECK(cublasMpGeadd_bufferSize( + handle, trans, + mB, nB, + alpha, + d_A, ia, ja, descA, + beta, + d_B, ib, jb, descB, + &workspaceInBytesOnDevice, &workspaceInBytesOnHost)); + gpu_dev_stream.calSync(); + + CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream)); + gpu_dev_stream.calSync(); + std::vector h_work(workspaceInBytesOnHost); + CUBLASMP_CHECK(cublasMpGeadd( + handle, trans, + mB, nB, + alpha, + d_A, ia, ja, descA, + beta, + d_B, ib, jb, descB, + d_work, workspaceInBytesOnDevice, + h_work.data(), workspaceInBytesOnHost)); + gpu_dev_stream.calSync(); + CUDA_CHECK(cudaFreeAsync(d_work, stream)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle, descA)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle, descB)); + CUBLASMP_CHECK(cublasMpGridDestroy(handle, grid)); + CUBLASMP_CHECK(cublasMpDestroy(handle)); +} + +void CudaConnector::pgemr2d_nvhpc( + const GpuDeviceStream& gpu_dev_stream,const int& m, const int& n, + const void* d_A, const int64_t& ia, const int64_t& ja, const Array_Desc& array_descA, + void* d_B, const int64_t& ib, const int64_t& jb, const Array_Desc& array_descB, + const cudaDataType_t& compute_type +) +{ + + cudaStream_t stream = gpu_dev_stream.stream; + cublasMpHandle_t handle = nullptr; + const int64_t mbA = array_descA.mb(); + const int64_t nbA = array_descA.nb(); + const int64_t mbB = array_descB.mb(); + const int64_t nbB = array_descB.nb(); + int nprow, npcol; + assert(array_descA.nprows() == array_descB.nprows()); + assert(array_descA.npcols() == array_descB.npcols()); + nprow = array_descA.nprows(); + npcol = array_descA.npcols(); + + int llda = array_descA.m_loc(); + int loc_n_a = array_descA.n_loc(); + + int lldb = array_descB.m_loc(); + int loc_n_b = array_descB.n_loc(); + + const int global_m_a = ia-1+m; + const int global_n_a = ja-1+n; + + const int global_m_b = ib-1+m; + const int global_n_b = jb-1+n; + + CUBLASMP_CHECK(cublasMpCreate(&handle, stream)); + cublasMpGrid_t grid = nullptr; + cublasMpMatrixDescriptor_t descA = nullptr; + cublasMpMatrixDescriptor_t descB = nullptr; + void* d_work = nullptr; + size_t workspaceInBytesOnDevice = 0; + size_t workspaceInBytesOnHost = 0; + CUBLASMP_CHECK(cublasMpGridCreate( + handle, nprow, npcol, + CUBLASMP_GRID_LAYOUT_ROW_MAJOR, + gpu_dev_stream.cal_comm, + &grid) + ); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle, global_m_a, global_n_a, mbA, nbA, 0, 0, llda, compute_type, grid, &descA)); + CUBLASMP_CHECK( + cublasMpMatrixDescriptorCreate(handle, global_m_b, global_n_b, mbB, nbB, 0, 0, lldb, compute_type, grid, &descB)); + CUBLASMP_CHECK(cublasMpGemr2D_bufferSize( + handle, + m, n, + d_A, ia, ja, descA, + d_B, ib, jb, descB, + &workspaceInBytesOnDevice, &workspaceInBytesOnHost, + gpu_dev_stream.cal_comm) + ); + + gpu_dev_stream.calSync(); + CUDA_CHECK(cudaMallocAsync(&d_work, workspaceInBytesOnDevice, stream)); + std::vector h_work(workspaceInBytesOnHost); + + CUBLASMP_CHECK(cublasMpGemr2D( + handle, + m, n, + d_A, ia, ja, descA, + d_B, ib, jb, descB, + d_work, workspaceInBytesOnDevice, + h_work.data(), workspaceInBytesOnHost, + gpu_dev_stream.cal_comm) + ); + gpu_dev_stream.calSync(); + CUDA_CHECK(cudaFreeAsync(d_work, stream)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle, descA)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle, descB)); + CUBLASMP_CHECK(cublasMpGridDestroy(handle, grid)); + CUBLASMP_CHECK(cublasMpDestroy(handle)); +} +#endif + diff --git a/src/cuda_connector.cu b/src/cuda_connector.cu new file mode 100644 index 00000000..f6ffa7db --- /dev/null +++ b/src/cuda_connector.cu @@ -0,0 +1,601 @@ +#include "cuda_connector.h" +#include "device_stream.h" +// add by hbchen in 2025-05-19 +__device__ int get_indxg2p(const int &indxglob, const int &nb, const int &iproc, + const int &isrcproc, const int &nprocs) +{ + return (isrcproc + indxglob / nb) % nprocs; +} +__device__ int get_indxg2l(const int &indxglob, const int &nb, const int &iproc, + const int &isrcproc, const int &nprocs) +{ + return nb * (indxglob / (nb * nprocs)) + indxglob % nb; +} +__device__ int get_index_g2l_r(const int& gindx, const int& m, const int& mb,const int &myprow, const int& irsrc, const int& nprows) +{ + return myprow != get_indxg2p(gindx, mb, myprow, irsrc, nprows) || + gindx >= m + ? -1 + : get_indxg2l(gindx, mb, myprow, irsrc, nprows); +} +__device__ int get_index_g2l_c(const int& gindx, const int &n, const int &nb, const int &mypcol, const int &icsrc, const int &npcols) +{ + return mypcol != get_indxg2p(gindx, nb, mypcol, icsrc, npcols) || + gindx >= n + ? -1 + : get_indxg2l(gindx, nb, mypcol, icsrc, npcols); +} +__global__ void det_multiply_seq(const cuDoubleComplex *inC,const int* d_ipiv,const int& num, cuDoubleComplex* ouC){ + int f = blockIdx.x*blockDim.x + threadIdx.x; + // printf("f = %d\n", f); + ouC[f]=make_cuDoubleComplex(1.0, 0.0); + for(int i=f;i>>(d_a,d_ipiv,*d_n,d_ouC); + cudaMemcpyAsync(h_ouC, d_ouC, blockSize*sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); + cudaFreeAsync(d_ouC, stream); + cudaFreeAsync(d_n, stream); + for(int i=0;i>>(d_a,*d_n); + cudaStreamSynchronize(stream); + cudaFreeAsync(d_n, stream); +} +void CudaConnector::multiply_number_for_ComplexMatrixDevice(ComplexMatrixDevice& mat, const cuDoubleComplex& num, const cudaStream_t& stream) +{ + int blockSize = 256; + cuDoubleComplex * d_num; + int *d_len; + CUDA_CHECK(cudaMallocAsync((void**)&d_len, sizeof(int), stream)); + int len = mat.nr()*mat.nc(); + CUDA_CHECK(cudaMemcpyAsync(d_len, &len, sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(cuDoubleComplex), stream)); + CUDA_CHECK(cudaMemcpyAsync(d_num, &num, sizeof(cuDoubleComplex), cudaMemcpyHostToDevice, stream)); + int gridSize = (mat.nr()*mat.nc() + blockSize - 1) / blockSize; + multiply_number_for_ComplexMatrixDevice_kernel<<>>(mat.ptr(), *d_num, *d_len); +} +__global__ void diag_add_ComplexMatrixDevice_kernel(cuDoubleComplex* d_a, const double& d_num, const int& d_m, const int& d_n, const int& d_m_loc, const int& d_n_loc, const int& d_mb, const int& d_nb, const int& d_myprow, const int& d_mypcol, const int& d_irsrc, const int& d_icsrc, const int& d_nprows, const int& d_npcols) +{ + int row = blockIdx.x * blockDim.x + threadIdx.x; + if (row < d_m) + { + int col = row; // Diagonal element + int local_row = get_index_g2l_r(row, d_m, d_mb, d_myprow, d_irsrc, d_nprows); + int local_col = get_index_g2l_c(col, d_n, d_nb, d_mypcol, d_icsrc, d_npcols); + if (local_row != -1 && local_col != -1 && local_row < d_m_loc && local_col < d_n_loc) + { + int local_index = local_row + local_col * d_m_loc; // Column-major order + d_a[local_index].x += d_num; // Add the real number to the real part + } + } +} + +void CudaConnector::diag_add_ComplexMatrixDevice(ComplexMatrixDevice& mat, const double& num, const Array_Desc& arrdesc,const cudaStream_t& stream) +{ + double *d_num; + int blockSize = 256; + CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(double), stream)); + CUDA_CHECK(cudaMemcpyAsync(d_num, &num, sizeof(double), cudaMemcpyHostToDevice, stream)); + int *d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols; + CUDA_CHECK(cudaMallocAsync((void**)&d_m, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_n, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_m_loc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_n_loc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_mb, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_nb, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_myprow, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_mypcol, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_irsrc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_icsrc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_nprows, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_npcols, sizeof(int), stream)); + CUDA_CHECK(cudaMemcpyAsync(d_m, &arrdesc.m(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_n, &arrdesc.n(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_m_loc, &arrdesc.m_loc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_n_loc, &arrdesc.n_loc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_mb, &arrdesc.mb(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_nb, &arrdesc.nb(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_myprow, &arrdesc.myprow(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_mypcol, &arrdesc.mypcol(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_irsrc, &arrdesc.irsrc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_icsrc, &arrdesc.icsrc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_nprows, &arrdesc.nprows(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_npcols, &arrdesc.npcols(), sizeof(int), cudaMemcpyHostToDevice, stream)); + int gridSize = (arrdesc.m() + blockSize - 1) / blockSize; + diag_add_ComplexMatrixDevice_kernel<<>>(mat.ptr(),*d_num,*d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols); + CUDA_CHECK(cudaFreeAsync(d_num, stream)); + CUDA_CHECK(cudaFreeAsync(d_m, stream)); + CUDA_CHECK(cudaFreeAsync(d_n, stream)); + CUDA_CHECK(cudaFreeAsync(d_m_loc, stream)); + CUDA_CHECK(cudaFreeAsync(d_n_loc, stream)); + CUDA_CHECK(cudaFreeAsync(d_mb, stream)); + CUDA_CHECK(cudaFreeAsync(d_nb, stream)); + CUDA_CHECK(cudaFreeAsync(d_myprow, stream)); + CUDA_CHECK(cudaFreeAsync(d_mypcol, stream)); + CUDA_CHECK(cudaFreeAsync(d_irsrc, stream)); + CUDA_CHECK(cudaFreeAsync(d_icsrc, stream)); + CUDA_CHECK(cudaFreeAsync(d_nprows, stream)); + CUDA_CHECK(cudaFreeAsync(d_npcols, stream)); + +} +__global__ void diag_add_matrix_device_blacs_kernel(const cuDoubleComplex* num, cuDoubleComplex* d_A, const Array_Desc_Device& array_desc_device) +{ + int i = threadIdx.x + blockIdx.x * blockDim.x; + if(i>=array_desc_device.m()) + return; + int ilo = array_desc_device.indx_g2l_r(i); + if(ilo==-1) + return; + int jlo = array_desc_device.indx_g2l_c(i); + if(jlo==-1) + return; + if(ilo>>((cuDoubleComplex*)d_num,(cuDoubleComplex*)d_A,*d_array_desc_device); + + }else if(compute_type == LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT){ + CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(cuFloatComplex), (cudaStream_t)device_stream.stream)); + CUDA_CHECK(cudaMemcpyAsync(d_num, num, sizeof(cuFloatComplex), cudaMemcpyHostToDevice, (cudaStream_t)device_stream.stream)); + }else if(compute_type == LIBRPA_COMPUTE_TYPE_DOUBLE){ + CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(double), (cudaStream_t)device_stream.stream)); + CUDA_CHECK(cudaMemcpyAsync(d_num, num, sizeof(double), cudaMemcpyHostToDevice, (cudaStream_t)device_stream.stream)); + }else{ + CUDA_CHECK(cudaMallocAsync((void**)&d_num, sizeof(float), (cudaStream_t)device_stream.stream)); + CUDA_CHECK(cudaMemcpyAsync(d_num, num, sizeof(float), cudaMemcpyHostToDevice, (cudaStream_t)device_stream.stream)); + } + CUDA_CHECK(cudaFreeAsync(d_num, (cudaStream_t)device_stream.stream)); + CUDA_CHECK(cudaFreeAsync(d_array_desc_device, (cudaStream_t)device_stream.stream)); + +} +__global__ void det_ComplexMatrixDevice_blacs_kernel(const cuDoubleComplex* d_in,cuDoubleComplex* d_out,const int& m,const int& n,const int& m_loc,const int& n_loc,const int& mb,const int& nb,const int& myprow,const int& mypcol,const int& irsrc,const int& icsrc,const int& nprows,const int& npcols) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + // printf("idx=%d\n",idx); + d_out[idx] = make_cuDoubleComplex(1.0,0.0); + + for(int i=idx;i CudaConnector::det_ComplexMatrixDevice_blacs(const cudaStream_t& stream, const ComplexMatrixDevice&d_A, const LIBRPA::Array_Desc &arrdesc_pi) +{ + std::complex det_loc={1.0,0.0}; + int *d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols; + CUDA_CHECK(cudaMallocAsync((void**)&d_m, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_n, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_m_loc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_n_loc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_mb, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_nb, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_myprow, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_mypcol, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_irsrc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_icsrc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_nprows, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_npcols, sizeof(int), stream)); + CUDA_CHECK(cudaMemcpyAsync(d_m, &arrdesc_pi.m(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_n, &arrdesc_pi.n(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_m_loc, &arrdesc_pi.m_loc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_n_loc, &arrdesc_pi.n_loc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_mb, &arrdesc_pi.mb(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_nb, &arrdesc_pi.nb(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_myprow, &arrdesc_pi.myprow(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_mypcol, &arrdesc_pi.mypcol(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_irsrc, &arrdesc_pi.irsrc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_icsrc, &arrdesc_pi.icsrc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_nprows, &arrdesc_pi.nprows(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_npcols, &arrdesc_pi.npcols(), sizeof(int), cudaMemcpyHostToDevice, stream)); + int blockSize = 256; + int gridSize = 1; + cuDoubleComplex* d_detA; + std::complex *h_detA = new std::complex[blockSize]; + CUDA_CHECK(cudaMallocAsync((void**)&d_detA, sizeof(cuDoubleComplex) * blockSize, stream)); + // printf("before det_ComplexMatrixDevice_blacs_kernel\n"); + // CUDA_CHECK(cudaStreamSynchronize(stream)); + det_ComplexMatrixDevice_blacs_kernel<<>>(d_A.ptr(),d_detA,*d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols); + CUDA_CHECK(cudaFreeAsync(d_m, stream)); + CUDA_CHECK(cudaFreeAsync(d_n, stream)); + CUDA_CHECK(cudaFreeAsync(d_m_loc, stream)); + CUDA_CHECK(cudaFreeAsync(d_n_loc, stream)); + CUDA_CHECK(cudaFreeAsync(d_mb, stream)); + CUDA_CHECK(cudaFreeAsync(d_nb, stream)); + CUDA_CHECK(cudaFreeAsync(d_myprow, stream)); + CUDA_CHECK(cudaFreeAsync(d_mypcol, stream)); + CUDA_CHECK(cudaFreeAsync(d_irsrc, stream)); + CUDA_CHECK(cudaFreeAsync(d_icsrc, stream)); + CUDA_CHECK(cudaFreeAsync(d_nprows, stream)); + CUDA_CHECK(cudaFreeAsync(d_npcols, stream)); + // printf("after det_ComplexMatrixDevice_blacs_kernel\n"); + CUDA_CHECK(cudaMemcpyAsync(h_detA, d_detA, sizeof(cuDoubleComplex) * blockSize, cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaFreeAsync(d_detA, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + for(int i=0;i>>(d_in, d_out, *d_m_ptr, *d_n_ptr, *d_conjugate_ptr); + // printf("after transpose kernel\n"); + CUDA_CHECK(cudaFreeAsync(d_m_ptr, gpu_dev_stream.stream)); + CUDA_CHECK(cudaFreeAsync(d_n_ptr, gpu_dev_stream.stream)); + CUDA_CHECK(cudaFreeAsync(d_conjugate_ptr, gpu_dev_stream.stream)); + gpu_dev_stream.cudaSync(); + in.set_ptr(d_out); + in.set_nr(n); + in.set_nc(m); +} +__global__ void cuFloatComplex_to_cuDoubleComplex_kernel(const cuFloatComplex* d_in, cuDoubleComplex* d_out, const int64_t& d_len) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < d_len) + { + d_out[idx] = make_cuDoubleComplex(static_cast(d_in[idx].x), static_cast(d_in[idx].y)); + } +} +void CudaConnector::cuFloatComplex_to_cuDoubleComplex_Async(const cuFloatComplex* d_in, cuDoubleComplex* d_out, const int64_t& len, const cudaStream_t& stream) +{ + int blockSize = 256; + int gridSize = (len + blockSize - 1) / blockSize; + // printf("gridSize = %d, blockSize = %d, len = %d\n", gridSize, blockSize, len); + // Launch the kernel + int64_t *d_len; + CUDA_CHECK(cudaMallocAsync((void**)&d_len, sizeof(int64_t), stream)); + CUDA_CHECK(cudaMemcpyAsync(d_len, &len, sizeof(int64_t), cudaMemcpyHostToDevice, stream)); + cuFloatComplex_to_cuDoubleComplex_kernel<<>>(d_in, d_out, *d_len); +} +__global__ void cuDoubleComplex_to_cuFloatComplex_kernel(const cuDoubleComplex* d_a, cuFloatComplex* d_b, const int64_t& d_len) +{ + + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (idx < d_len) + { + d_b[idx] = make_cuFloatComplex(static_cast(d_a[idx].x), static_cast(d_a[idx].y)); + } +} +void CudaConnector::cuDoubleComplex_to_cuFloatComplex_Async(const cuDoubleComplex* d_a, cuFloatComplex* d_b, const int64_t& len, const cudaStream_t& stream) +{ + int blockSize = 256; + int gridSize = (len + blockSize - 1) / blockSize; + // printf("gridSize = %d, blockSize = %d, len = %d\n", gridSize, blockSize, len); + // Launch the kernel + int64_t *d_len; + CUDA_CHECK(cudaMallocAsync((void**)&d_len, sizeof(int64_t), stream)); + CUDA_CHECK(cudaMemcpyAsync(d_len, &len, sizeof(int64_t), cudaMemcpyHostToDevice, stream)); + // printf("start kernel function\n"); + cuDoubleComplex_to_cuFloatComplex_kernel<<>>(d_a, d_b, *d_len); +} +__global__ void float_to_double_kernel(const float* d_a, double* d_b, const int64_t& d_len) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < d_len) + { + d_b[idx] = d_a[idx]; + } +} +void CudaConnector::float_to_double_device(const float* d_a, double* d_b, const int64_t& len) +{ + int blockSize = 256; + int gridSize = (len + blockSize - 1) / blockSize; + int64_t *d_len; + CUDA_CHECK(cudaMallocAsync((void**)&d_len, sizeof(int64_t), device_stream.stream)); + CUDA_CHECK(cudaMemcpyAsync(d_len, &len, sizeof(int64_t), cudaMemcpyHostToDevice, device_stream.stream)); + float_to_double_kernel<<>>(d_a, d_b, *d_len); + CUDA_CHECK(cudaFreeAsync(d_len, device_stream.stream)); +} +__global__ void double_to_float_kernel(const double* d_a, float* d_b, const int64_t& d_len) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < d_len) + { + d_b[idx] = static_cast(d_a[idx]); + } +} +void CudaConnector::double_to_float_device(const double* d_a, float* d_b, const int64_t& len) +{ + int blockSize = 256; + int gridSize = (len + blockSize - 1) / blockSize; + int64_t *d_len; + CUDA_CHECK(cudaMallocAsync((void**)&d_len, sizeof(int64_t), device_stream.stream)); + CUDA_CHECK(cudaMemcpyAsync(d_len, &len, sizeof(int64_t), cudaMemcpyHostToDevice, device_stream.stream)); + double_to_float_kernel<<>>(d_a, d_b, *d_len); + CUDA_CHECK(cudaFreeAsync(d_len, device_stream.stream)); +} +__global__ void trace_ComplexMatrixDevice_blacs_kernel(const cuDoubleComplex* d_in, cuDoubleComplex* d_out, const int& d_m, const int& d_n, const int& d_m_loc, const int& d_n_loc, const int& d_mb, const int& d_nb, const int& d_myprow, const int& d_mypcol, const int& d_irsrc, const int& d_icsrc, const int& d_nprows, const int& d_npcols) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + d_out[idx] = make_cuDoubleComplex(0.0, 0.0); + for(int i=idx;i *trace_z = (std::complex*)trace; + CUDA_CHECK(cudaMallocAsync((void**)&d_trace, blockSize*sizeof(std::complex), device_stream.stream)); + h_trace = (std::complex*)malloc(blockSize*sizeof(std::complex)); + std::complex *h_trace_z = (std::complex*)h_trace; + *(trace_z) = std::complex(0.0, 0.0); + trace_matrix_device_blacs_kernel<<>>((const cuDoubleComplex*)d_A, (cuDoubleComplex*)d_trace, *d_array_desc_device); + CUDA_CHECK(cudaMemcpyAsync(h_trace, d_trace, blockSize*sizeof(std::complex), cudaMemcpyDeviceToHost, device_stream.stream)); + device_stream.sync(); + for(int i=0;i *det_z = (std::complex*)det; + CUDA_CHECK(cudaMallocAsync((void**)&d_det, blockSize*sizeof(std::complex), device_stream.stream)); + h_det = (std::complex*)malloc(blockSize*sizeof(std::complex)); + std::complex *h_det_z = (std::complex*)h_det; + *(det_z) = std::complex(1.0, 0.0); + det_matrix_device_blacs_kernel<<>>((const cuDoubleComplex*)d_A, (cuDoubleComplex*)d_det, *d_array_desc_device); + CUDA_CHECK(cudaMemcpyAsync(h_det, d_det, blockSize*sizeof(std::complex), cudaMemcpyDeviceToHost, device_stream.stream)); + device_stream.sync(); + for(int i=0;i CudaConnector::trace_ComplexMatrixDevice_blacs(const cudaStream_t& stream, const ComplexMatrixDevice& d_A, const LIBRPA::Array_Desc &arrdesc) +{ + + int blockSize = 256; + int *d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols; + CUDA_CHECK(cudaMallocAsync((void**)&d_m, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_n, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_m_loc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_n_loc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_mb, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_nb, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_myprow, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_mypcol, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_irsrc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_icsrc, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_nprows, sizeof(int), stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_npcols, sizeof(int), stream)); + CUDA_CHECK(cudaMemcpyAsync(d_m, &arrdesc.m(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_n, &arrdesc.n(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_m_loc, &arrdesc.m_loc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_n_loc, &arrdesc.n_loc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_mb, &arrdesc.mb(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_nb, &arrdesc.nb(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_myprow, &arrdesc.myprow(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_mypcol, &arrdesc.mypcol(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_irsrc, &arrdesc.irsrc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_icsrc, &arrdesc.icsrc(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_nprows, &arrdesc.nprows(), sizeof(int), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_npcols, &arrdesc.npcols(), sizeof(int), cudaMemcpyHostToDevice, stream)); + cuDoubleComplex *d_out; + CUDA_CHECK(cudaMallocAsync((void**)&d_out, sizeof(cuDoubleComplex)*blockSize, stream)); + std::complex *h_out; + std::complex trace_loc = {0.0, 0.0}; + h_out = new std::complex[blockSize]; + int gridSize = 1; + trace_ComplexMatrixDevice_blacs_kernel<<>>(d_A.ptr(),d_out,*d_m,*d_n,*d_m_loc,*d_n_loc,*d_mb,*d_nb,*d_myprow,*d_mypcol,*d_irsrc,*d_icsrc,*d_nprows,*d_npcols); + CUDA_CHECK(cudaFreeAsync(d_m, stream)); + CUDA_CHECK(cudaFreeAsync(d_n, stream)); + CUDA_CHECK(cudaFreeAsync(d_m_loc, stream)); + CUDA_CHECK(cudaFreeAsync(d_n_loc, stream)); + CUDA_CHECK(cudaFreeAsync(d_mb, stream)); + CUDA_CHECK(cudaFreeAsync(d_nb, stream)); + CUDA_CHECK(cudaFreeAsync(d_myprow, stream)); + CUDA_CHECK(cudaFreeAsync(d_mypcol, stream)); + CUDA_CHECK(cudaFreeAsync(d_irsrc, stream)); + CUDA_CHECK(cudaFreeAsync(d_icsrc, stream)); + CUDA_CHECK(cudaFreeAsync(d_nprows, stream)); + CUDA_CHECK(cudaFreeAsync(d_npcols, stream)); + CUDA_CHECK(cudaMemcpyAsync(h_out, d_out, sizeof(cuDoubleComplex)*blockSize, cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaFreeAsync(d_out, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + for(int i=0;i=array_desc_device.m()) + return; + int ilo = array_desc_device.indx_g2l_r(i); + if(ilo==-1) + return; + int jlo = array_desc_device.indx_g2l_c(i); + if(jlo==-1) + return; + d_A[ilo+jlo*array_desc_device.m_loc()] = make_cuDoubleComplex(1.0, 0.0); +} +void ComplexMatrixDevice::set_as_identity(const GpuDeviceStream& gpu_dev_stream, const Array_Desc_Device& array_desc_device){ + this->set_as_zero(gpu_dev_stream.stream); + Array_Desc_Device* d_array_desc_device; + // printf("array_desc_device.m():%d\n", array_desc_device.m()); + CUDA_CHECK(cudaMallocAsync((void**)&d_array_desc_device, sizeof(Array_Desc_Device), gpu_dev_stream.stream)); + CUDA_CHECK(cudaMemcpyAsync(d_array_desc_device, &array_desc_device, sizeof(Array_Desc_Device), cudaMemcpyHostToDevice, gpu_dev_stream.stream)); + if(array_desc_device.m()!=array_desc_device.n()){ + fprintf(stderr,"Error: ComplexMatrixDevice::set_as_identity: m and n are not equal\n"); + std::abort(); + } + int blockSize = 256; + int gridSize = (array_desc_device.m()+blockSize-1)/blockSize; + set_as_identity_kernel<<>>(d_data, *d_array_desc_device); + CUDA_CHECK(cudaFreeAsync(d_array_desc_device, gpu_dev_stream.stream)); +} \ No newline at end of file diff --git a/src/cuda_connector.h b/src/cuda_connector.h new file mode 100644 index 00000000..f496bf08 --- /dev/null +++ b/src/cuda_connector.h @@ -0,0 +1,621 @@ +#ifndef CUDA_CONNECTOR_H +#define CUDA_CONNECTOR_H + +#pragma once +//=================hbchen 2025-05-11========================= +#include +#include +#include +#include +#include +//=================hbchen 2025-05-11========================= +#include +#include +#include +#include +#include "base_blacs.h" +#include +#ifdef ENABLE_NVHPC +#include +#include +#include "helpers.h" +#include "scalapack_connector.h" +#include +#include +#include "gpu_device_stream.h" +#endif +using LIBRPA::Array_Desc; +#include "array_desc_device.h" +#include "matrix_device.h" +#ifdef ENABLE_NVHPC + + +class ComplexMatrixDevice{ +private: + int m=0; + int n=0; +public: + cuDoubleComplex* d_data=nullptr; + + cusolverMpGrid_t grid_cusolver=nullptr; + cusolverMpMatrixDescriptor_t desc_cusolver=nullptr; + + cublasMpGrid_t grid_cublas=nullptr; + cublasMpMatrixDescriptor_t desc_cublas=nullptr; + + bool is_cusolver_init=false; + bool is_cublas_init=false; + + ComplexMatrixDevice(){ + + } + ComplexMatrixDevice(const int& m, const int& n){ + ComplexMatrixDevice(); + this->m=m; + this->n=n; + CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(cuDoubleComplex))); + } + ComplexMatrixDevice(const int& m, const int& n,void* c_data){ + this->m=m; + this->n=n; + CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(cuDoubleComplex))); + CUDA_CHECK(cudaMemcpy(d_data, c_data, m * n * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice)); + } + void set_data(const int& m, const int& n, const cudaStream_t& stream=nullptr){ + if(m!=this->m || n!=this->n){ + clean(stream); + this->m=m; + this->n=n; + if(stream==nullptr){ + CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(cuDoubleComplex))); + }else{ + CUDA_CHECK(cudaMallocAsync((void**)&d_data, m * n * sizeof(cuDoubleComplex),stream)); + } + } + } + void set_data(const int& m, const int& n,const void* c_data, const cudaStream_t& stream=nullptr){ + set_data(m,n,stream); + if(stream==nullptr){ + CUDA_CHECK(cudaMemcpy(d_data, c_data, m * n * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice)); + }else{ + CUDA_CHECK(cudaMemcpyAsync(d_data, c_data, m * n * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice,stream)); + } + } + void set_data_device(const int& m, const int& n,const void* d_A, const GpuDeviceStream& gpu_dev_stream) + { + if(m!=this->m || n!=this->n){ + set_data(m,n,gpu_dev_stream.stream); + } + + CUDA_CHECK(cudaMemcpyPeerAsync(this->d_data,gpu_dev_stream.local_device, d_A, gpu_dev_stream.local_device, m * n * sizeof(cuDoubleComplex), gpu_dev_stream.stream)); + + } + void cusolver_init(cusolverMpHandle_t handle, cal_comm_t cal_comm, const Array_Desc& arrdesc){ + if(!is_cusolver_init){ + CUSOLVERMP_CHECK(cusolverMpCreateDeviceGrid(handle,&grid_cusolver,cal_comm,arrdesc.nprows(),arrdesc.npcols(),CUSOLVERMP_GRID_MAPPING_ROW_MAJOR)); + CUSOLVERMP_CHECK(cusolverMpCreateMatrixDesc(&desc_cusolver,grid_cusolver,CUDA_C_64F,m,n,arrdesc.mb(),arrdesc.nb(),0,0,arrdesc.m_loc())); + is_cusolver_init=true; + } + } + void cublas_init(cublasMpHandle_t handle, cal_comm_t cal_comm, const Array_Desc& arrdesc){ + if(!is_cublas_init){ + CUBLASMP_CHECK(cublasMpGridCreate(handle,arrdesc.nprows(),arrdesc.npcols(),CUBLASMP_GRID_LAYOUT_ROW_MAJOR,cal_comm,&grid_cublas)); + CUBLASMP_CHECK(cublasMpMatrixDescriptorCreate(handle,m,n,arrdesc.mb(),arrdesc.nb(),0,0,arrdesc.m_loc(),CUDA_C_64F,grid_cublas,&desc_cublas)); + is_cublas_init=true; + } + } + cuDoubleComplex* ptr(){ + return d_data; + } + const cuDoubleComplex* ptr() const { + return d_data; + } + int nr(){ + return this->m; + } + int nc(){ + return this->n; + } + void set_nr(int m){ + this->m=m; + } + void set_nc(int n){ + this->n=n; + } + void set_ptr(cuDoubleComplex* d_data){ + if(this->d_data!=nullptr){ + cudaFree(this->d_data); + this->d_data=nullptr; + } + this->d_data=d_data; + } + void clean(const cudaStream_t& stream=nullptr){ + if(d_data!=nullptr){ + if(stream==nullptr) + cudaFree(d_data); + else + cudaFreeAsync(d_data,stream); + d_data=nullptr; + } + this->m=0; + this->n=0; + } + void set_as_zero(const cudaStream_t& stream=nullptr); + void set_as_identity(const GpuDeviceStream& gpu_dev_stream, const Array_Desc_Device&); + void cublasClean(cublasMpHandle_t handle){ + if(desc_cublas!=nullptr){ + CUBLASMP_CHECK(cublasMpMatrixDescriptorDestroy(handle,desc_cublas)); + desc_cublas=nullptr; + } + if(grid_cublas!=nullptr){ + CUBLASMP_CHECK(cublasMpGridDestroy(handle,grid_cublas)); + grid_cublas=nullptr; + } + is_cublas_init=false; + } + ~ComplexMatrixDevice(){ + if(is_cublas_init){ + fprintf(stderr, "Error: ComplexMatrixDevice not cleaned cublasMp resources before destructing!\n"); + std::abort(); + } + // CUDA_CHECK(cudaDeviceSynchronize()); + if(d_data!=nullptr){ + CUDA_CHECK(cudaFree(d_data)); + d_data=nullptr; + } + if(desc_cusolver!=nullptr){ + CUSOLVERMP_CHECK(cusolverMpDestroyMatrixDesc(desc_cusolver)); + desc_cusolver=nullptr; + } + if(grid_cusolver!=nullptr){ + CUSOLVERMP_CHECK(cusolverMpDestroyGrid(grid_cusolver)); + grid_cusolver=nullptr; + } + } + +}; + + +#endif + +class CudaConnector +{ +public: + static + void write_file(double* A,int M,int N,const char* name){ + std::fstream out; + out.open(name, std::ios::out); + if (!out.is_open()) { + std::cerr << "Failed to open file: " << name << std::endl; + return; + } + for(int i=0;i *a, cuDoubleComplex *b, const int n, const int lda,bool is_transpose=false) + { + if (is_transpose) + { + #pragma omp parallel for + for (int i = 0; i < n; i++) + { + for (int j = 0; j < lda; j++) + { + b[i*lda+j].x = a[j*n+i].real(); + b[i*lda+j].y = a[j*n+i].imag(); + } + } + } + else + { + #pragma omp parallel for + for (int i = 0; i < n*lda; i++) + { + b[i].x = a[i].real(); + b[i].y = a[i].imag(); + } + } + } + static inline + void cuDoubleComplexToDoubleComplex_host(std::complex *a,const cuDoubleComplex *b,const int n,const int lda,bool is_transpose=false) + { + if (is_transpose) + { + for (int i = 0; i < n; i++) + { + for (int j = 0; j < lda; j++) + { + a[j*n+i].real(b[i*lda+j].x); + a[j*n+i].imag(b[i*lda+j].y); + } + } + } + else + { + for (int i = 0; i < n*lda; i++) + { + a[i].real(b[i].x); + a[i].imag(b[i].y); + } + } + } + // update by chenhaobo in 2025-05-19 + static inline + void cuZgetrf_f(const int& m, const int& n, cuDoubleComplex *d_a, const int& lda, int *d_ipiv, int *d_info, cusolverDnHandle_t cusolverH) + { + cudaStream_t stream; + cusolverStatus_t status = cusolverDnGetStream(cusolverH, &stream); + // printf("cusolverDnGetStream status:%d\n",status); + // printf("Stream stream:%d\n",stream); + // printf("cusolverDnhandle:%d\n",cusolverH); + int lwork; + // auto start = std::chrono::high_resolution_clock::now(); + cusolverDnZgetrf_bufferSize(cusolverH, m, n, d_a, lda, &lwork); + cuDoubleComplex *d_work; + cudaMallocAsync((void**)&d_work, lwork * sizeof(cuDoubleComplex), stream); + // cudaStreamSynchronize(stream); + cusolverDnZgetrf(cusolverH, m, n, d_a, lda, d_work, d_ipiv, d_info); + // cudaStreamSynchronize(stream); + // auto end = std::chrono::high_resolution_clock::now(); + // auto duration= std::chrono::duration_cast(end - start); + // int info; + // cudaMemcpy(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost); + // printf("stream:%d, info:%d,Time taken of the cudaStreamSynchronize on GPU: %lld microseconds\n",stream,info, duration.count()); + cudaFreeAsync(d_work, stream); + } + static + cuDoubleComplex det_cuDoubleComplex(const cuDoubleComplex *d_a,const int *d_ipiv, const int& n, const cudaStream_t stream); + static + void cuDoubleComplex_minus_identity_Async(cuDoubleComplex *d_a,const int& h_n, const cudaStream_t stream); + static inline void cuDoubleComplex_minus_identity_host(cuDoubleComplex* h_a, const int& n) + { + #pragma omp parallel for + for(int i=0;i *, const int &, + // const int &, const LIBRPA::Array_Desc &, int *, int &,const char order='C'); + static + void pzgetrf_nvhpc(const GpuDeviceStream&, ComplexMatrixDevice &, const int &, + const int &, const LIBRPA::Array_Desc &, int64_t *, int *,const char& order='C'); + static + void pgetrf_nvhpc_mixed_precision( + const GpuDeviceStream&, void *, + const int &, const int &, + const LIBRPA::Array_Desc &, int64_t *, int *, + const cudaDataType_t &,const char &order='C' + ); + static + void pgetrf_nvhpc_mixed_precision( + void *, const int &, const int &, + const LIBRPA::Array_Desc &, int64_t *, int *, + const cudaDataType_t &,const char &order='C' + ); + static + void pgetrs_nvhpc_mixed_precision( + const GpuDeviceStream&, const cublasOperation_t&, + const void* d_A, const int64_t& IA, const int64_t& JA, const LIBRPA::Array_Desc &, + const int64_t* d_ipiv, + void* d_B, const int64_t& IB, const int64_t& JB, const LIBRPA::Array_Desc &, + int* d_info,const cudaDataType_t& compute_type, + const char& order='C' + ); + static void pgetrf_trs_nvhpc_mixed_precision( + const GpuDeviceStream&, const cublasOperation_t&, + void* d_A, const int64_t& IA, const int64_t& JA, const LIBRPA::Array_Desc &, + void* d_B, const int64_t& IB, const int64_t& JB, const LIBRPA::Array_Desc &, + const cudaDataType_t& compute_type, const char& order='C' + ); + static + std::complex det_ComplexMatrixDevice_blacs(const cudaStream_t&, const ComplexMatrixDevice&, const LIBRPA::Array_Desc &); + static + std::complex trace_ComplexMatrixDevice_blacs(const cudaStream_t&, const ComplexMatrixDevice&, const LIBRPA::Array_Desc &); + static + void trace_matrix_device_blacs(void* trace, const void* d_A, const Array_Desc &arrdesc_A, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type); + static + void det_matrix_device_blacs(void* det, const void* d_A, const Array_Desc &arrdesc_A, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type); + + // static + // void pgemm_cublasMp(const char &transa, const char &transb, const int &m, const int &n, const int &k, + // const double &alphaD, const std::complex *A, const int &ia, const int &ja, const LIBRPA::Array_Desc &arrdesc_A, + // const std::complex *B, const int &ib, const int &jb, const LIBRPA::Array_Desc &arrdesc_B, + // const double &betaD, std::complex *C, const int &ic, const int &jc, const LIBRPA::Array_Desc &arrdesc_C); + + static + void pgemm_device(cublasMpHandle_t,cublasOperation_t,cublasOperation_t,const int &,const int &,const int &, + const void *, + const ComplexMatrixDevice &,int64_t,int64_t, + const ComplexMatrixDevice &,int64_t,int64_t, + const void *, + ComplexMatrixDevice &,int64_t,int64_t, + cublasComputeType_t); + + static + void pgemm_nvhpc(const GpuDeviceStream&,cublasOperation_t,cublasOperation_t,const int &,const int &,const int &, + const void *, + const ComplexMatrixDevice &,int64_t,int64_t,const Array_Desc&, + const ComplexMatrixDevice &,int64_t,int64_t,const Array_Desc&, + const void *, + ComplexMatrixDevice &,int64_t,int64_t,const Array_Desc&, + cublasComputeType_t); + // static + // void pgemm_nvhpc_cuFloatComplex(const GpuDeviceStream& gpu_dev_stream,cublasOperation_t transA,cublasOperation_t transB,const int & m,const int & n,const int & k, + // const void *alpha, + // const cuFloatComplex* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA, + // const cuFloatComplex* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB, + // const void *beta, + // cuFloatComplex * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC, + // cublasComputeType_t cublas_compute_type); + static + void pgemm_nvhpc_mixed_precision( + const GpuDeviceStream& gpu_dev_stream,cublasOperation_t transA,cublasOperation_t transB, + const int & m,const int & n,const int & k, + const void *alpha, + const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA, + const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB, + const void *beta, + void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC, + cublasComputeType_t cublas_compute_type + ); + static + void pgemm_nvhpc_mixed_precision( + cublasOperation_t transA,cublasOperation_t transB, + const int & m,const int & n,const int & k, + const void *alpha, + const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA, + const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB, + const void *beta, + void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC, + cublasComputeType_t cublas_compute_type + ); + static + void pgeadd_nvhpc( + const GpuDeviceStream& gpu_dev_stream,const cublasOperation_t& trans, + const void *alpha, + const void* d_A, const int64_t& ia, const int64_t& ja, const Array_Desc& array_descA, + const void* beta, + void* d_B, const int64_t& ib, const int64_t& jb, const Array_Desc& array_descB, + const cudaDataType_t&, + const char& order = 'c' + ); + static + void pgemr2d_nvhpc( + const GpuDeviceStream& gpu_dev_stream,const int&,const int&, + const void* d_A, const int64_t& ia, const int64_t& ja, const Array_Desc& array_descA, + void* d_B, const int64_t& ib, const int64_t& jb, const Array_Desc& array_descB, + const cudaDataType_t& + ); + static + void cuFloatComplex_to_cuDoubleComplex_Async(const cuFloatComplex* d_a, cuDoubleComplex* d_b, const int64_t& len, const cudaStream_t& stream); + static + void cuDoubleComplex_to_cuFloatComplex_Async(const cuDoubleComplex* d_a, cuFloatComplex* d_b, const int64_t& len, const cudaStream_t& stream); + static + void float_to_double_device(const float* d_a, double* d_b, const int64_t& len); + static + void double_to_float_device(const double* d_a, float* d_b, const int64_t& len); + static + void multiply_number_for_ComplexMatrixDevice(ComplexMatrixDevice& mat, const cuDoubleComplex& num, const cudaStream_t& stream); + static + void diag_add_ComplexMatrixDevice(ComplexMatrixDevice& mat, const double& num, const Array_Desc& arrdesc,const cudaStream_t& stream); + static + void diag_add_matrix_device_blacs( + const void* num, void* d_A, const Array_Desc& array_desc,const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type + ); + static + void transpose_ComplexMatrixDevice(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice& in ,bool is_conjugate=false); + + #endif + +}; + +#endif // CUDA_CONNECTOR_H \ No newline at end of file diff --git a/src/device_connector.cpp b/src/device_connector.cpp new file mode 100644 index 00000000..b343450c --- /dev/null +++ b/src/device_connector.cpp @@ -0,0 +1,248 @@ +#include "device_connector.h" +#ifdef ENABLE_NVHPC +#include "cuda_connector.h" +#endif +#include "device_stream.h" + +void DeviceConnector::pgetrf_device_mixed_precision( + void* d_A, const int& m, const int& n, + const LIBRPA::Array_Desc& arrdesc_pi, + int64_t* d_ipiv, int* d_info, + const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type, + const char& order +){ + { + #ifdef ENABLE_NVHPC + cudaDataType_t cuda_compute_type; + if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){ + cuda_compute_type = CUDA_C_64F; + }else if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT){ + cuda_compute_type = CUDA_C_32F; + }else if(compute_type==LIBRPA_COMPUTE_TYPE_DOUBLE){ + cuda_compute_type = CUDA_R_64F; + }else if(compute_type==LIBRPA_COMPUTE_TYPE_FLOAT){ + cuda_compute_type = CUDA_R_32F; + }else{ + fprintf(stderr, "Error: Unsupported compute type in pgetrf_device_mixed_precision\n"); + exit(1); + } + CudaConnector::pgetrf_nvhpc_mixed_precision( + d_A, m, n, + arrdesc_pi, + d_ipiv, d_info, + cuda_compute_type, + order + ); + #else + fprintf(stderr, "Error: NVHPC is not enabled in this build, cannot call pgetrf_device_mixed_precision\n"); + exit(1); + #endif + } +} + +void DeviceConnector::pgemm_device_mixed_precision( + const char& transa, const char& transb, + const int & m,const int & n,const int & k, + const void *alpha, + const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA, + const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB, + const void *beta, + void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC, + const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type +){ + #ifdef ENABLE_NVHPC + cublasComputeType_t cublas_compute_type; + cublasOperation_t transA, transB; + if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){ + cublas_compute_type = CUBLAS_COMPUTE_64F_PEDANTIC; + }else if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT){ + cublas_compute_type = CUBLAS_COMPUTE_32F_PEDANTIC; + }else if(compute_type==LIBRPA_COMPUTE_TYPE_DOUBLE){ + cublas_compute_type = CUBLAS_COMPUTE_64F; + }else if(compute_type==LIBRPA_COMPUTE_TYPE_FLOAT){ + cublas_compute_type = CUBLAS_COMPUTE_32F; + }else{ + fprintf(stderr, "Error: Unsupported compute type in pgemm_device_mixed_precision\n"); + exit(1); + } + if(transa=='N'||transa=='n'){ + transA = CUBLAS_OP_N; + }else if(transa=='T'||transa=='t'){ + transA = CUBLAS_OP_T; + }else if(transa=='C'||transa=='c'){ + transA = CUBLAS_OP_C; + }else{ + fprintf(stderr, "Error: Unsupported transa in pgemm_device_mixed_precision\n"); + exit(1); + } + if(transb=='N'||transb=='n'){ + transB = CUBLAS_OP_N; + }else if(transb=='T'||transb=='t'){ + transB = CUBLAS_OP_T; + }else if(transb=='C'||transb=='c'){ + transB = CUBLAS_OP_C; + }else{ + fprintf(stderr, "Error: Unsupported transb in pgemm_device_mixed_precision\n"); + exit(1); + } + CudaConnector::pgemm_nvhpc_mixed_precision( + transA, transB, + m, n, k, + alpha, + d_A, ia, ja, array_descA, + d_B, ib, jb, array_descB, + beta, + d_C, ic, jc, array_descC, + cublas_compute_type + ); + #endif +} + +void DeviceConnector::transpose_device_blas( + const void* d_A, + const int& m, const int& n, + void* d_B, + const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type, + const bool& is_conjugate +){ + #ifdef ENABLE_NVHPC + cublasHandle_t cublasH = NULL; + CUBLAS_CHECK(cublasCreate(&cublasH)); + CUBLAS_CHECK(cublasSetStream(cublasH, (cudaStream_t)device_stream.stream)); + cublasOperation_t trans = CUBLAS_OP_T; + if(is_conjugate) + trans = CUBLAS_OP_C; + if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){ + cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0); + cuDoubleComplex beta = make_cuDoubleComplex(0.0, 0.0); + CUBLAS_CHECK(cublasZgeam( + cublasH, trans, CUBLAS_OP_N, + m, n, + &alpha, + (cuDoubleComplex*)d_A, n, + &beta, + (cuDoubleComplex*)d_B, m, + (cuDoubleComplex*)d_B, m)); + }else if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT){ + cuFloatComplex alpha = make_cuFloatComplex(1.0, 0.0); + cuFloatComplex beta = make_cuFloatComplex(0.0, 0.0); + CUBLAS_CHECK(cublasCgeam( + cublasH, trans, CUBLAS_OP_N, + m, n, + &alpha, + (cuFloatComplex*)d_A, n, + &beta, + (cuFloatComplex*)d_B, m, + (cuFloatComplex*)d_B, m)); + }else if(compute_type==LIBRPA_COMPUTE_TYPE_DOUBLE){ + double alpha = 1.0; + double beta = 0.0; + CUBLAS_CHECK(cublasDgeam( + cublasH, trans, CUBLAS_OP_N, + m, n, + &alpha, + (double*)d_A, n, + &beta, + (double*)d_B, m, + (double*)d_B, m)); + + }else if(compute_type==LIBRPA_COMPUTE_TYPE_FLOAT){ + float alpha = 1.0; + float beta = 0.0; + CUBLAS_CHECK(cublasSgeam( + cublasH, trans, CUBLAS_OP_N, + m, n, + &alpha, + (float*)d_A, n, + &beta, + (float*)d_B, m, + (float*)d_B, m)); + }else{ + fprintf(stderr, "Error: Unsupported compute type in transpose_device_blas\n"); + exit(1); + } + CUBLAS_CHECK(cublasDestroy(cublasH)); + #endif +} + +void DeviceConnector::num_multiply_matrix_device( + const int& n, const void* num, void* d_A, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type +){ + #ifdef ENABLE_NVHPC + cublasHandle_t cublasH = NULL; + CUBLAS_CHECK(cublasCreate(&cublasH)); + CUBLAS_CHECK(cublasSetStream(cublasH, (cudaStream_t)device_stream.stream)); + if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE){ + CUBLAS_CHECK(cublasZscal( + cublasH, n, + (cuDoubleComplex*)num, + (cuDoubleComplex*)d_A, 1 + )); + }else if(compute_type==LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT){ + CUBLAS_CHECK(cublasCscal( + cublasH, n, + (cuFloatComplex*)num, + (cuFloatComplex*)d_A, 1 + )); + }else if(compute_type==LIBRPA_COMPUTE_TYPE_DOUBLE){ + CUBLAS_CHECK(cublasDscal( + cublasH, n, + (double*)num, + (double*)d_A, 1 + )); + }else if(compute_type==LIBRPA_COMPUTE_TYPE_FLOAT){ + CUBLAS_CHECK(cublasSscal( + cublasH, n, + (float*)num, + (float*)d_A, 1 + )); + }else{ + fprintf(stderr, "Error: Unsupported compute type in num_multiply_matrix_device\n"); + exit(1); + } + #endif +} + +void DeviceConnector::diag_add_matrix_device_blacs( + const void* num, void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type +){ + #ifdef ENABLE_NVHPC + CudaConnector::diag_add_matrix_device_blacs( + num, d_A, array_desc, compute_type + ); + #endif +} + +void DeviceConnector::trace_matrix_device_blacs( + void* trace, const void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type +){ + #ifdef ENABLE_NVHPC + CudaConnector::trace_matrix_device_blacs( + trace, d_A, array_desc, compute_type + ); + #endif +} + +void DeviceConnector::det_matrix_device_blacs( + void* det, const void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type +){ + #ifdef ENABLE_NVHPC + CudaConnector::det_matrix_device_blacs( + det, d_A, array_desc, compute_type + ); + #endif +} + +void DeviceConnector::float_to_double_device(float* d_A, double* d_B, const int64_t& n) +{ + #ifdef ENABLE_NVHPC + CudaConnector::float_to_double_device(d_A, d_B, n); + #endif +} + +void DeviceConnector::double_to_float_device(double* d_A, float* d_B, const int64_t& n) +{ + #ifdef ENABLE_NVHPC + CudaConnector::double_to_float_device(d_A, d_B, n); + #endif +} diff --git a/src/device_connector.h b/src/device_connector.h new file mode 100644 index 00000000..b4188f68 --- /dev/null +++ b/src/device_connector.h @@ -0,0 +1,54 @@ +#ifndef DEVICE_CONNECTOR_H +#define DEVICE_CONNECTOR_H +#include "base_blacs.h" +#include "helpers.h" +using LIBRPA::Array_Desc; + +class DeviceConnector{ +public: + static void pgetrf_device_mixed_precision( + void* d_A, const int& m, const int& n, + const LIBRPA::Array_Desc& arrdesc_pi, + int64_t* d_ipiv, int* d_info, + const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type, + const char& order='C' + ); + static void pgemm_device_mixed_precision( + const char& transa, const char& transb, + const int & m,const int & n,const int & k, + const void *alpha, + const void* d_A,int64_t ia,int64_t ja,const Array_Desc& array_descA, + const void* d_B,int64_t ib,int64_t jb,const Array_Desc& array_descB, + const void *beta, + void * d_C,int64_t ic,int64_t jc,const Array_Desc& array_descC, + const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type + ); + static void transpose_device_blas( + const void* d_A, + const int& m, const int& n, + void* d_B, + const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type, + const bool& is_conjugate=false + ); + static void num_multiply_matrix_device( + const int& n, const void* num, void* d_A, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type + ); + static void diag_add_matrix_device_blacs( + const void* num, void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type + ); + + static void trace_matrix_device_blacs( + void* trace, const void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type + ); + static void det_matrix_device_blacs( + void* det, const void* d_A, const Array_Desc& array_desc, const LIBRPA_DEVICE_COMPUTE_TYPE& compute_type + ); + static void float_to_double_device(float* d_A, double* d_B, const int64_t& n); + static void double_to_float_device(double* d_A, float* d_B, const int64_t& n); + + +}; + + + +#endif // DEVICE_CONNECTOR_H \ No newline at end of file diff --git a/src/device_stream.cpp b/src/device_stream.cpp new file mode 100644 index 00000000..03b9ddfc --- /dev/null +++ b/src/device_stream.cpp @@ -0,0 +1,34 @@ +#include "device_stream.h" +#include "envs_mpi.h" +using LIBRPA::envs::mpi_comm_global_h; +void DeviceStream::init(){ + local_device = DeviceStream::getLocalDevice(); + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaSetDevice(local_device)); + CUDA_CHECK(cudaFree(nullptr)); + { + params.allgather = DeviceStream::allgather; + params.req_test = DeviceStream::request_test; + params.req_free = DeviceStream::request_free; + params.data = (void*)(MPI_COMM_WORLD); + params.rank = mpi_comm_global_h.myid; + params.nranks = mpi_comm_global_h.nprocs; + params.local_device = local_device; + + CAL_CHECK(cal_comm_create(params, &cal_comm)); + } + CUDA_CHECK(cudaStreamCreate((cudaStream_t*)&stream)); + CUSOLVERMP_CHECK(cusolverMpCreate(&cusolverMp_handle, local_device, (cudaStream_t)stream)); + CUBLASMP_CHECK(cublasMpCreate(&cublasMp_handle, (cudaStream_t)stream)); + #endif +} +void DeviceStream::check_memory(){ + #ifdef ENABLE_NVHPC + size_t free, total; + CUDA_CHECK(cudaMemGetInfo(&free, &total)); // 直接查询驱动 + printf("rank:%d, Used: %f GiB / %f GiB\n",mpi_comm_global_h.myid, (total - free) / (1024.0*1024.0*1024.0), total / (1024.0*1024.0*1024.0)); + #endif +} + + +DeviceStream device_stream = DeviceStream(); \ No newline at end of file diff --git a/src/device_stream.h b/src/device_stream.h new file mode 100644 index 00000000..91b9800c --- /dev/null +++ b/src/device_stream.h @@ -0,0 +1,134 @@ +#ifndef DEVICE_STREAM_H +#define DEVICE_STREAM_H + +#include +#ifdef ENABLE_NVHPC +#include +#include +#include +#include +#include +#endif +#include "helpers.h" +#include +class DeviceStream{ +private: + #ifdef ENABLE_NVHPC + cal_comm_create_params_t params; + static inline calError_t allgather(void* src_buf, void* recv_buf, size_t size, void* data, void** request) + { + MPI_Request req; + int err = MPI_Iallgather(src_buf, size, MPI_BYTE, recv_buf, size, MPI_BYTE, (MPI_Comm)(data), &req); + if (err != MPI_SUCCESS) + { + return CAL_ERROR; + } + *request = (void*)(req); + return CAL_OK; + } + + static inline calError_t request_test(void* request) + { + MPI_Request req = (MPI_Request)(request); + int completed; + int err = MPI_Test(&req, &completed, MPI_STATUS_IGNORE); + if (err != MPI_SUCCESS) + { + return CAL_ERROR; + } + return completed ? CAL_OK : CAL_ERROR_INPROGRESS; + } + + static inline calError_t request_free(void* request) + { + return CAL_OK; + } + #endif + static inline int getLocalDevice() + { + int localRank; + MPI_Comm localComm; + + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &localComm); + MPI_Comm_rank(localComm, &localRank); + MPI_Comm_free(&localComm); + + int deviceCount = 0; + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaGetDeviceCount(&deviceCount)); + #endif + + return localRank % deviceCount; + } +public: + int local_device; + #ifdef ENABLE_NVHPC + cudaStream_t stream = nullptr; + cal_comm_t cal_comm = nullptr; + cusolverMpHandle_t cusolverMp_handle = nullptr; + cublasMpHandle_t cublasMp_handle = nullptr; + #endif + DeviceStream(){} + void check_memory(); + void init(); + void finalize(){ + if(stream!=nullptr){ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaStreamDestroy((cudaStream_t)stream)); + #endif + stream=nullptr; + } + #ifdef ENABLE_NVHPC + if(cusolverMp_handle!=nullptr){ + CUSOLVERMP_CHECK(cusolverMpDestroy(cusolverMp_handle)); + cusolverMp_handle=nullptr; + } + if(cublasMp_handle!=nullptr){ + CUBLASMP_CHECK(cublasMpDestroy(cublasMp_handle)); + cublasMp_handle=nullptr; + } + if(cal_comm!=nullptr){ + CAL_CHECK(cal_comm_destroy(cal_comm)); + cal_comm=nullptr; + } + #endif + } + ~DeviceStream(){ + if(stream!=nullptr){ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaStreamDestroy((cudaStream_t)stream)); + #endif + stream=nullptr; + } + #ifdef ENABLE_NVHPC + if(cusolverMp_handle!=nullptr){ + CUSOLVERMP_CHECK(cusolverMpDestroy(cusolverMp_handle)); + cusolverMp_handle=nullptr; + } + if(cublasMp_handle!=nullptr){ + CUBLASMP_CHECK(cublasMpDestroy(cublasMp_handle)); + cublasMp_handle=nullptr; + } + if(cal_comm!=nullptr){ + CAL_CHECK(cal_comm_destroy(cal_comm)); + cal_comm=nullptr; + } + #endif + } + void sync() const{ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaStreamSynchronize((cudaStream_t)stream)); + #endif + } + #ifdef ENABLE_NVHPC + void cudaSync() const{ + CUDA_CHECK(cudaStreamSynchronize((cudaStream_t)stream)); + } + void calSync() const { + CAL_CHECK(cal_stream_sync(cal_comm,(cudaStream_t)stream)); + } + #endif +}; + +extern DeviceStream device_stream; +#endif // DEVICE_STREAM_H \ No newline at end of file diff --git a/src/epsilon_cuda.cpp b/src/epsilon_cuda.cpp new file mode 100644 index 00000000..dc194711 --- /dev/null +++ b/src/epsilon_cuda.cpp @@ -0,0 +1,1448 @@ +#include "epsilon_cuda.h" +#include "epsilon.h" +#include "device_stream.h" +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "atoms.h" +#include "constants.h" +#include "envs_blacs.h" +#include "envs_io.h" +#include "envs_mpi.h" +#include "lapack_connector.h" +#include "libri_utils.h" +#include "matrix_m_parallel_utils.h" +#include "parallel_mpi.h" +#include "params.h" +#include "pbc.h" +#include "profiler.h" +#include "scalapack_connector.h" +#include "stl_io_helper.h" +#include "utils_blacs.h" +#include "utils_io.h" +#include "utils_mem.h" +#include "utils_mpi_io.h" +#ifdef LIBRPA_USE_LIBRI +#include +#include +using RI::Tensor; +using RI::Communicate_Tensors_Map_Judge::comm_map2_first; +#endif +#ifdef ENABLE_NVHPC +#include "helpers.h" +#include +#include +#endif + +using LIBRPA::Array_Desc; +using LIBRPA::envs::blacs_ctxt_global_h; +using LIBRPA::envs::mpi_comm_global_h; +using LIBRPA::envs::ofs_myid; +using LIBRPA::utils::lib_printf; +CorrEnergy compute_RPA_correlation_blacs_2d_cuda(Chi0 &chi0, atpair_k_cplx_mat_t &coulmat) +{ + lib_printf("Begin to compute_RPA_correlation_blacs_2d_nvhpc myid: %d\n", mpi_comm_global_h.myid); + system("free -m"); + CorrEnergy corr; + if (mpi_comm_global_h.myid == 0) lib_printf("Calculating EcRPA with BLACS/ScaLAPACK_nvhpc 2D\n"); + const auto &mf = chi0.mf; + const complex CONE{1.0, 0.0}; + const int n_abf = LIBRPA::atomic_basis_abf.nb_total; + if (mpi_comm_global_h.myid == 0) lib_printf("n_abf = %d\n", n_abf); + const auto part_range = LIBRPA::atomic_basis_abf.get_part_range(); + + mpi_comm_global_h.barrier(); + + Array_Desc desc_nabf_nabf(blacs_ctxt_global_h); + desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0); + const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy( + 'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf); + const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf); + auto chi0_block = init_local_mat>(desc_nabf_nabf, MAJOR::COL); + auto coul_block = init_local_mat>(desc_nabf_nabf, MAJOR::COL); + auto coul_chi0_block = init_local_mat>(desc_nabf_nabf, MAJOR::COL); + #ifdef ENABLE_NVHPC + MatrixDevice> d_chi0_block, d_coul_block, d_coul_chi0_block; + #endif + vector> qpts; + for(const auto &q : chi0.klist) + { + qpts.push_back(q); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("processId:%d, q: (%f, %f, %f)\n", mpi_comm_global_h.myid, q.x, q.y, q.z); + #endif + } + complex tot_RPA_energy(0.0, 0.0); + map, complex> cRPA_q; + if (mpi_comm_global_h.is_root()) lib_printf("Finish init RPA blacs 2d\n"); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success before for loop processid:%d\n", mpi_comm_global_h.myid); + #endif +#ifdef LIBRPA_USE_LIBRI + + for (const auto &q : qpts) + { + coul_block.zero_out(); + + int iq = std::distance(klist.begin(), std::find(klist.begin(), klist.end(), q)); + std::array qa = {q.x, q.y, q.z}; + // collect the block elements of coulomb matrices + { + double vq_begin = omp_get_wtime(); + // LibRI tensor for communication, release once done + std::map>, Tensor>>> + coul_libri; + coul_libri.clear(); + for (const auto &Mu_Nu : local_atpair) + { + const auto Mu = Mu_Nu.first; + const auto Nu = Mu_Nu.second; + + if (coulmat.count(Mu) == 0 || coulmat.at(Mu).count(Nu) == 0 || + coulmat.at(Mu).at(Nu).count(q) == 0) + continue; + const auto &Vq = coulmat.at(Mu).at(Nu).at(q); + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu); + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu); + std::valarray> Vq_va(Vq->c, Vq->size); + auto pvq = std::make_shared>>(); + *pvq = Vq_va; + coul_libri[Mu][{Nu, qa}] = Tensor>({n_mu, n_nu}, pvq); + } + double arr_end = omp_get_wtime(); + mpi_comm_global_h.barrier(); + double comm_begin = omp_get_wtime(); + const auto IJq_coul = + comm_map2_first(mpi_comm_global_h.comm, coul_libri, s0_s1.first, s0_s1.second); + double comm_end = omp_get_wtime(); + mpi_comm_global_h.barrier(); + double block_begin = omp_get_wtime(); + collect_block_from_ALL_IJ_Tensor(coul_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, + qa, true, CONE, IJq_coul, MAJOR::ROW); + double block_end = omp_get_wtime(); + lib_printf( + "Vq Time myid: %d arr_time: %f comm_time: %f block_time: %f pair_size: %d\n", + mpi_comm_global_h.myid, arr_end - vq_begin, comm_end - comm_begin, + block_end - block_begin, set_IJ_nabf_nabf.size()); + mpi_comm_global_h.barrier(); + double vq_end = omp_get_wtime(); + + if (mpi_comm_global_h.myid == 0) + lib_printf(" | Total vq time: %f lri_coul: %f comm_vq: %f block_vq: %f\n", + vq_end - vq_begin, comm_begin - vq_begin, block_begin - comm_begin, + vq_end - block_begin); + } + double chi_arr_time = 0.0; + double chi_comm_time = 0.0; + double chi_2d_time = 0.0; + for (const auto &freq : chi0.tfg.get_freq_nodes()) + { + const auto ifreq = chi0.tfg.get_freq_index(freq); + const double freq_weight = chi0.tfg.find_freq_weight(freq); + double pi_freq_begin = omp_get_wtime(); + chi0_block.zero_out(); + { + double chi_begin_arr = omp_get_wtime(); + std::map>, Tensor>>> + chi0_libri; + atom_mapping::pair_t_old chi0_wq; + if(!chi0.get_chi0_q().empty()) + chi0_wq = chi0.get_chi0_q().at(freq).at(q); + chi0_libri.clear(); + if(!chi0.get_chi0_q().empty()) + for (const auto &M_Nchi : chi0_wq) + { + const auto &M = M_Nchi.first; + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M); + for (const auto &N_chi : M_Nchi.second) + { + const auto &N = N_chi.first; + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N); + const auto &chi = N_chi.second; + std::valarray> chi_va(chi.c, chi.size); + auto pchi = std::make_shared>>(); + *pchi = chi_va; + chi0_libri[M][{N, qa}] = Tensor>({n_mu, n_nu}, pchi); + } + } + if (mpi_comm_global_h.is_root()) + { + lib_printf("Begin to clean chi0 !!! \n"); + LIBRPA::utils::display_free_mem(); + lib_printf("chi0_freq_q size: %d, freq: %f, q:( %f, %f, %f )\n", + chi0_wq.size(), freq, q.x, q.y, q.z); + } + if(!chi0.get_chi0_q().empty()) + chi0.free_chi0_q(freq, q); + + LIBRPA::utils::release_free_mem(); + mpi_comm_global_h.barrier(); + double chi_end_arr = omp_get_wtime(); + const auto IJq_chi0 = + comm_map2_first(mpi_comm_global_h.comm, chi0_libri, s0_s1.first, s0_s1.second); + // ofs_myid << "IJq_chi0" << endl << IJq_chi0; + double chi_end_comm = omp_get_wtime(); + collect_block_from_ALL_IJ_Tensor(chi0_block, desc_nabf_nabf, + LIBRPA::atomic_basis_abf, qa, true, CONE, IJq_chi0, + MAJOR::ROW); + mpi_comm_global_h.barrier(); + double chi_end_2d = omp_get_wtime(); + + chi_arr_time = (chi_end_arr - chi_begin_arr); + chi_comm_time = (chi_end_comm - chi_end_arr); + chi_2d_time = (chi_end_2d - chi_end_comm); + } + + double pi_begin = omp_get_wtime(); + d_coul_block.set_data(coul_block.nr(), coul_block.nc(), coul_block.ptr(),device_stream.stream); + d_chi0_block.set_data(chi0_block.nr(), chi0_block.nc(), chi0_block.ptr(),device_stream.stream); + d_coul_chi0_block.set_data(coul_chi0_block.nr(), coul_chi0_block.nc(), coul_chi0_block.ptr(),device_stream.stream); + std::complex calpha(1.0,0.0),cbeta(0.0,0.0); + bool is_mixed_precision = false; + if(is_mixed_precision) + { + MatrixDevice> d_coul_block_f(coul_block.nr(),coul_block.nc(),device_stream.stream); + MatrixDevice> d_chi0_block_f(chi0_block.nr(),chi0_block.nc(),device_stream.stream); + MatrixDevice> d_coul_chi0_block_f(coul_chi0_block.nr(),coul_chi0_block.nc(),device_stream.stream); + DeviceConnector::double_to_float_device((double*)d_coul_block.ptr(),(float*)d_coul_block_f.ptr(),d_coul_block.nr()*d_coul_block.nc()*2); + DeviceConnector::double_to_float_device((double*)d_chi0_block.ptr(),(float*)d_chi0_block_f.ptr(),d_chi0_block.nr()*d_chi0_block.nc()*2); + DeviceConnector::double_to_float_device((double*)d_coul_chi0_block.ptr(),(float*)d_coul_chi0_block_f.ptr(),d_coul_chi0_block.nr()*d_coul_chi0_block.nc()*2); + std::complex calpha_f(1.0f,0.0f),cbeta_f(0.0f,0.0f); + DeviceConnector::pgemm_device_mixed_precision( + 'N', 'N', n_abf, n_abf, n_abf, + &calpha_f, + d_coul_block_f.ptr(), 1, 1, desc_nabf_nabf, + d_chi0_block_f.ptr(), 1, 1, desc_nabf_nabf, + &cbeta_f, + d_coul_chi0_block_f.ptr(), 1, 1, desc_nabf_nabf, + LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT + ); + DeviceConnector::float_to_double_device((float*)d_coul_chi0_block_f.ptr(),(double*)d_coul_chi0_block.ptr(),d_coul_chi0_block.nr()*d_coul_chi0_block.nc()*2); + }else{ + DeviceConnector::pgemm_device_mixed_precision( + 'N', 'N', n_abf, n_abf, n_abf, + &calpha, + d_coul_block.ptr(), 1, 1, desc_nabf_nabf, + d_chi0_block.ptr(), 1, 1, desc_nabf_nabf, + &cbeta, + d_coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf, + LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE + ); + } + double pi_end = omp_get_wtime(); + complex trace_pi(0.0, 0.0); + complex trace_pi_loc(0.0, 0.0); + DeviceConnector::trace_matrix_device_blacs(&trace_pi_loc,d_coul_chi0_block.ptr(),desc_nabf_nabf,LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE); + cuDoubleComplex calpha1; + calpha1.x = -1.0; + calpha1.y = 0.0; + DeviceConnector::num_multiply_matrix_device(d_coul_chi0_block.nr()*d_coul_chi0_block.nc(),&calpha1,d_coul_chi0_block.ptr(),LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE); + + DeviceConnector::diag_add_matrix_device_blacs(&calpha,d_coul_chi0_block.ptr(),desc_nabf_nabf, LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE); + + int64_t *d_ipiv; + int *d_info ; + CUDA_CHECK(cudaMallocAsync((void**)&d_ipiv, sizeof(int64_t)*max(desc_nabf_nabf.m_loc(), desc_nabf_nabf.n_loc()),device_stream.stream)); + CUDA_CHECK(cudaMallocAsync((void**)&d_info, sizeof(int),device_stream.stream)); + + complex ln_det = + compute_pi_det_blacs_2d_nvhpc(d_coul_chi0_block, desc_nabf_nabf, d_ipiv, d_info, 'c'); + CUDA_CHECK(cudaFreeAsync(d_ipiv,device_stream.stream)); + CUDA_CHECK(cudaFreeAsync(d_info,device_stream.stream)); + + double det_end = omp_get_wtime(); + mpi_comm_global_h.barrier(); + MPI_Allreduce(&trace_pi_loc, &trace_pi, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, + mpi_comm_global_h.comm); + double pi_freq_end = omp_get_wtime(); + if (mpi_comm_global_h.myid == 0) + { + lib_printf( + "| TIME of DET-freq-q: %f, q: ( %f, %f, %f) TOT: %f CHI_arr: %f CHI_comm: " + "%f, CHI_2d: %f, Pi: %f, Det: %f\n", + freq, q.x, q.y, q.z, pi_freq_end - pi_freq_begin, chi_arr_time, chi_comm_time, + chi_2d_time, pi_end - pi_begin, det_end - pi_end); + complex rpa_for_omega_q = trace_pi + ln_det; + cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; //! check + tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; + } + } + } +#else + throw std::logic_error("need compilation with LibRI"); +#endif + if (mpi_comm_global_h.myid == 0) + { + for (auto &q_crpa : cRPA_q) + { + corr.qcontrib[q_crpa.first] = q_crpa.second; + // cout << q_crpa.first << q_crpa.second << endl; + } + // cout << "gx_num_" << chi0.tfg.size() << " tot_RPA_energy: " << setprecision(8) + // <, atom_mapping::pair_t_old>> + pi_freq_q_Mu_Nu; + if (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR || + LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI) + pi_freq_q_Mu_Nu = compute_Pi_q_MPI(chi0, coulmat); + else + pi_freq_q_Mu_Nu = compute_Pi_q(chi0, coulmat); + lib_printf("Finish Pi freq on Proc %4d, size %zu\n", mpi_comm_global_h.myid, + pi_freq_q_Mu_Nu.size()); + // mpi_comm_global_h.barrier(); + + int range_all = N_all_mu; + + vector part_range; + part_range.resize(atom_mu.size()); + part_range[0] = 0; + int count_range = 0; + + for (int I = 0; I != atom_mu.size() - 1; I++) + { + count_range += atom_mu[I]; + part_range[I + 1] = count_range; + } + + + // pi_freq_q contains all atoms + map, ComplexMatrix>> pi_freq_q; + + for(const auto &freq : chi0.tfg.get_freq_nodes()) + { + // printf("| process %d, freq: %f\n", mpi_comm_global_h.myid, freq); + map, atom_mapping::pair_t_old> freq_q_MuNupi; + if(!chi0.get_chi0_q().empty()) + freq_q_MuNupi=pi_freq_q_Mu_Nu.at(freq); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("success before freq_q_MuNupi processid:%d, freq_q_MuNupi.size(): %zu\n", + // mpi_comm_global_h.myid, freq_q_MuNupi.size()); + #endif + for(const auto &q:chi0.klist){ + atom_mapping::pair_t_old q_MuNupi; + if(!chi0.get_chi0_q().empty()) + q_MuNupi = freq_q_MuNupi.at(q); + const auto MuNupi = q_MuNupi; + pi_freq_q[freq][q].create(range_all, range_all); + + ComplexMatrix pi_munu_tmp(range_all, range_all); + pi_munu_tmp.zero_out(); + if(!chi0.get_chi0_q().empty()) + for (const auto &Mu_Nupi : MuNupi) + { + const auto Mu = Mu_Nupi.first; + const auto Nupi = Mu_Nupi.second; + const size_t n_mu = atom_mu[Mu]; + for (const auto &Nu_pi : Nupi) + { + const auto Nu = Nu_pi.first; + const auto pimat = Nu_pi.second; + const size_t n_nu = atom_mu[Nu]; + + for (size_t mu = 0; mu != n_mu; ++mu) + { + for (size_t nu = 0; nu != n_nu; ++nu) + { + pi_munu_tmp(part_range[Mu] + mu, part_range[Nu] + nu) += pimat(mu, nu); + } + } + } + } + if (LIBRPA::parallel_routing == LIBRPA::ParallelRouting::ATOM_PAIR || + LIBRPA::parallel_routing == LIBRPA::ParallelRouting::LIBRI) + { + mpi_comm_global_h.reduce_ComplexMatrix(pi_munu_tmp, pi_freq_q.at(freq).at(q), 0); + } + else + { + pi_freq_q.at(freq).at(q) = std::move(pi_munu_tmp); + } + } + } + // lib_printf("Finish Pi communicate %4d, size %zu\n", mpi_comm_global_h.myid, + // pi_freq_q_Mu_Nu.size()); + mpi_comm_global_h.barrier(); + // if (mpi_comm_global_h.myid == 0) + { + complex tot_RPA_energy(0.0, 0.0); + map, complex> cRPA_q; + int deviceCount; + cudaError_t err= cudaGetDeviceCount(&deviceCount); + + // if(err==cudaSuccess&&deviceCount>0&&deviceCount!=4) + // printf("cudaSuccess:%d\n",err==cudaSuccess&&deviceCount>0); + // ==============================test the velocity of cuda stream======================================== + + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + range_all=TEST_LU_NUMBER; + complex* temp_complex=new complex[range_all*range_all]; + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<> dis(-1.0, 1.0); + double temp_begin_time=omp_get_wtime(); + // #pragma omp parallel for + for(int i=0;i[range_all*range_all]; + complex* c_ptr=pi_freq_q[freq][q].c; + + double temp_begin_time=omp_get_wtime(); + #pragma omp parallel for + for(int i=0;i trace_pi = trace(pi_freq_q.at(freq).at(q)); + complex rpa_for_omega_q(0.0, 0.0); + int info_LU = 1; + int *ipiv = new int[range_all]; + cuDoubleComplex det_test; + if(range_all det_for_rpa(det_test.x, det_test.y); + // printf("det_for_rpa_gpu: %f+%fi\n",det_test.x,det_test.y); + + // auto end_time = std::chrono::high_resolution_clock::now(); + // auto duration = std::chrono::duration_cast(end_time - start_time); + // printf("LU time by lapack: %lld us\n", duration.count()); + // complex trace_pi; + complex ln_det; + ln_det = std::log(det_for_rpa); + + // printf("in_det: %f+%fi, trace_pi: %f+%fi\n", ln_det.real(), ln_det.imag(), + // trace_pi.real(), trace_pi.imag()); + // cout << "PI trace vector:" << endl; + // cout << endl; + rpa_for_omega_q = ln_det + trace_pi; + // cout << " ifreq:" << freq << " rpa_for_omega_k: " << rpa_for_omega_q << " + // lnt_det: " << ln_det << " trace_pi " << trace_pi << endl; + // printf("tot_RPA_energy_gpu: %f+%fi,num_iteration:%d\n", + // tot_RPA_energy.real(), tot_RPA_energy.imag(), num_iteration); + // printf("rpa_for_omega_q:%f+%fi,freq_weight:%f,irk_weight[q]:%f\n", + // rpa_for_omega_q.real(), rpa_for_omega_q.imag(), freq_weight, + // irk_weight[q]); + #ifdef OPEN_OMP_FOR_LU_DECOMPOSITION + #pragma omp critical + #endif + { + cRPA_q[q] += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; + + tot_RPA_energy += rpa_for_omega_q * freq_weight * irk_weight[q] / TWO_PI; + } + // printf("freq: %f, q: (%f, %f, %f), rpa_for_omega_q: %f+%fi,tot_RPA_energy_gpu: %f+%fi,num_iteration:%d,deviceId:%d,streamId:%d\n", + // freq, q.x, q.y, q.z, rpa_for_omega_q.real(), rpa_for_omega_q.imag(),tot_RPA_energy.real(), tot_RPA_energy.imag(), num_iteration, deviceId, streamId); + + // printf("tot_RPA_energy_gpu: %f+%fi,num_iteration:%d\n",tot_RPA_energy.real(),tot_RPA_energy.imag(),num_iteration); + } + // double test_endTime=omp_get_wtime(); + // printf("task_time:%f(aimed to test whether task is congested)\n",test_endTime-test_startTime); + num_iteration++; + #ifndef OPEN_OMP_FOR_LU_DECOMPOSITION + printf("one task time:%f\n",omp_get_wtime()-one_task_begin); + #endif + } + } + + } + printf("time for calculate: %f\n",omp_get_wtime()-time_calculate); + printf("mpi_comm_global_h.myid:%d,num_iteration:%d\n",mpi_comm_global_h.myid,num_iteration); + #ifdef OPEN_TEST_FOR_LU_DECOMPOSITION + // printf("time for all tasks: %f\n",omp_get_wtime()-start_time); + #endif + printf("tot_RPA_energy_gpu: %f+%fi\n",tot_RPA_energy.real(),tot_RPA_energy.imag()); + if(range_all, complex> global_cRPA_q; + for (auto q_weight : irk_weight) + { + MPI_Reduce(&cRPA_q[q_weight.first], &global_cRPA_q[q_weight.first], 1, + MPI_DOUBLE_COMPLEX, MPI_SUM, 0, mpi_comm_global_h.comm); + } + + for (auto &q_crpa : global_cRPA_q) + { + corr.qcontrib[q_crpa.first] = q_crpa.second; + } + complex gather_tot_RPA_energy(0.0, 0.0); + MPI_Reduce(&tot_RPA_energy, &gather_tot_RPA_energy, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, 0, + mpi_comm_global_h.comm); + corr.value = gather_tot_RPA_energy; + } + // printf("gather_tot_RPA_energy_gpu: %f+%fi\n",corr.value.real(),corr.value.imag()); + corr.etype = CorrEnergy::type::RPA; + return corr; +} +#ifdef ENABLE_NVHPC + +complex compute_pi_det_blacs_2d_nvhpc( + MatrixDevice> &d_A, const LIBRPA::Array_Desc &arrdesc_pi, int64_t *d_ipiv, int *d_info,char order) +{ + MatrixDevice> d_A_T; + + ORDER_CHECK(order); + if(order=='C'||order=='c'){ + d_A_T.set_data(d_A.nc(), d_A.nr(), device_stream.stream); + DeviceConnector::transpose_device_blas(d_A.ptr(),d_A_T.nr(), d_A_T.nc(),d_A_T.ptr(), LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE); + } + int ia=1,ja=1; + bool is_mixed_precision=false; + if(is_mixed_precision){ + MatrixDevice> d_A_f(d_A.nr(),d_A.nc(),device_stream.stream); + + DeviceConnector::double_to_float_device((order=='C'||order=='c')?(double*)d_A_T.ptr():(double*)d_A.ptr(),(float*)d_A_f.ptr(),d_A.nr()*d_A.nc()*2); + + DeviceConnector::pgetrf_device_mixed_precision( + d_A_f.ptr(), ia, ja, arrdesc_pi, + d_ipiv, d_info, + LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT, + order + ); + DeviceConnector::float_to_double_device((float*)d_A_f.ptr(),(order=='C'||order=='c')?(double*)d_A_T.ptr():(double*)d_A.ptr(),d_A.nr()*d_A.nc()*2); + }else{ + DeviceConnector::pgetrf_device_mixed_precision( + (order=='C'||order=='c')?(void*)d_A_T.ptr():(void*)d_A.ptr(), ia, ja, arrdesc_pi, + d_ipiv, d_info, + LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE, + order); + } + if(order=='C'||order=='c'){ + DeviceConnector::transpose_device_blas(d_A_T.ptr(),d_A.nr(),d_A.nc(), d_A.ptr(), LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE); + d_A_T.clean(device_stream.stream); + } + complex ln_det_loc(0.0, 0.0); + complex ln_det_all(0.0, 0.0); + std::complex det_loc; + + DeviceConnector::det_matrix_device_blacs( + &det_loc, d_A.ptr(), arrdesc_pi, LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE + ); + if(det_loc.real() > 0) + { + ln_det_loc = std::log(det_loc); + } + else + { + ln_det_loc = std::log(-det_loc); + } + + MPI_Allreduce(&ln_det_loc, &ln_det_all, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_global_h.comm); + return ln_det_all; +} +// Done: converge compute_Wc_freq_q_blacs and compute_Wc_freq_q_blacs_wing +map, matrix_m>>>::pair_t_old> +compute_Wc_freq_q_blacs_cuda(Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat_eps, + atpair_k_cplx_mat_t &coulmat_wc, + const vector> &epsmac_LF_imagfreq) +{ + map, matrix_m>>>::pair_t_old> + Wc_freq_q; + const complex CONE{1.0, 0.0}; + const int n_abf = LIBRPA::atomic_basis_abf.nb_total; + const auto part_range = LIBRPA::atomic_basis_abf.get_part_range(); + + if (mpi_comm_global_h.myid == 0) + { + cout << "Calculating Wc using NVHPC" << endl; + } + mpi_comm_global_h.barrier(); + + Profiler::start("compute_Wc_freq_q_blacs_init"); + Array_Desc desc_nabf_nabf(blacs_ctxt_global_h); + // Use a square blocksize instead max block, otherwise heev and inversion will complain about + // illegal parameter Maximal blocksize ensure that atom indices related to the rows/columns of a + // local matrix is minimized. + desc_nabf_nabf.init_square_blk(n_abf, n_abf, 0, 0); + // This, however, is not optimal for matrix operations, and may lead to segment fault during + // MPI operations with parallel linear algebra subroutine. Thus we define an optimal blocksize + Array_Desc desc_nabf_nabf_opt(blacs_ctxt_global_h); + const int nb_opt = min(128, desc_nabf_nabf.nb()); + desc_nabf_nabf_opt.init(n_abf, n_abf, nb_opt, nb_opt, 0, 0); + // obtain the indices of atom-pair block necessary to build 2D block of a Hermitian/symmetric + // matrix + const auto set_IJ_nabf_nabf = LIBRPA::utils::get_necessary_IJ_from_block_2D_sy( + 'U', LIBRPA::atomic_basis_abf, desc_nabf_nabf); + const auto s0_s1 = get_s0_s1_for_comm_map2_first(set_IJ_nabf_nabf); + // temp_block is used to collect data from IJ-pair data structure with comm_map2_first + auto temp_block = init_local_mat>(desc_nabf_nabf, MAJOR::COL); + #ifdef ENABLE_NVHPC + GpuDeviceStream gpu_dev_stream; + ComplexMatrixDevice d_temp_block; + ComplexMatrixDevice d_coul_block; + #endif + // Below are the working arrays for matrix operations + auto chi0_block = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); + auto coul_block = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); + auto coul_eigen_block = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); + auto coul_chi0_block = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); + #ifdef ENABLE_NVHPC + ComplexMatrixDevice d_chi0_block; + ComplexMatrixDevice d_coul_chi0_block; + ComplexMatrixDevice d_coul_eigen_block; + #endif + auto coulwc_block = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); + #ifdef ENABLE_NVHPC + ComplexMatrixDevice d_coulwc_block; + #endif + + const double mem_blocks = (chi0_block.size() + coul_block.size() + coul_eigen_block.size() + + coul_chi0_block.size() + coulwc_block.size()) * + 16.0e-6; + ofs_myid << get_timestamp() + << " Memory consumption of task-local blocks for screened Coulomb [MB]: " << mem_blocks + << endl; + + const auto atpair_local = dispatch_upper_trangular_tasks( + natom, blacs_ctxt_global_h.myid, blacs_ctxt_global_h.nprows, blacs_ctxt_global_h.npcols, + blacs_ctxt_global_h.myprow, blacs_ctxt_global_h.mypcol); +#ifdef LIBRPA_DEBUG + ofs_myid << get_timestamp() << " atpair_local " << atpair_local << endl; + ofs_myid << get_timestamp() << " s0_s1 " << s0_s1 << endl; +#endif + + // IJ pair of Wc to be returned + pair, set> Iset_Jset_Wc; + for (const auto &ap : atpair_local) + { + Iset_Jset_Wc.first.insert(ap.first); + Iset_Jset_Wc.second.insert(ap.second); + } + + // Prepare local basis indices for 2D->IJ map + int I, iI; + map> map_lor_v; + map> map_loc_v; + for (int i_lo = 0; i_lo != desc_nabf_nabf.m_loc(); i_lo++) + { + int i_glo = desc_nabf_nabf.indx_l2g_r(i_lo); + LIBRPA::atomic_basis_abf.get_local_index(i_glo, I, iI); + map_lor_v[I].push_back(iI); + } + for (int i_lo = 0; i_lo != desc_nabf_nabf.n_loc(); i_lo++) + { + int i_glo = desc_nabf_nabf.indx_l2g_c(i_lo); + LIBRPA::atomic_basis_abf.get_local_index(i_glo, I, iI); + map_loc_v[I].push_back(iI); + } + + vector> qpts; + for (const auto &q_weight : irk_weight) qpts.push_back(q_weight.first); + + vec eigenvalues(n_abf); + Profiler::cease("compute_Wc_freq_q_blacs_init"); + LIBRPA::utils::lib_printf_root("Time for Wc initialization (seconds, Wall/CPU): %f %f\n", + Profiler::get_wall_time_last("compute_Wc_freq_q_blacs_init"), + Profiler::get_cpu_time_last("compute_Wc_freq_q_blacs_init")); + + Profiler::start("compute_Wc_freq_q_work"); +#ifdef LIBRPA_USE_LIBRI + for (const auto &q : qpts) + { + const int iq = std::distance(qpts.cbegin(), std::find(qpts.cbegin(), qpts.cend(), q)); + const int iq_in_k = + std::distance(klist.cbegin(), std::find(klist.cbegin(), klist.cend(), q)); + // q-point in fractional coordinates + const auto &qf = kfrac_list[iq_in_k]; + LIBRPA::utils::lib_printf_root("Computing Wc(q), %d / %d, q=(%f, %f, %f)\n", iq + 1, + qpts.size(), qf.x, qf.y, qf.z); + coul_block.zero_out(); + coulwc_block.zero_out(); + // lib_printf("coul_block\n%s", str(coul_block).c_str()); + + // q-array for LibRI object + std::array qa = {q.x, q.y, q.z}; + + // collect the block elements of truncated coulomb matrices first + // as we reuse coul_eigen_block to reduce memory usage + Profiler::start("epsilon_prepare_coulwc_sqrt", "Prepare sqrt of truncated Coulomb"); + { + size_t n_singular_coulwc; + // LibRI tensor for communication, release once done + std::map>, RI::Tensor>>> + couleps_libri; + Profiler::start("epsilon_prepare_coulwc_sqrt_1", "Setup libRI object"); + for (const auto &Mu_Nu : atpair_local) + { + const auto Mu = Mu_Nu.first; + const auto Nu = Mu_Nu.second; + // ofs_myid << "Mu " << Mu << " Nu " << Nu << endl; + if (coulmat_wc.count(Mu) == 0 || coulmat_wc.at(Mu).count(Nu) == 0 || + coulmat_wc.at(Mu).at(Nu).count(q) == 0) + continue; + const auto &Vq = coulmat_wc.at(Mu).at(Nu).at(q); + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu); + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu); + std::valarray> Vq_va(Vq->c, Vq->size); + auto pvq = std::make_shared>>(); + *pvq = Vq_va; + couleps_libri[Mu][{Nu, qa}] = RI::Tensor>({n_mu, n_nu}, pvq); + } + Profiler::stop("epsilon_prepare_coulwc_sqrt_1"); + + Profiler::start("epsilon_prepare_coulwc_sqrt_2", "libRI Communicate"); + const auto IJq_coul = RI::Communicate_Tensors_Map_Judge::comm_map2_first( + mpi_comm_global_h.comm, couleps_libri, s0_s1.first, s0_s1.second); + Profiler::stop("epsilon_prepare_coulwc_sqrt_2"); + + Profiler::start("epsilon_prepare_coulwc_sqrt_3", "Collect 2D-block from IJ"); + + collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, + qa, true, CONE, IJq_coul, MAJOR::ROW); + ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1, desc_nabf_nabf.desc, + coulwc_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, + blacs_ctxt_global_h.ictxt); + Profiler::stop("epsilon_prepare_coulwc_sqrt_3"); + Profiler::start("epsilon_prepare_coulwc_sqrt_4", "Perform square root"); + power_hemat_blacs(coulwc_block, desc_nabf_nabf_opt, coul_eigen_block, + desc_nabf_nabf_opt, n_singular_coulwc, eigenvalues.c, 0.5, + Params::sqrt_coulomb_threshold); + Profiler::stop("epsilon_prepare_coulwc_sqrt_4"); + } + Profiler::stop("epsilon_prepare_coulwc_sqrt"); + LIBRPA::utils::lib_printf_root( + "Time to prepare sqrt root of Coulomb for Wc(q) (seconds, Wall/CPU): %f %f\n", + Profiler::get_wall_time_last("epsilon_prepare_coulwc_sqrt"), + Profiler::get_cpu_time_last("epsilon_prepare_coulwc_sqrt")); + ofs_myid << get_timestamp() << " Done coulwc sqrt" << endl; + + Profiler::start("epsilon_prepare_couleps_sqrt", "Prepare sqrt of bare Coulomb"); + // collect the block elements of coulomb matrices + { + // LibRI tensor for communication, release once done + std::map>, RI::Tensor>>> + couleps_libri; + ofs_myid << get_timestamp() << " Start build couleps_libri" << endl; + for (const auto &Mu_Nu : atpair_local) + { + const auto Mu = Mu_Nu.first; + const auto Nu = Mu_Nu.second; + // ofs_myid << "Mu " << Mu << " Nu " << Nu << endl; + if (coulmat_eps.count(Mu) == 0 || coulmat_eps.at(Mu).count(Nu) == 0 || + coulmat_eps.at(Mu).at(Nu).count(q) == 0) + continue; + const auto &Vq = coulmat_eps.at(Mu).at(Nu).at(q); + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(Mu); + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(Nu); + std::valarray> Vq_va(Vq->c, Vq->size); + auto pvq = std::make_shared>>(); + *pvq = Vq_va; + couleps_libri[Mu][{Nu, qa}] = RI::Tensor>({n_mu, n_nu}, pvq); + } + ofs_myid << get_timestamp() << " Done build couleps_libri" << endl; + // ofs_myid << "Couleps_libri" << endl << couleps_libri; + // if (couleps_libri.size() == 0) + // throw std::logic_error("data at q-point not found in coulmat_eps"); + + // perform communication + ofs_myid << get_timestamp() << " Start collect couleps_libri, targets" << endl; +#ifdef LIBRPA_DEBUG + ofs_myid << set_IJ_nabf_nabf << endl; + ofs_myid << "Extended blocks" << endl; + ofs_myid << "atom 1: " << s0_s1.first << endl; + ofs_myid << "atom 2: " << s0_s1.second << endl; +#endif + // ofs_myid << "Owned blocks\n"; + // print_keys(ofs_myid, couleps_libri); + // mpi_comm_global_h.barrier(); + const auto IJq_coul = RI::Communicate_Tensors_Map_Judge::comm_map2_first( + mpi_comm_global_h.comm, couleps_libri, s0_s1.first, s0_s1.second); + ofs_myid << get_timestamp() << " Done collect couleps_libri, collected blocks" << endl; + + ofs_myid << get_timestamp() << " Start construct couleps 2D block" << endl; + collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf, LIBRPA::atomic_basis_abf, + qa, true, CONE, IJq_coul, MAJOR::ROW); + #ifndef ENABLE_NVHPC + ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1, desc_nabf_nabf.desc, + coul_block.ptr(), 1, 1, desc_nabf_nabf_opt.desc, + blacs_ctxt_global_h.ictxt); + #else + d_temp_block.set_data(temp_block.nr(),temp_block.nc(),temp_block.ptr()); + d_coul_block.set_data(coul_block.nr(),coul_block.nc()); + CudaConnector::pgemr2d_nvhpc( + gpu_dev_stream, n_abf, n_abf, + d_temp_block.ptr(), 1, 1, desc_nabf_nabf, + d_coul_block.ptr(), 1, 1, desc_nabf_nabf_opt, + CUDA_C_64F + ); + gpu_dev_stream.cudaSync(); + CUDA_CHECK(cudaMemcpy(coul_block.ptr(), d_coul_block.ptr(), sizeof(cuDoubleComplex)*coul_block.nr()*coul_block.nc(), cudaMemcpyDeviceToHost)); + #endif + ofs_myid << get_timestamp() << " Done construct couleps 2D block" << endl; + } + + size_t n_singular; + ofs_myid << get_timestamp() << " Start power hemat couleps\n"; + matrix_m> sqrtveig_blacs; + #ifdef ENABLE_NVHPC + ComplexMatrixDevice d_sqrtveig_blacs; + #endif + if (is_gamma_point(q)) + { + // choice of power_hemat_blacs_real/power_hemat_blacs_desc + // leads to sub-meV difference + sqrtveig_blacs = power_hemat_blacs_real( + coul_block, desc_nabf_nabf_opt, coul_eigen_block, desc_nabf_nabf_opt, n_singular, + eigenvalues.c, 0.5, Params::sqrt_coulomb_threshold); + if (Params::replace_w_head && Params::option_dielect_func == 3) + { + df_headwing.wing_mu_to_lambda(sqrtveig_blacs, desc_nabf_nabf_opt); + } + } + else + { + sqrtveig_blacs = power_hemat_blacs(coul_block, desc_nabf_nabf_opt, coul_eigen_block, + desc_nabf_nabf_opt, n_singular, eigenvalues.c, 0.5, + Params::sqrt_coulomb_threshold); + } + ofs_myid << get_timestamp() << " Done power hemat couleps\n"; + // lib_printf("nabf %d nsingu %lu\n", n_abf, n_singular); + // release sqrtv when the q-point is not Gamma, or macroscopic dielectric constant at + // imaginary frequency is not prepared + if (epsmac_LF_imagfreq.empty() || !is_gamma_point(q)) sqrtveig_blacs.clear(); + const size_t n_nonsingular = n_abf - n_singular; + if(gpu_dev_stream.rank==0){ + printf("n_abf:%lu,n_nonsingular:%lu,n_singular:%lu\n",n_abf,n_nonsingular,n_singular); + } + Profiler::stop("epsilon_prepare_couleps_sqrt"); + LIBRPA::utils::lib_printf_root( + "Time to prepare sqrt root of Coulomb for Epsilon(q) (seconds, Wall/CPU): %f %f\n", + Profiler::get_wall_time_last("epsilon_prepare_couleps_sqrt"), + Profiler::get_cpu_time_last("epsilon_prepare_couleps_sqrt")); + ofs_myid << get_timestamp() << " Done couleps sqrt\n"; + std::flush(ofs_myid); + + for (const auto &freq : chi0.tfg.get_freq_nodes()) + { + const auto ifreq = chi0.tfg.get_freq_index(freq); + Profiler::start("epsilon_wc_work_q_omega"); + Profiler::start("epsilon_prepare_chi0_2d", "Prepare Chi0 2D block"); + chi0_block.zero_out(); + { + std::map>, + RI::Tensor>>> + chi0_libri; + if (chi0.get_chi0_q().count(freq) > 0 && chi0.get_chi0_q().at(freq).count(q) > 0) + { + const auto &chi0_wq = chi0.get_chi0_q().at(freq).at(q); + for (const auto &M_Nchi : chi0_wq) + { + const auto &M = M_Nchi.first; + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M); + for (const auto &N_chi : M_Nchi.second) + { + const auto &N = N_chi.first; + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N); + const auto &chi = N_chi.second; + std::valarray> chi_va(chi.c, chi.size); + auto pchi = std::make_shared>>(); + *pchi = chi_va; + chi0_libri[M][{N, qa}] = + RI::Tensor>({n_mu, n_nu}, pchi); + } + } + // Release the chi0 block for this frequency and q to reduce memory load, + // as they will not be used again + chi0.free_chi0_q(freq, q); + } + // ofs_myid << "chi0_libri" << endl << chi0_libri; + Profiler::start("epsilon_prepare_chi0_2d_comm_map2"); + const auto IJq_chi0 = RI::Communicate_Tensors_Map_Judge::comm_map2_first( + mpi_comm_global_h.comm, chi0_libri, s0_s1.first, s0_s1.second); + Profiler::stop("epsilon_prepare_chi0_2d_comm_map2"); + Profiler::start("epsilon_prepare_chi0_2d_collect_block"); + collect_block_from_ALL_IJ_Tensor(temp_block, desc_nabf_nabf, + LIBRPA::atomic_basis_abf, qa, true, CONE, IJq_chi0, + MAJOR::ROW); + #ifndef ENABLE_NVHPC + ScalapackConnector::pgemr2d_f(n_abf, n_abf, temp_block.ptr(), 1, 1, + desc_nabf_nabf.desc, chi0_block.ptr(), 1, 1, + desc_nabf_nabf_opt.desc, blacs_ctxt_global_h.ictxt); + #else + d_temp_block.set_data(temp_block.nr(),temp_block.nc(),temp_block.ptr()); + d_chi0_block.set_data(chi0_block.nr(),chi0_block.nc()); + CudaConnector::pgemr2d_nvhpc( + gpu_dev_stream, n_abf, n_abf, + d_temp_block.ptr(), 1, 1, desc_nabf_nabf, + d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + CUDA_C_64F + ); + #endif + Profiler::stop("epsilon_prepare_chi0_2d_collect_block"); + } + Profiler::stop("epsilon_prepare_chi0_2d"); + + Profiler::start("epsilon_compute_eps", "Compute dielectric matrix"); + if(gpu_dev_stream.rank==0) + printf("is_gamma_point(q):%d\n",is_gamma_point(q)); + const std::complex calpha(1.0,0.0),cbeta(0.0,0.0); + if (epsmac_LF_imagfreq.size() > 0 && is_gamma_point(q)) + { + ofs_myid << get_timestamp() << " Entering dielectric matrix head overwrite" << endl; + // rotate to Coulomb-eigenvector basis + // descending order + + d_sqrtveig_blacs.set_data(sqrtveig_blacs.nr(), sqrtveig_blacs.nc(), sqrtveig_blacs.ptr()); + d_coul_chi0_block.set_data(coul_chi0_block.nr(), coul_chi0_block.nc()); + gpu_dev_stream.cudaSync(); + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_nonsingular, n_abf, + &calpha, + d_chi0_block,1,1,desc_nabf_nabf_opt, + d_sqrtveig_blacs,1,1,desc_nabf_nabf_opt, + &cbeta, + d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_C,CUBLAS_OP_N,n_nonsingular, n_nonsingular, n_abf, + &calpha, + d_sqrtveig_blacs,1,1,desc_nabf_nabf_opt, + d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + + if (Params::option_dielect_func == 3) + { + + cuDoubleComplex calpha1; + calpha1.x = -1.0; + calpha1.y = 0.0; + CudaConnector::multiply_number_for_ComplexMatrixDevice(d_chi0_block,calpha1,gpu_dev_stream.stream); + CudaConnector::diag_add_ComplexMatrixDevice(d_chi0_block,1.0,desc_nabf_nabf_opt,gpu_dev_stream.stream); + gpu_dev_stream.cudaSync(); + // { + // std::string filename = "gpu_"; + // filename += to_string(gpu_dev_stream.nranks); + // filename += "_"; + // filename += to_string(gpu_dev_stream.rank); + // gpu_dev_stream.cudaSync(); + // CUDA_CHECK(cudaMemcpy(coul_chi0_block.ptr(),d_chi0_block.ptr(),sizeof(cuDoubleComplex)*coul_chi0_block.nr()*coul_chi0_block.nc(),cudaMemcpyDeviceToHost)); + // CudaConnector::write_file((cuDoubleComplex*)coul_chi0_block.ptr(),coul_chi0_block.nr(),coul_chi0_block.nc(),filename.data()); + // } + // CUDA_CHECK(cudaMemcpy(chi0_block.ptr(),d_chi0_block.ptr(),sizeof(cuDoubleComplex)*chi0_block.nr()*chi0_block.nc(),cudaMemcpyDeviceToHost)); + + ofs_myid << get_timestamp() << "Perform the head & wing element overwrite" + << endl; + // df_headwing.rewrite_eps(chi0_block, ifreq, desc_nabf_nabf_opt); + df_headwing.rewrite_eps_nvhpc(gpu_dev_stream, d_chi0_block, ifreq, desc_nabf_nabf_opt); + // d_chi0_block.set_data(chi0_block.nr(), chi0_block.nc(), chi0_block.ptr()); + + d_coul_eigen_block.set_data(coul_eigen_block.nr(), coul_eigen_block.nc(), coul_eigen_block.ptr()); + // rotate back to ABF + // descending order + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_nonsingular, n_nonsingular, + &calpha, + d_coul_eigen_block,1,1,desc_nabf_nabf_opt, + d_chi0_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_C,n_abf, n_abf, n_nonsingular, + &calpha, + d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + d_coul_eigen_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + // subtract 1 from diagonal + CudaConnector::diag_add_ComplexMatrixDevice(d_chi0_block,-1.0,desc_nabf_nabf_opt,gpu_dev_stream.stream); + Profiler::start("epsilon_multiply_coulwc", "Multiply truncated Coulomb"); + d_coulwc_block.set_data(coulwc_block.nr(), coulwc_block.nc(), coulwc_block.ptr()); + d_coul_chi0_block.set_data(coul_chi0_block.nr(), coul_chi0_block.nc()); + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf, + &calpha, + d_coulwc_block,1,1,desc_nabf_nabf_opt, + d_chi0_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf, + &calpha, + d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + d_coulwc_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + Profiler::stop("epsilon_multiply_coulwc"); + } + else + { + const int ilo = desc_nabf_nabf_opt.indx_g2l_r(0); + const int jlo = desc_nabf_nabf_opt.indx_g2l_c(0); + if (ilo >= 0 && jlo >= 0) + { + ofs_myid << get_timestamp() << "Perform the head element overwrite" << endl; + std::complex temp_element = 1.0 - epsmac_LF_imagfreq[ifreq]; + CUDA_CHECK(cudaMemcpy(d_chi0_block.ptr() + ilo + jlo * d_chi0_block.nr(), + &temp_element, sizeof(cuDoubleComplex), + cudaMemcpyHostToDevice)); + } + d_coul_eigen_block.set_data(coul_eigen_block.nr(), coul_eigen_block.nc(), coul_eigen_block.ptr()); + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_nonsingular, n_nonsingular, + &calpha, + d_coul_eigen_block,1,1,desc_nabf_nabf_opt, + d_chi0_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_C,n_abf, n_abf, n_nonsingular, + &calpha, + d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + d_coul_eigen_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + + cuDoubleComplex calpha1; + calpha1.x = -1.0; + calpha1.y = 0.0; + CudaConnector::multiply_number_for_ComplexMatrixDevice(d_chi0_block,calpha1,gpu_dev_stream.stream); + CudaConnector::diag_add_ComplexMatrixDevice(d_chi0_block,1.0,desc_nabf_nabf_opt,gpu_dev_stream.stream); + gpu_dev_stream.cudaSync(); + Profiler::start("epsilon_invert_eps and epsilon_multiply_coulwc", "Invert dielectric matrix and Multiply truncated Coulomb"); + char order = 'c'; + d_coulwc_block.set_data(coulwc_block.nr(), coulwc_block.nc(), coulwc_block.ptr(),gpu_dev_stream.stream); + d_coul_chi0_block.set_data_device(coulwc_block.nr(), coulwc_block.nc(), d_coulwc_block.ptr(),gpu_dev_stream); + int64_t* d_ipiv; + int* d_info; + CUDA_CHECK(cudaMallocAsync(&d_info,sizeof(int),gpu_dev_stream.stream)); + if(order == 'c'||order == 'C'){ + CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*desc_nabf_nabf_opt.n_loc(),gpu_dev_stream.stream)); + CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_coul_chi0_block); + CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_chi0_block); + }else{ + CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*desc_nabf_nabf_opt.m_loc(),gpu_dev_stream.stream)); + } + CudaConnector::pgetrf_nvhpc_mixed_precision( + gpu_dev_stream, d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + d_ipiv, d_info, + CUDA_C_64F, order + ); + CudaConnector::pgetrs_nvhpc_mixed_precision( + gpu_dev_stream, CUBLAS_OP_N, + d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + d_ipiv, + d_coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + d_info, + CUDA_C_64F, order + ); + if(order == 'c'||order == 'C'){ + CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_coul_chi0_block); + } + CUDA_CHECK(cudaFreeAsync(d_info, gpu_dev_stream.stream)); + CUDA_CHECK(cudaFreeAsync(d_ipiv, gpu_dev_stream.stream)); + d_chi0_block.set_data_device(coulwc_block.nr(), coulwc_block.nc(), d_coulwc_block.ptr(), gpu_dev_stream); + + CudaConnector::pgeadd_nvhpc( + gpu_dev_stream, CUBLAS_OP_N, + &calpha, + d_coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + &calpha1, + d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + CUDA_C_64F, order + ); + d_coul_chi0_block.set_data_device(d_chi0_block.nr(), d_chi0_block.nc(), d_chi0_block.ptr(), gpu_dev_stream); + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf, + &calpha, + (order=='r'||order=='R')?d_coulwc_block:d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + (order=='r'||order=='R')?d_coul_chi0_block:d_coulwc_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + Profiler::stop("epsilon_invert_eps and epsilon_multiply_coulwc"); + } + } + else + { + Profiler::start("epsilon_compute_eps_pgemm_1"); + // d_chi0_block.set_data(chi0_block.nr(), chi0_block.nc(), chi0_block.ptr()); + d_coul_block.set_data(coul_block.nr(), coul_block.nc(), coul_block.ptr()); + d_coul_chi0_block.set_data(coul_chi0_block.nr(), coul_chi0_block.nc()); + + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf, + &calpha, + d_coul_block,1,1,desc_nabf_nabf_opt, + d_chi0_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + Profiler::cease("epsilon_compute_eps_pgemm_1"); + Profiler::start("epsilon_compute_eps_pgemm_2"); + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf, + &calpha, + d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + d_coul_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + // d_coul_block.cublasClean(gpu_dev_stream.cublas_handle); + Profiler::cease("epsilon_compute_eps_pgemm_2"); + // now chi0_block is actually v1/2 chi v1/2 + cuDoubleComplex calpha1; + calpha1.x = -1.0; + calpha1.y = 0.0; + CudaConnector::multiply_number_for_ComplexMatrixDevice(d_chi0_block,calpha1,gpu_dev_stream.stream); + gpu_dev_stream.cudaSync(); + CudaConnector::diag_add_ComplexMatrixDevice(d_chi0_block,1.0,desc_nabf_nabf_opt,gpu_dev_stream.stream); + Profiler::stop("epsilon_compute_eps"); + // now chi0_block is actually the dielectric matrix + // perform inversion + Profiler::start("epsilon_invert_eps and epsilon_multiply_coulwc", "Invert dielectric matrix and Multiply truncated Coulomb"); + char order = 'c'; + d_coulwc_block.set_data(coulwc_block.nr(), coulwc_block.nc(), coulwc_block.ptr(),gpu_dev_stream.stream); + d_coul_chi0_block.set_data_device(coulwc_block.nr(), coulwc_block.nc(), d_coulwc_block.ptr(),gpu_dev_stream); + int64_t* d_ipiv; + int* d_info; + CUDA_CHECK(cudaMallocAsync(&d_info,sizeof(int),gpu_dev_stream.stream)); + if(order == 'c'||order == 'C'){ + CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*desc_nabf_nabf_opt.n_loc(),gpu_dev_stream.stream)); + CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_coul_chi0_block); + CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_chi0_block); + }else{ + CUDA_CHECK(cudaMallocAsync(&d_ipiv,sizeof(int64_t)*desc_nabf_nabf_opt.m_loc(),gpu_dev_stream.stream)); + } + CudaConnector::pgetrf_nvhpc_mixed_precision( + gpu_dev_stream, d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + d_ipiv, d_info, + CUDA_C_64F, order + ); + + CudaConnector::pgetrs_nvhpc_mixed_precision( + gpu_dev_stream, CUBLAS_OP_N, + d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + d_ipiv, + d_coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + d_info, + CUDA_C_64F, order + ); + if(order == 'c'||order == 'C'){ + CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_coul_chi0_block); + } + CUDA_CHECK(cudaFreeAsync(d_info, gpu_dev_stream.stream)); + CUDA_CHECK(cudaFreeAsync(d_ipiv, gpu_dev_stream.stream)); + d_chi0_block.set_data_device(coulwc_block.nr(), coulwc_block.nc(), d_coulwc_block.ptr(), gpu_dev_stream); + + CudaConnector::pgeadd_nvhpc( + gpu_dev_stream, CUBLAS_OP_N, + &calpha, + d_coul_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + &calpha1, + d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + CUDA_C_64F, order + ); + d_coul_chi0_block.set_data_device(d_chi0_block.nr(), d_chi0_block.nc(), d_chi0_block.ptr(), gpu_dev_stream); + CudaConnector::pgemm_nvhpc( + gpu_dev_stream,CUBLAS_OP_N,CUBLAS_OP_N,n_abf, n_abf, n_abf, + &calpha, + (order=='r'||order=='R')?d_coulwc_block:d_coul_chi0_block,1,1,desc_nabf_nabf_opt, + (order=='r'||order=='R')?d_coul_chi0_block:d_coulwc_block,1,1,desc_nabf_nabf_opt, + &cbeta, + d_chi0_block,1,1,desc_nabf_nabf_opt, + CUBLAS_COMPUTE_64F_PEDANTIC); + Profiler::stop("epsilon_invert_eps and epsilon_multiply_coulwc"); + } + // Array_Desc_Device array_desc_device(desc_nabf_nabf_opt); + // printf("successful create object array_desc_device\n"); + #ifndef ENABLE_NVHPC + gpu_dev_stream.calSync(); + CUDA_CHECK(cudaMemcpy(chi0_block.ptr(),d_chi0_block.ptr(),sizeof(cuDoubleComplex)*chi0_block.nr()*chi0_block.nc(),cudaMemcpyDeviceToHost)); + ScalapackConnector::pgemr2d_f(n_abf, n_abf, chi0_block.ptr(), 1, 1, + desc_nabf_nabf_opt.desc, temp_block.ptr(), 1, 1, + desc_nabf_nabf.desc, blacs_ctxt_global_h.ictxt); + #else + d_temp_block.set_data(temp_block.nr(),temp_block.nc()); + CudaConnector::pgemr2d_nvhpc( + gpu_dev_stream, n_abf, n_abf, + d_chi0_block.ptr(), 1, 1, desc_nabf_nabf_opt, + d_temp_block.ptr(), 1, 1, desc_nabf_nabf, + CUDA_C_64F + ); + CUDA_CHECK(cudaMemcpyAsync(temp_block.ptr(), d_temp_block.ptr(), sizeof(cuDoubleComplex)*temp_block.nr()*temp_block.nc(), cudaMemcpyDeviceToHost, gpu_dev_stream.stream)); + gpu_dev_stream.cudaSync(); + #endif + + Profiler::start("epsilon_convert_wc_2d_to_ij", "Convert Wc, 2D -> IJ"); + Profiler::start("epsilon_convert_wc_map_block", "Initialize Wc atom-pair map"); + map>>> Wc_MNmap; + // map_block_to_IJ_storage(Wc_MNmap, LIBRPA::atomic_basis_abf, + // LIBRPA::atomic_basis_abf, chi0_block, + // desc_nabf_nabf, MAJOR::ROW); + map_block_to_IJ_storage_new(Wc_MNmap, LIBRPA::atomic_basis_abf, map_lor_v, map_loc_v, + temp_block, desc_nabf_nabf, MAJOR::ROW); + Profiler::stop("epsilon_convert_wc_map_block"); + + Profiler::start("epsilon_convert_wc_communicate", "Communicate"); + { + std::map>, + RI::Tensor>>> + Wc_libri; + Profiler::start("epsilon_convert_wc_communicate_1"); + for (const auto &M_NWc : Wc_MNmap) + { + const auto &M = M_NWc.first; + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M); + for (const auto &N_Wc : M_NWc.second) + { + const auto &N = N_Wc.first; + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N); + const auto &Wc = N_Wc.second; + // std::valarray> Wc_va(Wc.ptr(), Wc.size()); + // auto pWc = std::make_shared>>(); + // *pWc = Wc_va; + /*if (iq == 10 && ifreq == 10) + { + char fn[100]; + sprintf(fn, "Wc_M_%zu_N_%zu.dat", M, N); + print_matrix_mm_file(Wc, Params::output_dir + "/" + fn); + }*/ + Wc_libri[M][{N, qa}] = RI::Tensor>({n_mu, n_nu}, Wc.sptr()); + } + } + Profiler::stop("epsilon_convert_wc_communicate_1"); + Profiler::start("epsilon_convert_wc_communicate_2"); + // main timing + // cout << Wc_libri; + const auto IJq_Wc = RI::Communicate_Tensors_Map_Judge::comm_map2_first( + mpi_comm_global_h.comm, Wc_libri, Iset_Jset_Wc.first, Iset_Jset_Wc.second); + Profiler::stop("epsilon_convert_wc_communicate_2"); + Profiler::start("epsilon_convert_wc_communicate_3"); + // parse collected to + for (const auto &MN : atpair_local) + { + const auto &M = MN.first; + const auto &N = MN.second; + const auto n_mu = LIBRPA::atomic_basis_abf.get_atom_nb(M); + const auto n_nu = LIBRPA::atomic_basis_abf.get_atom_nb(N); + // Use row major for later usage in LibRI + Wc_freq_q[freq][M][N][q] = matrix_m>( + n_mu, n_nu, IJq_Wc.at(M).at({N, qa}).data, MAJOR::ROW); + } + Profiler::stop("epsilon_convert_wc_communicate_3"); + // for ( int i_mu = 0; i_mu != n_mu; i_mu++ ) + // for ( int i_nu = 0; i_nu != n_nu; i_nu++ ) + // { + // } + } + Profiler::stop("epsilon_convert_wc_communicate"); + Profiler::stop("epsilon_convert_wc_2d_to_ij"); + Profiler::cease("epsilon_wc_work_q_omega"); + LIBRPA::utils::lib_printf_root( + "Time for Wc(i_q=%d, i_omega=%d) (seconds, Wall/CPU): %f %f\n", iq + 1, ifreq + 1, + Profiler::get_wall_time_last("epsilon_wc_work_q_omega"), + Profiler::get_cpu_time_last("epsilon_wc_work_q_omega")); + } + } +#else + throw std::logic_error("need compilation with LibRI"); +#endif + Profiler::cease("compute_Wc_freq_q_work"); + LIBRPA::utils::lib_printf_root("Time for Wc computation (seconds, Wall/CPU): %f %f\n", + Profiler::get_wall_time_last("compute_Wc_freq_q_work"), + Profiler::get_cpu_time_last("compute_Wc_freq_q_work")); + + return Wc_freq_q; +} + +#endif \ No newline at end of file diff --git a/src/epsilon_cuda.h b/src/epsilon_cuda.h new file mode 100644 index 00000000..0fd7b2d2 --- /dev/null +++ b/src/epsilon_cuda.h @@ -0,0 +1,18 @@ +#pragma once +#include "epsilon.h" +#include "device_connector.h" +#include "matrix_device.h" + +#ifdef LIBRPA_USE_CUDA +CorrEnergy compute_RPA_correlation_cuda(const Chi0 &chi0, const atpair_k_cplx_mat_t &coulmat); +#endif +#ifdef ENABLE_NVHPC +CorrEnergy compute_RPA_correlation_blacs_2d_cuda(Chi0 &chi0, atpair_k_cplx_mat_t &coulmat); + +map, matrix_m>>>::pair_t_old> +compute_Wc_freq_q_blacs_cuda(Chi0 &, const atpair_k_cplx_mat_t &, + atpair_k_cplx_mat_t &, + const vector> &); +complex compute_pi_det_blacs_2d_nvhpc( + MatrixDevice> &, const LIBRPA::Array_Desc &arrdesc_pi, int64_t *d_ipiv, int *d_info,char order='C'); +#endif diff --git a/src/matrix_device.cpp b/src/matrix_device.cpp new file mode 100644 index 00000000..28b7987e --- /dev/null +++ b/src/matrix_device.cpp @@ -0,0 +1,16 @@ +#include "matrix_device.h" +#include "device_stream.h" + +template +void MatrixDevice::set_data_device(const int& m, const int& n, const void* d_A, const void* stream){ + set_data(m,n,stream); + if(stream==nullptr){ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaMemcpyPeer(this->d_data, device_stream.local_device, d_A, device_stream.local_device, m * n * sizeof(T))); + #endif + }else{ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaMemcpyPeerAsync(this->d_data, device_stream.local_device, d_A, device_stream.local_device, m * n * sizeof(T), (cudaStream_t)stream)); + #endif + } +} \ No newline at end of file diff --git a/src/matrix_device.h b/src/matrix_device.h new file mode 100644 index 00000000..000c3847 --- /dev/null +++ b/src/matrix_device.h @@ -0,0 +1,111 @@ +#ifndef MATRIX_DEVICE_H +#define MATRIX_DEVICE_H + +#include +#ifdef ENABLE_NVHPC +#include +#include "helpers.h" +#endif + +template +class MatrixDevice{ +private: + T* d_data=nullptr; + int m=0; + int n=0; +public: + MatrixDevice(){} + MatrixDevice(const int& m, const int& n, const void* stream){ + this->m=m; + this->n=n; + if(stream==nullptr){ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(T))); + #endif + }else{ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaMallocAsync((void**)&d_data, m * n * sizeof(T), (cudaStream_t)stream)); + #endif + } + } + MatrixDevice(const int& m, const int& n,const void* c_data, const void* stream){ + this->m=m; + this->n=n; + if(stream==nullptr){ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(T))); + CUDA_CHECK(cudaMemcpy(d_data, c_data, m * n * sizeof(T), cudaMemcpyHostToDevice)); + #endif + }else{ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaMallocAsync((void**)&d_data, m * n * sizeof(T), (cudaStream_t)stream)); + CUDA_CHECK(cudaMemcpyAsync(d_data, c_data, m * n * sizeof(T), cudaMemcpyHostToDevice, (cudaStream_t)stream)); + #endif + } + } + void set_data(const int& m, const int& n, const void* stream){ + if(m!=this->m || n!=this->n){ + clean(stream); + this->m=m; + this->n=n; + if(stream==nullptr){ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaMalloc((void**)&d_data, m * n * sizeof(T))); + #endif + }else{ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaMallocAsync((void**)&d_data, m * n * sizeof(T), (cudaStream_t)stream)); + #endif + } + } + } + void set_data(const int& m, const int& n,const void* c_data, const void* stream){ + set_data(m,n,stream); + if(stream==nullptr){ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaMemcpy(d_data, c_data, m * n * sizeof(T), cudaMemcpyHostToDevice)); + #endif + }else{ + #ifdef ENABLE_NVHPC + CUDA_CHECK(cudaMemcpyAsync(d_data, c_data, m * n * sizeof(T), cudaMemcpyHostToDevice, (cudaStream_t)stream)); + #endif + } + } + void set_data_device(const int& m, const int& n, const void* d_A, const void* stream); + + + void clean(const void* stream){ + if(d_data!=nullptr){ + if(stream==nullptr){ + #ifdef ENABLE_NVHPC + cudaFree(d_data); + #endif + }else{ + #ifdef ENABLE_NVHPC + cudaFreeAsync(d_data, (cudaStream_t)stream); + #endif + } + d_data=nullptr; + } + this->m=0; + this->n=0; + } + T* ptr(){ + return d_data; + } + const T* ptr() const { + return d_data; + } + int nr() const { + return m; + } + int nc() const { + return n; + } + ~MatrixDevice(){ + clean(nullptr); + } + +}; + +#endif // MATRIX_DEVICE_H \ No newline at end of file From 154ab9a882e053ed45f17276579e636c350b65af Mon Sep 17 00:00:00 2001 From: userchba <88939318+userchba@users.noreply.github.com> Date: Fri, 14 Nov 2025 16:55:53 +0800 Subject: [PATCH 09/18] Add files via upload --- src/helpers.h | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 src/helpers.h diff --git a/src/helpers.h b/src/helpers.h new file mode 100644 index 00000000..ec014586 --- /dev/null +++ b/src/helpers.h @@ -0,0 +1,107 @@ + +#ifndef HELPERS_H +#define HELPERS_H + +#pragma once +#include +#include +#ifdef ENABLE_NVHPC +#include +#include +#include +#endif + + +typedef enum{ + LIBRPA_COMPUTE_TYPE_COMPLEX_DOUBLE, + LIBRPA_COMPUTE_TYPE_COMPLEX_FLOAT, + LIBRPA_COMPUTE_TYPE_DOUBLE, + LIBRPA_COMPUTE_TYPE_FLOAT +}LIBRPA_DEVICE_COMPUTE_TYPE; + + +#ifdef ENABLE_NVHPC +#define NVHPC_MPI_CHECK(call) \ + do \ + { \ + int status = call; \ + if (status != MPI_SUCCESS) \ + { \ + fprintf(stderr, "MPI error at %s:%d : %d\n", __FILE__, __LINE__, status); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) +#define NVSHMEM_CHECK(call) \ + do \ + { \ + int status = call; \ + if (status != 0) \ + { \ + fprintf(stderr, "NVSHMEM error at %s:%d : %d\n", __FILE__, __LINE__, status); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + + +#define CUDA_CHECK(call) \ + do \ + { \ + cudaError_t status = call; \ + if (status != cudaSuccess) \ + { \ + fprintf(stderr, "CUDA error at %s:%d : %s\n", __FILE__, __LINE__, cudaGetErrorString(status)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define CUBLASMP_CHECK(call) \ + do \ + { \ + cublasStatus_t status = call; \ + if (status != CUBLAS_STATUS_SUCCESS) \ + { \ + fprintf(stderr, "cuBLASMp error at %s:%d : %d\n", __FILE__, __LINE__, status); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) +#define CUBLAS_CHECK(err) \ + do { \ + cublasStatus_t err_ = (err); \ + if (err_ != CUBLAS_STATUS_SUCCESS) { \ + std::printf("cublas error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) +#define CAL_CHECK(call) \ + do \ + { \ + calError_t status = call; \ + if (status != CAL_OK) \ + { \ + fprintf(stderr, "CAL error at %s:%d : %d\n", __FILE__, __LINE__, status); \ + exit(EXIT_FAILURE); \ + } \ + }while(0) + +#define CUSOLVERMP_CHECK(call) \ + do \ + { \ + cusolverStatus_t status = call; \ + if (status != CUSOLVER_STATUS_SUCCESS) \ + { \ + fprintf(stderr, "cuSOLVERMp error at %s:%d : %d\n", __FILE__, __LINE__, status); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) +#endif +#define ORDER_CHECK(order) \ + do \ + { \ + if(order !='C'&&order !='c'&&order !='R'&&order !='r') \ + { \ + fprintf(stderr, "Order should be either 'C' or 'R', order error at %s:%d:%s\n", __FILE__, __LINE__, order);\ + exit(EXIT_FAILURE); \ + } \ + }while (0) + +#endif // HELPERS_H \ No newline at end of file From 91026d70fd5e955d4b810feb2a1bb2f156b5478a Mon Sep 17 00:00:00 2001 From: userchba <88939318+userchba@users.noreply.github.com> Date: Fri, 14 Nov 2025 16:56:33 +0800 Subject: [PATCH 10/18] Add files via upload --- driver/main.cpp | 12 +++-- driver/task_gw_band.cpp | 115 ++++++++++++---------------------------- 2 files changed, 43 insertions(+), 84 deletions(-) diff --git a/driver/main.cpp b/driver/main.cpp index 736a1838..164671ed 100644 --- a/driver/main.cpp +++ b/driver/main.cpp @@ -39,7 +39,9 @@ #include "utils_cmake.h" #include "utils_mem.h" #include "utils_mpi_io.h" - +#ifdef ENABLE_NVHPC +#include "device_stream.h" +#endif static void initialize(int argc, char **argv) { using namespace LIBRPA::envs; @@ -54,7 +56,9 @@ static void initialize(int argc, char **argv) } initialize_librpa_environment(MPI_COMM_WORLD, 0, 0, ""); - + #ifdef ENABLE_NVHPC + device_stream.init(); + #endif // Global profiler begins right after MPI is initialized Profiler::start("total", "Total"); } @@ -77,7 +81,9 @@ static void finalize(bool success) lib_printf("libRPA failed\n"); } } - + #ifdef ENABLE_NVHPC + device_stream.finalize(); + #endif finalize_librpa_environment(); MPI_Finalize(); diff --git a/driver/task_gw_band.cpp b/driver/task_gw_band.cpp index fd420614..1a9d99b3 100644 --- a/driver/task_gw_band.cpp +++ b/driver/task_gw_band.cpp @@ -23,6 +23,10 @@ #include "ri.h" #include "utils_timefreq.h" #include "write_aims.h" +#ifdef ENABLE_NVHPC +#include "epsilon_cuda.h" +#include +#endif void task_g0w0_band(std::map, ComplexMatrix> &sinvS) { @@ -114,19 +118,9 @@ void task_g0w0_band(std::map, ComplexMatrix> &sinvS) Profiler::start("g0w0_exx", "Build exchange self-energy"); auto exx = LIBRPA::Exx(meanfield, kfrac_list, period); { - atpair_R_mat_t VR; - if (Params::use_fullcoul_exx) - { - Profiler::start("ft_vq_full", "Fourier transform full Coulomb"); - VR = FT_Vq(Vq, meanfield.get_n_kpoints(), Rlist, true); - Profiler::stop("ft_vq_full"); - } - else - { - Profiler::start("ft_vq_cut", "Fourier transform truncated Coulomb"); - VR = FT_Vq(Vq_cut, meanfield.get_n_kpoints(), Rlist, true); - Profiler::stop("ft_vq_cut"); - } + Profiler::start("ft_vq_cut", "Fourier transform truncated Coulomb"); + const auto VR = FT_Vq(Vq_cut, meanfield.get_n_kpoints(), Rlist, true); + Profiler::stop("ft_vq_cut"); Profiler::start("g0w0_exx_real_work"); if (Params::use_shrink_abfs) @@ -156,7 +150,15 @@ void task_g0w0_band(std::map, ComplexMatrix> &sinvS) Wc_freq_q; if (Params::use_scalapack_gw_wc) { + #ifdef ENABLE_NVHPC + int numDevices; + cudaError_t cudaStat = cudaGetDeviceCount(&numDevices); + if(cudaStat == cudaSuccess && numDevices>0) + Wc_freq_q = compute_Wc_freq_q_blacs_cuda(chi0, Vq, Vq_cut, epsmac_LF_imagfreq); + else + #endif Wc_freq_q = compute_Wc_freq_q_blacs(chi0, Vq, Vq_cut, epsmac_LF_imagfreq); + } else { @@ -442,21 +444,11 @@ void task_g0w0_band(std::map, ComplexMatrix> &sinvS) } } // output bandgap - double gw_bandgap = 0.0; - double gw_valence = -1.e10; - double gw_conduct = 1.e10; - double exx_bandgap = 0.0; - double exx_valence = -1.e10; - double exx_conduct = 1.e10; - double dft_bandgap = 0.0; - double dft_valence = -1.e10; - double dft_conduct = 1.e10; - int ik_val_gw = 0; - int ik_cond_gw = 0; - int ik_val_exx = 0; - int ik_cond_exx = 0; - int ik_val_dft = 0; - int ik_cond_dft = 0; + double bandgap = 0.0; + double valence = -1.e10; + double conduct = 1.e10; + int ik_val = 0; + int ik_cond = 0; int nocc = 0; auto &wg = meanfield.get_weight()[0]; for (int i = 0; i != wg.size; i++) @@ -522,38 +514,16 @@ void task_g0w0_band(std::map, ComplexMatrix> &sinvS) ofs_hf << std::setw(15) << std::setprecision(5) << occ_state << std::setw(15) << std::setprecision(5) << eks_state - vxc_state + exx_state; - // output GW bandgap - if (i_state == nocc - 1 && eqp > gw_conduct) // HOMO - { - gw_valence = eqp; - ik_val_gw = i_kpoint; - } - else if (i_state == nocc && eqp < gw_conduct) // LUMO - { - gw_conduct = eqp; - ik_cond_gw = i_kpoint; - } - // output EXX bandgap - if (i_state == nocc - 1 && exx_state > exx_valence) // HOMO - { - exx_valence = exx_state; - ik_val_exx = i_kpoint; - } - else if (i_state == nocc && exx_state < exx_conduct) // LUMO - { - exx_conduct = exx_state; - ik_cond_exx = i_kpoint; - } - // output DFT bandgap - if (i_state == nocc - 1 && eks_state > dft_valence) // HOMO + // output bandgap + if (i_state == nocc - 1 && eqp > valence) // HOMO { - dft_valence = eks_state; - ik_val_dft = i_kpoint; + valence = eqp; + ik_val = i_kpoint; } - else if (i_state == nocc && eks_state < dft_conduct) // LUMO + else if (i_state == nocc && eqp < conduct) // LUMO { - dft_conduct = eks_state; - ik_cond_dft = i_kpoint; + conduct = eqp; + ik_cond = i_kpoint; } } ofs_gw << "\n"; @@ -561,30 +531,13 @@ void task_g0w0_band(std::map, ComplexMatrix> &sinvS) ofs_ks << "\n"; } } - gw_bandgap = gw_conduct - gw_valence; - exx_bandgap = exx_conduct - exx_valence; - dft_bandgap = dft_conduct - dft_valence; - const auto &k_val_gw = kfrac_band[ik_val_gw]; - const auto &k_cond_gw = kfrac_band[ik_cond_gw]; - printf("GW VBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_val_gw + 1, k_val_gw.x, k_val_gw.y, - k_val_gw.z); - printf("GW CBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_cond_gw + 1, k_cond_gw.x, - k_cond_gw.y, k_cond_gw.z); - lib_printf("GW bandgap(eV): %12.7f \n", gw_bandgap); - const auto &k_val_exx = kfrac_band[ik_val_exx]; - const auto &k_cond_exx = kfrac_band[ik_cond_exx]; - printf("EXX VBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_val_exx + 1, k_val_exx.x, - k_val_exx.y, k_val_exx.z); - printf("EXX CBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_cond_exx + 1, k_cond_exx.x, - k_cond_exx.y, k_cond_exx.z); - lib_printf("EXX bandgap(eV): %12.7f \n", exx_bandgap); - const auto &k_val_dft = kfrac_band[ik_val_dft]; - const auto &k_cond_dft = kfrac_band[ik_cond_dft]; - printf("DFT VBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_val_dft + 1, k_val_dft.x, - k_val_dft.y, k_val_dft.z); - printf("DFT CBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_cond_dft + 1, k_cond_dft.x, - k_cond_dft.y, k_cond_dft.z); - lib_printf("DFT bandgap(eV): %12.7f \n", dft_bandgap); + bandgap = conduct - valence; + const auto &k_val = kfrac_band[ik_val]; + printf("VBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_val + 1, k_val.x, k_val.y, k_val.z); + const auto &k_cond = kfrac_band[ik_cond]; + printf("CBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_cond + 1, k_cond.x, k_cond.y, + k_cond.z); + lib_printf("Bandgap(eV): %12.7f \n", bandgap); } Profiler::stop("g0w0_solve_band_qpe"); From c91e40b956c9a98490931739a59fe289e44c133c Mon Sep 17 00:00:00 2001 From: userchba <88939318+userchba@users.noreply.github.com> Date: Fri, 14 Nov 2025 17:00:45 +0800 Subject: [PATCH 11/18] Add files via upload --- src/dielecmodel.cpp | 355 +++++++++++++++++++++++++++++++++++++++++--- src/dielecmodel.h | 17 ++- 2 files changed, 352 insertions(+), 20 deletions(-) diff --git a/src/dielecmodel.cpp b/src/dielecmodel.cpp index 87507dcd..cff7158f 100644 --- a/src/dielecmodel.cpp +++ b/src/dielecmodel.cpp @@ -19,7 +19,9 @@ using RI::Communicate_Tensors_Map_Judge::comm_map2_first; using LIBRPA::envs::mpi_comm_global_h; using LIBRPA::envs::ofs_myid; using LIBRPA::utils::lib_printf; - +#ifdef ENABLE_NVHPC +#include "cuda_connector.h" +#endif const int DoubleHavriliakNegami::d_npar = 8; const std::function &)> @@ -1070,6 +1072,43 @@ Array_Desc diele_func::get_body_inv(matrix_m> &chi0_block, return desc_body; }; +Array_Desc diele_func::get_body_inv_nvhpc(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice& d_chi0_block, const Array_Desc& desc_nabf_nabf_opt) +{ + Profiler::start("get_inverse_body_of_chi0_nvhpc"); + gpu_dev_stream.calSync(); + Array_Desc desc_body(blacs_ctxt_global_h); + desc_body.init_square_blk(n_nonsingular - 1, n_nonsingular - 1, 0, 0); + this->d_body_inv.set_data(desc_body.m_loc(),desc_body.n_loc(),gpu_dev_stream.stream); + Array_Desc_Device desc_body_dev(desc_body); + CudaConnector::pgemr2d_nvhpc( + gpu_dev_stream, n_nonsingular - 1, n_nonsingular - 1, + d_chi0_block.ptr(), 2, 2, desc_nabf_nabf_opt, + this->d_body_inv.ptr(), 1, 1, desc_body, + CUDA_C_64F + ); + ComplexMatrixDevice d_identity; + d_identity.set_data(desc_body.m_loc(),desc_body.n_loc(),gpu_dev_stream.stream); + d_identity.set_as_identity(gpu_dev_stream, desc_body_dev); + char order = 'c'; + if(order == 'c'||order == 'C'){ + CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_body_inv); + CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_identity); + } + CudaConnector::pgetrf_trs_nvhpc_mixed_precision( + gpu_dev_stream, CUBLAS_OP_N, + d_body_inv.ptr(), 1, 1, desc_body, + d_identity.ptr(), 1, 1, desc_body, + CUDA_C_64F, order + ); + if(order == 'c'||order == 'C'){ + CudaConnector::transpose_ComplexMatrixDevice(gpu_dev_stream,d_identity); + } + + this->d_body_inv.set_data_device(d_identity.nr(),d_identity.nc(),d_identity.ptr(),gpu_dev_stream); + Profiler::stop("get_inverse_body_of_chi0_nvhpc"); + return desc_body; +} + void diele_func::construct_L(const int ifreq, Array_Desc &desc_body) { Profiler::start("cal_L"); @@ -1136,6 +1175,170 @@ void diele_func::construct_L(const int ifreq, Array_Desc &desc_body) Profiler::stop("cal_L"); }; +#ifdef ENABLE_NVHPC +void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const int& ifreq, Array_Desc& desc_body) +{ + Profiler::start("cal_L"); + this->Lind.resize(3, 3, MAJOR::COL); + this->bw.resize(n_nonsingular - 1, 3, MAJOR::COL); + this->wb.resize(3, n_nonsingular - 1, MAJOR::COL); + Array_Desc desc_wing(blacs_ctxt_global_h); + desc_wing.init_square_blk(n_nonsingular - 1, 3, 0, 0); + + Array_Desc desc_wing_opt(blacs_ctxt_global_h); + desc_wing_opt.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0); + + + Array_Desc desc_lam_3(blacs_ctxt_global_h); + desc_lam_3.init_square_blk(n_nonsingular - 1, 3, 0, 0); + + Array_Desc desc_lam_3_opt(blacs_ctxt_global_h); + desc_lam_3_opt.init(n_nonsingular - 1, 3, desc_body.mb(), desc_lam_3.nb(), 0, 0); + + Array_Desc desc_3_lam(blacs_ctxt_global_h); + desc_3_lam.init_square_blk(3, n_nonsingular - 1, 0, 0); + + Array_Desc desc_3_lam_opt(blacs_ctxt_global_h); + desc_3_lam_opt.init(3, n_nonsingular - 1, desc_3_lam.mb(), desc_body.nb(), 0, 0); + + Array_Desc desc_3_3(blacs_ctxt_global_h); + desc_3_3.init_square_blk(3, 3, 0, 0); + + auto lam_3 = init_local_mat>(desc_lam_3, MAJOR::COL); + auto _3_lam = init_local_mat>(desc_3_lam, MAJOR::COL); + auto Lind_loc = init_local_mat>(desc_3_3, MAJOR::COL); + // printf("rank:%d,ma:%d,na:%d,mb:%d,nb:%d,mc:%d,nc:%d\n",gpu_dev_stream.rank,desc_body.m(),desc_body.n(),desc_wing_opt.m(),desc_wing_opt.n(),desc_lam_3_opt.m(),desc_lam_3_opt.n()); + // printf("rank:%d,m_loc_a:%d,n_loc_a:%d,m_loc_b:%d,n_loc_b:%d,m_loc_c:%d,n_loc_c:%d\n",gpu_dev_stream.rank,desc_body.m_loc(),desc_body.n_loc(),desc_wing_opt.m_loc(),desc_wing_opt.n_loc(),desc_lam_3_opt.m_loc(),desc_lam_3_opt.n_loc()); + // printf("rank:%d,mba:%d,nba:%d,mbb:%d,nbb:%d,mbc:%d,nbc:%d\n",gpu_dev_stream.rank,desc_body.mb(),desc_body.nb(),desc_wing_opt.mb(),desc_wing_opt.nb(),desc_lam_3_opt.mb(),desc_lam_3_opt.nb()); + // tmp = head.at(ifreq) - transpose(wing.at(ifreq), true) * body_inv * wing.at(ifreq); + #ifdef ENABLE_NVHPC + auto wing_ifreq_opt = init_local_mat>(desc_wing_opt, MAJOR::COL); + auto lam_3_opt = init_local_mat>(desc_lam_3_opt, MAJOR::COL); + auto _3_lam_opt = init_local_mat>(desc_3_lam_opt, MAJOR::COL); + ComplexMatrixDevice d_lam_3,d_3_lam,d_Lind_loc; + ComplexMatrixDevice d_lam_3_opt,d_3_lam_opt; + ComplexMatrixDevice d_wing_ifreq,d_wing_ifreq_opt; + d_wing_ifreq.set_data(wing.at(ifreq).nr(),wing.at(ifreq).nc(),wing.at(ifreq).ptr(),gpu_dev_stream.stream); + d_wing_ifreq_opt.set_data(desc_wing_opt.m_loc(),desc_wing_opt.n_loc(),gpu_dev_stream.stream); + // d_lam_3.set_data(desc_lam_3.m_loc(),desc_lam_3.n_loc(),gpu_dev_stream.stream); + d_lam_3_opt.set_data(desc_lam_3_opt.m_loc(),desc_lam_3_opt.n_loc(),gpu_dev_stream.stream); + // d_3_lam.set_data(desc_3_lam.m_loc(),desc_3_lam.n_loc(),gpu_dev_stream.stream); + d_3_lam_opt.set_data(desc_3_lam_opt.m_loc(),desc_3_lam_opt.n_loc(),gpu_dev_stream.stream); + d_Lind_loc.set_data(desc_3_3.m_loc(),desc_3_3.n_loc(),gpu_dev_stream.stream); + ScalapackConnector::pgemr2d_f( + n_nonsingular-1, 3, + wing.at(ifreq).ptr(), 1, 1, desc_wing.desc, + wing_ifreq_opt.ptr(), 1, 1, desc_wing_opt.desc, + blacs_ctxt_global_h.ictxt + ); + // CudaConnector::pgemr2d_nvhpc( + // gpu_dev_stream, n_nonsingular-1, 3, + // d_wing_ifreq.ptr(), 1, 1, desc_wing, + // d_wing_ifreq_opt.ptr(), 1, 1, desc_wing, + // CUDA_C_64F + // ); + CUDA_CHECK(cudaMemcpyAsync(d_wing_ifreq_opt.ptr(),wing_ifreq_opt.ptr(),desc_wing_opt.m_loc()*desc_wing_opt.n_loc()*sizeof(std::complex),cudaMemcpyHostToDevice,gpu_dev_stream.stream)); + std::complex calpha(1.0,0.0),cbeta(0.0,0.0); + + CudaConnector::pgemm_nvhpc( + gpu_dev_stream, CUBLAS_OP_N, CUBLAS_OP_N, n_nonsingular - 1, 3, n_nonsingular - 1, + &calpha, + d_body_inv, 1, 1, desc_body, + d_wing_ifreq_opt, 1, 1, desc_wing_opt, + &cbeta, + d_lam_3_opt, 1, 1, desc_lam_3_opt, + CUBLAS_COMPUTE_64F_PEDANTIC + ); + CUDA_CHECK(cudaMemcpyAsync(lam_3_opt.ptr(),d_lam_3_opt.ptr(),desc_lam_3_opt.m_loc()*desc_lam_3_opt.n_loc()*sizeof(std::complex),cudaMemcpyDeviceToHost,gpu_dev_stream.stream)); + gpu_dev_stream.cudaSync(); + ScalapackConnector::pgemr2d_f( + n_nonsingular-1, 3, + lam_3_opt.ptr(), 1, 1, desc_lam_3_opt.desc, + lam_3.ptr(), 1, 1, desc_lam_3.desc, + blacs_ctxt_global_h.ictxt + ); + gpu_dev_stream.cudaSync(); + #else + ScalapackConnector::pgemm_f('N', 'N', n_nonsingular - 1, 3, n_nonsingular - 1, 1.0, + body_inv.ptr(), 1, 1, desc_body.desc, wing.at(ifreq).ptr(), 1, 1, + desc_wing.desc, 0.0, lam_3.ptr(), 1, 1, desc_lam_3.desc); + #endif + #ifdef ENABLE_NVHPC + CudaConnector::pgemm_nvhpc( + gpu_dev_stream, CUBLAS_OP_C, CUBLAS_OP_N, 3, 3, n_nonsingular - 1, + &calpha, + d_wing_ifreq_opt, 1, 1, desc_wing_opt, + d_lam_3_opt, 1, 1, desc_lam_3_opt, + &cbeta, + d_Lind_loc, 1, 1, desc_3_3, + CUBLAS_COMPUTE_64F_PEDANTIC + ); + CUDA_CHECK(cudaMemcpyAsync(Lind_loc.ptr(),d_Lind_loc.ptr(),desc_3_3.m_loc()*desc_3_3.n_loc()*sizeof(std::complex),cudaMemcpyDeviceToHost,gpu_dev_stream.stream)); + gpu_dev_stream.cudaSync(); + #else + ScalapackConnector::pgemm_f('C', 'N', 3, 3, n_nonsingular - 1, 1.0, wing.at(ifreq).ptr(), 1, 1, + desc_wing.desc, lam_3.ptr(), 1, 1, desc_lam_3.desc, 0.0, + Lind_loc.ptr(), 1, 1, desc_3_3.desc); + #endif + #ifndef ENABLE_NVHPC + CudaConnector::pgemm_nvhpc( + gpu_dev_stream, CUBLAS_OP_C, CUBLAS_OP_N, 3, n_nonsingular - 1, n_nonsingular - 1, + &calpha, + d_wing_ifreq_opt, 1, 1, desc_wing_opt, + d_body_inv, 1, 1, desc_body, + &cbeta, + d_3_lam_opt, 1, 1, desc_3_lam_opt, + CUBLAS_COMPUTE_64F_PEDANTIC + ); + CUDA_CHECK(cudaMemcpyAsync(_3_lam_opt.ptr(),d_3_lam_opt.ptr(),desc_3_lam_opt.m_loc()*desc_3_lam_opt.n_loc()*sizeof(std::complex),cudaMemcpyDeviceToHost,gpu_dev_stream.stream)); + gpu_dev_stream.cudaSync(); + ScalapackConnector::pgemr2d_f( + 3, n_nonsingular-1, + _3_lam_opt.ptr(), 1, 1, desc_3_lam_opt.desc, + _3_lam.ptr(), 1, 1, desc_3_lam.desc, + blacs_ctxt_global_h.ictxt + ); + #else + ScalapackConnector::pgemm_f('C', 'N', 3, n_nonsingular - 1, n_nonsingular - 1, 1.0, + wing.at(ifreq).ptr(), 1, 1, desc_wing.desc, body_inv.ptr(), 1, 1, + desc_body.desc, 0.0, _3_lam.ptr(), 1, 1, desc_3_lam.desc); + #endif + + for (int i = 0; i != 3; i++) + { + auto loc_i = desc_3_3.indx_g2l_r(i); + for (int ilambda = 0; ilambda < n_nonsingular - 1; ilambda++) + { + auto loc_ilambda = desc_lam_3.indx_g2l_r(ilambda); + auto loc_ibw = desc_lam_3.indx_g2l_c(i); + if (loc_ibw >= 0 && loc_ilambda >= 0) + this->bw(ilambda, i) = lam_3(loc_ilambda, loc_ibw); + + loc_ilambda = desc_3_lam.indx_g2l_c(ilambda); + auto loc_iwb = desc_3_lam.indx_g2l_r(i); + if (loc_iwb >= 0 && loc_ilambda >= 0) + this->wb(i, ilambda) = _3_lam(loc_iwb, loc_ilambda); + + MPI_Allreduce(MPI_IN_PLACE, &bw(ilambda, i), 1, MPI_DOUBLE_COMPLEX, MPI_SUM, + mpi_comm_global_h.comm); + MPI_Allreduce(MPI_IN_PLACE, &wb(i, ilambda), 1, MPI_DOUBLE_COMPLEX, MPI_SUM, + mpi_comm_global_h.comm); + } + + for (int j = 0; j != 3; j++) + { + auto loc_j = desc_3_3.indx_g2l_c(j); + if (loc_j >= 0 && loc_i >= 0) + this->Lind(i, j) = head.at(ifreq)(i, j) - Lind_loc(loc_i, loc_j); + MPI_Allreduce(MPI_IN_PLACE, &Lind(i, j), 1, MPI_DOUBLE_COMPLEX, MPI_SUM, + mpi_comm_global_h.comm); + } + } + + Profiler::stop("cal_L"); +}; +#endif + void diele_func::get_Leb_points() { auto quad_order = lebedev::QuadratureOrder::order_5810; @@ -1296,24 +1499,6 @@ void diele_func::cal_eps(const int ifreq, Array_Desc &desc_nabf_nabf_opt, Array_ chi0(ilo, jlo) = result; } } - // auto identity = init_local_mat>(desc_body, MAJOR::COL); - // for (int i = 0; i < n_nonsingular - 1; i++) - // { - // const int ilo = desc_body.indx_g2l_r(i); - // if (ilo < 0) continue; - // for (int j = 0; j < n_nonsingular - 1; j++) - // { - // const int jlo = desc_body.indx_g2l_c(j); - // if (jlo < 0) continue; - // if (i == j) - // identity(ilo, jlo) = 1.0; - // else - // identity(ilo, jlo) = 0.0; - // } - // } - // ScalapackConnector::pgemm_f('N', 'N', n_nonsingular - 1, n_nonsingular - 1, n_nonsingular - 1, - // 1.0, body_inv.ptr(), 1, 1, desc_body.desc, identity.ptr(), 1, 1, - // desc_body.desc, 1.0, chi0.ptr(), 2, 2, desc_nabf_nabf_opt.desc); ScalapackConnector::pgeadd_f( 'N', n_nonsingular - 1, n_nonsingular - 1, 1.0, @@ -1327,6 +1512,119 @@ void diele_func::cal_eps(const int ifreq, Array_Desc &desc_nabf_nabf_opt, Array_ Profiler::stop("cal_inverse_dielectric_matrix"); }; +#ifdef ENABLE_NVHPC +void diele_func::cal_eps_nvhpc(const GpuDeviceStream& gpu_dev_stream, const int& ifreq, Array_Desc &desc_nabf_nabf_opt, Array_Desc &desc_body) +{ + Profiler::start("cal_inverse_dielectric_matrix"); + // mpi_comm_global_h.barrier(); + this->chi0 = init_local_mat>(desc_nabf_nabf_opt, MAJOR::COL); + + const double k_volume = std::abs(G.Det()); + this->vol_gamma = k_volume / nk; + double vol_gamma_numeric = 0.0; + if (ifreq == 0 && mpi_comm_global_h.is_root()) + { + for (int ileb = 0; ileb != qw_leb.size(); ileb++) + { + vol_gamma_numeric += qw_leb[ileb] * std::pow(q_gamma[ileb], 3) / 3.0; + } + std::cout << "Number of angular grids for average inverse dielectric matrix: " + << qw_leb.size() << std::endl; + std::cout << "vol_gamma_numeric/vol_gamma: " << vol_gamma_numeric << ", " << vol_gamma + << std::endl; + std::cout << "Angular quadrature accuracy for volume: " << vol_gamma_numeric / vol_gamma + << " (should be close to 1)" << std::endl; + } + /*std::cout << "major of Matz: " << wing[0].is_row_major() << "," << body_inv.is_row_major() + << "," << transpose(wing.at(0), true).is_row_major() << "," << Lind.is_row_major() + << std::endl;*/ + construct_L_nvhpc(gpu_dev_stream, ifreq, desc_body); + + Profiler::start("precompute_q_data"); + + const size_t nleb = qw_leb.size(); + std::vector> weights(nleb); + std::vector> q_vectors(nleb); + + const auto L00 = Lind(0, 0), L01 = Lind(0, 1), L02 = Lind(0, 2); + const auto L10 = Lind(1, 0), L11 = Lind(1, 1), L12 = Lind(1, 2); + const auto L20 = Lind(2, 0), L21 = Lind(2, 1), L22 = Lind(2, 2); + +#pragma omp parallel for schedule(static) + for (int ileb = 0; ileb < nleb; ++ileb) + { + const double qx = qx_leb[ileb]; + const double qy = qy_leb[ileb]; + const double qz = qz_leb[ileb]; + + q_vectors[ileb] = {qx, qy, qz}; + + const auto qLq = qx * (qx * L00 + qy * L01 + qz * L02) + + qy * (qx * L10 + qy * L11 + qz * L12) + + qz * (qx * L20 + qy * L21 + qz * L22); + + weights[ileb] = qw_leb[ileb] * std::pow(q_gamma[ileb], 3) / (3.0 * vol_gamma) / qLq; + } + Profiler::stop("precompute_q_data"); + + Profiler::start("cal_inverse_dielectric_matrix_ij"); + int i_start = 0, i_end = n_nonsingular; + int j_start = 0, j_end = n_nonsingular; +#pragma omp parallel for schedule(dynamic, 4) collapse(2) + for (int i = i_start; i != i_end; i++) + { + for (int j = j_start; j != j_end; j++) + { + const int ilo = desc_nabf_nabf_opt.indx_g2l_r(i); + if (ilo < 0) continue; + const int jlo = desc_nabf_nabf_opt.indx_g2l_c(j); + if (jlo < 0) continue; + + complex result = 0.0; + + if (i == 0 && j == 0) + { + for (int ileb = 0; ileb < nleb; ++ileb) + { + result += weights[ileb]; + } + } + else if (i == 0 || j == 0) + { + result = 0.0; + } + else + { + const int idx_i = i - 1, idx_j = j - 1; + + const auto bw_i0 = bw(idx_i, 0), bw_i1 = bw(idx_i, 1), bw_i2 = bw(idx_i, 2); + const auto wb_j0 = wb(0, idx_j), wb_j1 = wb(1, idx_j), wb_j2 = wb(2, idx_j); + + for (int ileb = 0; ileb < nleb; ++ileb) + { + const auto &[qx, qy, qz] = q_vectors[ileb]; + const auto bwq = bw_i0 * qx + bw_i1 * qy + bw_i2 * qz; + const auto qwb = qx * wb_j0 + qy * wb_j1 + qz * wb_j2; + + result += weights[ileb] * bwq * qwb; + } + } + chi0(ilo, jlo) = result; + } + } + ScalapackConnector::pgeadd_f( + 'N', n_nonsingular - 1, n_nonsingular - 1, + 1.0, + body_inv.ptr(), 1, 1, desc_body.desc, + 1.0, + chi0.ptr(), 2, 2, desc_nabf_nabf_opt.desc); + Profiler::stop("cal_inverse_dielectric_matrix_ij"); + if (mpi_comm_global_h.is_root()) + std::cout << "* Success: calculate average inverse dielectric matrix no." << ifreq + 1 + << "." << std::endl; + Profiler::stop("cal_inverse_dielectric_matrix"); +}; +#endif /*std::complex diele_func::compute_chi0_inv_00(const int ifreq) { std::complex total = 0.0; @@ -1416,4 +1714,23 @@ void diele_func::rewrite_eps(matrix_m> &chi0_block, const i this->body_inv.clear(); }; +#ifdef ENABLE_NVHPC +void diele_func::rewrite_eps_nvhpc(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice& d_chi0_block, const int& ifreq, Array_Desc& desc_nabf_nabf_opt) +{ + auto desc_body = get_body_inv_nvhpc(gpu_dev_stream, d_chi0_block, desc_nabf_nabf_opt); + this->body_inv = init_local_mat>(desc_body, MAJOR::COL); + CUDA_CHECK(cudaMemcpyAsync(this->body_inv.ptr(),this->d_body_inv.ptr(),sizeof(cuDoubleComplex)*desc_body.m_loc()*desc_body.n_loc(),cudaMemcpyDeviceToHost,gpu_dev_stream.stream)); + + cal_eps_nvhpc(gpu_dev_stream, ifreq, desc_nabf_nabf_opt, desc_body); + d_chi0_block.set_data(this->chi0.nr(),this->chi0.nc(),this->chi0.ptr(),gpu_dev_stream.stream); + // this->chi0.clear(); + ofs_myid << get_timestamp() << "success finish rewrite_eps"<Lind.clear(); + this->body_inv.clear(); + this->d_body_inv.clean(gpu_dev_stream.stream); + ofs_myid << get_timestamp() << "success clean the data"<> body_inv; // ( i:3, j:3 ) + #ifdef ENABLE_NVHPC + ComplexMatrixDevice d_body_inv; + #endif matrix_m> Lind; // ( i:n_lambda, j:3 ) matrix_m> bw; @@ -132,16 +137,26 @@ class diele_func Array_Desc get_body_inv(matrix_m> &chi0_block, Array_Desc &desc_nabf_nabf_opt); void construct_L(const int ifreq, Array_Desc &desc_body); + #ifdef ENABLE_NVHPC + void construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const int& ifreq, Array_Desc& desc_body); + #endif // Lebedev-Laikov quadrature void get_Leb_points(); void get_g_enclosing_gamma(); void calculate_q_gamma(); void cal_eps(const int ifreq, Array_Desc &desc_nabf_nabf_opt, Array_Desc &desc_body); + #ifdef ENABLE_NVHPC + void cal_eps_nvhpc(const GpuDeviceStream& gpu_dev_stream, const int& ifreq, Array_Desc &desc_nabf_nabf_opt, Array_Desc &desc_body); + #endif // not used now due to performance optimization // std::complex compute_chi0_inv_00(const int ifreq); // std::complex compute_chi0_inv_ij(const int ifreq, int i, int j); void rewrite_eps(matrix_m> &chi0_block, const int ifreq, Array_Desc &desc_nabf_nabf_opt); + #ifdef ENABLE_NVHPC + Array_Desc get_body_inv_nvhpc(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice&, const Array_Desc&); + void rewrite_eps_nvhpc(const GpuDeviceStream& gpu_dev_stream,ComplexMatrixDevice&,const int& ifreq,Array_Desc& desc_nabf_nabf_opt); + #endif void assign_chi0(matrix_m> &chi0_block, Array_Desc &desc_nabf_nabf_opt); }; From 8f430d7282e5bcddc878b55cbad9b58f7a19f679 Mon Sep 17 00:00:00 2001 From: userchba <88939318+userchba@users.noreply.github.com> Date: Fri, 14 Nov 2025 17:13:30 +0800 Subject: [PATCH 12/18] Add files via upload --- src/gpu_device_stream.cpp | 0 src/gpu_device_stream.h | 121 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100644 src/gpu_device_stream.cpp create mode 100644 src/gpu_device_stream.h diff --git a/src/gpu_device_stream.cpp b/src/gpu_device_stream.cpp new file mode 100644 index 00000000..e69de29b diff --git a/src/gpu_device_stream.h b/src/gpu_device_stream.h new file mode 100644 index 00000000..849556bd --- /dev/null +++ b/src/gpu_device_stream.h @@ -0,0 +1,121 @@ +#pragma once +//=================hbchen 2025-05-11========================= +#include +#include +#include +#include +//=================hbchen 2025-05-11========================= +#include +#include +#include +#include +#ifdef ENABLE_NVHPC +#include +#include +#include "helpers.h" +#endif + +class GpuDeviceStream{ +private: + cal_comm_create_params_t params; + static inline calError_t allgather(void* src_buf, void* recv_buf, size_t size, void* data, void** request) + { + MPI_Request req; + int err = MPI_Iallgather(src_buf, size, MPI_BYTE, recv_buf, size, MPI_BYTE, (MPI_Comm)(data), &req); + if (err != MPI_SUCCESS) + { + return CAL_ERROR; + } + *request = (void*)(req); + return CAL_OK; + } + + static inline calError_t request_test(void* request) + { + MPI_Request req = (MPI_Request)(request); + int completed; + int err = MPI_Test(&req, &completed, MPI_STATUS_IGNORE); + if (err != MPI_SUCCESS) + { + return CAL_ERROR; + } + return completed ? CAL_OK : CAL_ERROR_INPROGRESS; + } + + static inline calError_t request_free(void* request) + { + return CAL_OK; + } + static inline int getLocalDevice() + { + int localRank; + MPI_Comm localComm; + + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &localComm); + MPI_Comm_rank(localComm, &localRank); + MPI_Comm_free(&localComm); + + int deviceCount = 0; + CUDA_CHECK(cudaGetDeviceCount(&deviceCount)); + + return localRank % deviceCount; + } + + +public: + int rank; + int nranks; + int local_device; + cudaStream_t stream = nullptr; + cal_comm_t cal_comm = nullptr; + + cusolverMpHandle_t cusolver_handle = nullptr; + cublasMpHandle_t cublas_handle = nullptr; + GpuDeviceStream(){ + MPI_Comm_size(MPI_COMM_WORLD, &nranks); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + local_device = GpuDeviceStream::getLocalDevice(); + // printf("myrank:%d, local_device:%d\n", rank, local_device); + CUDA_CHECK(cudaSetDevice(local_device)); + CUDA_CHECK(cudaFree(nullptr)); + { + params.allgather = GpuDeviceStream::allgather; + params.req_test = GpuDeviceStream::request_test; + params.req_free = GpuDeviceStream::request_free; + params.data = (void*)(MPI_COMM_WORLD); + params.rank = rank; + params.nranks = nranks; + params.local_device = local_device; + + CAL_CHECK(cal_comm_create(params, &cal_comm)); + } + CUDA_CHECK(cudaStreamCreate(&stream)); + CUSOLVERMP_CHECK(cusolverMpCreate(&cusolver_handle, local_device, stream)); + CUBLASMP_CHECK(cublasMpCreate(&cublas_handle, stream)); + + } + ~GpuDeviceStream(){ + if(stream!=nullptr){ + CUDA_CHECK(cudaStreamDestroy(stream)); + stream=nullptr; + } + if(cal_comm!=nullptr){ + CAL_CHECK(cal_comm_destroy(cal_comm)); + cal_comm=nullptr; + } + if(cusolver_handle!=nullptr){ + CUSOLVERMP_CHECK(cusolverMpDestroy(cusolver_handle)); + cusolver_handle=nullptr; + } + if(cublas_handle!=nullptr){ + CUBLASMP_CHECK(cublasMpDestroy(cublas_handle)); + cublas_handle=nullptr; + } + } + void cudaSync() const { + CUDA_CHECK(cudaStreamSynchronize(stream)); + } + void calSync() const { + CAL_CHECK(cal_stream_sync(cal_comm,stream)); + } +}; \ No newline at end of file From 44f9f2c0e85fa35899ad39d354adbd1165f5186e Mon Sep 17 00:00:00 2001 From: userchba <88939318+userchba@users.noreply.github.com> Date: Fri, 14 Nov 2025 17:58:42 +0800 Subject: [PATCH 13/18] fix error for non-gpu installation From d2ec742920b66e6096eaa054a92da6d91e6cef54 Mon Sep 17 00:00:00 2001 From: userchba <88939318+userchba@users.noreply.github.com> Date: Mon, 17 Nov 2025 20:34:09 +0800 Subject: [PATCH 14/18] Add files via upload --- src/dielecmodel.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dielecmodel.cpp b/src/dielecmodel.cpp index cff7158f..7ac8ee73 100644 --- a/src/dielecmodel.cpp +++ b/src/dielecmodel.cpp @@ -1071,7 +1071,7 @@ Array_Desc diele_func::get_body_inv(matrix_m> &chi0_block, Profiler::stop("get_inverse_body_of_chi0"); return desc_body; }; - +#ifdef ENABLE_NVHPC Array_Desc diele_func::get_body_inv_nvhpc(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice& d_chi0_block, const Array_Desc& desc_nabf_nabf_opt) { Profiler::start("get_inverse_body_of_chi0_nvhpc"); @@ -1108,7 +1108,7 @@ Array_Desc diele_func::get_body_inv_nvhpc(const GpuDeviceStream& gpu_dev_stream, Profiler::stop("get_inverse_body_of_chi0_nvhpc"); return desc_body; } - +#endif void diele_func::construct_L(const int ifreq, Array_Desc &desc_body) { Profiler::start("cal_L"); From 71970ea907d0a80bff47037cde3e0253c153b08d Mon Sep 17 00:00:00 2001 From: userchba <88939318+userchba@users.noreply.github.com> Date: Sat, 22 Nov 2025 20:15:34 +0800 Subject: [PATCH 15/18] merge --- driver/task_gw_band.cpp | 105 +++++++++++++++++++++++++++++++--------- 1 file changed, 81 insertions(+), 24 deletions(-) diff --git a/driver/task_gw_band.cpp b/driver/task_gw_band.cpp index 1a9d99b3..a777e9e7 100644 --- a/driver/task_gw_band.cpp +++ b/driver/task_gw_band.cpp @@ -27,7 +27,6 @@ #include "epsilon_cuda.h" #include #endif - void task_g0w0_band(std::map, ComplexMatrix> &sinvS) { using LIBRPA::envs::mpi_comm_global_h; @@ -118,9 +117,19 @@ void task_g0w0_band(std::map, ComplexMatrix> &sinvS) Profiler::start("g0w0_exx", "Build exchange self-energy"); auto exx = LIBRPA::Exx(meanfield, kfrac_list, period); { - Profiler::start("ft_vq_cut", "Fourier transform truncated Coulomb"); - const auto VR = FT_Vq(Vq_cut, meanfield.get_n_kpoints(), Rlist, true); - Profiler::stop("ft_vq_cut"); + atpair_R_mat_t VR; + if (Params::use_fullcoul_exx) + { + Profiler::start("ft_vq_full", "Fourier transform full Coulomb"); + VR = FT_Vq(Vq, meanfield.get_n_kpoints(), Rlist, true); + Profiler::stop("ft_vq_full"); + } + else + { + Profiler::start("ft_vq_cut", "Fourier transform truncated Coulomb"); + VR = FT_Vq(Vq_cut, meanfield.get_n_kpoints(), Rlist, true); + Profiler::stop("ft_vq_cut"); + } Profiler::start("g0w0_exx_real_work"); if (Params::use_shrink_abfs) @@ -158,7 +167,6 @@ void task_g0w0_band(std::map, ComplexMatrix> &sinvS) else #endif Wc_freq_q = compute_Wc_freq_q_blacs(chi0, Vq, Vq_cut, epsmac_LF_imagfreq); - } else { @@ -444,11 +452,21 @@ void task_g0w0_band(std::map, ComplexMatrix> &sinvS) } } // output bandgap - double bandgap = 0.0; - double valence = -1.e10; - double conduct = 1.e10; - int ik_val = 0; - int ik_cond = 0; + double gw_bandgap = 0.0; + double gw_valence = -1.e10; + double gw_conduct = 1.e10; + double exx_bandgap = 0.0; + double exx_valence = -1.e10; + double exx_conduct = 1.e10; + double dft_bandgap = 0.0; + double dft_valence = -1.e10; + double dft_conduct = 1.e10; + int ik_val_gw = 0; + int ik_cond_gw = 0; + int ik_val_exx = 0; + int ik_cond_exx = 0; + int ik_val_dft = 0; + int ik_cond_dft = 0; int nocc = 0; auto &wg = meanfield.get_weight()[0]; for (int i = 0; i != wg.size; i++) @@ -514,16 +532,38 @@ void task_g0w0_band(std::map, ComplexMatrix> &sinvS) ofs_hf << std::setw(15) << std::setprecision(5) << occ_state << std::setw(15) << std::setprecision(5) << eks_state - vxc_state + exx_state; - // output bandgap - if (i_state == nocc - 1 && eqp > valence) // HOMO + // output GW bandgap + if (i_state == nocc - 1 && eqp > gw_conduct) // HOMO + { + gw_valence = eqp; + ik_val_gw = i_kpoint; + } + else if (i_state == nocc && eqp < gw_conduct) // LUMO + { + gw_conduct = eqp; + ik_cond_gw = i_kpoint; + } + // output EXX bandgap + if (i_state == nocc - 1 && exx_state > exx_valence) // HOMO + { + exx_valence = exx_state; + ik_val_exx = i_kpoint; + } + else if (i_state == nocc && exx_state < exx_conduct) // LUMO + { + exx_conduct = exx_state; + ik_cond_exx = i_kpoint; + } + // output DFT bandgap + if (i_state == nocc - 1 && eks_state > dft_valence) // HOMO { - valence = eqp; - ik_val = i_kpoint; + dft_valence = eks_state; + ik_val_dft = i_kpoint; } - else if (i_state == nocc && eqp < conduct) // LUMO + else if (i_state == nocc && eks_state < dft_conduct) // LUMO { - conduct = eqp; - ik_cond = i_kpoint; + dft_conduct = eks_state; + ik_cond_dft = i_kpoint; } } ofs_gw << "\n"; @@ -531,13 +571,30 @@ void task_g0w0_band(std::map, ComplexMatrix> &sinvS) ofs_ks << "\n"; } } - bandgap = conduct - valence; - const auto &k_val = kfrac_band[ik_val]; - printf("VBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_val + 1, k_val.x, k_val.y, k_val.z); - const auto &k_cond = kfrac_band[ik_cond]; - printf("CBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_cond + 1, k_cond.x, k_cond.y, - k_cond.z); - lib_printf("Bandgap(eV): %12.7f \n", bandgap); + gw_bandgap = gw_conduct - gw_valence; + exx_bandgap = exx_conduct - exx_valence; + dft_bandgap = dft_conduct - dft_valence; + const auto &k_val_gw = kfrac_band[ik_val_gw]; + const auto &k_cond_gw = kfrac_band[ik_cond_gw]; + printf("GW VBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_val_gw + 1, k_val_gw.x, k_val_gw.y, + k_val_gw.z); + printf("GW CBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_cond_gw + 1, k_cond_gw.x, + k_cond_gw.y, k_cond_gw.z); + lib_printf("GW bandgap(eV): %12.7f \n", gw_bandgap); + const auto &k_val_exx = kfrac_band[ik_val_exx]; + const auto &k_cond_exx = kfrac_band[ik_cond_exx]; + printf("EXX VBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_val_exx + 1, k_val_exx.x, + k_val_exx.y, k_val_exx.z); + printf("EXX CBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_cond_exx + 1, k_cond_exx.x, + k_cond_exx.y, k_cond_exx.z); + lib_printf("EXX bandgap(eV): %12.7f \n", exx_bandgap); + const auto &k_val_dft = kfrac_band[ik_val_dft]; + const auto &k_cond_dft = kfrac_band[ik_cond_dft]; + printf("DFT VBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_val_dft + 1, k_val_dft.x, + k_val_dft.y, k_val_dft.z); + printf("DFT CBM: k-point %4d: (%.5f, %.5f, %.5f) \n", ik_cond_dft + 1, k_cond_dft.x, + k_cond_dft.y, k_cond_dft.z); + lib_printf("DFT bandgap(eV): %12.7f \n", dft_bandgap); } Profiler::stop("g0w0_solve_band_qpe"); From ac931d4bf5e6fb275eb9eb58156fd12e4796e895 Mon Sep 17 00:00:00 2001 From: userchba <88939318+userchba@users.noreply.github.com> Date: Tue, 2 Dec 2025 16:50:28 +0800 Subject: [PATCH 16/18] add gpu head_wing calaulation --- src/dielecmodel.cpp | 154 ++++++++++++++++---------------------------- 1 file changed, 56 insertions(+), 98 deletions(-) diff --git a/src/dielecmodel.cpp b/src/dielecmodel.cpp index 7ac8ee73..ee046300 100644 --- a/src/dielecmodel.cpp +++ b/src/dielecmodel.cpp @@ -19,6 +19,7 @@ using RI::Communicate_Tensors_Map_Judge::comm_map2_first; using LIBRPA::envs::mpi_comm_global_h; using LIBRPA::envs::ofs_myid; using LIBRPA::utils::lib_printf; + #ifdef ENABLE_NVHPC #include "cuda_connector.h" #endif @@ -500,8 +501,16 @@ void diele_func::wing_mu_to_lambda(matrix_m> &sqrtveig_blac int n_lambda = this->n_nonsingular - 1; Array_Desc desc_wing_mu(blacs_ctxt_global_h); desc_wing_mu.init_square_blk(n_abf, 3, 0, 0); + desc_wing_mu.init(n_abf, 3, desc_nabf_nabf_opt.mb(), desc_wing_mu.nb(), 0, 0); Array_Desc desc_wing(blacs_ctxt_global_h); desc_wing.init_square_blk(n_nonsingular - 1, 3, 0, 0); + Array_Desc desc_body(blacs_ctxt_global_h); + desc_body.init_square_blk(n_nonsingular - 1, n_nonsingular - 1, 0, 0); + int mb = std::min(128, desc_body.mb()); + desc_body.init(n_nonsingular - 1, n_nonsingular - 1, mb, mb, 0, 0); + + desc_wing.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0); + for (int iomega = 0; iomega != this->omega.size(); iomega++) { auto &wing_tmp = this->wing.at(iomega); @@ -524,7 +533,8 @@ void diele_func::wing_mu_to_lambda(matrix_m> &sqrtveig_blac // drop the first column of sqrtveig_blacs, the largest eigenvalue ScalapackConnector::pgemm_f('C', 'N', n_lambda, 3, n_abf, 1.0, sqrtveig_blacs.ptr(), 1, 2, desc_nabf_nabf_opt.desc, wing_mu_tmp.ptr(), 1, 1, - desc_wing_mu.desc, 0.0, wing_tmp.ptr(), 1, 1, desc_wing.desc); + desc_wing_mu.desc, 0.0, wing_tmp.ptr(), 1, 1, + desc_wing.desc); } this->wing_mu.clear(); @@ -1053,6 +1063,8 @@ Array_Desc diele_func::get_body_inv(matrix_m> &chi0_block, Array_Desc desc_body(blacs_ctxt_global_h); desc_body.init_square_blk(n_nonsingular - 1, n_nonsingular - 1, 0, 0); + int mb = std::min(128,desc_body.mb()); + desc_body.init(n_nonsingular - 1, n_nonsingular - 1, mb, mb, 0, 0); this->body_inv = init_local_mat>(desc_body, MAJOR::COL); /* for (int ilambda = 0; ilambda < n_nonsingular - 1; ilambda++) @@ -1071,6 +1083,7 @@ Array_Desc diele_func::get_body_inv(matrix_m> &chi0_block, Profiler::stop("get_inverse_body_of_chi0"); return desc_body; }; + #ifdef ENABLE_NVHPC Array_Desc diele_func::get_body_inv_nvhpc(const GpuDeviceStream& gpu_dev_stream, ComplexMatrixDevice& d_chi0_block, const Array_Desc& desc_nabf_nabf_opt) { @@ -1078,6 +1091,8 @@ Array_Desc diele_func::get_body_inv_nvhpc(const GpuDeviceStream& gpu_dev_stream, gpu_dev_stream.calSync(); Array_Desc desc_body(blacs_ctxt_global_h); desc_body.init_square_blk(n_nonsingular - 1, n_nonsingular - 1, 0, 0); + int mb = std::min(128,desc_body.mb()); + desc_body.init(n_nonsingular - 1, n_nonsingular - 1, mb, mb, 0, 0); this->d_body_inv.set_data(desc_body.m_loc(),desc_body.n_loc(),gpu_dev_stream.stream); Array_Desc_Device desc_body_dev(desc_body); CudaConnector::pgemr2d_nvhpc( @@ -1117,15 +1132,17 @@ void diele_func::construct_L(const int ifreq, Array_Desc &desc_body) this->wb.resize(3, n_nonsingular - 1, MAJOR::COL); Array_Desc desc_wing(blacs_ctxt_global_h); desc_wing.init_square_blk(n_nonsingular - 1, 3, 0, 0); - + // opt descriptor for wing + desc_wing.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0); + Array_Desc desc_lam_3(blacs_ctxt_global_h); - desc_lam_3.init_square_blk(n_nonsingular - 1, 3, 0, 0); + desc_lam_3.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0); Array_Desc desc_3_lam(blacs_ctxt_global_h); - desc_3_lam.init_square_blk(3, n_nonsingular - 1, 0, 0); + desc_3_lam.init(3, n_nonsingular - 1, desc_wing.nb(), desc_body.mb(), 0, 0); Array_Desc desc_3_3(blacs_ctxt_global_h); - desc_3_3.init_square_blk(3, 3, 0, 0); + desc_3_3.init(3, 3, desc_wing.nb(), desc_wing.nb(), 0, 0); auto lam_3 = init_local_mat>(desc_lam_3, MAJOR::COL); auto _3_lam = init_local_mat>(desc_3_lam, MAJOR::COL); @@ -1138,8 +1155,8 @@ void diele_func::construct_L(const int ifreq, Array_Desc &desc_body) desc_wing.desc, lam_3.ptr(), 1, 1, desc_lam_3.desc, 0.0, Lind_loc.ptr(), 1, 1, desc_3_3.desc); ScalapackConnector::pgemm_f('C', 'N', 3, n_nonsingular - 1, n_nonsingular - 1, 1.0, - wing.at(ifreq).ptr(), 1, 1, desc_wing.desc, body_inv.ptr(), 1, 1, - desc_body.desc, 0.0, _3_lam.ptr(), 1, 1, desc_3_lam.desc); + wing.at(ifreq).ptr(), 1, 1, desc_wing.desc, body_inv.ptr(), 1, + 1, desc_body.desc, 0.0, _3_lam.ptr(), 1, 1, desc_3_lam.desc); for (int i = 0; i != 3; i++) { @@ -1184,126 +1201,70 @@ void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const this->wb.resize(3, n_nonsingular - 1, MAJOR::COL); Array_Desc desc_wing(blacs_ctxt_global_h); desc_wing.init_square_blk(n_nonsingular - 1, 3, 0, 0); - - Array_Desc desc_wing_opt(blacs_ctxt_global_h); - desc_wing_opt.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0); - - + // opt descriptor for wing + desc_wing.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0); + Array_Desc desc_lam_3(blacs_ctxt_global_h); - desc_lam_3.init_square_blk(n_nonsingular - 1, 3, 0, 0); - - Array_Desc desc_lam_3_opt(blacs_ctxt_global_h); - desc_lam_3_opt.init(n_nonsingular - 1, 3, desc_body.mb(), desc_lam_3.nb(), 0, 0); + desc_lam_3.init(n_nonsingular - 1, 3, desc_body.mb(), desc_wing.nb(), 0, 0); Array_Desc desc_3_lam(blacs_ctxt_global_h); - desc_3_lam.init_square_blk(3, n_nonsingular - 1, 0, 0); - - Array_Desc desc_3_lam_opt(blacs_ctxt_global_h); - desc_3_lam_opt.init(3, n_nonsingular - 1, desc_3_lam.mb(), desc_body.nb(), 0, 0); + desc_3_lam.init(3, n_nonsingular - 1, desc_wing.nb(), desc_body.mb(), 0, 0); Array_Desc desc_3_3(blacs_ctxt_global_h); - desc_3_3.init_square_blk(3, 3, 0, 0); + desc_3_3.init(3, 3, desc_wing.nb(), desc_wing.nb(), 0, 0); auto lam_3 = init_local_mat>(desc_lam_3, MAJOR::COL); auto _3_lam = init_local_mat>(desc_3_lam, MAJOR::COL); auto Lind_loc = init_local_mat>(desc_3_3, MAJOR::COL); - // printf("rank:%d,ma:%d,na:%d,mb:%d,nb:%d,mc:%d,nc:%d\n",gpu_dev_stream.rank,desc_body.m(),desc_body.n(),desc_wing_opt.m(),desc_wing_opt.n(),desc_lam_3_opt.m(),desc_lam_3_opt.n()); - // printf("rank:%d,m_loc_a:%d,n_loc_a:%d,m_loc_b:%d,n_loc_b:%d,m_loc_c:%d,n_loc_c:%d\n",gpu_dev_stream.rank,desc_body.m_loc(),desc_body.n_loc(),desc_wing_opt.m_loc(),desc_wing_opt.n_loc(),desc_lam_3_opt.m_loc(),desc_lam_3_opt.n_loc()); - // printf("rank:%d,mba:%d,nba:%d,mbb:%d,nbb:%d,mbc:%d,nbc:%d\n",gpu_dev_stream.rank,desc_body.mb(),desc_body.nb(),desc_wing_opt.mb(),desc_wing_opt.nb(),desc_lam_3_opt.mb(),desc_lam_3_opt.nb()); // tmp = head.at(ifreq) - transpose(wing.at(ifreq), true) * body_inv * wing.at(ifreq); - #ifdef ENABLE_NVHPC - auto wing_ifreq_opt = init_local_mat>(desc_wing_opt, MAJOR::COL); - auto lam_3_opt = init_local_mat>(desc_lam_3_opt, MAJOR::COL); - auto _3_lam_opt = init_local_mat>(desc_3_lam_opt, MAJOR::COL); - ComplexMatrixDevice d_lam_3,d_3_lam,d_Lind_loc; - ComplexMatrixDevice d_lam_3_opt,d_3_lam_opt; - ComplexMatrixDevice d_wing_ifreq,d_wing_ifreq_opt; + + ComplexMatrixDevice d_lam_3,d_3_lam,d_Lind_loc,d_wing_ifreq; + d_wing_ifreq.set_data(wing.at(ifreq).nr(),wing.at(ifreq).nc(),wing.at(ifreq).ptr(),gpu_dev_stream.stream); - d_wing_ifreq_opt.set_data(desc_wing_opt.m_loc(),desc_wing_opt.n_loc(),gpu_dev_stream.stream); - // d_lam_3.set_data(desc_lam_3.m_loc(),desc_lam_3.n_loc(),gpu_dev_stream.stream); - d_lam_3_opt.set_data(desc_lam_3_opt.m_loc(),desc_lam_3_opt.n_loc(),gpu_dev_stream.stream); - // d_3_lam.set_data(desc_3_lam.m_loc(),desc_3_lam.n_loc(),gpu_dev_stream.stream); - d_3_lam_opt.set_data(desc_3_lam_opt.m_loc(),desc_3_lam_opt.n_loc(),gpu_dev_stream.stream); + + d_lam_3.set_data(desc_lam_3.m_loc(),desc_lam_3.n_loc(),gpu_dev_stream.stream); + d_3_lam.set_data(desc_3_lam.m_loc(),desc_3_lam.n_loc(),gpu_dev_stream.stream); d_Lind_loc.set_data(desc_3_3.m_loc(),desc_3_3.n_loc(),gpu_dev_stream.stream); - ScalapackConnector::pgemr2d_f( - n_nonsingular-1, 3, - wing.at(ifreq).ptr(), 1, 1, desc_wing.desc, - wing_ifreq_opt.ptr(), 1, 1, desc_wing_opt.desc, - blacs_ctxt_global_h.ictxt - ); - // CudaConnector::pgemr2d_nvhpc( - // gpu_dev_stream, n_nonsingular-1, 3, - // d_wing_ifreq.ptr(), 1, 1, desc_wing, - // d_wing_ifreq_opt.ptr(), 1, 1, desc_wing, - // CUDA_C_64F - // ); - CUDA_CHECK(cudaMemcpyAsync(d_wing_ifreq_opt.ptr(),wing_ifreq_opt.ptr(),desc_wing_opt.m_loc()*desc_wing_opt.n_loc()*sizeof(std::complex),cudaMemcpyHostToDevice,gpu_dev_stream.stream)); - std::complex calpha(1.0,0.0),cbeta(0.0,0.0); + std::complex calpha(1.0,0.0),cbeta(0.0,0.0); + printf("successful before gemm1\n"); CudaConnector::pgemm_nvhpc( gpu_dev_stream, CUBLAS_OP_N, CUBLAS_OP_N, n_nonsingular - 1, 3, n_nonsingular - 1, &calpha, d_body_inv, 1, 1, desc_body, - d_wing_ifreq_opt, 1, 1, desc_wing_opt, + d_wing_ifreq, 1, 1, desc_wing, &cbeta, - d_lam_3_opt, 1, 1, desc_lam_3_opt, + d_lam_3, 1, 1, desc_lam_3, CUBLAS_COMPUTE_64F_PEDANTIC ); - CUDA_CHECK(cudaMemcpyAsync(lam_3_opt.ptr(),d_lam_3_opt.ptr(),desc_lam_3_opt.m_loc()*desc_lam_3_opt.n_loc()*sizeof(std::complex),cudaMemcpyDeviceToHost,gpu_dev_stream.stream)); - gpu_dev_stream.cudaSync(); - ScalapackConnector::pgemr2d_f( - n_nonsingular-1, 3, - lam_3_opt.ptr(), 1, 1, desc_lam_3_opt.desc, - lam_3.ptr(), 1, 1, desc_lam_3.desc, - blacs_ctxt_global_h.ictxt - ); - gpu_dev_stream.cudaSync(); - #else - ScalapackConnector::pgemm_f('N', 'N', n_nonsingular - 1, 3, n_nonsingular - 1, 1.0, - body_inv.ptr(), 1, 1, desc_body.desc, wing.at(ifreq).ptr(), 1, 1, - desc_wing.desc, 0.0, lam_3.ptr(), 1, 1, desc_lam_3.desc); - #endif - #ifdef ENABLE_NVHPC + printf("successful after gemm1\n"); + // printf("desc_wing:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_wing.m_loc(),desc_wing.n_loc(),desc_wing.lld(),desc_wing.mb(),desc_wing.nb()); + // printf("desc_lam_3:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_lam_3.m_loc(),desc_lam_3.n_loc(),desc_lam_3.lld(),desc_lam_3.mb(),desc_lam_3.nb()); + // printf("desc_3_3:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_3_3.m_loc(),desc_3_3.n_loc(),desc_3_3.lld(),desc_3_3.mb(),desc_3_3.nb()); CudaConnector::pgemm_nvhpc( gpu_dev_stream, CUBLAS_OP_C, CUBLAS_OP_N, 3, 3, n_nonsingular - 1, &calpha, - d_wing_ifreq_opt, 1, 1, desc_wing_opt, - d_lam_3_opt, 1, 1, desc_lam_3_opt, + d_wing_ifreq, 1, 1, desc_wing, + d_lam_3, 1, 1, desc_lam_3, &cbeta, d_Lind_loc, 1, 1, desc_3_3, CUBLAS_COMPUTE_64F_PEDANTIC ); - CUDA_CHECK(cudaMemcpyAsync(Lind_loc.ptr(),d_Lind_loc.ptr(),desc_3_3.m_loc()*desc_3_3.n_loc()*sizeof(std::complex),cudaMemcpyDeviceToHost,gpu_dev_stream.stream)); - gpu_dev_stream.cudaSync(); - #else - ScalapackConnector::pgemm_f('C', 'N', 3, 3, n_nonsingular - 1, 1.0, wing.at(ifreq).ptr(), 1, 1, - desc_wing.desc, lam_3.ptr(), 1, 1, desc_lam_3.desc, 0.0, - Lind_loc.ptr(), 1, 1, desc_3_3.desc); - #endif - #ifndef ENABLE_NVHPC + printf("successful after gemm2\n"); CudaConnector::pgemm_nvhpc( gpu_dev_stream, CUBLAS_OP_C, CUBLAS_OP_N, 3, n_nonsingular - 1, n_nonsingular - 1, &calpha, - d_wing_ifreq_opt, 1, 1, desc_wing_opt, + d_wing_ifreq, 1, 1, desc_wing, d_body_inv, 1, 1, desc_body, &cbeta, - d_3_lam_opt, 1, 1, desc_3_lam_opt, + d_3_lam, 1, 1, desc_3_lam, CUBLAS_COMPUTE_64F_PEDANTIC ); - CUDA_CHECK(cudaMemcpyAsync(_3_lam_opt.ptr(),d_3_lam_opt.ptr(),desc_3_lam_opt.m_loc()*desc_3_lam_opt.n_loc()*sizeof(std::complex),cudaMemcpyDeviceToHost,gpu_dev_stream.stream)); + printf("successful after gemm3\n"); + CUDA_CHECK(cudaMemcpyAsync(lam_3.ptr(), d_lam_3.ptr(), desc_lam_3.m_loc() * desc_lam_3.n_loc() * sizeof(std::complex), cudaMemcpyDeviceToHost, gpu_dev_stream.stream)); + CUDA_CHECK(cudaMemcpyAsync(_3_lam.ptr(), d_3_lam.ptr(), desc_3_lam.m_loc() * desc_3_lam.n_loc() * sizeof(std::complex), cudaMemcpyDeviceToHost, gpu_dev_stream.stream)); + CUDA_CHECK(cudaMemcpyAsync(Lind_loc.ptr(), d_Lind_loc.ptr(), desc_3_3.m_loc() * desc_3_3.n_loc() * sizeof(std::complex), cudaMemcpyDeviceToHost, gpu_dev_stream.stream)); gpu_dev_stream.cudaSync(); - ScalapackConnector::pgemr2d_f( - 3, n_nonsingular-1, - _3_lam_opt.ptr(), 1, 1, desc_3_lam_opt.desc, - _3_lam.ptr(), 1, 1, desc_3_lam.desc, - blacs_ctxt_global_h.ictxt - ); - #else - ScalapackConnector::pgemm_f('C', 'N', 3, n_nonsingular - 1, n_nonsingular - 1, 1.0, - wing.at(ifreq).ptr(), 1, 1, desc_wing.desc, body_inv.ptr(), 1, 1, - desc_body.desc, 0.0, _3_lam.ptr(), 1, 1, desc_3_lam.desc); - #endif - for (int i = 0; i != 3; i++) { auto loc_i = desc_3_3.indx_g2l_r(i); @@ -1499,12 +1460,9 @@ void diele_func::cal_eps(const int ifreq, Array_Desc &desc_nabf_nabf_opt, Array_ chi0(ilo, jlo) = result; } } - ScalapackConnector::pgeadd_f( - 'N', n_nonsingular - 1, n_nonsingular - 1, - 1.0, - body_inv.ptr(), 1, 1, desc_body.desc, - 1.0, - chi0.ptr(), 2, 2, desc_nabf_nabf_opt.desc); + + ScalapackConnector::pgeadd_f('N', n_nonsingular - 1, n_nonsingular - 1, 1.0, body_inv.ptr(), 1, + 1, desc_body.desc, 1.0, chi0.ptr(), 2, 2, desc_nabf_nabf_opt.desc); Profiler::stop("cal_inverse_dielectric_matrix_ij"); if (mpi_comm_global_h.is_root()) std::cout << "* Success: calculate average inverse dielectric matrix no." << ifreq + 1 From 3b16358ca3fd394db2d940684542bc310ecacafa Mon Sep 17 00:00:00 2001 From: userchba <88939318+userchba@users.noreply.github.com> Date: Tue, 2 Dec 2025 17:25:52 +0800 Subject: [PATCH 17/18] Update dielecmodel.cpp --- src/dielecmodel.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/dielecmodel.cpp b/src/dielecmodel.cpp index ee046300..74de2b27 100644 --- a/src/dielecmodel.cpp +++ b/src/dielecmodel.cpp @@ -1227,7 +1227,7 @@ void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const d_Lind_loc.set_data(desc_3_3.m_loc(),desc_3_3.n_loc(),gpu_dev_stream.stream); std::complex calpha(1.0,0.0),cbeta(0.0,0.0); - printf("successful before gemm1\n"); + CudaConnector::pgemm_nvhpc( gpu_dev_stream, CUBLAS_OP_N, CUBLAS_OP_N, n_nonsingular - 1, 3, n_nonsingular - 1, &calpha, @@ -1237,7 +1237,7 @@ void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const d_lam_3, 1, 1, desc_lam_3, CUBLAS_COMPUTE_64F_PEDANTIC ); - printf("successful after gemm1\n"); + // printf("desc_wing:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_wing.m_loc(),desc_wing.n_loc(),desc_wing.lld(),desc_wing.mb(),desc_wing.nb()); // printf("desc_lam_3:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_lam_3.m_loc(),desc_lam_3.n_loc(),desc_lam_3.lld(),desc_lam_3.mb(),desc_lam_3.nb()); // printf("desc_3_3:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_3_3.m_loc(),desc_3_3.n_loc(),desc_3_3.lld(),desc_3_3.mb(),desc_3_3.nb()); @@ -1250,7 +1250,7 @@ void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const d_Lind_loc, 1, 1, desc_3_3, CUBLAS_COMPUTE_64F_PEDANTIC ); - printf("successful after gemm2\n"); + CudaConnector::pgemm_nvhpc( gpu_dev_stream, CUBLAS_OP_C, CUBLAS_OP_N, 3, n_nonsingular - 1, n_nonsingular - 1, &calpha, @@ -1260,7 +1260,7 @@ void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const d_3_lam, 1, 1, desc_3_lam, CUBLAS_COMPUTE_64F_PEDANTIC ); - printf("successful after gemm3\n"); + CUDA_CHECK(cudaMemcpyAsync(lam_3.ptr(), d_lam_3.ptr(), desc_lam_3.m_loc() * desc_lam_3.n_loc() * sizeof(std::complex), cudaMemcpyDeviceToHost, gpu_dev_stream.stream)); CUDA_CHECK(cudaMemcpyAsync(_3_lam.ptr(), d_3_lam.ptr(), desc_3_lam.m_loc() * desc_3_lam.n_loc() * sizeof(std::complex), cudaMemcpyDeviceToHost, gpu_dev_stream.stream)); CUDA_CHECK(cudaMemcpyAsync(Lind_loc.ptr(), d_Lind_loc.ptr(), desc_3_3.m_loc() * desc_3_3.n_loc() * sizeof(std::complex), cudaMemcpyDeviceToHost, gpu_dev_stream.stream)); From 0d11aebc60fa3f24b117ad4801276bcf3ded3e9c Mon Sep 17 00:00:00 2001 From: userchba <88939318+userchba@users.noreply.github.com> Date: Tue, 2 Dec 2025 17:45:09 +0800 Subject: [PATCH 18/18] Add files via upload --- src/dielecmodel.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/dielecmodel.cpp b/src/dielecmodel.cpp index cdcff61a..8338c640 100644 --- a/src/dielecmodel.cpp +++ b/src/dielecmodel.cpp @@ -514,7 +514,7 @@ void diele_func::wing_mu_to_lambda(matrix_m> &sqrtveig_blac for (int iomega = 0; iomega != this->omega.size(); iomega++) { auto &wing_tmp = this->wing.at(iomega); - wing_tmp = init_local_mat>(desc_wing_opt, MAJOR::COL); + wing_tmp = init_local_mat>(desc_wing, MAJOR::COL); // TODO: reconstruct wing_mu auto wing_mu_tmp = init_local_mat>(desc_wing_mu, MAJOR::COL); for (int alpha = 0; alpha != 3; alpha++) @@ -1150,9 +1150,9 @@ void diele_func::construct_L(const int ifreq, Array_Desc &desc_body) // tmp = head.at(ifreq) - transpose(wing.at(ifreq), true) * body_inv * wing.at(ifreq); ScalapackConnector::pgemm_f('N', 'N', n_nonsingular - 1, 3, n_nonsingular - 1, 1.0, body_inv.ptr(), 1, 1, desc_body.desc, wing.at(ifreq).ptr(), 1, 1, - desc_wing_opt.desc, 0.0, lam_3.ptr(), 1, 1, desc_lam_3.desc); + desc_wing.desc, 0.0, lam_3.ptr(), 1, 1, desc_lam_3.desc); ScalapackConnector::pgemm_f('C', 'N', 3, 3, n_nonsingular - 1, 1.0, wing.at(ifreq).ptr(), 1, 1, - desc_wing_opt.desc, lam_3.ptr(), 1, 1, desc_lam_3.desc, 0.0, + desc_wing.desc, lam_3.ptr(), 1, 1, desc_lam_3.desc, 0.0, Lind_loc.ptr(), 1, 1, desc_3_3.desc); ScalapackConnector::pgemm_f('C', 'N', 3, n_nonsingular - 1, n_nonsingular - 1, 1.0, wing.at(ifreq).ptr(), 1, 1, desc_wing.desc, body_inv.ptr(), 1, @@ -1227,7 +1227,7 @@ void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const d_Lind_loc.set_data(desc_3_3.m_loc(),desc_3_3.n_loc(),gpu_dev_stream.stream); std::complex calpha(1.0,0.0),cbeta(0.0,0.0); - + CudaConnector::pgemm_nvhpc( gpu_dev_stream, CUBLAS_OP_N, CUBLAS_OP_N, n_nonsingular - 1, 3, n_nonsingular - 1, &calpha, @@ -1237,7 +1237,7 @@ void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const d_lam_3, 1, 1, desc_lam_3, CUBLAS_COMPUTE_64F_PEDANTIC ); - + // printf("desc_wing:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_wing.m_loc(),desc_wing.n_loc(),desc_wing.lld(),desc_wing.mb(),desc_wing.nb()); // printf("desc_lam_3:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_lam_3.m_loc(),desc_lam_3.n_loc(),desc_lam_3.lld(),desc_lam_3.mb(),desc_lam_3.nb()); // printf("desc_3_3:m_loc:%d,n_loc:%d,lld:%d,mb:%d,nb:%d\n",desc_3_3.m_loc(),desc_3_3.n_loc(),desc_3_3.lld(),desc_3_3.mb(),desc_3_3.nb()); @@ -1250,7 +1250,7 @@ void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const d_Lind_loc, 1, 1, desc_3_3, CUBLAS_COMPUTE_64F_PEDANTIC ); - + CudaConnector::pgemm_nvhpc( gpu_dev_stream, CUBLAS_OP_C, CUBLAS_OP_N, 3, n_nonsingular - 1, n_nonsingular - 1, &calpha, @@ -1260,7 +1260,7 @@ void diele_func::construct_L_nvhpc(const GpuDeviceStream& gpu_dev_stream, const d_3_lam, 1, 1, desc_3_lam, CUBLAS_COMPUTE_64F_PEDANTIC ); - + CUDA_CHECK(cudaMemcpyAsync(lam_3.ptr(), d_lam_3.ptr(), desc_lam_3.m_loc() * desc_lam_3.n_loc() * sizeof(std::complex), cudaMemcpyDeviceToHost, gpu_dev_stream.stream)); CUDA_CHECK(cudaMemcpyAsync(_3_lam.ptr(), d_3_lam.ptr(), desc_3_lam.m_loc() * desc_3_lam.n_loc() * sizeof(std::complex), cudaMemcpyDeviceToHost, gpu_dev_stream.stream)); CUDA_CHECK(cudaMemcpyAsync(Lind_loc.ptr(), d_Lind_loc.ptr(), desc_3_3.m_loc() * desc_3_3.n_loc() * sizeof(std::complex), cudaMemcpyDeviceToHost, gpu_dev_stream.stream));