diff --git a/.cmake/mito-config.cmake.in b/.cmake/mito-config.cmake.in index 124606f58..d792ed33f 100644 --- a/.cmake/mito-config.cmake.in +++ b/.cmake/mito-config.cmake.in @@ -44,4 +44,10 @@ endif(@WITH_VTK@) if(@WITH_PETSC@) # add compiler definitions add_definitions(-DWITH_PETSC) -endif(@WITH_PETSC@) \ No newline at end of file +endif(@WITH_PETSC@) + +# cuda dependency +if(@WITH_CUDA@) + # add compiler definitions + add_definitions(-DWITH_CUDA) +endif(@WITH_CUDA@) \ No newline at end of file diff --git a/.cmake/mito_cuda.cmake b/.cmake/mito_cuda.cmake new file mode 100644 index 000000000..d1f0cc7b1 --- /dev/null +++ b/.cmake/mito_cuda.cmake @@ -0,0 +1,54 @@ +# -*- cmake -*- +# +# Copyright (c) 2020-2024, the MiTo Authors, all rights reserved +# + + +# CUDA support +option(WITH_CUDA "Enable support for CUDA" OFF) + +# if CUDA is requested +if(WITH_CUDA) + # find CUDA package + find_package(CUDAToolkit REQUIRED) + # report + message(STATUS "Enable CUDA support") + # add compiler definitions + add_definitions(-DWITH_CUDA) + message(STATUS "Add definition WITH_CUDA") + # enable CUDA language for targets + enable_language(CUDA) + # include CUDA headers + target_include_directories(mito PUBLIC ${CUDAToolkit_INCLUDE_DIRS}) + # link against CUDA libraries + target_link_libraries(mito PUBLIC CUDA::cudart CUDA::cusolver) + # set CUDA host compiler to match the C++ compiler (to tell NVCC the compiler to use for the host code) + if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA") + set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE FILEPATH "Host compiler for NVCC") + endif() + message(STATUS "Using CUDA host compiler: ${CMAKE_CUDA_HOST_COMPILER}") + # disable offline compilation support warnings + target_compile_options(mito PUBLIC $<$:-Wno-deprecated-gpu-targets>) + # find Eigen for CUDA sparse solver + find_package(Eigen3 3.3 QUIET NO_MODULE) + if(Eigen3_FOUND) + # add Eigen compiler definition + message(STATUS "Found Eigen3: ${EIGEN3_INCLUDE_DIR}") + add_definitions(-DWITH_EIGEN) + # add include directories for Eigen + target_include_directories(mito PUBLIC ${EIGEN3_INCLUDE_DIR}) + # disable warnings due to incompatible return types in Eigen + target_compile_options(mito PUBLIC $<$:--expt-relaxed-constexpr>) + # find cudss library + find_package(cudss REQUIRED) + # include cudss headers + target_include_directories(mito PUBLIC ${cudss_INCLUDE_DIR}) + # link against cudss libraries + target_link_libraries(mito PUBLIC cudss) + else() + message(WARNING "Eigen3 not found. CUDA sparse solver parts will be disabled.") + endif() +endif() + + +# end of file diff --git a/.cmake/mito_sources.cmake b/.cmake/mito_sources.cmake index 3095b5fea..86c600cd2 100644 --- a/.cmake/mito_sources.cmake +++ b/.cmake/mito_sources.cmake @@ -14,5 +14,14 @@ set(MITO_SOURCES ${MITO_SOURCES} ) endif() +# the mito cuda solvers backend +if (WITH_CUDA) +set(MITO_SOURCES ${MITO_SOURCES} + lib/mito/solvers/backend/cuda/CUDASolver.cu + lib/mito/solvers/backend/cuda/CUDADenseSolver.cu + lib/mito/solvers/backend/cuda/CUDASparseSolver.cu +) +endif() + # end of file diff --git a/.cmake/mito_tests_mito_lib.cmake b/.cmake/mito_tests_mito_lib.cmake index b3a8b9a84..0e63104eb 100644 --- a/.cmake/mito_tests_mito_lib.cmake +++ b/.cmake/mito_tests_mito_lib.cmake @@ -69,6 +69,11 @@ if(WITH_PETSC) mito_test_driver(tests/mito.lib/solvers/petsc_solve_linear_system.cc) endif() +if(WITH_CUDA) + mito_test_driver(tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc) + mito_test_driver(tests/mito.lib/solvers/cuda_sparse_solver_solve_linear_system.cc) +endif() + # tensor mito_test_driver(tests/mito.lib/tensor/one_forms.cc) mito_test_driver(tests/mito.lib/tensor/contractions.cc) diff --git a/CMakeLists.txt b/CMakeLists.txt index 458b981ca..665525e3f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,6 +58,9 @@ include(mito_metis) # petsc support include(mito_petsc) +# cuda support +include(mito_cuda) + # configure mito as a cmake package (so that mito can be found by cmake with find_package(mito)) mito_shareCmakePackage() diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu new file mode 100644 index 000000000..2c0e79401 --- /dev/null +++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu @@ -0,0 +1,243 @@ +// -*- c++ -*- +// +// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved +// + + +#include "public.h" + + +// constructor +template +mito::solvers::cuda::CUDADenseSolver::CUDADenseSolver(SolverType solver_type) : + CUDASolver(solver_type), + _h_matrix(), + _d_matrix(), + _cusolver_handle() +{ + // initialize cuSOLVER + _initialize_cusolver(); +} + +// destructor +template +mito::solvers::cuda::CUDADenseSolver::~CUDADenseSolver() +{ + // finalize cuSOLVER + _finalize_cusolver(); +} + +// add/insert {value} to matrix entry at ({row}, {col}) of the host copy +template +auto +mito::solvers::cuda::CUDADenseSolver::set_matrix_value( + size_t row, size_t col, const real_type value, const InsertMode insert_mode) -> void +{ + // check if the system assembly is finalized and throw an error if it is + if (this->_is_assembly_finalized) { + throw std::logic_error( + "System assembly is already finalized. Cannot add/insert values to the matrix."); + } + + // check if the row and column indices are within bounds + this->_check_index_validity(row); + this->_check_index_validity(col); + + // add/insert the value to the matrix entry in the host matrix + // NOTE: We store the matrix in column-major order since the cuSOLVER library expects the matrix + // to be in column-major order. + if (insert_mode == InsertMode::ADD_VALUE) + _h_matrix[col * this->_size + row] += value; + else if (insert_mode == InsertMode::INSERT_VALUE) + _h_matrix[col * this->_size + row] = value; + else + throw std::invalid_argument("Invalid insert mode. Use ADD_VALUE or INSERT_VALUE."); + + // all done + return; +} + +template +auto +mito::solvers::cuda::CUDADenseSolver::reset_system() -> void +{ + // check if the solver is initialized + if (!this->_is_solver_initialized) { + throw std::logic_error("Solver is not yet initialized. Call initialize() first."); + } + + // fill the host matrix, rhs and solution with zeros + _h_matrix.zero(); + this->_h_rhs.zero(); + this->_h_solution.zero(); + + // reset the assembly finalized flag + this->_is_assembly_finalized = false; + + // all done + return; +} + +template +auto +mito::solvers::cuda::CUDADenseSolver::solve() -> void +{ + // check if the assembly is finalized + if (!this->_is_assembly_finalized) { + throw std::logic_error( + "System assembly is not yet finalized. Call finalize_assembly() first."); + } + + // copy the host matrix and rhs data to device global memory + CHECK_CUDA_ERROR(cudaMemcpy( + _d_matrix.data(), _h_matrix.data(), this->_size * this->_size * sizeof(real_type), + cudaMemcpyHostToDevice)); + CHECK_CUDA_ERROR(cudaMemcpy( + this->_d_rhs.data(), this->_h_rhs.data(), this->_size * sizeof(real_type), + cudaMemcpyHostToDevice)); + CHECK_CUDA_ERROR(cudaDeviceSynchronize()); + + // allocate device memory for temporary variables in the factorization + int * d_pivot = nullptr; + int * d_info = nullptr; + real_type * d_workspace = nullptr; + int workspace_size = 0; + + // check the solver type is either LU or Cholesky + if (this->_solver_type != SolverType::LU && this->_solver_type != SolverType::CHOLESKY) { + throw std::invalid_argument( + "Invalid solver type. Only LU and Cholesky solvers are supported in the CUDA dense " + "solver."); + } + + // get the workspace size for the factorization + if (this->_solver_type == SolverType::LU) { + CHECK_CUSOLVER_ERROR( + cusolver_traits::getrf_buffer_size( + _cusolver_handle, this->_size, this->_size, _d_matrix.data(), this->_size, + &workspace_size)); + } else if (this->_solver_type == SolverType::CHOLESKY) { + CHECK_CUSOLVER_ERROR( + cusolver_traits::potrf_buffer_size( + _cusolver_handle, CUBLAS_FILL_MODE_LOWER, this->_size, _d_matrix.data(), + this->_size, &workspace_size)); + } + + CHECK_CUDA_ERROR(cudaMalloc(&d_pivot, this->_size * sizeof(int))); + CHECK_CUDA_ERROR(cudaMalloc(&d_info, sizeof(int))); + CHECK_CUDA_ERROR(cudaMalloc(&d_workspace, workspace_size * sizeof(real_type))); + + // perform the factorization + if (this->_solver_type == SolverType::LU) { + CHECK_CUSOLVER_ERROR( + cusolver_traits::getrf( + _cusolver_handle, this->_size, this->_size, _d_matrix.data(), this->_size, + d_workspace, d_pivot, d_info)); + } else if (this->_solver_type == SolverType::CHOLESKY) { + CHECK_CUSOLVER_ERROR( + cusolver_traits::potrf( + _cusolver_handle, CUBLAS_FILL_MODE_LOWER, this->_size, _d_matrix.data(), + this->_size, d_workspace, workspace_size, d_info)); + } + CHECK_CUDA_ERROR(cudaDeviceSynchronize()); + + // solve the linear system + if (this->_solver_type == SolverType::LU) { + CHECK_CUSOLVER_ERROR( + cusolver_traits::getrs( + _cusolver_handle, CUBLAS_OP_N, this->_size, 1, _d_matrix.data(), this->_size, + d_pivot, this->_d_rhs.data(), this->_size, d_info)); + } else if (this->_solver_type == SolverType::CHOLESKY) { + CHECK_CUSOLVER_ERROR( + cusolver_traits::potrs( + _cusolver_handle, CUBLAS_FILL_MODE_LOWER, this->_size, 1, _d_matrix.data(), + this->_size, this->_d_rhs.data(), this->_size, d_info)); + } + CHECK_CUDA_ERROR(cudaDeviceSynchronize()); + + // copy the solution from device global memory to host memory + // NOTE: _d_rhs contains the solution after the call to getrs/potrs as its contents are + // overwritten by the solution vector + CHECK_CUDA_ERROR(cudaMemcpy( + this->_h_solution.data(), this->_d_rhs.data(), this->_size * sizeof(real_type), + cudaMemcpyDeviceToHost)); + + // free the temporary device memory + CHECK_CUDA_ERROR(cudaFree(d_pivot)); + CHECK_CUDA_ERROR(cudaFree(d_info)); + CHECK_CUDA_ERROR(cudaFree(d_workspace)); + + // all done + return; +} + +template +auto +mito::solvers::cuda::CUDADenseSolver::_initialize_cusolver() -> void +{ + // create the cuSOLVER handle + CHECK_CUSOLVER_ERROR(cusolverDnCreate(&_cusolver_handle)); + + // create a cuda stream + CHECK_CUDA_ERROR(cudaStreamCreateWithPriority(&(this->_cuda_stream), cudaStreamNonBlocking, 0)); + + // set the stream for the cuSOLVER handle + CHECK_CUSOLVER_ERROR(cusolverDnSetStream(_cusolver_handle, this->_cuda_stream)); + + // all done + return; +} + +template +auto +mito::solvers::cuda::CUDADenseSolver::_finalize_cusolver() -> void +{ + // destroy the cuSOLVER handle + CHECK_CUSOLVER_ERROR(cusolverDnDestroy(_cusolver_handle)); + + // destroy the cuda stream + CHECK_CUDA_ERROR(cudaStreamDestroy(this->_cuda_stream)); + + // reset the handle and stream pointers + _cusolver_handle = nullptr; + this->_cuda_stream = nullptr; + + // all done + return; +} + +template +auto +mito::solvers::cuda::CUDADenseSolver::_allocate_host_memory(size_t size) -> void +{ + // allocate host memory for matrix, rhs and solution + _h_matrix.allocate(size * size); + this->_h_rhs.allocate(size); + this->_h_solution.allocate(size); + + // zero out the arrays + _h_matrix.zero(); + this->_h_rhs.zero(); + this->_h_solution.zero(); + + // all done + return; +} + +template +auto +mito::solvers::cuda::CUDADenseSolver::_allocate_device_memory(size_t size) -> void +{ + // allocate global device memory for matrix and rhs + _d_matrix.allocate(size * size); + this->_d_rhs.allocate(size); + + // all done + return; +} + +// explicit instantiation of the {CUDADenseSolver} class for doubles +template class mito::solvers::cuda::CUDADenseSolver; + +// explicit instantiation of the {CUDADenseSolver} class for floats +template class mito::solvers::cuda::CUDADenseSolver; diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h new file mode 100644 index 000000000..0c26e113f --- /dev/null +++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h @@ -0,0 +1,90 @@ +// -*- c++ -*- +// +// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved +// + +// code guard +#pragma once + + +// a struct to hold the cusolver function pointers for different data types +template +struct cusolver_traits; + +template <> +struct cusolver_traits { + // Cholesky factorization and solve routines for double precision + static constexpr auto potrf_buffer_size = cusolverDnDpotrf_bufferSize; + static constexpr auto potrf = cusolverDnDpotrf; + static constexpr auto potrs = cusolverDnDpotrs; + // LU factorization and solve routines for double precision + static constexpr auto getrf_buffer_size = cusolverDnDgetrf_bufferSize; + static constexpr auto getrf = cusolverDnDgetrf; + static constexpr auto getrs = cusolverDnDgetrs; +}; + +template <> +struct cusolver_traits { + // Cholesky factorization and solve routines for single precision + static constexpr auto potrf_buffer_size = cusolverDnSpotrf_bufferSize; + static constexpr auto potrf = cusolverDnSpotrf; + static constexpr auto potrs = cusolverDnSpotrs; + // LU factorization and solve routines for single precision + static constexpr auto getrf_buffer_size = cusolverDnSgetrf_bufferSize; + static constexpr auto getrf = cusolverDnSgetrf; + static constexpr auto getrs = cusolverDnSgetrs; +}; + + +namespace mito::solvers::cuda { + + template + class CUDADenseSolver : public CUDASolver { + private: + // real type definition from CUDASolver + using real_type = typename CUDASolver::real_type; + + public: + // constructor + CUDADenseSolver(SolverType solver_type = SolverType::LU); + + // destructor + ~CUDADenseSolver(); + + public: + // set (add or insert depending on the mode) the value of a matrix entry in the host copy + auto set_matrix_value( + size_t row, size_t col, const real_type value, + const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void override; + + // reset the linear system (i.e. the host copy of the matrix, right-hand side and solution) + auto reset_system() -> void override; + + // solve the linear system + auto solve() -> void override; + + private: + // initialize the cuSOLVER utilities + auto _initialize_cusolver() -> void override; + + // destroy the cuSOLVER utilities + auto _finalize_cusolver() -> void override; + + // allocate the host memory for the matrix, right-hand side, and solution + auto _allocate_host_memory(size_t size) -> void override; + + // allocate the device memory for the matrix and right-hand side + auto _allocate_device_memory(size_t size) -> void override; + + private: + // host copy of the matrix + HostArray _h_matrix; + // device copy of the matrix + DeviceArray _d_matrix; + // cuSOLVER handle + cusolverDnHandle_t _cusolver_handle; + }; +} + + +// end of file diff --git a/lib/mito/solvers/backend/cuda/CUDAErrorChecks.h b/lib/mito/solvers/backend/cuda/CUDAErrorChecks.h new file mode 100644 index 000000000..0fb22c848 --- /dev/null +++ b/lib/mito/solvers/backend/cuda/CUDAErrorChecks.h @@ -0,0 +1,70 @@ +// -*- c++ -*- +// +// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved +// + + +// code guard +#pragma once + + +// CUDA error checking macro +#define CHECK_CUDA_ERROR(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + fprintf(stderr, "CUDA error at %s:%d\n", __FILE__, __LINE__); \ + fprintf(stderr, " Error: %s (%d)\n", cudaGetErrorString(err), err); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +// function to convert cuSOLVER error codes to strings +inline const char * +cusolverGetErrorString(cusolverStatus_t status) +{ + switch (status) { + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_MAPPING_ERROR: + return "CUSOLVER_STATUS_MAPPING_ERROR"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + default: + return "Unknown cuSOLVER error"; + } +} + +// cuSOLVER error checking macro +#define CHECK_CUSOLVER_ERROR(call) \ + do { \ + cusolverStatus_t status = (call); \ + if (status != CUSOLVER_STATUS_SUCCESS) { \ + fprintf( \ + stderr, "cuSOLVER error: %s at %s:%d\n", cusolverGetErrorString(status), __FILE__, \ + __LINE__); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +// cuDSS error checking macro +#define CHECK_CUDSS_ERROR(call) \ + do { \ + cudssStatus_t status = (call); \ + if (status != CUDSS_STATUS_SUCCESS) { \ + fprintf(stderr, "cuDSS error: %d at %s:%d\n", status, __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) diff --git a/lib/mito/solvers/backend/cuda/CUDASolver.cu b/lib/mito/solvers/backend/cuda/CUDASolver.cu new file mode 100644 index 000000000..8ee9e4641 --- /dev/null +++ b/lib/mito/solvers/backend/cuda/CUDASolver.cu @@ -0,0 +1,134 @@ +// -*- c++ -*- +// +// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved +// + + +#include "public.h" + + +// constructor +template +mito::solvers::cuda::CUDASolver::CUDASolver(SolverType solver_type) : + _h_rhs(), + _h_solution(), + _d_rhs(), + _solver_type(solver_type), + _size(0), + _is_solver_initialized(false), + _is_assembly_finalized(false), + _cuda_stream(nullptr) +{} + +// destructor +template +mito::solvers::cuda::CUDASolver::~CUDASolver() +{} + +template +auto +mito::solvers::cuda::CUDASolver::initialize(size_t size) -> void +{ + // check if the solver is already initialized + if (_is_solver_initialized) { + throw std::logic_error( + "Solver is already initialized. You are not allowed to re-initialize it. If you want " + "to re-initialize, you need to create a new solver instance."); + } + + // check if the size is valid + if (size <= 0) { + throw std::invalid_argument("Size of the linear system must be greater than zero."); + } + + // save the size of the linear system + _size = size; + + // allocate host memory + _allocate_host_memory(size); + + // allocate device memory + _allocate_device_memory(size); + + // turn on the solver initialized flag + _is_solver_initialized = true; + + // all done + return; +} + +// add/insert {value} to rhs entry at {row} of the host copy +template +auto +mito::solvers::cuda::CUDASolver::set_rhs_value( + size_t row, const real_type value, const InsertMode insert_mode) -> void +{ + // check if the system assembly is finalized and throw an error if it is + if (_is_assembly_finalized) { + throw std::logic_error( + "System assembly is already finalized. Cannot add/insert values to the rhs."); + } + + // check if the row index is within bounds + this->_check_index_validity(row); + + // add/insert the value to the rhs entry in the host rhs + if (insert_mode == InsertMode::ADD_VALUE) + _h_rhs[row] += value; + else if (insert_mode == InsertMode::INSERT_VALUE) + _h_rhs[row] = value; + else + throw std::invalid_argument("Invalid insert mode. Use ADD_VALUE or INSERT_VALUE."); + + // all done + return; +} + +template +auto +mito::solvers::cuda::CUDASolver::finalize_assembly() -> void +{ + // check if the solver is initialized + if (!_is_solver_initialized) { + throw std::logic_error( + "Solver is not yet initialized. Call initialize() first, assemble the " + "system, and then finalize the assembly."); + } + + // issue a warning that all entries should be set before finalizing the assembly + std::cerr + << "Warning: Finalizing assembly. Make sure all system entries are set before this step." + << std::endl; + + // set the assembly finalized flag to true + _is_assembly_finalized = true; + + // all done + return; +} + +template +auto +mito::solvers::cuda::CUDASolver::_check_index_validity(size_t index) const -> void +{ + // check if the solver is initialized + if (!_is_solver_initialized) { + throw std::logic_error("Solver is not yet initialized. Call initialize() first."); + } + + // check if the index is valid and return false if it is not + if (index >= _size) { + throw std::out_of_range( + "Index " + std::to_string(index) + " is out of range. It must be between 0 and " + + std::to_string(_size - 1) + "."); + } + + // all done + return; +} + +// explicit instantiation of the {CUDASolver} class for doubles +template class mito::solvers::cuda::CUDASolver; + +// explicit instantiation of the {CUDASolver} class for floats +template class mito::solvers::cuda::CUDASolver; diff --git a/lib/mito/solvers/backend/cuda/CUDASolver.h b/lib/mito/solvers/backend/cuda/CUDASolver.h new file mode 100644 index 000000000..f8c5e3e9b --- /dev/null +++ b/lib/mito/solvers/backend/cuda/CUDASolver.h @@ -0,0 +1,96 @@ +// -*- c++ -*- +// +// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved +// + +// code guard +#pragma once + + +namespace mito::solvers::cuda { + + template + class CUDASolver { + public: + // type alias for real numbers + using real_type = realT; + + public: + // constructor + CUDASolver(SolverType solver_type); + + // destructor + ~CUDASolver(); + + public: + // initialize the CUDA solver + auto initialize(size_t size) -> void; + + // reset the linear system (i.e. the host copy of the matrix, right-hand side and solution) + virtual auto reset_system() -> void = 0; + + // set (add or insert depending on the mode) the value of a matrix entry in the host copy + virtual auto set_matrix_value( + size_t row, size_t col, const real_type value, + const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void = 0; + + // set (add or insert depending on the mode) the value of a right-hand side entry in the + // host copy + virtual auto set_rhs_value( + size_t row, const real_type value, + const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void; + + // finalize the linear system assembly + auto finalize_assembly() -> void; + + // solve the linear system + virtual auto solve() -> void = 0; + + // get the solution vector + template + auto get_solution(solutionT & solution) const -> void; + + protected: + // initialize the cuSOLVER utilities + virtual auto _initialize_cusolver() -> void = 0; + + // destroy the cuSOLVER utilities + virtual auto _finalize_cusolver() -> void = 0; + + // allocate the host memory for the matrix, right-hand side, and solution + virtual auto _allocate_host_memory(size_t size) -> void = 0; + + // allocate the device memory for the matrix and right-hand side + virtual auto _allocate_device_memory(size_t size) -> void = 0; + + // check the validity of the index in the matrix and right-hand side + auto _check_index_validity(size_t index) const -> void; + + protected: + // host copy of the right-hand side + HostArray _h_rhs; + // host copy of the solution + HostArray _h_solution; + // device copy of the right-hand side + DeviceArray _d_rhs; + // solver type + SolverType _solver_type; + // size of the linear system + size_t _size; + // flag to indicate if the solver has been initialized + bool _is_solver_initialized; + // flag to indicate if the system assembly has been finalized + bool _is_assembly_finalized; + // cuda stream + cudaStream_t _cuda_stream; + }; +} + + +// get the template definitions +#define mito_solvers_backend_cuda_CUDASolver_icc +#include "CUDASolver.icc" +#undef mito_solvers_backend_cuda_CUDASolver_icc + + +// end of file diff --git a/lib/mito/solvers/backend/cuda/CUDASolver.icc b/lib/mito/solvers/backend/cuda/CUDASolver.icc new file mode 100644 index 000000000..5ee5e26f3 --- /dev/null +++ b/lib/mito/solvers/backend/cuda/CUDASolver.icc @@ -0,0 +1,32 @@ +// -*- c++ -*- +// +// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved +// + + +#if !defined(mito_solvers_backend_cuda_CUDASolver_icc) +#error This header file contains implementation details of class mito::solvers::cuda::CUDASolver +#else + +template +template +auto +mito::solvers::cuda::CUDASolver::get_solution(solutionT & solution) const -> void +{ + // check if {_size} matches the size of {solution} + assert(_size == std::size(solution)); + + // reset the entries in {solution} + std::fill(std::begin(solution), std::end(solution), 0.0); + + // copy {_h_solution} into {solution} + std::copy(_h_solution.data(), _h_solution.data() + _size, std::begin(solution)); + + // all done + return; +} + +#endif // mito_solvers_backend_cuda_CUDASolver_icc + + +// end of file diff --git a/lib/mito/solvers/backend/cuda/CUDASparseSolver.cu b/lib/mito/solvers/backend/cuda/CUDASparseSolver.cu new file mode 100644 index 000000000..8186672e1 --- /dev/null +++ b/lib/mito/solvers/backend/cuda/CUDASparseSolver.cu @@ -0,0 +1,298 @@ +// -*- c++ -*- +// +// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved +// + + +#include "public.h" + + +// constructor +template +mito::solvers::cuda::CUDASparseSolver::CUDASparseSolver(SolverType solver_type) : + CUDASolver(solver_type), + _h_matrix(), + _d_matrix(), + _d_solution(), + _cudss_handle() +{ + // initialize cuSOLVER + _initialize_cusolver(); +} + +// destructor +template +mito::solvers::cuda::CUDASparseSolver::~CUDASparseSolver() +{ + // finalize cuSOLVER + _finalize_cusolver(); +} + +template +auto +mito::solvers::cuda::CUDASparseSolver::initialize(size_t size, size_t nnz) -> void +{ + // call the base class initialize function + CUDASolver::initialize(size); + + // reserve space for non-zero entries per row in the host sparse matrix for efficiency + // NOTE: This is just an estimate. The actual number of non-zero entries may be less than + // nnz. But it is better to set it to a reasonable value to avoid multiple reallocations. + // In general, over-allocating is better than under-allocating. + (nnz == 0) ? _h_matrix.reserve(Eigen::VectorXi::Constant(size, size)) : + _h_matrix.reserve(Eigen::VectorXi::Constant(size, nnz)); + + // all done + return; +} + +// add/insert {value} to matrix entry at ({row}, {col}) of the host copy +template +auto +mito::solvers::cuda::CUDASparseSolver::set_matrix_value( + size_t row, size_t col, const real_type value, const InsertMode insert_mode) -> void +{ + // check if the system assembly is finalized and throw an error if it is + if (this->_is_assembly_finalized) { + throw std::logic_error( + "System assembly is already finalized. Cannot add/insert values to the matrix."); + } + + // check if the row and column indices are within bounds + this->_check_index_validity(row); + this->_check_index_validity(col); + + // add/insert the value to the matrix entry in the host matrix + // NOTE: We store the host matrix in CSR format using Eigen's SparseMatrix class in row-major + // order. + if (insert_mode == InsertMode::ADD_VALUE) + _h_matrix.coeffRef(row, col) += value; + else if (insert_mode == InsertMode::INSERT_VALUE) + _h_matrix.coeffRef(row, col) = value; + else + throw std::invalid_argument("Invalid insert mode. Use ADD_VALUE or INSERT_VALUE."); + + // all done + return; +} + +template +auto +mito::solvers::cuda::CUDASparseSolver::reset_system() -> void +{ + // check if the solver is initialized + if (!this->_is_solver_initialized) { + throw std::logic_error("Solver is not yet initialized. Call initialize() first."); + } + + // fill the host matrix, rhs and solution with zeros + _h_matrix.setZero(); + this->_h_rhs.zero(); + this->_h_solution.zero(); + + // reset the assembly finalized flag + this->_is_assembly_finalized = false; + + // all done + return; +} + +template +auto +mito::solvers::cuda::CUDASparseSolver::solve() -> void +{ + // check if the assembly is finalized + if (!this->_is_assembly_finalized) { + throw std::logic_error( + "System assembly is not yet finalized. Call finalize_assembly() first."); + } + + // finalize the host matrix (compress the storage) + // NOTE: This is not a mandatory step as we are only using the Eigen matrix for storage on the + // host but we are doing it anyway. + _h_matrix.makeCompressed(); + + // create and fill the auxiliary containers to hold the CSR representation of the host matrix + size_t nnz_matrix = _h_matrix.nonZeros(); + std::vector h_csr_values(nnz_matrix, 0.0); + std::vector h_csr_row_offsets(this->_size + 1, 0); + std::vector h_csr_col_indices(nnz_matrix, 0); + int idx = 0; + for (size_t k = 0; k < this->_size; ++k) { // for each row + h_csr_row_offsets[k] = idx; + for (typename Eigen::SparseMatrix::InnerIterator it( + _h_matrix, k); + it; ++it) { // for each non-zero entry in the row + h_csr_values[idx] = it.value(); + h_csr_col_indices[idx] = it.col(); + ++idx; + } + } + h_csr_row_offsets[this->_size] = nnz_matrix; + + // allocate device memory for the matrix + _d_matrix.allocate(this->_size, this->_size, nnz_matrix); + + // copy the host matrix and rhs data to device global memory + CHECK_CUDA_ERROR(cudaMemcpy( + _d_matrix.values(), h_csr_values.data(), h_csr_values.size() * sizeof(real_type), + cudaMemcpyHostToDevice)); + CHECK_CUDA_ERROR(cudaMemcpy( + _d_matrix.row_offsets(), h_csr_row_offsets.data(), h_csr_row_offsets.size() * sizeof(int), + cudaMemcpyHostToDevice)); + CHECK_CUDA_ERROR(cudaMemcpy( + _d_matrix.col_indices(), h_csr_col_indices.data(), h_csr_col_indices.size() * sizeof(int), + cudaMemcpyHostToDevice)); + CHECK_CUDA_ERROR(cudaMemcpy( + this->_d_rhs.data(), this->_h_rhs.data(), this->_size * sizeof(real_type), + cudaMemcpyHostToDevice)); + CHECK_CUDA_ERROR(cudaDeviceSynchronize()); + + // cuDSS data structures to solve the linear system + cudssConfig_t config; + cudssData_t data; + cudssMatrix_t d_A, d_b, d_x; + + // create the cuDSS config and data structures + CHECK_CUDSS_ERROR(cudssConfigCreate(&config)); + CHECK_CUDSS_ERROR(cudssDataCreate(_cudss_handle, &data)); + + // determine the data type + cudaDataType_t dtype; + if constexpr (std::is_same_v) { + dtype = CUDA_R_64F; + } else if constexpr (std::is_same_v) { + dtype = CUDA_R_32F; + } + + // determine the matrix type based on the solver type + cudssMatrixType_t mtype; + if (this->_solver_type == SolverType::LU) { + mtype = CUDSS_MTYPE_GENERAL; + } else if (this->_solver_type == SolverType::CHOLESKY) { + mtype = CUDSS_MTYPE_SPD; + } else { + throw std::logic_error( + "Unsupported solver type. Only LU and CHOLESKY are supported in the CUDA sparse " + "solver."); + } + + // create (CSR) matrix and vector descriptors + CHECK_CUDSS_ERROR(cudssMatrixCreateCsr( + &d_A, this->_size, this->_size, nnz_matrix, _d_matrix.row_offsets(), NULL, + _d_matrix.col_indices(), _d_matrix.values(), CUDA_R_32I, dtype, mtype, CUDSS_MVIEW_FULL, + CUDSS_BASE_ZERO)); + CHECK_CUDSS_ERROR(cudssMatrixCreateDn( + &d_b, this->_size, 1, this->_size, this->_d_rhs.data(), dtype, CUDSS_LAYOUT_COL_MAJOR)); + CHECK_CUDSS_ERROR(cudssMatrixCreateDn( + &d_x, this->_size, 1, this->_size, _d_solution.data(), dtype, CUDSS_LAYOUT_COL_MAJOR)); + + /************* solve the linear system in three stages *************/ + + // Stage 1: reordering & symbolic factorization + CHECK_CUDSS_ERROR( + cudssExecute(_cudss_handle, CUDSS_PHASE_ANALYSIS, config, data, d_A, d_x, d_b)); + CHECK_CUDA_ERROR(cudaDeviceSynchronize()); + + // Stage 2: numerical factorization + CHECK_CUDSS_ERROR( + cudssExecute(_cudss_handle, CUDSS_PHASE_FACTORIZATION, config, data, d_A, d_x, d_b)); + CHECK_CUDA_ERROR(cudaDeviceSynchronize()); + + // Stage 3: solve the system + CHECK_CUDSS_ERROR(cudssExecute(_cudss_handle, CUDSS_PHASE_SOLVE, config, data, d_A, d_x, d_b)); + CHECK_CUDA_ERROR(cudaDeviceSynchronize()); + + // copy the solution back to host + CHECK_CUDA_ERROR(cudaMemcpy( + this->_h_solution.data(), _d_solution.data(), this->_size * sizeof(real_type), + cudaMemcpyDeviceToHost)); + + // destroy the cuDSS data structures + CHECK_CUDSS_ERROR(cudssMatrixDestroy(d_A)); + CHECK_CUDSS_ERROR(cudssMatrixDestroy(d_b)); + CHECK_CUDSS_ERROR(cudssMatrixDestroy(d_x)); + CHECK_CUDSS_ERROR(cudssDataDestroy(_cudss_handle, data)); + CHECK_CUDSS_ERROR(cudssConfigDestroy(config)); + + // all done + return; +} + +template +auto +mito::solvers::cuda::CUDASparseSolver::_initialize_cusolver() -> void +{ + // create the cuDSS handle + CHECK_CUDSS_ERROR(cudssCreate(&_cudss_handle)); + + // create a cuda stream + CHECK_CUDA_ERROR(cudaStreamCreateWithPriority(&(this->_cuda_stream), cudaStreamNonBlocking, 0)); + + // set the stream for the cuDSS handle + CHECK_CUDSS_ERROR(cudssSetStream(_cudss_handle, this->_cuda_stream)); + + // all done + return; +} + +template +auto +mito::solvers::cuda::CUDASparseSolver::_finalize_cusolver() -> void +{ + // destroy the cuDSS handle + CHECK_CUDSS_ERROR(cudssDestroy(_cudss_handle)); + + // destroy the cuda stream + CHECK_CUDA_ERROR(cudaStreamDestroy(this->_cuda_stream)); + + // reset the handle and stream pointers + _cudss_handle = nullptr; + this->_cuda_stream = nullptr; + + // all done + return; +} + +template +auto +mito::solvers::cuda::CUDASparseSolver::_allocate_host_memory(size_t size) -> void +{ + // allocate host memory for matrix, rhs and solution + _h_matrix.resize(size, size); + this->_h_rhs.allocate(size); + this->_h_solution.allocate(size); + + // zero out the arrays + _h_matrix.setZero(); + this->_h_rhs.zero(); + this->_h_solution.zero(); + + // all done + return; +} + +template +auto +mito::solvers::cuda::CUDASparseSolver::_allocate_device_memory(size_t size) -> void +{ + // NOTE: We are not allocating memory for the matrix here because we do not know the number of + // non-zero entries yet and allocating device memory is an expensive operation. So we will + // allocate the device memory for the matrix directly after the assembly is finalized. + + // allocate device memory for rhs and solution + this->_d_rhs.allocate(size); + _d_solution.allocate(size); + + // zero out the solution array + _d_solution.zero(); + + // all done + return; +} + +// explicit instantiation of the {CUDASparseSolver} class for doubles +template class mito::solvers::cuda::CUDASparseSolver; + +// explicit instantiation of the {CUDASparseSolver} class for floats +template class mito::solvers::cuda::CUDASparseSolver; diff --git a/lib/mito/solvers/backend/cuda/CUDASparseSolver.h b/lib/mito/solvers/backend/cuda/CUDASparseSolver.h new file mode 100644 index 000000000..c8967aee1 --- /dev/null +++ b/lib/mito/solvers/backend/cuda/CUDASparseSolver.h @@ -0,0 +1,65 @@ +// -*- c++ -*- +// +// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved +// + +// code guard +#pragma once + + +namespace mito::solvers::cuda { + template + class CUDASparseSolver : public CUDASolver { + private: + // real type definition from CUDASolver + using real_type = typename CUDASolver::real_type; + + public: + // constructor + CUDASparseSolver(SolverType solver_type = SolverType::LU); + + // destructor + ~CUDASparseSolver(); + + public: + // initialize the CUDA sparse solver + auto initialize(size_t size, size_t nnz = 0) -> void; + + // set (add or insert depending on the mode) the value of a matrix entry in the host copy + auto set_matrix_value( + size_t row, size_t col, const real_type value, + const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void override; + + // reset the linear system (i.e. the host copy of the matrix, right-hand side and solution) + auto reset_system() -> void override; + + // solve the linear system + auto solve() -> void override; + + private: + // initialize the cuSOLVER utilities + auto _initialize_cusolver() -> void override; + + // destroy the cuSOLVER utilities + auto _finalize_cusolver() -> void override; + + // allocate the host memory for the matrix, right-hand side, and solution + auto _allocate_host_memory(size_t size) -> void override; + + // allocate the device memory for the matrix and right-hand side + auto _allocate_device_memory(size_t size) -> void override; + + private: + // host copy of the matrix in CSR format + Eigen::SparseMatrix _h_matrix; + // device copy of the matrix in CSR format + DeviceSparseMatrix _d_matrix; + // device copy of the solution + DeviceArray _d_solution; + // cuDSS handle + cudssHandle_t _cudss_handle; + }; +} + + +// end of file diff --git a/lib/mito/solvers/backend/cuda/api.h b/lib/mito/solvers/backend/cuda/api.h new file mode 100644 index 000000000..fbb598816 --- /dev/null +++ b/lib/mito/solvers/backend/cuda/api.h @@ -0,0 +1,22 @@ +// -*- c++ -*- +// +// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved +// + +// code guard +#pragma once + + +namespace mito::solvers::cuda { + + // cuda dense solver + template + using dense_t = CUDADenseSolver; + + // cuda sparse solver + template + using sparse_t = CUDASparseSolver; +} + + +// end of file \ No newline at end of file diff --git a/lib/mito/solvers/backend/cuda/externals.h b/lib/mito/solvers/backend/cuda/externals.h new file mode 100644 index 000000000..a6422a3bd --- /dev/null +++ b/lib/mito/solvers/backend/cuda/externals.h @@ -0,0 +1,42 @@ +// -*- c++ -*- +// +// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved +// + +// code guard +#pragma once + + +// externals +#include +#include +#include +#include +#include +#ifdef WITH_EIGEN +#include // for sparse matrix support +#endif // WITH_EIGEN + + +// cuda support +#include +#include +#ifdef WITH_EIGEN +#include +#endif // WITH_EIGEN + + +// utilities +#include "utilities.h" + + +namespace mito::solvers::cuda { + + // concept for a real type + template + concept real_c = std::is_floating_point_v; + +} + + +// end of file diff --git a/lib/mito/solvers/backend/cuda/factories.h b/lib/mito/solvers/backend/cuda/factories.h new file mode 100644 index 000000000..8dd3f44a6 --- /dev/null +++ b/lib/mito/solvers/backend/cuda/factories.h @@ -0,0 +1,28 @@ +// -*- c++ -*- +// +// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved +// + +// code guard +#pragma once + + +namespace mito::solvers::cuda { + + // cuda dense solver + template + auto dense(SolverType solver_type = SolverType::LU) + { + return dense_t(solver_type); + } + + // cuda sparse solver + template + auto sparse(SolverType solver_type = SolverType::LU) + { + return sparse_t(solver_type); + } +} + + +// end of file \ No newline at end of file diff --git a/lib/mito/solvers/backend/cuda/forward.h b/lib/mito/solvers/backend/cuda/forward.h new file mode 100644 index 000000000..ec32486d8 --- /dev/null +++ b/lib/mito/solvers/backend/cuda/forward.h @@ -0,0 +1,28 @@ +// -*- c++ -*- +// +// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved +// + +// code guard +#pragma once + + +namespace mito::solvers::cuda { + + // available insert modes for matrix and right-hand side entries + enum InsertMode { ADD_VALUE, INSERT_VALUE }; + + // available solver types + enum SolverType { CHOLESKY, LU }; + + // class for CUDA dense solver + template + class CUDADenseSolver; + + // class for CUDA sparse solver + template + class CUDASparseSolver; +} + + +// end of file diff --git a/lib/mito/solvers/backend/cuda/public.h b/lib/mito/solvers/backend/cuda/public.h new file mode 100644 index 000000000..cd87df6b1 --- /dev/null +++ b/lib/mito/solvers/backend/cuda/public.h @@ -0,0 +1,33 @@ +// -*- c++ -*- +// +// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved +// + +// code guard +#pragma once + + +// external packages +#include "externals.h" + +// get the forward declarations +#include "forward.h" + +// published types +#include "api.h" + +// classes +#include "CUDASolver.h" +#include "CUDADenseSolver.h" +#ifdef WITH_EIGEN +#include "CUDASparseSolver.h" +#endif // WITH_EIGEN + +// error checks +#include "CUDAErrorChecks.h" + +// factories implementation +#include "factories.h" + + +// end of file diff --git a/lib/mito/solvers/backend/cuda/utilities.h b/lib/mito/solvers/backend/cuda/utilities.h new file mode 100644 index 000000000..c84a8af23 --- /dev/null +++ b/lib/mito/solvers/backend/cuda/utilities.h @@ -0,0 +1,508 @@ +// -*- c++ -*- +// +// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved +// + +// code guard +#pragma once + + +namespace mito::solvers::cuda { + + // a utility class to manage host memory + template + class HostArray { + public: + // type alias for real numbers + using real_type = T; + // available host memory types + enum HostMemoryType { PAGEABLE, PINNED }; + + public: + // default constructor + HostArray() : _data(nullptr), _size(0), _memory_type(HostMemoryType::PINNED) {} + + // parameterized constructor + HostArray(std::size_t size, HostMemoryType memory_type = HostMemoryType::PINNED) : + _data(nullptr), + _size(size), + _memory_type(memory_type) + { + allocate(size); + } + + // allocate on construction + explicit HostArray(std::size_t size) : HostArray() { allocate(size); } + + // destructor + ~HostArray() { _free_memory(); } + + // delete copy constructor + HostArray(const HostArray &) = delete; + + // delete copy assignment operator + HostArray & operator=(const HostArray &) = delete; + + // enable move semantics + HostArray(HostArray && other) noexcept : + _data(other._data), + _size(other._size), + _memory_type(other._memory_type) + { + // reset the state of the moved-from object + other._data = nullptr; + other._size = 0; + other._memory_type = HostMemoryType::PINNED; + } + + HostArray & operator=(HostArray && other) noexcept + { + if (this != &other) { + // free existing memory and acquire new memory + _free_memory(); + _data = other._data; + _size = other._size; + _memory_type = other._memory_type; + // reset the state of the moved-from object + other._data = nullptr; + other._size = 0; + other._memory_type = HostMemoryType::PINNED; + } + return *this; + } + + // allocate or reallocate memory for the array + void allocate(std::size_t size) + { + if (_size == size) + return; + + // free old memory if any + // NOTE: This could be expensive if called frequently. We are going ahead with this + // for now for simplicity. But, as a first step towards improvement, we can come up with + // a way to reuse existing memory if the new allocation size is smaller than the current + // size. + _free_memory(); + + _size = size; + if (_size == 0) + return; + + // try to allocate pinned memory first for faster transfers + cudaError_t pinned_alloc_error = + cudaMallocHost((void **) &_data, _size * sizeof(real_type)); + if (pinned_alloc_error == cudaSuccess) { + _memory_type = HostMemoryType::PINNED; + } else { + // fallback to pageable memory + try { + std::cerr + << "[HostArray] Warning: Failed to allocate pinned memory. Falling back " + "to pageable memory\n"; + _data = new real_type[_size]; + _memory_type = HostMemoryType::PAGEABLE; + } catch (const std::bad_alloc & e) { + throw std::runtime_error( + "Failed to allocate host memory: " + std::string(e.what())); + } + } + + // all done + return; + } + + // zero out the array + void zero() + { + if (_data) { + std::fill(_data, _data + _size, static_cast(0.0)); + } + } + + // data accessor + real_type * data() { return _data; } + + // const data accessor + const real_type * data() const { return _data; } + + // size accessor + std::size_t size() const { return _size; } + + // accessor for memory type + HostMemoryType memory_type() const { return _memory_type; } + + // element access + real_type & operator[](std::size_t idx) { return _data[idx]; } + + // const element access + const real_type & operator[](std::size_t idx) const { return _data[idx]; } + + private: + // free the memory in the array + void _free_memory() + { + if (!_data) + return; + + // free the memory + if (_memory_type == HostMemoryType::PINNED) { + cudaError_t error = cudaFreeHost(_data); + if (error != cudaSuccess) { + std::cerr << "[HostArray] Error: Failed to free pinned memory: " + << cudaGetErrorString(error) << "\n"; + } + } else { + delete[] _data; + } + + // reset the state + _data = nullptr; + _size = 0; + _memory_type = HostMemoryType::PINNED; + + // all done + return; + } + + private: + // pointer to the data + real_type * _data; + // size of the array + std::size_t _size; + // type of memory + HostMemoryType _memory_type; + }; + + // a utility class to manage device memory + template + class DeviceArray { + public: + using real_type = T; + + public: + // default constructor + DeviceArray() : _data(nullptr), _size(0) {} + + // allocate on construction + explicit DeviceArray(std::size_t size) : DeviceArray() { allocate(size); } + + // destructor + ~DeviceArray() { _free_memory(); } + + // delete copy constructor + DeviceArray(const DeviceArray &) = delete; + + // delete copy assignment operator + DeviceArray & operator=(const DeviceArray &) = delete; + + // enable move semantics + DeviceArray(DeviceArray && other) noexcept : _data(other._data), _size(other._size) + { + // reset the state of the moved-from object + other._data = nullptr; + other._size = 0; + } + + DeviceArray & operator=(DeviceArray && other) noexcept + { + if (this != &other) { + // free existing memory and acquire new memory + _free_memory(); + _data = other._data; + _size = other._size; + // reset the state of the moved-from object + other._data = nullptr; + other._size = 0; + } + return *this; + } + + // allocate or reallocate memory for the array + void allocate(std::size_t size) + { + if (_size == size) + return; + + // free old memory if any + _free_memory(); + + _size = size; + if (_size == 0) + return; + + // allocate device memory + cudaError_t error = cudaMalloc((void **) &_data, size * sizeof(real_type)); + if (error != cudaSuccess) { + throw std::runtime_error( + "[DeviceArray] Error: Failed to allocate device memory: " + + std::string(cudaGetErrorString(error))); + } + + // all done + return; + } + + // zero out the device array + void zero() + { + if (_data) { + cudaError_t error = cudaMemset(_data, 0, _size * sizeof(real_type)); + if (error != cudaSuccess) { + throw std::runtime_error( + "[DeviceArray] Error: Failed to zero out device memory: " + + std::string(cudaGetErrorString(error))); + } + } + } + + // data accessor + real_type * data() { return _data; } + + // const data accessor + const real_type * data() const { return _data; } + + // size accessor + std::size_t size() const { return _size; } + + private: + // free the memory in the array + void _free_memory() + { + if (!_data) + return; + + // free the memory + cudaError_t error = cudaFree(_data); + if (error != cudaSuccess) { + std::cerr << "[DeviceArray] Error: Failed to free device memory: " + << cudaGetErrorString(error) << "\n"; + } + + // reset the state + _data = nullptr; + _size = 0; + + // all done + return; + } + + private: + // pointer to the data + real_type * _data; + // size of the array + std::size_t _size; + }; + + // a utility class to manage device memory for sparse matrices in CSR format + template + class DeviceSparseMatrix { + public: + using real_type = T; + using index_type = int; + + public: + // default constructor + DeviceSparseMatrix() : + _values(nullptr), + _row_offsets(nullptr), + _col_indices(nullptr), + _rows(0), + _cols(0), + _nnz(0) + {} + + // allocate on construction + explicit DeviceSparseMatrix(std::size_t rows, std::size_t cols, std::size_t nnz) : + DeviceSparseMatrix() + { + allocate(rows, cols, nnz); + } + + // destructor + ~DeviceSparseMatrix() { _free_memory(); } + + // delete copy constructor + DeviceSparseMatrix(const DeviceSparseMatrix &) = delete; + + // delete copy assignment operator + DeviceSparseMatrix & operator=(const DeviceSparseMatrix &) = delete; + + // enable move semantics + DeviceSparseMatrix(DeviceSparseMatrix && other) noexcept : + _values(other._values), + _row_offsets(other._row_offsets), + _col_indices(other._col_indices), + _rows(other._rows), + _cols(other._cols), + _nnz(other._nnz) + { + // reset the state of the moved-from object + other._values = nullptr; + other._row_offsets = nullptr; + other._col_indices = nullptr; + other._rows = 0; + other._cols = 0; + other._nnz = 0; + } + + DeviceSparseMatrix & operator=(DeviceSparseMatrix && other) noexcept + { + if (this != &other) { + // free existing memory and acquire new memory + _free_memory(); + _values = other._values; + _row_offsets = other._row_offsets; + _col_indices = other._col_indices; + _rows = other._rows; + _cols = other._cols; + _nnz = other._nnz; + // reset the state of the moved-from object + other._values = nullptr; + other._row_offsets = nullptr; + other._col_indices = nullptr; + other._rows = 0; + other._cols = 0; + other._nnz = 0; + } + return *this; + } + + // allocate or reallocate memory for the matrix + void allocate(std::size_t rows, std::size_t cols, std::size_t nnz) + { + if (_rows == rows && _cols == cols && _nnz == nnz) + return; + + // free old memory if any + _free_memory(); + + _rows = rows; + _cols = cols; + _nnz = nnz; + if (_rows == 0 || _cols == 0 || _nnz == 0) // no memory to allocate + return; + + // allocate device memory for the CSR representation + cudaError_t error; + + error = cudaMalloc((void **) &_values, _nnz * sizeof(real_type)); + if (error != cudaSuccess) { + throw std::runtime_error( + "[DeviceSparseMatrix] Error: Failed to allocate values array: " + + std::string(cudaGetErrorString(error))); + } + + error = cudaMalloc((void **) &_row_offsets, (_rows + 1) * sizeof(index_type)); + if (error != cudaSuccess) { + throw std::runtime_error( + "[DeviceSparseMatrix] Error: Failed to allocate row offsets array: " + + std::string(cudaGetErrorString(error))); + } + + error = cudaMalloc((void **) &_col_indices, _nnz * sizeof(index_type)); + if (error != cudaSuccess) { + throw std::runtime_error( + "[DeviceSparseMatrix] Error: Failed to allocate column indices array: " + + std::string(cudaGetErrorString(error))); + } + + // all done + return; + } + + // zero out the device matrix + void zero() + { + if (_values) { + cudaError_t error = cudaMemset(_values, 0, _nnz * sizeof(real_type)); + if (error != cudaSuccess) { + throw std::runtime_error( + "[DeviceSparseMatrix] Error: Failed to zero out device memory: " + + std::string(cudaGetErrorString(error))); + } + } + } + + // values accessor + real_type * values() { return _values; } + + // row offsets accessor + index_type * row_offsets() { return _row_offsets; } + + // column indices accessor + index_type * col_indices() { return _col_indices; } + + // const values accessor + const real_type * values() const { return _values; } + + // const row offsets accessor + const index_type * row_offsets() const { return _row_offsets; } + + // const column indices accessor + const index_type * col_indices() const { return _col_indices; } + + // row number accessor + size_t rows() const { return _rows; } + + // column number accessor + size_t cols() const { return _cols; } + + // non-zero entries accessor + size_t nnz() const { return _nnz; } + + private: + // free the memory in the matrix + void _free_memory() + { + if (_values) { + cudaError_t error = cudaFree(_values); + if (error != cudaSuccess) { + std::cerr << "[DeviceSparseMatrix] Error: Failed to free values array: " + << cudaGetErrorString(error) << "\n"; + } + } + if (_row_offsets) { + cudaError_t error = cudaFree(_row_offsets); + if (error != cudaSuccess) { + std::cerr << "[DeviceSparseMatrix] Error: Failed to free row offsets array: " + << cudaGetErrorString(error) << "\n"; + } + } + if (_col_indices) { + cudaError_t error = cudaFree(_col_indices); + if (error != cudaSuccess) { + std::cerr << "[DeviceSparseMatrix] Error: Failed to free column indices array: " + << cudaGetErrorString(error) << "\n"; + } + } + + // reset the state + _values = nullptr; + _row_offsets = nullptr; + _col_indices = nullptr; + _rows = 0; + _cols = 0; + _nnz = 0; + + // all done + return; + } + + private: + // pointer to the non-zero values + real_type * _values; + // pointer to the row offsets + index_type * _row_offsets; + // pointer to the column indices + index_type * _col_indices; + // number of rows + size_t _rows; + // number of columns + size_t _cols; + // number of non-zero entries + size_t _nnz; + }; +} + + +// end of file diff --git a/lib/mito/solvers/public.h b/lib/mito/solvers/public.h index aca9e6a46..698d69016 100644 --- a/lib/mito/solvers/public.h +++ b/lib/mito/solvers/public.h @@ -12,4 +12,9 @@ #endif // WITH_PETSC +#ifdef WITH_CUDA +#include "backend/cuda/public.h" +#endif // WITH_CUDA + + // end of file diff --git a/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc b/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc new file mode 100644 index 000000000..6cd7ea189 --- /dev/null +++ b/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc @@ -0,0 +1,145 @@ +// -*- c++ -*- +// +// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved +// + + +#include +#include + + +TEST(Solvers, CUDADenseSolverSymmetricLinearSystem) +{ + // the size of the linear system + int N = 10; + + // numerical tolerance for the solution + double tol = 1.0e-10; + + // instantiate a CUDA Dense solver for a linear system of size {N} + auto solver = mito::solvers::cuda::dense(mito::solvers::cuda::SolverType::CHOLESKY); + solver.initialize(N); + + // set matrix entries + double A[10][10] = { { 3.5, -1.2, 0.8, 0.0, -0.5, 1.1, 0.0, 0.3, -0.9, 0.4 }, + { -1.2, 4.0, 1.3, -0.4, 0.6, -0.7, 0.0, 0.5, 0.0, -1.0 }, + { 0.8, 1.3, 3.8, 0.7, -0.8, 0.0, -0.2, 0.6, -0.5, 0.0 }, + { 0.0, -0.4, 0.7, 4.2, 1.0, 0.9, 0.3, 0.0, 0.4, -0.3 }, + { -0.5, 0.6, -0.8, 1.0, 4.5, 0.4, -0.7, 0.8, 0.0, 0.2 }, + { 1.1, -0.7, 0.0, 0.9, 0.4, 3.6, -0.3, 0.0, -0.4, 0.5 }, + { 0.0, 0.0, -0.2, 0.3, -0.7, -0.3, 3.3, 0.6, -0.8, 0.0 }, + { 0.3, 0.5, 0.6, 0.0, 0.8, 0.0, 0.6, 4.1, 0.2, 0.7 }, + { -0.9, 0.0, -0.5, 0.4, 0.0, -0.4, -0.8, 0.2, 3.7, 0.9 }, + { 0.4, -1.0, 0.0, -0.3, 0.2, 0.5, 0.0, 0.7, 0.9, 4.0 } }; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + solver.set_matrix_value(i, j, A[i][j], mito::solvers::cuda::InsertMode::INSERT_VALUE); + } + } + + // set rhs entries + double b[10] = { 1.0, -2.0, 0.5, 1.5, -1.0, 0.7, -0.8, 2.0, 0.0, -1.5 }; + for (int i = 0; i < N; i++) { + solver.set_rhs_value(i, b[i], mito::solvers::cuda::InsertMode::INSERT_VALUE); + } + + // finalize the assembly of the linear system + solver.finalize_assembly(); + + // solve the linear system + solver.solve(); + + // read the solution + auto x = std::vector(N); + solver.get_solution(x); + + // check the solution + EXPECT_NEAR(x[0], -0.0560503759, tol); + EXPECT_NEAR(x[1], -0.6932480517, tol); + EXPECT_NEAR(x[2], 0.0666007599, tol); + EXPECT_NEAR(x[3], 0.3544482567, tol); + EXPECT_NEAR(x[4], -0.4084514057, tol); + EXPECT_NEAR(x[5], 0.0767339355, tol); + EXPECT_NEAR(x[6], -0.5097195863, tol); + EXPECT_NEAR(x[7], 0.8324810769, tol); + EXPECT_NEAR(x[8], -0.0333448254, tol); + EXPECT_NEAR(x[9], -0.6434741305, tol); + + // all done + return; +} + +TEST(Solvers, CUDADenseSolverUnsymmetricLinearSystem) +{ + // the size of the linear system + int N = 10; + + // numerical tolerance for the solution + double tol = 1.0e-10; + + // instantiate a CUDA Dense solver for a linear system of size {N} + auto solver = mito::solvers::cuda::dense(mito::solvers::cuda::SolverType::LU); + solver.initialize(N); + + // set matrix entries + double A[10][10] = { { 3.5, -1.2, 0.8, 0.0, -0.5, 1.1, 0.0, 0.3, -0.9, 0.4 }, + { 0.6, 4.0, 1.3, -0.4, 0.6, -0.7, 0.0, 0.5, 0.0, -1.0 }, + { -1.1, 1.0, 3.8, 0.7, -0.8, 0.0, -0.2, 0.6, -0.5, 0.0 }, + { 0.0, -0.4, 0.7, 4.2, -0.9, 0.9, 0.3, 0.0, 0.4, -0.3 }, + { 0.9, 0.0, -0.8, 1.0, 4.5, 0.4, -0.7, 0.8, 0.0, 0.2 }, + { 1.1, -0.7, 0.0, -1.3, 0.4, 3.6, -0.3, 0.0, -0.4, 0.5 }, + { 0.0, 0.0, 1.1, 0.3, -0.7, -0.3, 3.3, 0.6, -0.8, 0.0 }, + { 0.3, 0.5, 0.6, 0.0, 0.8, 0.0, 0.6, 4.1, -0.5, 0.7 }, + { -0.9, 0.0, -0.5, -1.0, 0.0, -0.4, -0.8, 0.2, 3.7, 0.9 }, + { 0.4, 1.3, 0.0, -0.3, 0.2, 0.5, 0.0, 0.7, 0.9, 4.0 } }; + for (int i = 0; i < N; i++) { + for (int j = 0; j < N; j++) { + solver.set_matrix_value(i, j, A[i][j], mito::solvers::cuda::InsertMode::INSERT_VALUE); + } + } + + // set rhs entries + double b[10] = { -1.2, 0.5, 0.7, 1.0, 0.3, -1.5, 2.0, -0.8, 0.6, 1.1 }; + for (int i = 0; i < N; i++) { + solver.set_rhs_value(i, b[i], mito::solvers::cuda::InsertMode::INSERT_VALUE); + } + + // finalize the assembly of the linear system + solver.finalize_assembly(); + + // solve the linear system + solver.solve(); + + // read the solution + auto x = std::vector(N); + solver.get_solution(x); + + // check the solution + EXPECT_NEAR(x[0], -0.1642774470, tol); + EXPECT_NEAR(x[1], 0.1352982046, tol); + EXPECT_NEAR(x[2], 0.2493049978, tol); + EXPECT_NEAR(x[3], 0.2587528172, tol); + EXPECT_NEAR(x[4], 0.2711567201, tol); + EXPECT_NEAR(x[5], -0.2258981243, tol); + EXPECT_NEAR(x[6], 0.6823460181, tol); + EXPECT_NEAR(x[7], -0.4005242029, tol); + EXPECT_NEAR(x[8], 0.3015605175, tol); + EXPECT_NEAR(x[9], 0.2837823381, tol); + + // all done + return; +} + + +int +main(int argc, char ** argv) +{ + ::testing::InitGoogleTest(&argc, argv); + auto result = RUN_ALL_TESTS(); + + // all done + return result; +} + + +// end of file diff --git a/tests/mito.lib/solvers/cuda_sparse_solver_solve_linear_system.cc b/tests/mito.lib/solvers/cuda_sparse_solver_solve_linear_system.cc new file mode 100644 index 000000000..e29a8a77e --- /dev/null +++ b/tests/mito.lib/solvers/cuda_sparse_solver_solve_linear_system.cc @@ -0,0 +1,131 @@ +// -*- c++ -*- +// +// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved +// + + +#include +#include + +TEST(Solvers, CUDASparseSolverSymmetricLinearSystem) +{ + // the size of the linear system + int N = 10; + + // numerical tolerance for the solution + double tol = 1.0e-10; + + // instantiate a CUDA Sparse solver for a linear system of size {N} + auto solver = + mito::solvers::cuda::sparse(mito::solvers::cuda::SolverType::CHOLESKY); + solver.initialize(N, 3); // 3 non-zero entries per row + + // set matrix and right-hand side entries + for (int i = 0; i < N; i++) { + // matrix: diagonal + solver.set_matrix_value(i, i, 2.0, mito::solvers::cuda::InsertMode::INSERT_VALUE); + // matrix: subdiagonal + if (i > 0) { + solver.set_matrix_value(i, i - 1, -1.0, mito::solvers::cuda::InsertMode::ADD_VALUE); + } + // matrix: superdiagonal + if (i < N - 1) { + solver.set_matrix_value(i, i + 1, -1.0, mito::solvers::cuda::InsertMode::INSERT_VALUE); + } + // rhs + solver.set_rhs_value(i, 1.0, mito::solvers::cuda::InsertMode::ADD_VALUE); + } + + // finalize the assembly of the linear system + solver.finalize_assembly(); + + // solve the linear system + solver.solve(); + + // read the solution + auto x = std::vector(N); + solver.get_solution(x); + + // check the solution + EXPECT_NEAR(x[0], 5.0, tol); + EXPECT_NEAR(x[1], 9.0, tol); + EXPECT_NEAR(x[2], 12.0, tol); + EXPECT_NEAR(x[3], 14.0, tol); + EXPECT_NEAR(x[4], 15.0, tol); + EXPECT_NEAR(x[5], 15.0, tol); + EXPECT_NEAR(x[6], 14.0, tol); + EXPECT_NEAR(x[7], 12.0, tol); + EXPECT_NEAR(x[8], 9.0, tol); + EXPECT_NEAR(x[9], 5.0, tol); + + // all done + return; +} + +TEST(Solvers, CUDASparseSolverUnsymmetricLinearSystem) +{ + // the size of the linear system + int N = 10; + + // numerical tolerance for the solution + double tol = 1.0e-10; + + // instantiate a CUDA Sparse solver for a linear system of size {N} + auto solver = mito::solvers::cuda::sparse(mito::solvers::cuda::SolverType::LU); + solver.initialize(N); // non-zeros per row are not set + + // set matrix and right-hand side entries + for (int i = 0; i < N; i++) { + // matrix: diagonal + solver.set_matrix_value(i, i, 2.0, mito::solvers::cuda::InsertMode::ADD_VALUE); + // matrix: subdiagonal + if (i > 0) { + solver.set_matrix_value(i, i - 1, -1.0, mito::solvers::cuda::InsertMode::INSERT_VALUE); + } + // matrix: superdiagonal + if (i < N - 1) { + solver.set_matrix_value(i, i + 1, -0.5, mito::solvers::cuda::InsertMode::INSERT_VALUE); + } + // rhs + solver.set_rhs_value(i, 1.0, mito::solvers::cuda::InsertMode::ADD_VALUE); + } + + // finalize the assembly of the linear system + solver.finalize_assembly(); + + // solve the linear system + solver.solve(); + + // read the solution + auto x = std::vector(N); + solver.get_solution(x); + + // check the solution + EXPECT_NEAR(x[0], 0.8284194482079074, tol); + EXPECT_NEAR(x[1], 1.3136777928316294, tol); + EXPECT_NEAR(x[2], 1.5978722749107030, tol); + EXPECT_NEAR(x[3], 1.7641335139795535, tol); + EXPECT_NEAR(x[4], 1.8607895060968098, tol); + EXPECT_NEAR(x[5], 1.9148909964281315, tol); + EXPECT_NEAR(x[6], 1.9379849735189063, tol); + EXPECT_NEAR(x[7], 1.9221579012193621, tol); + EXPECT_NEAR(x[8], 1.8126616578396353, tol); + EXPECT_NEAR(x[9], 1.4063308289198175, tol); + + // all done + return; +} + + +int +main(int argc, char ** argv) +{ + ::testing::InitGoogleTest(&argc, argv); + auto result = RUN_ALL_TESTS(); + + // all done + return result; +} + + +// end of file