Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
39b609e
solvers/cuda/CUDADenseSolver: first attempt at the CUDA dense solver …
saikubairkota Jun 6, 2025
cc8459e
solvers/cuda/CUDADenseSolver: removed negative index comparison while…
saikubairkota Jun 9, 2025
d3fcb4c
solvers/cuda/CUDADenseSolver: hardcoded mito::real to double to avoid…
saikubairkota Jun 9, 2025
8f722b5
.cmake: added support for CUDA compiler and CUDA solver
saikubairkota Jun 9, 2025
1a9c41d
solvers/cuda/CUDADenseSolver: enabled apis & factories to create CUDA…
saikubairkota Jun 14, 2025
2a2b8d4
solvers/cuda/CUDADenseSolver: added function to get solution from the…
saikubairkota Jun 14, 2025
bfe62f6
tests/mito.lib/solvers/cuda_dense: added a simple linear system solve…
saikubairkota Jun 14, 2025
27759da
solvers/cuda/CUDADenseSolver: replaced double with mito::real for gen…
saikubairkota Jun 17, 2025
b3541e4
solvers/cuda/CUDADenseSolver: added cusolver_traits to support differ…
saikubairkota Jun 17, 2025
af6988f
solvers/cuda/CUDADenseSolver: introduced SolverType enum & new parame…
saikubairkota Jun 17, 2025
c88a099
solvers/cuda/CUDADenseSolver: storing matrix in column-major order on…
saikubairkota Jun 17, 2025
5dea7bc
solvers/cuda/CUDADenseSolver: added support for Cholesky solver
saikubairkota Jun 17, 2025
9f38bec
tests/mito.lib/solvers/cuda_dense: switched to Cholesky solver for so…
saikubairkota Jun 17, 2025
1e8a49a
tests/mito.lib/solvers/cuda_dense: introduced checks with tolerances
saikubairkota Jun 17, 2025
10a4314
tests/mito.lib/solvers/cuda_dense: added test with an unsymmetric matrix
saikubairkota Jun 17, 2025
36160df
solvers/cuda: {typedef} real type in {CUDADenseSolver} class
bgiovanardi Jun 19, 2025
2f5557e
solvers/cuda: remove dependency of class {CUDADenseSolver} from other…
bgiovanardi Jun 19, 2025
81568b9
solvers/cuda: class {CUDADenseSolver} is now template with respect to…
bgiovanardi Jun 19, 2025
9a59e4a
solvers/cuda: add concept for a type representing a real value
bgiovanardi Jun 20, 2025
4c7a268
solvers/cuda: remove redundant namespace specification
bgiovanardi Jun 20, 2025
11123cb
solvers/cuda/CUDADenseSolver: added arguments for constructor & metho…
saikubairkota Jul 5, 2025
88e15e8
solvers/cuda/CUDADenseSolver: added separate header for CUDA error ch…
saikubairkota Aug 10, 2025
0c947d4
solvers/cuda/CUDASolver: added base class for implementing attributes…
saikubairkota Aug 10, 2025
665a86c
solvers/cuda/CUDADenseSolver: made derived class of CUDASolver
saikubairkota Aug 10, 2025
a8d7969
solvers/cuda/utilities: added HostArray utility class to manage host …
saikubairkota Sep 1, 2025
3b5e210
solvers/cuda/utilities: added DeviceArray utility class to manage dev…
saikubairkota Sep 7, 2025
76ee128
solvers/cuda/CUDASolver & CUDADenseSolver: multiple changes with use …
saikubairkota Sep 7, 2025
f7d913b
solvers/cuda: removed/corrected some comments
saikubairkota Sep 17, 2025
3ee0e23
solvers/cuda/utilities: added DeviceSparseMatrix class to manage devi…
saikubairkota Sep 17, 2025
ddefaab
solvers/cuda/CUDASparseSolver: first attempt at the CUDA sparse solve…
saikubairkota Sep 18, 2025
7ecb1ce
tests/mito.lib/solvers/cuda_sparse: added symmetric & unsymmetric sys…
saikubairkota Sep 18, 2025
356a00b
solvers/cuda/CUDASparseSolver: corrected nnz default input as size_t …
saikubairkota Sep 18, 2025
a7e15e6
solvers/cuda: added synchronization steps after memory transfers & so…
saikubairkota Sep 18, 2025
0bf8c74
tests/mito.lib/solvers/cuda_sparse: not setting nnz per row in unsymm…
saikubairkota Sep 18, 2025
d0e52bc
tests/mito.lib/solvers/cuda_dense: made matrices truly dense by filli…
saikubairkota Sep 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .cmake/mito-config.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,10 @@ endif(@WITH_VTK@)
if(@WITH_PETSC@)
# add compiler definitions
add_definitions(-DWITH_PETSC)
endif(@WITH_PETSC@)
endif(@WITH_PETSC@)

# cuda dependency
if(@WITH_CUDA@)
# add compiler definitions
add_definitions(-DWITH_CUDA)
endif(@WITH_CUDA@)
54 changes: 54 additions & 0 deletions .cmake/mito_cuda.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# -*- cmake -*-
#
# Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
#


# CUDA support
option(WITH_CUDA "Enable support for CUDA" OFF)

# if CUDA is requested
if(WITH_CUDA)
# find CUDA package
find_package(CUDAToolkit REQUIRED)
# report
message(STATUS "Enable CUDA support")
# add compiler definitions
add_definitions(-DWITH_CUDA)
message(STATUS "Add definition WITH_CUDA")
# enable CUDA language for targets
enable_language(CUDA)
# include CUDA headers
target_include_directories(mito PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
# link against CUDA libraries
target_link_libraries(mito PUBLIC CUDA::cudart CUDA::cusolver)
# set CUDA host compiler to match the C++ compiler (to tell NVCC the compiler to use for the host code)
if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE FILEPATH "Host compiler for NVCC")
endif()
message(STATUS "Using CUDA host compiler: ${CMAKE_CUDA_HOST_COMPILER}")
# disable offline compilation support warnings
target_compile_options(mito PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:-Wno-deprecated-gpu-targets>)
# find Eigen for CUDA sparse solver
find_package(Eigen3 3.3 QUIET NO_MODULE)
if(Eigen3_FOUND)
# add Eigen compiler definition
message(STATUS "Found Eigen3: ${EIGEN3_INCLUDE_DIR}")
add_definitions(-DWITH_EIGEN)
# add include directories for Eigen
target_include_directories(mito PUBLIC ${EIGEN3_INCLUDE_DIR})
# disable warnings due to incompatible return types in Eigen
target_compile_options(mito PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
# find cudss library
find_package(cudss REQUIRED)
# include cudss headers
target_include_directories(mito PUBLIC ${cudss_INCLUDE_DIR})
# link against cudss libraries
target_link_libraries(mito PUBLIC cudss)
else()
message(WARNING "Eigen3 not found. CUDA sparse solver parts will be disabled.")
endif()
endif()


# end of file
9 changes: 9 additions & 0 deletions .cmake/mito_sources.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,14 @@ set(MITO_SOURCES ${MITO_SOURCES}
)
endif()

# the mito cuda solvers backend
if (WITH_CUDA)
set(MITO_SOURCES ${MITO_SOURCES}
lib/mito/solvers/backend/cuda/CUDASolver.cu
lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
lib/mito/solvers/backend/cuda/CUDASparseSolver.cu
)
endif()


# end of file
5 changes: 5 additions & 0 deletions .cmake/mito_tests_mito_lib.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,11 @@ if(WITH_PETSC)
mito_test_driver(tests/mito.lib/solvers/petsc_solve_linear_system.cc)
endif()

if(WITH_CUDA)
mito_test_driver(tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc)
mito_test_driver(tests/mito.lib/solvers/cuda_sparse_solver_solve_linear_system.cc)
endif()

# tensor
mito_test_driver(tests/mito.lib/tensor/one_forms.cc)
mito_test_driver(tests/mito.lib/tensor/contractions.cc)
Expand Down
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ include(mito_metis)
# petsc support
include(mito_petsc)

# cuda support
include(mito_cuda)

# configure mito as a cmake package (so that mito can be found by cmake with find_package(mito))
mito_shareCmakePackage()

Expand Down
243 changes: 243 additions & 0 deletions lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
// -*- c++ -*-
//
// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
//


#include "public.h"


// constructor
template <mito::solvers::cuda::real_c realT>
mito::solvers::cuda::CUDADenseSolver<realT>::CUDADenseSolver(SolverType solver_type) :
CUDASolver<realT>(solver_type),
_h_matrix(),
_d_matrix(),
_cusolver_handle()
{
// initialize cuSOLVER
_initialize_cusolver();
}

// destructor
template <mito::solvers::cuda::real_c realT>
mito::solvers::cuda::CUDADenseSolver<realT>::~CUDADenseSolver()
{
// finalize cuSOLVER
_finalize_cusolver();
}

// add/insert {value} to matrix entry at ({row}, {col}) of the host copy
template <mito::solvers::cuda::real_c realT>
auto
mito::solvers::cuda::CUDADenseSolver<realT>::set_matrix_value(
size_t row, size_t col, const real_type value, const InsertMode insert_mode) -> void
{
// check if the system assembly is finalized and throw an error if it is
if (this->_is_assembly_finalized) {
throw std::logic_error(
"System assembly is already finalized. Cannot add/insert values to the matrix.");
}

// check if the row and column indices are within bounds
this->_check_index_validity(row);
this->_check_index_validity(col);

// add/insert the value to the matrix entry in the host matrix
// NOTE: We store the matrix in column-major order since the cuSOLVER library expects the matrix
// to be in column-major order.
if (insert_mode == InsertMode::ADD_VALUE)
_h_matrix[col * this->_size + row] += value;
else if (insert_mode == InsertMode::INSERT_VALUE)
_h_matrix[col * this->_size + row] = value;
else
throw std::invalid_argument("Invalid insert mode. Use ADD_VALUE or INSERT_VALUE.");

// all done
return;
}

template <mito::solvers::cuda::real_c realT>
auto
mito::solvers::cuda::CUDADenseSolver<realT>::reset_system() -> void
{
// check if the solver is initialized
if (!this->_is_solver_initialized) {
throw std::logic_error("Solver is not yet initialized. Call initialize() first.");
}

// fill the host matrix, rhs and solution with zeros
_h_matrix.zero();
this->_h_rhs.zero();
this->_h_solution.zero();

// reset the assembly finalized flag
this->_is_assembly_finalized = false;

// all done
return;
}

template <mito::solvers::cuda::real_c realT>
auto
mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
{
// check if the assembly is finalized
if (!this->_is_assembly_finalized) {
throw std::logic_error(
"System assembly is not yet finalized. Call finalize_assembly() first.");
}

// copy the host matrix and rhs data to device global memory
CHECK_CUDA_ERROR(cudaMemcpy(
_d_matrix.data(), _h_matrix.data(), this->_size * this->_size * sizeof(real_type),
cudaMemcpyHostToDevice));
CHECK_CUDA_ERROR(cudaMemcpy(
this->_d_rhs.data(), this->_h_rhs.data(), this->_size * sizeof(real_type),
cudaMemcpyHostToDevice));
CHECK_CUDA_ERROR(cudaDeviceSynchronize());

// allocate device memory for temporary variables in the factorization
int * d_pivot = nullptr;
int * d_info = nullptr;
real_type * d_workspace = nullptr;
int workspace_size = 0;

// check the solver type is either LU or Cholesky
if (this->_solver_type != SolverType::LU && this->_solver_type != SolverType::CHOLESKY) {
throw std::invalid_argument(
"Invalid solver type. Only LU and Cholesky solvers are supported in the CUDA dense "
"solver.");
}

// get the workspace size for the factorization
if (this->_solver_type == SolverType::LU) {
CHECK_CUSOLVER_ERROR(
cusolver_traits<real_type>::getrf_buffer_size(
_cusolver_handle, this->_size, this->_size, _d_matrix.data(), this->_size,
&workspace_size));
} else if (this->_solver_type == SolverType::CHOLESKY) {
CHECK_CUSOLVER_ERROR(
cusolver_traits<real_type>::potrf_buffer_size(
_cusolver_handle, CUBLAS_FILL_MODE_LOWER, this->_size, _d_matrix.data(),
this->_size, &workspace_size));
}

CHECK_CUDA_ERROR(cudaMalloc(&d_pivot, this->_size * sizeof(int)));
CHECK_CUDA_ERROR(cudaMalloc(&d_info, sizeof(int)));
CHECK_CUDA_ERROR(cudaMalloc(&d_workspace, workspace_size * sizeof(real_type)));

// perform the factorization
if (this->_solver_type == SolverType::LU) {
CHECK_CUSOLVER_ERROR(
cusolver_traits<real_type>::getrf(
_cusolver_handle, this->_size, this->_size, _d_matrix.data(), this->_size,
d_workspace, d_pivot, d_info));
} else if (this->_solver_type == SolverType::CHOLESKY) {
CHECK_CUSOLVER_ERROR(
cusolver_traits<real_type>::potrf(
_cusolver_handle, CUBLAS_FILL_MODE_LOWER, this->_size, _d_matrix.data(),
this->_size, d_workspace, workspace_size, d_info));
}
CHECK_CUDA_ERROR(cudaDeviceSynchronize());

// solve the linear system
if (this->_solver_type == SolverType::LU) {
CHECK_CUSOLVER_ERROR(
cusolver_traits<real_type>::getrs(
_cusolver_handle, CUBLAS_OP_N, this->_size, 1, _d_matrix.data(), this->_size,
d_pivot, this->_d_rhs.data(), this->_size, d_info));
} else if (this->_solver_type == SolverType::CHOLESKY) {
CHECK_CUSOLVER_ERROR(
cusolver_traits<real_type>::potrs(
_cusolver_handle, CUBLAS_FILL_MODE_LOWER, this->_size, 1, _d_matrix.data(),
this->_size, this->_d_rhs.data(), this->_size, d_info));
}
CHECK_CUDA_ERROR(cudaDeviceSynchronize());

// copy the solution from device global memory to host memory
// NOTE: _d_rhs contains the solution after the call to getrs/potrs as its contents are
// overwritten by the solution vector
CHECK_CUDA_ERROR(cudaMemcpy(
this->_h_solution.data(), this->_d_rhs.data(), this->_size * sizeof(real_type),
cudaMemcpyDeviceToHost));

// free the temporary device memory
CHECK_CUDA_ERROR(cudaFree(d_pivot));
CHECK_CUDA_ERROR(cudaFree(d_info));
CHECK_CUDA_ERROR(cudaFree(d_workspace));

// all done
return;
}

template <mito::solvers::cuda::real_c realT>
auto
mito::solvers::cuda::CUDADenseSolver<realT>::_initialize_cusolver() -> void
{
// create the cuSOLVER handle
CHECK_CUSOLVER_ERROR(cusolverDnCreate(&_cusolver_handle));

// create a cuda stream
CHECK_CUDA_ERROR(cudaStreamCreateWithPriority(&(this->_cuda_stream), cudaStreamNonBlocking, 0));

// set the stream for the cuSOLVER handle
CHECK_CUSOLVER_ERROR(cusolverDnSetStream(_cusolver_handle, this->_cuda_stream));

// all done
return;
}

template <mito::solvers::cuda::real_c realT>
auto
mito::solvers::cuda::CUDADenseSolver<realT>::_finalize_cusolver() -> void
{
// destroy the cuSOLVER handle
CHECK_CUSOLVER_ERROR(cusolverDnDestroy(_cusolver_handle));

// destroy the cuda stream
CHECK_CUDA_ERROR(cudaStreamDestroy(this->_cuda_stream));

// reset the handle and stream pointers
_cusolver_handle = nullptr;
this->_cuda_stream = nullptr;

// all done
return;
}

template <mito::solvers::cuda::real_c realT>
auto
mito::solvers::cuda::CUDADenseSolver<realT>::_allocate_host_memory(size_t size) -> void
{
// allocate host memory for matrix, rhs and solution
_h_matrix.allocate(size * size);
this->_h_rhs.allocate(size);
this->_h_solution.allocate(size);

// zero out the arrays
_h_matrix.zero();
this->_h_rhs.zero();
this->_h_solution.zero();

// all done
return;
}

template <mito::solvers::cuda::real_c realT>
auto
mito::solvers::cuda::CUDADenseSolver<realT>::_allocate_device_memory(size_t size) -> void
{
// allocate global device memory for matrix and rhs
_d_matrix.allocate(size * size);
this->_d_rhs.allocate(size);

// all done
return;
}

// explicit instantiation of the {CUDADenseSolver} class for doubles
template class mito::solvers::cuda::CUDADenseSolver<double>;

// explicit instantiation of the {CUDADenseSolver} class for floats
template class mito::solvers::cuda::CUDADenseSolver<float>;
Loading
Loading