diff --git a/.cmake/mito-config.cmake.in b/.cmake/mito-config.cmake.in
index 124606f58..d792ed33f 100644
--- a/.cmake/mito-config.cmake.in
+++ b/.cmake/mito-config.cmake.in
@@ -44,4 +44,10 @@ endif(@WITH_VTK@)
 if(@WITH_PETSC@)
   # add compiler definitions
   add_definitions(-DWITH_PETSC)
-endif(@WITH_PETSC@)
\ No newline at end of file
+endif(@WITH_PETSC@)
+
+# cuda dependency
+if(@WITH_CUDA@)
+  # add compiler definitions
+  add_definitions(-DWITH_CUDA)
+endif(@WITH_CUDA@)
\ No newline at end of file
diff --git a/.cmake/mito_cuda.cmake b/.cmake/mito_cuda.cmake
new file mode 100644
index 000000000..d1f0cc7b1
--- /dev/null
+++ b/.cmake/mito_cuda.cmake
@@ -0,0 +1,54 @@
+# -*- cmake -*-
+#
+# Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+#
+
+
+# CUDA support
+option(WITH_CUDA "Enable support for CUDA" OFF)
+
+# if CUDA is requested
+if(WITH_CUDA)
+    # find CUDA package
+    find_package(CUDAToolkit REQUIRED)
+    # report
+    message(STATUS "Enable CUDA support")
+    # add compiler definitions
+    add_definitions(-DWITH_CUDA)
+    message(STATUS "Add definition WITH_CUDA")
+    # enable CUDA language for targets
+    enable_language(CUDA)
+    # include CUDA headers
+    target_include_directories(mito PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+    # link against CUDA libraries
+    target_link_libraries(mito PUBLIC CUDA::cudart CUDA::cusolver)
+    # set CUDA host compiler to match the C++ compiler (to tell NVCC the compiler to use for the host code)
+    if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
+        set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE FILEPATH "Host compiler for NVCC")
+    endif()
+    message(STATUS "Using CUDA host compiler: ${CMAKE_CUDA_HOST_COMPILER}")
+    # disable offline compilation support warnings
+    target_compile_options(mito PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:-Wno-deprecated-gpu-targets>)
+    # find Eigen for CUDA sparse solver
+    find_package(Eigen3 3.3 QUIET NO_MODULE)
+    if(Eigen3_FOUND)
+        # add Eigen compiler definition
+        message(STATUS "Found Eigen3: ${EIGEN3_INCLUDE_DIR}")
+        add_definitions(-DWITH_EIGEN)
+        # add include directories for Eigen
+        target_include_directories(mito PUBLIC ${EIGEN3_INCLUDE_DIR})
+        # disable warnings due to incompatible return types in Eigen
+        target_compile_options(mito PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
+        # find cudss library
+        find_package(cudss REQUIRED)
+        # include cudss headers
+        target_include_directories(mito PUBLIC ${cudss_INCLUDE_DIR})
+        # link against cudss libraries
+        target_link_libraries(mito PUBLIC cudss)
+    else()
+        message(WARNING "Eigen3 not found. CUDA sparse solver parts will be disabled.")
+    endif()
+endif()
+
+
+# end of file
diff --git a/.cmake/mito_sources.cmake b/.cmake/mito_sources.cmake
index 3095b5fea..86c600cd2 100644
--- a/.cmake/mito_sources.cmake
+++ b/.cmake/mito_sources.cmake
@@ -14,5 +14,14 @@ set(MITO_SOURCES ${MITO_SOURCES}
 )
 endif()
 
+# the mito cuda solvers backend
+if (WITH_CUDA)
+set(MITO_SOURCES ${MITO_SOURCES}
+    lib/mito/solvers/backend/cuda/CUDASolver.cu
+    lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
+    lib/mito/solvers/backend/cuda/CUDASparseSolver.cu
+)
+endif()
+
 
 # end of file
diff --git a/.cmake/mito_tests_mito_lib.cmake b/.cmake/mito_tests_mito_lib.cmake
index b3a8b9a84..0e63104eb 100644
--- a/.cmake/mito_tests_mito_lib.cmake
+++ b/.cmake/mito_tests_mito_lib.cmake
@@ -69,6 +69,11 @@ if(WITH_PETSC)
     mito_test_driver(tests/mito.lib/solvers/petsc_solve_linear_system.cc)
 endif()
 
+if(WITH_CUDA)
+    mito_test_driver(tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc)
+    mito_test_driver(tests/mito.lib/solvers/cuda_sparse_solver_solve_linear_system.cc)
+endif()
+
 # tensor
 mito_test_driver(tests/mito.lib/tensor/one_forms.cc)
 mito_test_driver(tests/mito.lib/tensor/contractions.cc)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 458b981ca..665525e3f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,6 +58,9 @@ include(mito_metis)
 # petsc support
 include(mito_petsc)
 
+# cuda support
+include(mito_cuda)
+
 # configure mito as a cmake package (so that mito can be found by cmake with find_package(mito))
 mito_shareCmakePackage()
 
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
new file mode 100644
index 000000000..2c0e79401
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
@@ -0,0 +1,243 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+
+#include "public.h"
+
+
+// constructor
+template <mito::solvers::cuda::real_c realT>
+mito::solvers::cuda::CUDADenseSolver<realT>::CUDADenseSolver(SolverType solver_type) :
+    CUDASolver<realT>(solver_type),
+    _h_matrix(),
+    _d_matrix(),
+    _cusolver_handle()
+{
+    // initialize cuSOLVER
+    _initialize_cusolver();
+}
+
+// destructor
+template <mito::solvers::cuda::real_c realT>
+mito::solvers::cuda::CUDADenseSolver<realT>::~CUDADenseSolver()
+{
+    // finalize cuSOLVER
+    _finalize_cusolver();
+}
+
+// add/insert {value} to matrix entry at ({row}, {col}) of the host copy
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDADenseSolver<realT>::set_matrix_value(
+    size_t row, size_t col, const real_type value, const InsertMode insert_mode) -> void
+{
+    // check if the system assembly is finalized and throw an error if it is
+    if (this->_is_assembly_finalized) {
+        throw std::logic_error(
+            "System assembly is already finalized. Cannot add/insert values to the matrix.");
+    }
+
+    // check if the row and column indices are within bounds
+    this->_check_index_validity(row);
+    this->_check_index_validity(col);
+
+    // add/insert the value to the matrix entry in the host matrix
+    // NOTE: We store the matrix in column-major order since the cuSOLVER library expects the matrix
+    // to be in column-major order.
+    if (insert_mode == InsertMode::ADD_VALUE)
+        _h_matrix[col * this->_size + row] += value;
+    else if (insert_mode == InsertMode::INSERT_VALUE)
+        _h_matrix[col * this->_size + row] = value;
+    else
+        throw std::invalid_argument("Invalid insert mode. Use ADD_VALUE or INSERT_VALUE.");
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDADenseSolver<realT>::reset_system() -> void
+{
+    // check if the solver is initialized
+    if (!this->_is_solver_initialized) {
+        throw std::logic_error("Solver is not yet initialized. Call initialize() first.");
+    }
+
+    // fill the host matrix, rhs and solution with zeros
+    _h_matrix.zero();
+    this->_h_rhs.zero();
+    this->_h_solution.zero();
+
+    // reset the assembly finalized flag
+    this->_is_assembly_finalized = false;
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
+{
+    // check if the assembly is finalized
+    if (!this->_is_assembly_finalized) {
+        throw std::logic_error(
+            "System assembly is not yet finalized. Call finalize_assembly() first.");
+    }
+
+    // copy the host matrix and rhs data to device global memory
+    CHECK_CUDA_ERROR(cudaMemcpy(
+        _d_matrix.data(), _h_matrix.data(), this->_size * this->_size * sizeof(real_type),
+        cudaMemcpyHostToDevice));
+    CHECK_CUDA_ERROR(cudaMemcpy(
+        this->_d_rhs.data(), this->_h_rhs.data(), this->_size * sizeof(real_type),
+        cudaMemcpyHostToDevice));
+    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
+
+    // allocate device memory for temporary variables in the factorization
+    int * d_pivot = nullptr;
+    int * d_info = nullptr;
+    real_type * d_workspace = nullptr;
+    int workspace_size = 0;
+
+    // check the solver type is either LU or Cholesky
+    if (this->_solver_type != SolverType::LU && this->_solver_type != SolverType::CHOLESKY) {
+        throw std::invalid_argument(
+            "Invalid solver type. Only LU and Cholesky solvers are supported in the CUDA dense "
+            "solver.");
+    }
+
+    // get the workspace size for the factorization
+    if (this->_solver_type == SolverType::LU) {
+        CHECK_CUSOLVER_ERROR(
+            cusolver_traits<real_type>::getrf_buffer_size(
+                _cusolver_handle, this->_size, this->_size, _d_matrix.data(), this->_size,
+                &workspace_size));
+    } else if (this->_solver_type == SolverType::CHOLESKY) {
+        CHECK_CUSOLVER_ERROR(
+            cusolver_traits<real_type>::potrf_buffer_size(
+                _cusolver_handle, CUBLAS_FILL_MODE_LOWER, this->_size, _d_matrix.data(),
+                this->_size, &workspace_size));
+    }
+
+    CHECK_CUDA_ERROR(cudaMalloc(&d_pivot, this->_size * sizeof(int)));
+    CHECK_CUDA_ERROR(cudaMalloc(&d_info, sizeof(int)));
+    CHECK_CUDA_ERROR(cudaMalloc(&d_workspace, workspace_size * sizeof(real_type)));
+
+    // perform the factorization
+    if (this->_solver_type == SolverType::LU) {
+        CHECK_CUSOLVER_ERROR(
+            cusolver_traits<real_type>::getrf(
+                _cusolver_handle, this->_size, this->_size, _d_matrix.data(), this->_size,
+                d_workspace, d_pivot, d_info));
+    } else if (this->_solver_type == SolverType::CHOLESKY) {
+        CHECK_CUSOLVER_ERROR(
+            cusolver_traits<real_type>::potrf(
+                _cusolver_handle, CUBLAS_FILL_MODE_LOWER, this->_size, _d_matrix.data(),
+                this->_size, d_workspace, workspace_size, d_info));
+    }
+    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
+
+    // solve the linear system
+    if (this->_solver_type == SolverType::LU) {
+        CHECK_CUSOLVER_ERROR(
+            cusolver_traits<real_type>::getrs(
+                _cusolver_handle, CUBLAS_OP_N, this->_size, 1, _d_matrix.data(), this->_size,
+                d_pivot, this->_d_rhs.data(), this->_size, d_info));
+    } else if (this->_solver_type == SolverType::CHOLESKY) {
+        CHECK_CUSOLVER_ERROR(
+            cusolver_traits<real_type>::potrs(
+                _cusolver_handle, CUBLAS_FILL_MODE_LOWER, this->_size, 1, _d_matrix.data(),
+                this->_size, this->_d_rhs.data(), this->_size, d_info));
+    }
+    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
+
+    // copy the solution from device global memory to host memory
+    // NOTE: _d_rhs contains the solution after the call to getrs/potrs as its contents are
+    // overwritten by the solution vector
+    CHECK_CUDA_ERROR(cudaMemcpy(
+        this->_h_solution.data(), this->_d_rhs.data(), this->_size * sizeof(real_type),
+        cudaMemcpyDeviceToHost));
+
+    // free the temporary device memory
+    CHECK_CUDA_ERROR(cudaFree(d_pivot));
+    CHECK_CUDA_ERROR(cudaFree(d_info));
+    CHECK_CUDA_ERROR(cudaFree(d_workspace));
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDADenseSolver<realT>::_initialize_cusolver() -> void
+{
+    // create the cuSOLVER handle
+    CHECK_CUSOLVER_ERROR(cusolverDnCreate(&_cusolver_handle));
+
+    // create a cuda stream
+    CHECK_CUDA_ERROR(cudaStreamCreateWithPriority(&(this->_cuda_stream), cudaStreamNonBlocking, 0));
+
+    // set the stream for the cuSOLVER handle
+    CHECK_CUSOLVER_ERROR(cusolverDnSetStream(_cusolver_handle, this->_cuda_stream));
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDADenseSolver<realT>::_finalize_cusolver() -> void
+{
+    // destroy the cuSOLVER handle
+    CHECK_CUSOLVER_ERROR(cusolverDnDestroy(_cusolver_handle));
+
+    // destroy the cuda stream
+    CHECK_CUDA_ERROR(cudaStreamDestroy(this->_cuda_stream));
+
+    // reset the handle and stream pointers
+    _cusolver_handle = nullptr;
+    this->_cuda_stream = nullptr;
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDADenseSolver<realT>::_allocate_host_memory(size_t size) -> void
+{
+    // allocate host memory for matrix, rhs and solution
+    _h_matrix.allocate(size * size);
+    this->_h_rhs.allocate(size);
+    this->_h_solution.allocate(size);
+
+    // zero out the arrays
+    _h_matrix.zero();
+    this->_h_rhs.zero();
+    this->_h_solution.zero();
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDADenseSolver<realT>::_allocate_device_memory(size_t size) -> void
+{
+    // allocate global device memory for matrix and rhs
+    _d_matrix.allocate(size * size);
+    this->_d_rhs.allocate(size);
+
+    // all done
+    return;
+}
+
+// explicit instantiation of the {CUDADenseSolver} class for doubles
+template class mito::solvers::cuda::CUDADenseSolver<double>;
+
+// explicit instantiation of the {CUDADenseSolver} class for floats
+template class mito::solvers::cuda::CUDADenseSolver<float>;
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
new file mode 100644
index 000000000..0c26e113f
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
@@ -0,0 +1,90 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+// code guard
+#pragma once
+
+
+// a struct to hold the cusolver function pointers for different data types
+template <typename T>
+struct cusolver_traits;
+
+template <>
+struct cusolver_traits<double> {
+    // Cholesky factorization and solve routines for double precision
+    static constexpr auto potrf_buffer_size = cusolverDnDpotrf_bufferSize;
+    static constexpr auto potrf = cusolverDnDpotrf;
+    static constexpr auto potrs = cusolverDnDpotrs;
+    // LU factorization and solve routines for double precision
+    static constexpr auto getrf_buffer_size = cusolverDnDgetrf_bufferSize;
+    static constexpr auto getrf = cusolverDnDgetrf;
+    static constexpr auto getrs = cusolverDnDgetrs;
+};
+
+template <>
+struct cusolver_traits<float> {
+    // Cholesky factorization and solve routines for single precision
+    static constexpr auto potrf_buffer_size = cusolverDnSpotrf_bufferSize;
+    static constexpr auto potrf = cusolverDnSpotrf;
+    static constexpr auto potrs = cusolverDnSpotrs;
+    // LU factorization and solve routines for single precision
+    static constexpr auto getrf_buffer_size = cusolverDnSgetrf_bufferSize;
+    static constexpr auto getrf = cusolverDnSgetrf;
+    static constexpr auto getrs = cusolverDnSgetrs;
+};
+
+
+namespace mito::solvers::cuda {
+
+    template <real_c realT>
+    class CUDADenseSolver : public CUDASolver<realT> {
+      private:
+        // real type definition from CUDASolver
+        using real_type = typename CUDASolver<realT>::real_type;
+
+      public:
+        // constructor
+        CUDADenseSolver(SolverType solver_type = SolverType::LU);
+
+        // destructor
+        ~CUDADenseSolver();
+
+      public:
+        // set (add or insert depending on the mode) the value of a matrix entry in the host copy
+        auto set_matrix_value(
+            size_t row, size_t col, const real_type value,
+            const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void override;
+
+        // reset the linear system (i.e. the host copy of the matrix, right-hand side and solution)
+        auto reset_system() -> void override;
+
+        // solve the linear system
+        auto solve() -> void override;
+
+      private:
+        // initialize the cuSOLVER utilities
+        auto _initialize_cusolver() -> void override;
+
+        // destroy the cuSOLVER utilities
+        auto _finalize_cusolver() -> void override;
+
+        // allocate the host memory for the matrix, right-hand side, and solution
+        auto _allocate_host_memory(size_t size) -> void override;
+
+        // allocate the device memory for the matrix and right-hand side
+        auto _allocate_device_memory(size_t size) -> void override;
+
+      private:
+        // host copy of the matrix
+        HostArray<real_type> _h_matrix;
+        // device copy of the matrix
+        DeviceArray<real_type> _d_matrix;
+        // cuSOLVER handle
+        cusolverDnHandle_t _cusolver_handle;
+    };
+}
+
+
+// end of file
diff --git a/lib/mito/solvers/backend/cuda/CUDAErrorChecks.h b/lib/mito/solvers/backend/cuda/CUDAErrorChecks.h
new file mode 100644
index 000000000..0fb22c848
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/CUDAErrorChecks.h
@@ -0,0 +1,70 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+
+// code guard
+#pragma once
+
+
+// CUDA error checking macro
+#define CHECK_CUDA_ERROR(call)                                                   \
+    do {                                                                         \
+        cudaError_t err = call;                                                  \
+        if (err != cudaSuccess) {                                                \
+            fprintf(stderr, "CUDA error at %s:%d\n", __FILE__, __LINE__);        \
+            fprintf(stderr, "  Error: %s (%d)\n", cudaGetErrorString(err), err); \
+            exit(EXIT_FAILURE);                                                  \
+        }                                                                        \
+    } while (0)
+
+// function to convert cuSOLVER error codes to strings
+inline const char *
+cusolverGetErrorString(cusolverStatus_t status)
+{
+    switch (status) {
+        case CUSOLVER_STATUS_SUCCESS:
+            return "CUSOLVER_STATUS_SUCCESS";
+        case CUSOLVER_STATUS_NOT_INITIALIZED:
+            return "CUSOLVER_STATUS_NOT_INITIALIZED";
+        case CUSOLVER_STATUS_ALLOC_FAILED:
+            return "CUSOLVER_STATUS_ALLOC_FAILED";
+        case CUSOLVER_STATUS_INVALID_VALUE:
+            return "CUSOLVER_STATUS_INVALID_VALUE";
+        case CUSOLVER_STATUS_ARCH_MISMATCH:
+            return "CUSOLVER_STATUS_ARCH_MISMATCH";
+        case CUSOLVER_STATUS_MAPPING_ERROR:
+            return "CUSOLVER_STATUS_MAPPING_ERROR";
+        case CUSOLVER_STATUS_EXECUTION_FAILED:
+            return "CUSOLVER_STATUS_EXECUTION_FAILED";
+        case CUSOLVER_STATUS_INTERNAL_ERROR:
+            return "CUSOLVER_STATUS_INTERNAL_ERROR";
+        case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+            return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+        default:
+            return "Unknown cuSOLVER error";
+    }
+}
+
+// cuSOLVER error checking macro
+#define CHECK_CUSOLVER_ERROR(call)                                                                 \
+    do {                                                                                           \
+        cusolverStatus_t status = (call);                                                          \
+        if (status != CUSOLVER_STATUS_SUCCESS) {                                                   \
+            fprintf(                                                                               \
+                stderr, "cuSOLVER error: %s at %s:%d\n", cusolverGetErrorString(status), __FILE__, \
+                __LINE__);                                                                         \
+            exit(EXIT_FAILURE);                                                                    \
+        }                                                                                          \
+    } while (0)
+
+// cuDSS error checking macro
+#define CHECK_CUDSS_ERROR(call)                                                        \
+    do {                                                                               \
+        cudssStatus_t status = (call);                                                 \
+        if (status != CUDSS_STATUS_SUCCESS) {                                          \
+            fprintf(stderr, "cuDSS error: %d at %s:%d\n", status, __FILE__, __LINE__); \
+            exit(EXIT_FAILURE);                                                        \
+        }                                                                              \
+    } while (0)
diff --git a/lib/mito/solvers/backend/cuda/CUDASolver.cu b/lib/mito/solvers/backend/cuda/CUDASolver.cu
new file mode 100644
index 000000000..8ee9e4641
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/CUDASolver.cu
@@ -0,0 +1,134 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+
+#include "public.h"
+
+
+// constructor
+template <mito::solvers::cuda::real_c realT>
+mito::solvers::cuda::CUDASolver<realT>::CUDASolver(SolverType solver_type) :
+    _h_rhs(),
+    _h_solution(),
+    _d_rhs(),
+    _solver_type(solver_type),
+    _size(0),
+    _is_solver_initialized(false),
+    _is_assembly_finalized(false),
+    _cuda_stream(nullptr)
+{}
+
+// destructor
+template <mito::solvers::cuda::real_c realT>
+mito::solvers::cuda::CUDASolver<realT>::~CUDASolver()
+{}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASolver<realT>::initialize(size_t size) -> void
+{
+    // check if the solver is already initialized
+    if (_is_solver_initialized) {
+        throw std::logic_error(
+            "Solver is already initialized. You are not allowed to re-initialize it. If you want "
+            "to re-initialize, you need to create a new solver instance.");
+    }
+
+    // check if the size is valid
+    if (size <= 0) {
+        throw std::invalid_argument("Size of the linear system must be greater than zero.");
+    }
+
+    // save the size of the linear system
+    _size = size;
+
+    // allocate host memory
+    _allocate_host_memory(size);
+
+    // allocate device memory
+    _allocate_device_memory(size);
+
+    // turn on the solver initialized flag
+    _is_solver_initialized = true;
+
+    // all done
+    return;
+}
+
+// add/insert {value} to rhs entry at {row} of the host copy
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASolver<realT>::set_rhs_value(
+    size_t row, const real_type value, const InsertMode insert_mode) -> void
+{
+    // check if the system assembly is finalized and throw an error if it is
+    if (_is_assembly_finalized) {
+        throw std::logic_error(
+            "System assembly is already finalized. Cannot add/insert values to the rhs.");
+    }
+
+    // check if the row index is within bounds
+    this->_check_index_validity(row);
+
+    // add/insert the value to the rhs entry in the host rhs
+    if (insert_mode == InsertMode::ADD_VALUE)
+        _h_rhs[row] += value;
+    else if (insert_mode == InsertMode::INSERT_VALUE)
+        _h_rhs[row] = value;
+    else
+        throw std::invalid_argument("Invalid insert mode. Use ADD_VALUE or INSERT_VALUE.");
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASolver<realT>::finalize_assembly() -> void
+{
+    // check if the solver is initialized
+    if (!_is_solver_initialized) {
+        throw std::logic_error(
+            "Solver is not yet initialized. Call initialize() first, assemble the "
+            "system, and then finalize the assembly.");
+    }
+
+    // issue a warning that all entries should be set before finalizing the assembly
+    std::cerr
+        << "Warning: Finalizing assembly. Make sure all system entries are set before this step."
+        << std::endl;
+
+    // set the assembly finalized flag to true
+    _is_assembly_finalized = true;
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASolver<realT>::_check_index_validity(size_t index) const -> void
+{
+    // check if the solver is initialized
+    if (!_is_solver_initialized) {
+        throw std::logic_error("Solver is not yet initialized. Call initialize() first.");
+    }
+
+    // check if the index is valid and return false if it is not
+    if (index >= _size) {
+        throw std::out_of_range(
+            "Index " + std::to_string(index) + " is out of range. It must be between 0 and "
+            + std::to_string(_size - 1) + ".");
+    }
+
+    // all done
+    return;
+}
+
+// explicit instantiation of the {CUDASolver} class for doubles
+template class mito::solvers::cuda::CUDASolver<double>;
+
+// explicit instantiation of the {CUDASolver} class for floats
+template class mito::solvers::cuda::CUDASolver<float>;
diff --git a/lib/mito/solvers/backend/cuda/CUDASolver.h b/lib/mito/solvers/backend/cuda/CUDASolver.h
new file mode 100644
index 000000000..f8c5e3e9b
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/CUDASolver.h
@@ -0,0 +1,96 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+// code guard
+#pragma once
+
+
+namespace mito::solvers::cuda {
+
+    template <real_c realT>
+    class CUDASolver {
+      public:
+        // type alias for real numbers
+        using real_type = realT;
+
+      public:
+        // constructor
+        CUDASolver(SolverType solver_type);
+
+        // destructor
+        ~CUDASolver();
+
+      public:
+        // initialize the CUDA solver
+        auto initialize(size_t size) -> void;
+
+        // reset the linear system (i.e. the host copy of the matrix, right-hand side and solution)
+        virtual auto reset_system() -> void = 0;
+
+        // set (add or insert depending on the mode) the value of a matrix entry in the host copy
+        virtual auto set_matrix_value(
+            size_t row, size_t col, const real_type value,
+            const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void = 0;
+
+        // set (add or insert depending on the mode) the value of a right-hand side entry in the
+        // host copy
+        virtual auto set_rhs_value(
+            size_t row, const real_type value,
+            const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void;
+
+        // finalize the linear system assembly
+        auto finalize_assembly() -> void;
+
+        // solve the linear system
+        virtual auto solve() -> void = 0;
+
+        // get the solution vector
+        template <class solutionT>
+        auto get_solution(solutionT & solution) const -> void;
+
+      protected:
+        // initialize the cuSOLVER utilities
+        virtual auto _initialize_cusolver() -> void = 0;
+
+        // destroy the cuSOLVER utilities
+        virtual auto _finalize_cusolver() -> void = 0;
+
+        // allocate the host memory for the matrix, right-hand side, and solution
+        virtual auto _allocate_host_memory(size_t size) -> void = 0;
+
+        // allocate the device memory for the matrix and right-hand side
+        virtual auto _allocate_device_memory(size_t size) -> void = 0;
+
+        // check the validity of the index in the matrix and right-hand side
+        auto _check_index_validity(size_t index) const -> void;
+
+      protected:
+        // host copy of the right-hand side
+        HostArray<real_type> _h_rhs;
+        // host copy of the solution
+        HostArray<real_type> _h_solution;
+        // device copy of the right-hand side
+        DeviceArray<real_type> _d_rhs;
+        // solver type
+        SolverType _solver_type;
+        // size of the linear system
+        size_t _size;
+        // flag to indicate if the solver has been initialized
+        bool _is_solver_initialized;
+        // flag to indicate if the system assembly has been finalized
+        bool _is_assembly_finalized;
+        // cuda stream
+        cudaStream_t _cuda_stream;
+    };
+}
+
+
+// get the template definitions
+#define mito_solvers_backend_cuda_CUDASolver_icc
+#include "CUDASolver.icc"
+#undef mito_solvers_backend_cuda_CUDASolver_icc
+
+
+// end of file
diff --git a/lib/mito/solvers/backend/cuda/CUDASolver.icc b/lib/mito/solvers/backend/cuda/CUDASolver.icc
new file mode 100644
index 000000000..5ee5e26f3
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/CUDASolver.icc
@@ -0,0 +1,32 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+
+#if !defined(mito_solvers_backend_cuda_CUDASolver_icc)
+#error This header file contains implementation details of class mito::solvers::cuda::CUDASolver
+#else
+
+template <mito::solvers::cuda::real_c realT>
+template <class solutionT>
+auto
+mito::solvers::cuda::CUDASolver<realT>::get_solution(solutionT & solution) const -> void
+{
+    // check if {_size} matches the size of {solution}
+    assert(_size == std::size(solution));
+
+    // reset the entries in {solution}
+    std::fill(std::begin(solution), std::end(solution), 0.0);
+
+    // copy {_h_solution} into {solution}
+    std::copy(_h_solution.data(), _h_solution.data() + _size, std::begin(solution));
+
+    // all done
+    return;
+}
+
+#endif    // mito_solvers_backend_cuda_CUDASolver_icc
+
+
+// end of file
diff --git a/lib/mito/solvers/backend/cuda/CUDASparseSolver.cu b/lib/mito/solvers/backend/cuda/CUDASparseSolver.cu
new file mode 100644
index 000000000..8186672e1
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/CUDASparseSolver.cu
@@ -0,0 +1,298 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+
+#include "public.h"
+
+
+// constructor
+template <mito::solvers::cuda::real_c realT>
+mito::solvers::cuda::CUDASparseSolver<realT>::CUDASparseSolver(SolverType solver_type) :
+    CUDASolver<realT>(solver_type),
+    _h_matrix(),
+    _d_matrix(),
+    _d_solution(),
+    _cudss_handle()
+{
+    // initialize cuSOLVER
+    _initialize_cusolver();
+}
+
+// destructor
+template <mito::solvers::cuda::real_c realT>
+mito::solvers::cuda::CUDASparseSolver<realT>::~CUDASparseSolver()
+{
+    // finalize cuSOLVER
+    _finalize_cusolver();
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASparseSolver<realT>::initialize(size_t size, size_t nnz) -> void
+{
+    // call the base class initialize function
+    CUDASolver<realT>::initialize(size);
+
+    // reserve space for non-zero entries per row in the host sparse matrix for efficiency
+    // NOTE: This is just an estimate. The actual number of non-zero entries may be less than
+    // nnz. But it is better to set it to a reasonable value to avoid multiple reallocations.
+    // In general, over-allocating is better than under-allocating.
+    (nnz == 0) ? _h_matrix.reserve(Eigen::VectorXi::Constant(size, size)) :
+                 _h_matrix.reserve(Eigen::VectorXi::Constant(size, nnz));
+
+    // all done
+    return;
+}
+
+// add/insert {value} to matrix entry at ({row}, {col}) of the host copy
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASparseSolver<realT>::set_matrix_value(
+    size_t row, size_t col, const real_type value, const InsertMode insert_mode) -> void
+{
+    // check if the system assembly is finalized and throw an error if it is
+    if (this->_is_assembly_finalized) {
+        throw std::logic_error(
+            "System assembly is already finalized. Cannot add/insert values to the matrix.");
+    }
+
+    // check if the row and column indices are within bounds
+    this->_check_index_validity(row);
+    this->_check_index_validity(col);
+
+    // add/insert the value to the matrix entry in the host matrix
+    // NOTE: We store the host matrix in CSR format using Eigen's SparseMatrix class in row-major
+    // order.
+    if (insert_mode == InsertMode::ADD_VALUE)
+        _h_matrix.coeffRef(row, col) += value;
+    else if (insert_mode == InsertMode::INSERT_VALUE)
+        _h_matrix.coeffRef(row, col) = value;
+    else
+        throw std::invalid_argument("Invalid insert mode. Use ADD_VALUE or INSERT_VALUE.");
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASparseSolver<realT>::reset_system() -> void
+{
+    // check if the solver is initialized
+    if (!this->_is_solver_initialized) {
+        throw std::logic_error("Solver is not yet initialized. Call initialize() first.");
+    }
+
+    // fill the host matrix, rhs and solution with zeros
+    _h_matrix.setZero();
+    this->_h_rhs.zero();
+    this->_h_solution.zero();
+
+    // reset the assembly finalized flag
+    this->_is_assembly_finalized = false;
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASparseSolver<realT>::solve() -> void
+{
+    // check if the assembly is finalized
+    if (!this->_is_assembly_finalized) {
+        throw std::logic_error(
+            "System assembly is not yet finalized. Call finalize_assembly() first.");
+    }
+
+    // finalize the host matrix (compress the storage)
+    // NOTE: This is not a mandatory step as we are only using the Eigen matrix for storage on the
+    // host but we are doing it anyway.
+    _h_matrix.makeCompressed();
+
+    // create and fill the auxiliary containers to hold the CSR representation of the host matrix
+    size_t nnz_matrix = _h_matrix.nonZeros();
+    std::vector<real_type> h_csr_values(nnz_matrix, 0.0);
+    std::vector<int> h_csr_row_offsets(this->_size + 1, 0);
+    std::vector<int> h_csr_col_indices(nnz_matrix, 0);
+    int idx = 0;
+    for (size_t k = 0; k < this->_size; ++k) {    // for each row
+        h_csr_row_offsets[k] = idx;
+        for (typename Eigen::SparseMatrix<real_type, Eigen::RowMajor>::InnerIterator it(
+                 _h_matrix, k);
+             it; ++it) {    // for each non-zero entry in the row
+            h_csr_values[idx] = it.value();
+            h_csr_col_indices[idx] = it.col();
+            ++idx;
+        }
+    }
+    h_csr_row_offsets[this->_size] = nnz_matrix;
+
+    // allocate device memory for the matrix
+    _d_matrix.allocate(this->_size, this->_size, nnz_matrix);
+
+    // copy the host matrix and rhs data to device global memory
+    CHECK_CUDA_ERROR(cudaMemcpy(
+        _d_matrix.values(), h_csr_values.data(), h_csr_values.size() * sizeof(real_type),
+        cudaMemcpyHostToDevice));
+    CHECK_CUDA_ERROR(cudaMemcpy(
+        _d_matrix.row_offsets(), h_csr_row_offsets.data(), h_csr_row_offsets.size() * sizeof(int),
+        cudaMemcpyHostToDevice));
+    CHECK_CUDA_ERROR(cudaMemcpy(
+        _d_matrix.col_indices(), h_csr_col_indices.data(), h_csr_col_indices.size() * sizeof(int),
+        cudaMemcpyHostToDevice));
+    CHECK_CUDA_ERROR(cudaMemcpy(
+        this->_d_rhs.data(), this->_h_rhs.data(), this->_size * sizeof(real_type),
+        cudaMemcpyHostToDevice));
+    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
+
+    // cuDSS data structures to solve the linear system
+    cudssConfig_t config;
+    cudssData_t data;
+    cudssMatrix_t d_A, d_b, d_x;
+
+    // create the cuDSS config and data structures
+    CHECK_CUDSS_ERROR(cudssConfigCreate(&config));
+    CHECK_CUDSS_ERROR(cudssDataCreate(_cudss_handle, &data));
+
+    // determine the data type
+    cudaDataType_t dtype;
+    if constexpr (std::is_same_v<real_type, double>) {
+        dtype = CUDA_R_64F;
+    } else if constexpr (std::is_same_v<real_type, float>) {
+        dtype = CUDA_R_32F;
+    }
+
+    // determine the matrix type based on the solver type
+    cudssMatrixType_t mtype;
+    if (this->_solver_type == SolverType::LU) {
+        mtype = CUDSS_MTYPE_GENERAL;
+    } else if (this->_solver_type == SolverType::CHOLESKY) {
+        mtype = CUDSS_MTYPE_SPD;
+    } else {
+        throw std::logic_error(
+            "Unsupported solver type. Only LU and CHOLESKY are supported in the CUDA sparse "
+            "solver.");
+    }
+
+    // create (CSR) matrix and vector descriptors
+    CHECK_CUDSS_ERROR(cudssMatrixCreateCsr(
+        &d_A, this->_size, this->_size, nnz_matrix, _d_matrix.row_offsets(), NULL,
+        _d_matrix.col_indices(), _d_matrix.values(), CUDA_R_32I, dtype, mtype, CUDSS_MVIEW_FULL,
+        CUDSS_BASE_ZERO));
+    CHECK_CUDSS_ERROR(cudssMatrixCreateDn(
+        &d_b, this->_size, 1, this->_size, this->_d_rhs.data(), dtype, CUDSS_LAYOUT_COL_MAJOR));
+    CHECK_CUDSS_ERROR(cudssMatrixCreateDn(
+        &d_x, this->_size, 1, this->_size, _d_solution.data(), dtype, CUDSS_LAYOUT_COL_MAJOR));
+
+    /************* solve the linear system in three stages *************/
+
+    // Stage 1: reordering & symbolic factorization
+    CHECK_CUDSS_ERROR(
+        cudssExecute(_cudss_handle, CUDSS_PHASE_ANALYSIS, config, data, d_A, d_x, d_b));
+    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
+
+    // Stage 2: numerical factorization
+    CHECK_CUDSS_ERROR(
+        cudssExecute(_cudss_handle, CUDSS_PHASE_FACTORIZATION, config, data, d_A, d_x, d_b));
+    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
+
+    // Stage 3: solve the system
+    CHECK_CUDSS_ERROR(cudssExecute(_cudss_handle, CUDSS_PHASE_SOLVE, config, data, d_A, d_x, d_b));
+    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
+
+    // copy the solution back to host
+    CHECK_CUDA_ERROR(cudaMemcpy(
+        this->_h_solution.data(), _d_solution.data(), this->_size * sizeof(real_type),
+        cudaMemcpyDeviceToHost));
+
+    // destroy the cuDSS data structures
+    CHECK_CUDSS_ERROR(cudssMatrixDestroy(d_A));
+    CHECK_CUDSS_ERROR(cudssMatrixDestroy(d_b));
+    CHECK_CUDSS_ERROR(cudssMatrixDestroy(d_x));
+    CHECK_CUDSS_ERROR(cudssDataDestroy(_cudss_handle, data));
+    CHECK_CUDSS_ERROR(cudssConfigDestroy(config));
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASparseSolver<realT>::_initialize_cusolver() -> void
+{
+    // create the cuDSS handle
+    CHECK_CUDSS_ERROR(cudssCreate(&_cudss_handle));
+
+    // create a cuda stream
+    CHECK_CUDA_ERROR(cudaStreamCreateWithPriority(&(this->_cuda_stream), cudaStreamNonBlocking, 0));
+
+    // set the stream for the cuDSS handle
+    CHECK_CUDSS_ERROR(cudssSetStream(_cudss_handle, this->_cuda_stream));
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASparseSolver<realT>::_finalize_cusolver() -> void
+{
+    // destroy the cuDSS handle
+    CHECK_CUDSS_ERROR(cudssDestroy(_cudss_handle));
+
+    // destroy the cuda stream
+    CHECK_CUDA_ERROR(cudaStreamDestroy(this->_cuda_stream));
+
+    // reset the handle and stream pointers
+    _cudss_handle = nullptr;
+    this->_cuda_stream = nullptr;
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASparseSolver<realT>::_allocate_host_memory(size_t size) -> void
+{
+    // allocate host memory for matrix, rhs and solution
+    _h_matrix.resize(size, size);
+    this->_h_rhs.allocate(size);
+    this->_h_solution.allocate(size);
+
+    // zero out the arrays
+    _h_matrix.setZero();
+    this->_h_rhs.zero();
+    this->_h_solution.zero();
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASparseSolver<realT>::_allocate_device_memory(size_t size) -> void
+{
+    // NOTE: We are not allocating memory for the matrix here because we do not know the number of
+    // non-zero entries yet and allocating device memory is an expensive operation. So we will
+    // allocate the device memory for the matrix directly after the assembly is finalized.
+
+    // allocate device memory for rhs and solution
+    this->_d_rhs.allocate(size);
+    _d_solution.allocate(size);
+
+    // zero out the solution array
+    _d_solution.zero();
+
+    // all done
+    return;
+}
+
+// explicit instantiation of the {CUDASparseSolver} class for doubles
+template class mito::solvers::cuda::CUDASparseSolver<double>;
+
+// explicit instantiation of the {CUDASparseSolver} class for floats
+template class mito::solvers::cuda::CUDASparseSolver<float>;
diff --git a/lib/mito/solvers/backend/cuda/CUDASparseSolver.h b/lib/mito/solvers/backend/cuda/CUDASparseSolver.h
new file mode 100644
index 000000000..c8967aee1
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/CUDASparseSolver.h
@@ -0,0 +1,65 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+// code guard
+#pragma once
+
+
+namespace mito::solvers::cuda {
+    template <real_c realT>
+    class CUDASparseSolver : public CUDASolver<realT> {
+      private:
+        // real type definition from CUDASolver
+        using real_type = typename CUDASolver<realT>::real_type;
+
+      public:
+        // constructor
+        CUDASparseSolver(SolverType solver_type = SolverType::LU);
+
+        // destructor
+        ~CUDASparseSolver();
+
+      public:
+        // initialize the CUDA sparse solver
+        auto initialize(size_t size, size_t nnz = 0) -> void;
+
+        // set (add or insert depending on the mode) the value of a matrix entry in the host copy
+        auto set_matrix_value(
+            size_t row, size_t col, const real_type value,
+            const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void override;
+
+        // reset the linear system (i.e. the host copy of the matrix, right-hand side and solution)
+        auto reset_system() -> void override;
+
+        // solve the linear system
+        auto solve() -> void override;
+
+      private:
+        // initialize the cuSOLVER utilities
+        auto _initialize_cusolver() -> void override;
+
+        // destroy the cuSOLVER utilities
+        auto _finalize_cusolver() -> void override;
+
+        // allocate the host memory for the matrix, right-hand side, and solution
+        auto _allocate_host_memory(size_t size) -> void override;
+
+        // allocate the device memory for the matrix and right-hand side
+        auto _allocate_device_memory(size_t size) -> void override;
+
+      private:
+        // host copy of the matrix in CSR format
+        Eigen::SparseMatrix<real_type, Eigen::RowMajor> _h_matrix;
+        // device copy of the matrix in CSR format
+        DeviceSparseMatrix<real_type> _d_matrix;
+        // device copy of the solution
+        DeviceArray<real_type> _d_solution;
+        // cuDSS handle
+        cudssHandle_t _cudss_handle;
+    };
+}
+
+
+// end of file
diff --git a/lib/mito/solvers/backend/cuda/api.h b/lib/mito/solvers/backend/cuda/api.h
new file mode 100644
index 000000000..fbb598816
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/api.h
@@ -0,0 +1,22 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+// code guard
+#pragma once
+
+
+namespace mito::solvers::cuda {
+
+    // cuda dense solver
+    template <real_c realT>
+    using dense_t = CUDADenseSolver<realT>;
+
+    // cuda sparse solver
+    template <real_c realT>
+    using sparse_t = CUDASparseSolver<realT>;
+}
+
+
+// end of file
\ No newline at end of file
diff --git a/lib/mito/solvers/backend/cuda/externals.h b/lib/mito/solvers/backend/cuda/externals.h
new file mode 100644
index 000000000..a6422a3bd
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/externals.h
@@ -0,0 +1,42 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+// code guard
+#pragma once
+
+
+// externals
+#include <string>
+#include <vector>
+#include <cassert>
+#include <iostream>
+#include <type_traits>
+#ifdef WITH_EIGEN
+#include <Eigen/Sparse>    // for sparse matrix support
+#endif                     // WITH_EIGEN
+
+
+// cuda support
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+#ifdef WITH_EIGEN
+#include <cudss.h>
+#endif    // WITH_EIGEN
+
+
+// utilities
+#include "utilities.h"
+
+
+namespace mito::solvers::cuda {
+
+    // concept for a real type
+    template <typename T>
+    concept real_c = std::is_floating_point_v<T>;
+
+}
+
+
+// end of file
diff --git a/lib/mito/solvers/backend/cuda/factories.h b/lib/mito/solvers/backend/cuda/factories.h
new file mode 100644
index 000000000..8dd3f44a6
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/factories.h
@@ -0,0 +1,28 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+// code guard
+#pragma once
+
+
+namespace mito::solvers::cuda {
+
+    // cuda dense solver
+    template <real_c realT>
+    auto dense(SolverType solver_type = SolverType::LU)
+    {
+        return dense_t<realT>(solver_type);
+    }
+
+    // cuda sparse solver
+    template <real_c realT>
+    auto sparse(SolverType solver_type = SolverType::LU)
+    {
+        return sparse_t<realT>(solver_type);
+    }
+}
+
+
+// end of file
\ No newline at end of file
diff --git a/lib/mito/solvers/backend/cuda/forward.h b/lib/mito/solvers/backend/cuda/forward.h
new file mode 100644
index 000000000..ec32486d8
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/forward.h
@@ -0,0 +1,28 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+// code guard
+#pragma once
+
+
+namespace mito::solvers::cuda {
+
+    // available insert modes for matrix and right-hand side entries
+    enum InsertMode { ADD_VALUE, INSERT_VALUE };
+
+    // available solver types
+    enum SolverType { CHOLESKY, LU };
+
+    // class for CUDA dense solver
+    template <real_c realT>
+    class CUDADenseSolver;
+
+    // class for CUDA sparse solver
+    template <real_c realT>
+    class CUDASparseSolver;
+}
+
+
+// end of file
diff --git a/lib/mito/solvers/backend/cuda/public.h b/lib/mito/solvers/backend/cuda/public.h
new file mode 100644
index 000000000..cd87df6b1
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/public.h
@@ -0,0 +1,33 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+// code guard
+#pragma once
+
+
+// external packages
+#include "externals.h"
+
+// get the forward declarations
+#include "forward.h"
+
+// published types
+#include "api.h"
+
+// classes
+#include "CUDASolver.h"
+#include "CUDADenseSolver.h"
+#ifdef WITH_EIGEN
+#include "CUDASparseSolver.h"
+#endif    // WITH_EIGEN
+
+// error checks
+#include "CUDAErrorChecks.h"
+
+// factories implementation
+#include "factories.h"
+
+
+// end of file
diff --git a/lib/mito/solvers/backend/cuda/utilities.h b/lib/mito/solvers/backend/cuda/utilities.h
new file mode 100644
index 000000000..c84a8af23
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/utilities.h
@@ -0,0 +1,508 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+// code guard
+#pragma once
+
+
+namespace mito::solvers::cuda {
+
+    // a utility class to manage host memory
+    template <std::floating_point T>
+    class HostArray {
+      public:
+        // type alias for real numbers
+        using real_type = T;
+        // available host memory types
+        enum HostMemoryType { PAGEABLE, PINNED };
+
+      public:
+        // default constructor
+        HostArray() : _data(nullptr), _size(0), _memory_type(HostMemoryType::PINNED) {}
+
+        // parameterized constructor
+        HostArray(std::size_t size, HostMemoryType memory_type = HostMemoryType::PINNED) :
+            _data(nullptr),
+            _size(size),
+            _memory_type(memory_type)
+        {
+            allocate(size);
+        }
+
+        // allocate on construction
+        explicit HostArray(std::size_t size) : HostArray() { allocate(size); }
+
+        // destructor
+        ~HostArray() { _free_memory(); }
+
+        // delete copy constructor
+        HostArray(const HostArray &) = delete;
+
+        // delete copy assignment operator
+        HostArray & operator=(const HostArray &) = delete;
+
+        // enable move semantics
+        HostArray(HostArray && other) noexcept :
+            _data(other._data),
+            _size(other._size),
+            _memory_type(other._memory_type)
+        {
+            // reset the state of the moved-from object
+            other._data = nullptr;
+            other._size = 0;
+            other._memory_type = HostMemoryType::PINNED;
+        }
+
+        HostArray & operator=(HostArray && other) noexcept
+        {
+            if (this != &other) {
+                // free existing memory and acquire new memory
+                _free_memory();
+                _data = other._data;
+                _size = other._size;
+                _memory_type = other._memory_type;
+                // reset the state of the moved-from object
+                other._data = nullptr;
+                other._size = 0;
+                other._memory_type = HostMemoryType::PINNED;
+            }
+            return *this;
+        }
+
+        // allocate or reallocate memory for the array
+        void allocate(std::size_t size)
+        {
+            if (_size == size)
+                return;
+
+            // free old memory if any
+            // NOTE: This could be expensive if called frequently. We are going ahead with this
+            // for now for simplicity. But, as a first step towards improvement, we can come up with
+            // a way to reuse existing memory if the new allocation size is smaller than the current
+            // size.
+            _free_memory();
+
+            _size = size;
+            if (_size == 0)
+                return;
+
+            // try to allocate pinned memory first for faster transfers
+            cudaError_t pinned_alloc_error =
+                cudaMallocHost((void **) &_data, _size * sizeof(real_type));
+            if (pinned_alloc_error == cudaSuccess) {
+                _memory_type = HostMemoryType::PINNED;
+            } else {
+                // fallback to pageable memory
+                try {
+                    std::cerr
+                        << "[HostArray] Warning: Failed to allocate pinned memory. Falling back "
+                           "to pageable memory\n";
+                    _data = new real_type[_size];
+                    _memory_type = HostMemoryType::PAGEABLE;
+                } catch (const std::bad_alloc & e) {
+                    throw std::runtime_error(
+                        "Failed to allocate host memory: " + std::string(e.what()));
+                }
+            }
+
+            // all done
+            return;
+        }
+
+        // zero out the array
+        void zero()
+        {
+            if (_data) {
+                std::fill(_data, _data + _size, static_cast<real_type>(0.0));
+            }
+        }
+
+        // data accessor
+        real_type * data() { return _data; }
+
+        // const data accessor
+        const real_type * data() const { return _data; }
+
+        // size accessor
+        std::size_t size() const { return _size; }
+
+        // accessor for memory type
+        HostMemoryType memory_type() const { return _memory_type; }
+
+        // element access
+        real_type & operator[](std::size_t idx) { return _data[idx]; }
+
+        // const element access
+        const real_type & operator[](std::size_t idx) const { return _data[idx]; }
+
+      private:
+        // free the memory in the array
+        void _free_memory()
+        {
+            if (!_data)
+                return;
+
+            // free the memory
+            if (_memory_type == HostMemoryType::PINNED) {
+                cudaError_t error = cudaFreeHost(_data);
+                if (error != cudaSuccess) {
+                    std::cerr << "[HostArray] Error: Failed to free pinned memory: "
+                              << cudaGetErrorString(error) << "\n";
+                }
+            } else {
+                delete[] _data;
+            }
+
+            // reset the state
+            _data = nullptr;
+            _size = 0;
+            _memory_type = HostMemoryType::PINNED;
+
+            // all done
+            return;
+        }
+
+      private:
+        // pointer to the data
+        real_type * _data;
+        // size of the array
+        std::size_t _size;
+        // type of memory
+        HostMemoryType _memory_type;
+    };
+
+    // a utility class to manage device memory
+    template <std::floating_point T>
+    class DeviceArray {
+      public:
+        using real_type = T;
+
+      public:
+        // default constructor
+        DeviceArray() : _data(nullptr), _size(0) {}
+
+        // allocate on construction
+        explicit DeviceArray(std::size_t size) : DeviceArray() { allocate(size); }
+
+        // destructor
+        ~DeviceArray() { _free_memory(); }
+
+        // delete copy constructor
+        DeviceArray(const DeviceArray &) = delete;
+
+        // delete copy assignment operator
+        DeviceArray & operator=(const DeviceArray &) = delete;
+
+        // enable move semantics
+        DeviceArray(DeviceArray && other) noexcept : _data(other._data), _size(other._size)
+        {
+            // reset the state of the moved-from object
+            other._data = nullptr;
+            other._size = 0;
+        }
+
+        DeviceArray & operator=(DeviceArray && other) noexcept
+        {
+            if (this != &other) {
+                // free existing memory and acquire new memory
+                _free_memory();
+                _data = other._data;
+                _size = other._size;
+                // reset the state of the moved-from object
+                other._data = nullptr;
+                other._size = 0;
+            }
+            return *this;
+        }
+
+        // allocate or reallocate memory for the array
+        void allocate(std::size_t size)
+        {
+            if (_size == size)
+                return;
+
+            // free old memory if any
+            _free_memory();
+
+            _size = size;
+            if (_size == 0)
+                return;
+
+            // allocate device memory
+            cudaError_t error = cudaMalloc((void **) &_data, size * sizeof(real_type));
+            if (error != cudaSuccess) {
+                throw std::runtime_error(
+                    "[DeviceArray] Error: Failed to allocate device memory: "
+                    + std::string(cudaGetErrorString(error)));
+            }
+
+            // all done
+            return;
+        }
+
+        // zero out the device array
+        void zero()
+        {
+            if (_data) {
+                cudaError_t error = cudaMemset(_data, 0, _size * sizeof(real_type));
+                if (error != cudaSuccess) {
+                    throw std::runtime_error(
+                        "[DeviceArray] Error: Failed to zero out device memory: "
+                        + std::string(cudaGetErrorString(error)));
+                }
+            }
+        }
+
+        // data accessor
+        real_type * data() { return _data; }
+
+        // const data accessor
+        const real_type * data() const { return _data; }
+
+        // size accessor
+        std::size_t size() const { return _size; }
+
+      private:
+        // free the memory in the array
+        void _free_memory()
+        {
+            if (!_data)
+                return;
+
+            // free the memory
+            cudaError_t error = cudaFree(_data);
+            if (error != cudaSuccess) {
+                std::cerr << "[DeviceArray] Error: Failed to free device memory: "
+                          << cudaGetErrorString(error) << "\n";
+            }
+
+            // reset the state
+            _data = nullptr;
+            _size = 0;
+
+            // all done
+            return;
+        }
+
+      private:
+        // pointer to the data
+        real_type * _data;
+        // size of the array
+        std::size_t _size;
+    };
+
+    // a utility class to manage device memory for sparse matrices in CSR format
+    template <std::floating_point T>
+    class DeviceSparseMatrix {
+      public:
+        using real_type = T;
+        using index_type = int;
+
+      public:
+        // default constructor
+        DeviceSparseMatrix() :
+            _values(nullptr),
+            _row_offsets(nullptr),
+            _col_indices(nullptr),
+            _rows(0),
+            _cols(0),
+            _nnz(0)
+        {}
+
+        // allocate on construction
+        explicit DeviceSparseMatrix(std::size_t rows, std::size_t cols, std::size_t nnz) :
+            DeviceSparseMatrix()
+        {
+            allocate(rows, cols, nnz);
+        }
+
+        // destructor
+        ~DeviceSparseMatrix() { _free_memory(); }
+
+        // delete copy constructor
+        DeviceSparseMatrix(const DeviceSparseMatrix &) = delete;
+
+        // delete copy assignment operator
+        DeviceSparseMatrix & operator=(const DeviceSparseMatrix &) = delete;
+
+        // enable move semantics
+        DeviceSparseMatrix(DeviceSparseMatrix && other) noexcept :
+            _values(other._values),
+            _row_offsets(other._row_offsets),
+            _col_indices(other._col_indices),
+            _rows(other._rows),
+            _cols(other._cols),
+            _nnz(other._nnz)
+        {
+            // reset the state of the moved-from object
+            other._values = nullptr;
+            other._row_offsets = nullptr;
+            other._col_indices = nullptr;
+            other._rows = 0;
+            other._cols = 0;
+            other._nnz = 0;
+        }
+
+        DeviceSparseMatrix & operator=(DeviceSparseMatrix && other) noexcept
+        {
+            if (this != &other) {
+                // free existing memory and acquire new memory
+                _free_memory();
+                _values = other._values;
+                _row_offsets = other._row_offsets;
+                _col_indices = other._col_indices;
+                _rows = other._rows;
+                _cols = other._cols;
+                _nnz = other._nnz;
+                // reset the state of the moved-from object
+                other._values = nullptr;
+                other._row_offsets = nullptr;
+                other._col_indices = nullptr;
+                other._rows = 0;
+                other._cols = 0;
+                other._nnz = 0;
+            }
+            return *this;
+        }
+
+        // allocate or reallocate memory for the matrix
+        void allocate(std::size_t rows, std::size_t cols, std::size_t nnz)
+        {
+            if (_rows == rows && _cols == cols && _nnz == nnz)
+                return;
+
+            // free old memory if any
+            _free_memory();
+
+            _rows = rows;
+            _cols = cols;
+            _nnz = nnz;
+            if (_rows == 0 || _cols == 0 || _nnz == 0)    // no memory to allocate
+                return;
+
+            // allocate device memory for the CSR representation
+            cudaError_t error;
+
+            error = cudaMalloc((void **) &_values, _nnz * sizeof(real_type));
+            if (error != cudaSuccess) {
+                throw std::runtime_error(
+                    "[DeviceSparseMatrix] Error: Failed to allocate values array: "
+                    + std::string(cudaGetErrorString(error)));
+            }
+
+            error = cudaMalloc((void **) &_row_offsets, (_rows + 1) * sizeof(index_type));
+            if (error != cudaSuccess) {
+                throw std::runtime_error(
+                    "[DeviceSparseMatrix] Error: Failed to allocate row offsets array: "
+                    + std::string(cudaGetErrorString(error)));
+            }
+
+            error = cudaMalloc((void **) &_col_indices, _nnz * sizeof(index_type));
+            if (error != cudaSuccess) {
+                throw std::runtime_error(
+                    "[DeviceSparseMatrix] Error: Failed to allocate column indices array: "
+                    + std::string(cudaGetErrorString(error)));
+            }
+
+            // all done
+            return;
+        }
+
+        // zero out the device matrix
+        void zero()
+        {
+            if (_values) {
+                cudaError_t error = cudaMemset(_values, 0, _nnz * sizeof(real_type));
+                if (error != cudaSuccess) {
+                    throw std::runtime_error(
+                        "[DeviceSparseMatrix] Error: Failed to zero out device memory: "
+                        + std::string(cudaGetErrorString(error)));
+                }
+            }
+        }
+
+        // values accessor
+        real_type * values() { return _values; }
+
+        // row offsets accessor
+        index_type * row_offsets() { return _row_offsets; }
+
+        // column indices accessor
+        index_type * col_indices() { return _col_indices; }
+
+        // const values accessor
+        const real_type * values() const { return _values; }
+
+        // const row offsets accessor
+        const index_type * row_offsets() const { return _row_offsets; }
+
+        // const column indices accessor
+        const index_type * col_indices() const { return _col_indices; }
+
+        // row number accessor
+        size_t rows() const { return _rows; }
+
+        // column number accessor
+        size_t cols() const { return _cols; }
+
+        // non-zero entries accessor
+        size_t nnz() const { return _nnz; }
+
+      private:
+        // free the memory in the matrix
+        void _free_memory()
+        {
+            if (_values) {
+                cudaError_t error = cudaFree(_values);
+                if (error != cudaSuccess) {
+                    std::cerr << "[DeviceSparseMatrix] Error: Failed to free values array: "
+                              << cudaGetErrorString(error) << "\n";
+                }
+            }
+            if (_row_offsets) {
+                cudaError_t error = cudaFree(_row_offsets);
+                if (error != cudaSuccess) {
+                    std::cerr << "[DeviceSparseMatrix] Error: Failed to free row offsets array: "
+                              << cudaGetErrorString(error) << "\n";
+                }
+            }
+            if (_col_indices) {
+                cudaError_t error = cudaFree(_col_indices);
+                if (error != cudaSuccess) {
+                    std::cerr << "[DeviceSparseMatrix] Error: Failed to free column indices array: "
+                              << cudaGetErrorString(error) << "\n";
+                }
+            }
+
+            // reset the state
+            _values = nullptr;
+            _row_offsets = nullptr;
+            _col_indices = nullptr;
+            _rows = 0;
+            _cols = 0;
+            _nnz = 0;
+
+            // all done
+            return;
+        }
+
+      private:
+        // pointer to the non-zero values
+        real_type * _values;
+        // pointer to the row offsets
+        index_type * _row_offsets;
+        // pointer to the column indices
+        index_type * _col_indices;
+        // number of rows
+        size_t _rows;
+        // number of columns
+        size_t _cols;
+        // number of non-zero entries
+        size_t _nnz;
+    };
+}
+
+
+// end of file
diff --git a/lib/mito/solvers/public.h b/lib/mito/solvers/public.h
index aca9e6a46..698d69016 100644
--- a/lib/mito/solvers/public.h
+++ b/lib/mito/solvers/public.h
@@ -12,4 +12,9 @@
 #endif    // WITH_PETSC
 
 
+#ifdef WITH_CUDA
+#include "backend/cuda/public.h"
+#endif    // WITH_CUDA
+
+
 // end of file
diff --git a/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc b/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
new file mode 100644
index 000000000..6cd7ea189
--- /dev/null
+++ b/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
@@ -0,0 +1,145 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+
+#include <gtest/gtest.h>
+#include <mito.h>
+
+
+TEST(Solvers, CUDADenseSolverSymmetricLinearSystem)
+{
+    // the size of the linear system
+    int N = 10;
+
+    // numerical tolerance for the solution
+    double tol = 1.0e-10;
+
+    // instantiate a CUDA Dense solver for a linear system of size {N}
+    auto solver = mito::solvers::cuda::dense<mito::real>(mito::solvers::cuda::SolverType::CHOLESKY);
+    solver.initialize(N);
+
+    // set matrix entries
+    double A[10][10] = { { 3.5, -1.2, 0.8, 0.0, -0.5, 1.1, 0.0, 0.3, -0.9, 0.4 },
+                         { -1.2, 4.0, 1.3, -0.4, 0.6, -0.7, 0.0, 0.5, 0.0, -1.0 },
+                         { 0.8, 1.3, 3.8, 0.7, -0.8, 0.0, -0.2, 0.6, -0.5, 0.0 },
+                         { 0.0, -0.4, 0.7, 4.2, 1.0, 0.9, 0.3, 0.0, 0.4, -0.3 },
+                         { -0.5, 0.6, -0.8, 1.0, 4.5, 0.4, -0.7, 0.8, 0.0, 0.2 },
+                         { 1.1, -0.7, 0.0, 0.9, 0.4, 3.6, -0.3, 0.0, -0.4, 0.5 },
+                         { 0.0, 0.0, -0.2, 0.3, -0.7, -0.3, 3.3, 0.6, -0.8, 0.0 },
+                         { 0.3, 0.5, 0.6, 0.0, 0.8, 0.0, 0.6, 4.1, 0.2, 0.7 },
+                         { -0.9, 0.0, -0.5, 0.4, 0.0, -0.4, -0.8, 0.2, 3.7, 0.9 },
+                         { 0.4, -1.0, 0.0, -0.3, 0.2, 0.5, 0.0, 0.7, 0.9, 4.0 } };
+    for (int i = 0; i < N; i++) {
+        for (int j = 0; j < N; j++) {
+            solver.set_matrix_value(i, j, A[i][j], mito::solvers::cuda::InsertMode::INSERT_VALUE);
+        }
+    }
+
+    // set rhs entries
+    double b[10] = { 1.0, -2.0, 0.5, 1.5, -1.0, 0.7, -0.8, 2.0, 0.0, -1.5 };
+    for (int i = 0; i < N; i++) {
+        solver.set_rhs_value(i, b[i], mito::solvers::cuda::InsertMode::INSERT_VALUE);
+    }
+
+    // finalize the assembly of the linear system
+    solver.finalize_assembly();
+
+    // solve the linear system
+    solver.solve();
+
+    // read the solution
+    auto x = std::vector<double>(N);
+    solver.get_solution(x);
+
+    // check the solution
+    EXPECT_NEAR(x[0], -0.0560503759, tol);
+    EXPECT_NEAR(x[1], -0.6932480517, tol);
+    EXPECT_NEAR(x[2], 0.0666007599, tol);
+    EXPECT_NEAR(x[3], 0.3544482567, tol);
+    EXPECT_NEAR(x[4], -0.4084514057, tol);
+    EXPECT_NEAR(x[5], 0.0767339355, tol);
+    EXPECT_NEAR(x[6], -0.5097195863, tol);
+    EXPECT_NEAR(x[7], 0.8324810769, tol);
+    EXPECT_NEAR(x[8], -0.0333448254, tol);
+    EXPECT_NEAR(x[9], -0.6434741305, tol);
+
+    // all done
+    return;
+}
+
+TEST(Solvers, CUDADenseSolverUnsymmetricLinearSystem)
+{
+    // the size of the linear system
+    int N = 10;
+
+    // numerical tolerance for the solution
+    double tol = 1.0e-10;
+
+    // instantiate a CUDA Dense solver for a linear system of size {N}
+    auto solver = mito::solvers::cuda::dense<mito::real>(mito::solvers::cuda::SolverType::LU);
+    solver.initialize(N);
+
+    // set matrix entries
+    double A[10][10] = { { 3.5, -1.2, 0.8, 0.0, -0.5, 1.1, 0.0, 0.3, -0.9, 0.4 },
+                         { 0.6, 4.0, 1.3, -0.4, 0.6, -0.7, 0.0, 0.5, 0.0, -1.0 },
+                         { -1.1, 1.0, 3.8, 0.7, -0.8, 0.0, -0.2, 0.6, -0.5, 0.0 },
+                         { 0.0, -0.4, 0.7, 4.2, -0.9, 0.9, 0.3, 0.0, 0.4, -0.3 },
+                         { 0.9, 0.0, -0.8, 1.0, 4.5, 0.4, -0.7, 0.8, 0.0, 0.2 },
+                         { 1.1, -0.7, 0.0, -1.3, 0.4, 3.6, -0.3, 0.0, -0.4, 0.5 },
+                         { 0.0, 0.0, 1.1, 0.3, -0.7, -0.3, 3.3, 0.6, -0.8, 0.0 },
+                         { 0.3, 0.5, 0.6, 0.0, 0.8, 0.0, 0.6, 4.1, -0.5, 0.7 },
+                         { -0.9, 0.0, -0.5, -1.0, 0.0, -0.4, -0.8, 0.2, 3.7, 0.9 },
+                         { 0.4, 1.3, 0.0, -0.3, 0.2, 0.5, 0.0, 0.7, 0.9, 4.0 } };
+    for (int i = 0; i < N; i++) {
+        for (int j = 0; j < N; j++) {
+            solver.set_matrix_value(i, j, A[i][j], mito::solvers::cuda::InsertMode::INSERT_VALUE);
+        }
+    }
+
+    // set rhs entries
+    double b[10] = { -1.2, 0.5, 0.7, 1.0, 0.3, -1.5, 2.0, -0.8, 0.6, 1.1 };
+    for (int i = 0; i < N; i++) {
+        solver.set_rhs_value(i, b[i], mito::solvers::cuda::InsertMode::INSERT_VALUE);
+    }
+
+    // finalize the assembly of the linear system
+    solver.finalize_assembly();
+
+    // solve the linear system
+    solver.solve();
+
+    // read the solution
+    auto x = std::vector<double>(N);
+    solver.get_solution(x);
+
+    // check the solution
+    EXPECT_NEAR(x[0], -0.1642774470, tol);
+    EXPECT_NEAR(x[1], 0.1352982046, tol);
+    EXPECT_NEAR(x[2], 0.2493049978, tol);
+    EXPECT_NEAR(x[3], 0.2587528172, tol);
+    EXPECT_NEAR(x[4], 0.2711567201, tol);
+    EXPECT_NEAR(x[5], -0.2258981243, tol);
+    EXPECT_NEAR(x[6], 0.6823460181, tol);
+    EXPECT_NEAR(x[7], -0.4005242029, tol);
+    EXPECT_NEAR(x[8], 0.3015605175, tol);
+    EXPECT_NEAR(x[9], 0.2837823381, tol);
+
+    // all done
+    return;
+}
+
+
+int
+main(int argc, char ** argv)
+{
+    ::testing::InitGoogleTest(&argc, argv);
+    auto result = RUN_ALL_TESTS();
+
+    // all done
+    return result;
+}
+
+
+// end of file
diff --git a/tests/mito.lib/solvers/cuda_sparse_solver_solve_linear_system.cc b/tests/mito.lib/solvers/cuda_sparse_solver_solve_linear_system.cc
new file mode 100644
index 000000000..e29a8a77e
--- /dev/null
+++ b/tests/mito.lib/solvers/cuda_sparse_solver_solve_linear_system.cc
@@ -0,0 +1,131 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+
+#include <gtest/gtest.h>
+#include <mito.h>
+
+TEST(Solvers, CUDASparseSolverSymmetricLinearSystem)
+{
+    // the size of the linear system
+    int N = 10;
+
+    // numerical tolerance for the solution
+    double tol = 1.0e-10;
+
+    // instantiate a CUDA Sparse solver for a linear system of size {N}
+    auto solver =
+        mito::solvers::cuda::sparse<mito::real>(mito::solvers::cuda::SolverType::CHOLESKY);
+    solver.initialize(N, 3);    // 3 non-zero entries per row
+
+    // set matrix and right-hand side entries
+    for (int i = 0; i < N; i++) {
+        // matrix: diagonal
+        solver.set_matrix_value(i, i, 2.0, mito::solvers::cuda::InsertMode::INSERT_VALUE);
+        // matrix: subdiagonal
+        if (i > 0) {
+            solver.set_matrix_value(i, i - 1, -1.0, mito::solvers::cuda::InsertMode::ADD_VALUE);
+        }
+        // matrix: superdiagonal
+        if (i < N - 1) {
+            solver.set_matrix_value(i, i + 1, -1.0, mito::solvers::cuda::InsertMode::INSERT_VALUE);
+        }
+        // rhs
+        solver.set_rhs_value(i, 1.0, mito::solvers::cuda::InsertMode::ADD_VALUE);
+    }
+
+    // finalize the assembly of the linear system
+    solver.finalize_assembly();
+
+    // solve the linear system
+    solver.solve();
+
+    // read the solution
+    auto x = std::vector<double>(N);
+    solver.get_solution(x);
+
+    // check the solution
+    EXPECT_NEAR(x[0], 5.0, tol);
+    EXPECT_NEAR(x[1], 9.0, tol);
+    EXPECT_NEAR(x[2], 12.0, tol);
+    EXPECT_NEAR(x[3], 14.0, tol);
+    EXPECT_NEAR(x[4], 15.0, tol);
+    EXPECT_NEAR(x[5], 15.0, tol);
+    EXPECT_NEAR(x[6], 14.0, tol);
+    EXPECT_NEAR(x[7], 12.0, tol);
+    EXPECT_NEAR(x[8], 9.0, tol);
+    EXPECT_NEAR(x[9], 5.0, tol);
+
+    // all done
+    return;
+}
+
+TEST(Solvers, CUDASparseSolverUnsymmetricLinearSystem)
+{
+    // the size of the linear system
+    int N = 10;
+
+    // numerical tolerance for the solution
+    double tol = 1.0e-10;
+
+    // instantiate a CUDA Sparse solver for a linear system of size {N}
+    auto solver = mito::solvers::cuda::sparse<mito::real>(mito::solvers::cuda::SolverType::LU);
+    solver.initialize(N);    // non-zeros per row are not set
+
+    // set matrix and right-hand side entries
+    for (int i = 0; i < N; i++) {
+        // matrix: diagonal
+        solver.set_matrix_value(i, i, 2.0, mito::solvers::cuda::InsertMode::ADD_VALUE);
+        // matrix: subdiagonal
+        if (i > 0) {
+            solver.set_matrix_value(i, i - 1, -1.0, mito::solvers::cuda::InsertMode::INSERT_VALUE);
+        }
+        // matrix: superdiagonal
+        if (i < N - 1) {
+            solver.set_matrix_value(i, i + 1, -0.5, mito::solvers::cuda::InsertMode::INSERT_VALUE);
+        }
+        // rhs
+        solver.set_rhs_value(i, 1.0, mito::solvers::cuda::InsertMode::ADD_VALUE);
+    }
+
+    // finalize the assembly of the linear system
+    solver.finalize_assembly();
+
+    // solve the linear system
+    solver.solve();
+
+    // read the solution
+    auto x = std::vector<double>(N);
+    solver.get_solution(x);
+
+    // check the solution
+    EXPECT_NEAR(x[0], 0.8284194482079074, tol);
+    EXPECT_NEAR(x[1], 1.3136777928316294, tol);
+    EXPECT_NEAR(x[2], 1.5978722749107030, tol);
+    EXPECT_NEAR(x[3], 1.7641335139795535, tol);
+    EXPECT_NEAR(x[4], 1.8607895060968098, tol);
+    EXPECT_NEAR(x[5], 1.9148909964281315, tol);
+    EXPECT_NEAR(x[6], 1.9379849735189063, tol);
+    EXPECT_NEAR(x[7], 1.9221579012193621, tol);
+    EXPECT_NEAR(x[8], 1.8126616578396353, tol);
+    EXPECT_NEAR(x[9], 1.4063308289198175, tol);
+
+    // all done
+    return;
+}
+
+
+int
+main(int argc, char ** argv)
+{
+    ::testing::InitGoogleTest(&argc, argv);
+    auto result = RUN_ALL_TESTS();
+
+    // all done
+    return result;
+}
+
+
+// end of file