From 39b609e8f9a82f410c8bfe87d5b72bb6c50d058d Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Fri, 6 Jun 2025 18:19:26 +0200
Subject: [PATCH 01/35] solvers/cuda/CUDADenseSolver: first attempt at the CUDA
 dense solver implementation

This is the first attempt at implementing a CUDA-based dense solver. The code doesn't compile yet since we didn't make the necessary modifications to the build system to use the CUDA compiler. We will make those changes in the next commit.
---
 .../solvers/backend/cuda/CUDADenseSolver.cu   | 468 ++++++++++++++++++
 .../solvers/backend/cuda/CUDADenseSolver.h    | 104 ++++
 .../solvers/backend/cuda/CUDADenseSolver.icc  |  16 +
 lib/mito/solvers/backend/cuda/api.h           |  18 +
 lib/mito/solvers/backend/cuda/externals.h     |  25 +
 lib/mito/solvers/backend/cuda/factories.h     |  21 +
 lib/mito/solvers/backend/cuda/forward.h       |  20 +
 lib/mito/solvers/backend/cuda/public.h        |  26 +
 lib/mito/solvers/public.h                     |   5 +
 9 files changed, 703 insertions(+)
 create mode 100644 lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
 create mode 100644 lib/mito/solvers/backend/cuda/CUDADenseSolver.h
 create mode 100644 lib/mito/solvers/backend/cuda/CUDADenseSolver.icc
 create mode 100644 lib/mito/solvers/backend/cuda/api.h
 create mode 100644 lib/mito/solvers/backend/cuda/externals.h
 create mode 100644 lib/mito/solvers/backend/cuda/factories.h
 create mode 100644 lib/mito/solvers/backend/cuda/forward.h
 create mode 100644 lib/mito/solvers/backend/cuda/public.h

diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
new file mode 100644
index 000000000..1af745895
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
@@ -0,0 +1,468 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+
+#include "public.h"
+
+
+// CUDA error checking macro
+#define CHECK_CUDA_ERROR(call)                                                   \
+    do {                                                                         \
+        cudaError_t err = call;                                                  \
+        if (err != cudaSuccess) {                                                \
+            fprintf(stderr, "CUDA error at %s:%d\n", __FILE__, __LINE__);        \
+            fprintf(stderr, "  Error: %s (%d)\n", cudaGetErrorString(err), err); \
+            exit(EXIT_FAILURE);                                                  \
+        }                                                                        \
+    } while (0)
+
+// function to convert cuSOLVER error codes to strings
+inline const char *
+cusolverGetErrorString(cusolverStatus_t status)
+{
+    switch (status) {
+        case CUSOLVER_STATUS_SUCCESS:
+            return "CUSOLVER_STATUS_SUCCESS";
+        case CUSOLVER_STATUS_NOT_INITIALIZED:
+            return "CUSOLVER_STATUS_NOT_INITIALIZED";
+        case CUSOLVER_STATUS_ALLOC_FAILED:
+            return "CUSOLVER_STATUS_ALLOC_FAILED";
+        case CUSOLVER_STATUS_INVALID_VALUE:
+            return "CUSOLVER_STATUS_INVALID_VALUE";
+        case CUSOLVER_STATUS_ARCH_MISMATCH:
+            return "CUSOLVER_STATUS_ARCH_MISMATCH";
+        case CUSOLVER_STATUS_MAPPING_ERROR:
+            return "CUSOLVER_STATUS_MAPPING_ERROR";
+        case CUSOLVER_STATUS_EXECUTION_FAILED:
+            return "CUSOLVER_STATUS_EXECUTION_FAILED";
+        case CUSOLVER_STATUS_INTERNAL_ERROR:
+            return "CUSOLVER_STATUS_INTERNAL_ERROR";
+        case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+            return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+        default:
+            return "Unknown cuSOLVER error";
+    }
+}
+
+// cuSOLVER error checking macro
+#define CHECK_CUSOLVER_ERROR(call)                                                                 \
+    do {                                                                                           \
+        cusolverStatus_t status = (call);                                                          \
+        if (status != CUSOLVER_STATUS_SUCCESS) {                                                   \
+            fprintf(                                                                               \
+                stderr, "cuSOLVER error: %s at %s:%d\n", cusolverGetErrorString(status), __FILE__, \
+                __LINE__);                                                                         \
+            exit(EXIT_FAILURE);                                                                    \
+        }                                                                                          \
+    } while (0)
+
+// constructor
+mito::solvers::cuda::CUDADenseSolver::CUDADenseSolver() :
+    _h_matrix(nullptr),
+    _h_rhs(nullptr),
+    _h_solution(nullptr),
+    _d_matrix(nullptr),
+    _d_rhs(nullptr),
+    _size(0),
+    _is_solver_initialized(false),
+    _allocated_host_memory_type(0),
+    _is_assembly_finalized(false),
+    _cusolver_handle(nullptr),
+    _cuda_stream(nullptr)
+{
+    // initialize cuSOLVER
+    _initialize_cusolver();
+}
+
+// destructor
+mito::solvers::cuda::CUDADenseSolver::~CUDADenseSolver()
+{
+    // finalize cuSOLVER
+    _finalize_cusolver();
+}
+
+auto
+mito::solvers::cuda::CUDADenseSolver::initialize(size_t size) -> void
+{
+    // check if the solver is already initialized
+    if (_is_solver_initialized) {
+        throw std::logic_error(
+            "Solver is already initialized. Are you sure you want to reinitialize? Then call "
+            "finalize() first.");
+    }
+
+    // check if the size is valid
+    if (size <= 0) {
+        throw std::invalid_argument("Size of the linear system must be greater than zero.");
+    }
+
+    // save the size of the linear system
+    _size = size;
+
+    // allocate host memory
+    _allocate_host_memory(size);
+
+    // initialize host data
+    _initialize_host_data(size);
+
+    // allocate device memory
+    _allocate_device_memory(size);
+
+    // turn on the solver initialized flag
+    _is_solver_initialized = true;
+
+    // all done
+    return;
+}
+
+auto
+mito::solvers::cuda::CUDADenseSolver::finalize() -> void
+{
+    // check if the solver is initialized
+    if (_is_solver_initialized) {
+        // free host memory
+        _free_host_memory();
+
+        // free device memory
+        _free_device_memory();
+    }
+
+    // reset the solver initialized flag
+    _is_solver_initialized = false;
+
+    // all done
+    return;
+}
+
+auto
+mito::solvers::cuda::CUDADenseSolver::reset_system() -> void
+{
+    // check if the solver is initialized
+    if (!_is_solver_initialized) {
+        throw std::logic_error("Solver is not yet initialized. Call initialize() first.");
+    }
+
+    // fill the host matrix, rhs and solution with zeros
+    _initialize_host_data(_size);
+
+    // reset the assembly finalized flag
+    _is_assembly_finalized = false;
+
+    // all done
+    return;
+}
+
+// add/insert {value} to matrix entry at ({row}, {col}) of the host copy
+auto
+mito::solvers::cuda::CUDADenseSolver::set_matrix_value(
+    size_t row, size_t col, const mito::real value,
+    const mito::solvers::cuda::InsertMode insert_mode = mito::solvers::cuda::InsertMode::ADD_VALUE)
+    -> void
+{
+    // check if the system assembly is finalized and throw an error if it is
+    if (_is_assembly_finalized) {
+        throw std::logic_error(
+            "System assembly is already finalized. Cannot add/insert values to the matrix.");
+    }
+
+    // check if the row and column indices are within bounds
+    _check_index_validity(row);
+    _check_index_validity(col);
+
+    // add/insert the value to the matrix entry in the host matrix
+    if (insert_mode == mito::solvers::cuda::InsertMode::ADD_VALUE)
+        _h_matrix[row * _size + col] += value;
+    else if (insert_mode == mito::solvers::cuda::InsertMode::INSERT_VALUE)
+        _h_matrix[row * _size + col] = value;
+    else
+        throw std::invalid_argument("Invalid insert mode. Use ADD_VALUE or INSERT_VALUE.");
+
+    // all done
+    return;
+}
+
+// add/insert {value} to rhs entry at {row} of the host copy
+auto
+mito::solvers::cuda::CUDADenseSolver::set_rhs_value(
+    size_t row, const mito::real value,
+    const mito::solvers::cuda::InsertMode insert_mode = mito::solvers::cuda::InsertMode::ADD_VALUE)
+    -> void
+{
+    // check if the system assembly is finalized and throw an error if it is
+    if (_is_assembly_finalized) {
+        throw std::logic_error(
+            "System assembly is already finalized. Cannot add/insert values to the rhs.");
+    }
+
+    // check if the row index is within bounds
+    _check_index_validity(row);
+
+    // add/insert the value to the rhs entry in the host rhs
+    if (insert_mode == mito::solvers::cuda::InsertMode::ADD_VALUE)
+        _h_rhs[row] += value;
+    else if (insert_mode == mito::solvers::cuda::InsertMode::INSERT_VALUE)
+        _h_rhs[row] = value;
+    else
+        throw std::invalid_argument("Invalid insert mode. Use ADD_VALUE or INSERT_VALUE.");
+
+    // all done
+    return;
+}
+
+auto
+mito::solvers::cuda::CUDADenseSolver::finalize_assembly() -> void
+{
+    // check if the solver is initialized
+    if (!_is_solver_initialized) {
+        throw std::logic_error(
+            "Solver is not yet initialized. Call initialize() first, assemble the "
+            "system, and then finalize the assembly.");
+    }
+
+    // issue a warning that all entries should be set before finalizing the assembly
+    std::cerr
+        << "Warning: Finalizing assembly. Make sure all system entries are set before this step."
+        << std::endl;
+
+    // set the assembly finalized flag to true
+    _is_assembly_finalized = true;
+
+    // all done
+    return;
+}
+
+auto
+mito::solvers::cuda::CUDADenseSolver::solve() -> void
+{
+    // check if the assembly is finalized
+    if (!_is_assembly_finalized) {
+        throw std::logic_error(
+            "System assembly is not yet finalized. Call finalize_assembly() first.");
+    }
+
+    // copy the host matrix and rhs data to device global memory
+    // IMPROVE: We should move the data through streams for better performance later!
+    CHECK_CUDA_ERROR(cudaMemcpy(
+        _d_matrix, _h_matrix, _size * _size * sizeof(mito::real), cudaMemcpyHostToDevice));
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(_d_rhs, _h_rhs, _size * sizeof(mito::real), cudaMemcpyHostToDevice));
+
+    // allocate device memory for temporary variables in the LU factorization
+    int * d_pivot = nullptr;
+    int * d_info = nullptr;
+    real * d_workspace = nullptr;
+    int workspace_size = 0;
+
+    // get the workspace size for getrf (only double precision LU factorization is supported!)
+    // QUESTION: Should we check if mito::real is double or float and allocate the workspace memory
+    // accordingly?
+    CHECK_CUSOLVER_ERROR(cusolverDnDgetrf_bufferSize(
+        _cusolver_handle, _size, _size, _d_matrix, _size, &workspace_size));
+
+    CHECK_CUDA_ERROR(cudaMalloc(&d_pivot, _size * sizeof(int)));
+    CHECK_CUDA_ERROR(cudaMalloc(&d_info, sizeof(int)));
+    CHECK_CUDA_ERROR(cudaMalloc(&d_workspace, workspace_size * sizeof(mito::real)));
+
+    // perform LU factorization
+    CHECK_CUSOLVER_ERROR(cusolverDnDgetrf(
+        _cusolver_handle, _size, _size, _d_matrix, _size, d_workspace, d_pivot, d_info));
+    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
+
+    // solve the linear system
+    CHECK_CUSOLVER_ERROR(cusolverDnDgetrs(
+        _cusolver_handle, CUBLAS_OP_N, _size, 1, _d_matrix, _size, d_pivot, _d_rhs, _size, d_info));
+    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
+
+    // copy the solution from device global memory to host memory
+    // NOTE: _d_rhs contains the solution after the call to getrs as its contents are overwritten
+    // by the solution vector
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(_h_solution, _d_rhs, _size * sizeof(mito::real), cudaMemcpyDeviceToHost));
+
+    // free the temporary device memory
+    CHECK_CUDA_ERROR(cudaFree(d_pivot));
+    CHECK_CUDA_ERROR(cudaFree(d_info));
+    CHECK_CUDA_ERROR(cudaFree(d_workspace));
+
+    // all done
+    return;
+}
+
+auto
+mito::solvers::cuda::CUDADenseSolver::_initialize_cusolver() -> void
+{
+    // create the cuSOLVER handle
+    CHECK_CUSOLVER_ERROR(cusolverDnCreate(&_cusolver_handle));
+
+    // create a cuda stream
+    CHECK_CUDA_ERROR(cudaStreamCreateWithPriority(&_cuda_stream, cudaStreamNonBlocking, 0));
+
+    // set the stream for the cuSOLVER handle
+    CHECK_CUSOLVER_ERROR(cusolverDnSetStream(_cusolver_handle, _cuda_stream));
+
+    // all done
+    return;
+}
+
+auto
+mito::solvers::cuda::CUDADenseSolver::_finalize_cusolver() -> void
+{
+    // destroy the cuSOLVER handle
+    CHECK_CUSOLVER_ERROR(cusolverDnDestroy(_cusolver_handle));
+
+    // destroy the cuda stream
+    CHECK_CUDA_ERROR(cudaStreamDestroy(_cuda_stream));
+
+    // reset the handle and stream pointers
+    _cusolver_handle = nullptr;
+    _cuda_stream = nullptr;
+
+    // all done
+    return;
+}
+
+auto
+mito::solvers::cuda::CUDADenseSolver::_allocate_host_memory(size_t size) -> void
+{
+    // try to allocate pinned memory on the host for faster transfers
+    cudaError_t err_pinned_alloc_matrix =
+        cudaMallocHost(&_h_matrix, size * size * sizeof(mito::real));
+    cudaError_t err_pinned_alloc_rhs = cudaMallocHost(&_h_rhs, size * sizeof(mito::real));
+    cudaError_t err_pinned_alloc_solution = cudaMallocHost(&_h_solution, size * sizeof(mito::real));
+
+    // check if the pinned memory allocation for matrix, rhs, and solution was successful
+    if (err_pinned_alloc_matrix == cudaSuccess && err_pinned_alloc_rhs == cudaSuccess
+        && err_pinned_alloc_solution == cudaSuccess) {
+        // set the flag to indicate that pinned memory was allocated
+        _allocated_host_memory_type = 1;
+        return;
+    }
+
+    // free any partially allocated pinned memory
+    if (err_pinned_alloc_matrix == cudaSuccess)
+        CHECK_CUDA_ERROR(cudaFreeHost(_h_matrix));
+    if (err_pinned_alloc_rhs == cudaSuccess)
+        CHECK_CUDA_ERROR(cudaFreeHost(_h_rhs));
+    if (err_pinned_alloc_solution == cudaSuccess)
+        CHECK_CUDA_ERROR(cudaFreeHost(_h_solution));
+
+    // try to allocate regular memory on the host
+    try {
+        _h_matrix = new mito::real[size * size];
+        _h_rhs = new mito::real[size];
+        _h_solution = new mito::real[size];
+        // set the flag to indicate that regular memory was allocated
+        _allocated_host_memory_type = 2;
+    } catch (const std::bad_alloc & e) {
+        throw std::runtime_error(
+            "Failed to allocate host memory for matrix, rhs, and solution: "
+            + std::string(e.what()));
+    }
+
+    // all done
+    return;
+}
+
+auto
+mito::solvers::cuda::CUDADenseSolver::_allocate_device_memory(size_t size) -> void
+{
+    // allocate global device memory for matrix, rhs, and solution
+    CHECK_CUDA_ERROR(cudaMalloc(&_d_matrix, size * size * sizeof(mito::real)));
+    CHECK_CUDA_ERROR(cudaMalloc(&_d_rhs, size * sizeof(mito::real)));
+
+    // all done
+    return;
+}
+
+auto
+mito::solvers::cuda::CUDADenseSolver::_initialize_host_data(size_t size) -> void
+{
+    // check if host memory is allocated
+    if (_allocated_host_memory_type == 0) {
+        // throw developer error
+        throw std::logic_error(
+            "Host memory is not yet allocated. Call _allocate_host_memory() first.");
+    }
+
+    // initialize the host matrix, rhs and solution with zeros
+    for (size_t i = 0; i < size * size; ++i) {
+        _h_matrix[i] = 0.0;
+    }
+    for (size_t i = 0; i < size; ++i) {
+        _h_rhs[i] = 0.0;
+        _h_solution[i] = 0.0;
+    }
+
+    // all done
+    return;
+}
+
+auto
+mito::solvers::cuda::CUDADenseSolver::_free_host_memory() -> void
+{
+    // check if pinned memory was allocated
+    if (_allocated_host_memory_type == 1) {
+        // free pinned memory
+        if (_h_matrix)
+            CHECK_CUDA_ERROR(cudaFreeHost(_h_matrix));
+        if (_h_rhs)
+            CHECK_CUDA_ERROR(cudaFreeHost(_h_rhs));
+        if (_h_solution)
+            CHECK_CUDA_ERROR(cudaFreeHost(_h_solution));
+    } else if (_allocated_host_memory_type == 2) {
+        // free regular memory
+        if (_h_matrix)
+            delete[] _h_matrix;
+        if (_h_rhs)
+            delete[] _h_rhs;
+        if (_h_solution)
+            delete[] _h_solution;
+    }
+
+    // reset the flag to indicate that the memory has been freed
+    _allocated_host_memory_type = 0;
+    // reset the pointers
+    _h_matrix = nullptr;
+    _h_rhs = nullptr;
+    _h_solution = nullptr;
+
+    return;
+}
+
+auto
+mito::solvers::cuda::CUDADenseSolver::_free_device_memory() -> void
+{
+    // free global device memory for matrix and rhs
+    if (_d_matrix)
+        CHECK_CUDA_ERROR(cudaFree(_d_matrix));
+    if (_d_rhs)
+        CHECK_CUDA_ERROR(cudaFree(_d_rhs));
+
+    // reset the pointers
+    _d_matrix = nullptr;
+    _d_rhs = nullptr;
+
+    return;
+}
+
+auto
+mito::solvers::cuda::CUDADenseSolver::_check_index_validity(size_t index) const -> void
+{
+    // check if the solver is initialized
+    // QUESTION: checking multiple times for initialization may be inefficient?
+    if (!_is_solver_initialized) {
+        throw std::logic_error("Solver is not yet initialized. Call initialize() first.");
+    }
+
+    // check if the index is valid and return false if it is not
+    if (index < 0 || index >= _size) {
+        throw std::out_of_range(
+            "Index " + std::to_string(index) + " is out of range. It must be between 0 and "
+            + std::to_string(_size - 1) + ".");
+    }
+
+    // all done
+    return;
+}
\ No newline at end of file
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
new file mode 100644
index 000000000..f1688d3fd
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
@@ -0,0 +1,104 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+// code guard
+#pragma once
+
+
+namespace mito::solvers::cuda {
+
+    class CUDADenseSolver {
+      public:
+        // constructor
+        CUDADenseSolver();
+
+        // destructor
+        ~CUDADenseSolver();
+
+      public:
+        // initialize the CUDA dense solver
+        auto initialize(size_t size) -> void;
+
+        // finalize the CUDA dense solver
+        auto finalize() -> void;
+
+        // reset the linear system (i.e. the host copy of the matrix, right-hand side and solution)
+        auto reset_system() -> void;
+
+        // set (add or insert depending on the mode) the value of a matrix entry in the host copy
+        auto set_matrix_value(size_t, size_t, const mito::real, const InsertMode) -> void;
+
+        // set (add or insert depending on the mode) the value of a right-hand side entry in the
+        // host copy
+        auto set_rhs_value(size_t, const mito::real, const InsertMode) -> void;
+
+        // finalize the linear system assembly
+        auto finalize_assembly() -> void;
+
+        // solve the linear system
+        auto solve() -> void;
+
+      private:
+        // initialize the cuSOLVER utilities
+        auto _initialize_cusolver() -> void;
+
+        // destroy the cuSOLVER utilities
+        auto _finalize_cusolver() -> void;
+
+        // allocate the host memory for the matrix, right-hand side, and solution
+        auto _allocate_host_memory(size_t) -> void;
+
+        // allocate the device memory for the matrix and right-hand side
+        auto _allocate_device_memory(size_t) -> void;
+
+        // initialize the host data for the matrix, right-hand side, and solution
+        auto _initialize_host_data(size_t) -> void;
+
+        // deallocate the host memory for the matrix, right-hand side, and solution
+        auto _free_host_memory() -> void;
+
+        // deallocate the device memory for the matrix and right-hand side
+        auto _free_device_memory() -> void;
+
+        // check the validity of the index in the matrix and right-hand side
+        auto _check_index_validity(size_t) const -> void;
+
+      private:
+        // host copy of the matrix
+        mito::real * _h_matrix;
+        // host copy of the right-hand side
+        mito::real * _h_rhs;
+        // host copy of the solution
+        mito::real * _h_solution;
+        // device copy of the matrix
+        mito::real * _d_matrix;
+        // device copy of the right-hand side
+        mito::real * _d_rhs;
+        // size of the linear system
+        size_t _size;
+        // flag to indicate if the solver has been initialized
+        bool _is_solver_initialized;
+        // flag to indicate which type of host memory has been allocated
+        // 0: no memory allocated
+        // 1: pinned memory allocated
+        // 2: pageable (regular) memory allocated
+        int _allocated_host_memory_type;
+        // flag to indicate if the system assembly has been finalized
+        bool _is_assembly_finalized;
+        // cuSOLVER handle
+        cusolverDnHandle_t _cusolver_handle;
+        // cuda stream
+        cudaStream_t _cuda_stream;
+    };
+}
+
+
+// get the template definitions
+#define mito_solvers_backend_cuda_CUDADenseSolver_icc
+#include "CUDADenseSolver.icc"
+#undef mito_solvers_backend_cuda_CUDADenseSolver_icc
+
+
+// end of file
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc b/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc
new file mode 100644
index 000000000..faf1e0f7e
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc
@@ -0,0 +1,16 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+
+#if !defined(mito_solvers_backend_cuda_CUDADenseSolver_icc)
+#error This header file contains implementation details of class mito::solvers::cuda::CUDADenseSolver
+#else
+
+// We can add template specializations here for CUDADenseSolver, if needed.
+
+#endif    // mito_solvers_backend_cuda_CUDADenseSolver_icc
+
+
+// end of file
diff --git a/lib/mito/solvers/backend/cuda/api.h b/lib/mito/solvers/backend/cuda/api.h
new file mode 100644
index 000000000..40b42b79f
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/api.h
@@ -0,0 +1,18 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+// code guard
+#pragma once
+
+
+namespace mito::solvers::cuda {
+
+    // cuda dense solver
+    using dense_t = CUDADenseSolver;
+
+}
+
+
+// end of file
\ No newline at end of file
diff --git a/lib/mito/solvers/backend/cuda/externals.h b/lib/mito/solvers/backend/cuda/externals.h
new file mode 100644
index 000000000..57325c253
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/externals.h
@@ -0,0 +1,25 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+// code guard
+#pragma once
+
+
+// externals
+#include <string>
+#include <vector>
+#include <cassert>
+#include <iostream>
+
+
+// cuda support
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+
+// support
+#include "../../../tensor.h"
+
+
+// end of file
diff --git a/lib/mito/solvers/backend/cuda/factories.h b/lib/mito/solvers/backend/cuda/factories.h
new file mode 100644
index 000000000..3600de7b1
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/factories.h
@@ -0,0 +1,21 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+// code guard
+#pragma once
+
+
+namespace mito::solvers::cuda {
+
+    // cuda dense solver
+    auto dense(const std::string & name)
+    {
+        return dense_t(name);
+    }
+
+}
+
+
+// end of file
\ No newline at end of file
diff --git a/lib/mito/solvers/backend/cuda/forward.h b/lib/mito/solvers/backend/cuda/forward.h
new file mode 100644
index 000000000..97b757809
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/forward.h
@@ -0,0 +1,20 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+// code guard
+#pragma once
+
+
+namespace mito::solvers::cuda {
+
+    // available insert modes for matrix and right-hand side entries
+    enum InsertMode { ADD_VALUE, INSERT_VALUE };
+
+    // class for CUDA dense solver
+    class CUDADenseSolver;
+}
+
+
+// end of file
diff --git a/lib/mito/solvers/backend/cuda/public.h b/lib/mito/solvers/backend/cuda/public.h
new file mode 100644
index 000000000..92a700df5
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/public.h
@@ -0,0 +1,26 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+// code guard
+#pragma once
+
+
+// external packages
+#include "externals.h"
+
+// get the forward declarations
+#include "forward.h"
+
+// published types
+// #include "api.h"
+
+// classes
+#include "CUDADenseSolver.h"
+
+// factories implementation
+// #include "factories.h"
+
+
+// end of file
diff --git a/lib/mito/solvers/public.h b/lib/mito/solvers/public.h
index aca9e6a46..698d69016 100644
--- a/lib/mito/solvers/public.h
+++ b/lib/mito/solvers/public.h
@@ -12,4 +12,9 @@
 #endif    // WITH_PETSC
 
 
+#ifdef WITH_CUDA
+#include "backend/cuda/public.h"
+#endif    // WITH_CUDA
+
+
 // end of file

From cc8459e8a4d80caaa83f57f6592cace0a3d12b9c Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Mon, 9 Jun 2025 21:46:20 +0200
Subject: [PATCH 02/35] solvers/cuda/CUDADenseSolver: removed negative index
 comparison while checking for validity

Removed the negative index comparison before adding the element to the stiffness matrix because the size_t type is unsigned anyways and will never be negative. So the comparison is redundant.
---
 lib/mito/solvers/backend/cuda/CUDADenseSolver.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
index 1af745895..6bca98c60 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
@@ -457,7 +457,7 @@ mito::solvers::cuda::CUDADenseSolver::_check_index_validity(size_t index) const
     }
 
     // check if the index is valid and return false if it is not
-    if (index < 0 || index >= _size) {
+    if (index >= _size) {
         throw std::out_of_range(
             "Index " + std::to_string(index) + " is out of range. It must be between 0 and "
             + std::to_string(_size - 1) + ".");

From d3fcb4c86c76016891ae14c12bae92cf6d4d172b Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Mon, 9 Jun 2025 21:54:19 +0200
Subject: [PATCH 03/35] solvers/cuda/CUDADenseSolver: hardcoded mito::real to
 double to avoid compilation error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For some reason, when the CUDADenseSolver.cu is compiled with the nvcc compiler, it fails to compile with the error:

```
/usr/local/pyre/include/pyre/tensor/traits.h:30:18: error: expected identifier before ‘>’ token
   30 |         template <int N>
      |                  ^
/usr/local/pyre/include/pyre/tensor/traits.h:41:18: error: expected identifier before ‘>’ token
   41 |         template <int N>
      |                  ^
/usr/local/pyre/include/pyre/tensor/algebra.icc:1232:50: error: expected identifier before ‘>’ token
 1232 | template <pyre::tensor::square_matrix_c matrixT>
      |                                                  ^
```

Apparently, the nvcc compiler is not able to compile the some template code in the pyre library which we are using in the CUDADenseSolver.cu to defined the real type. This could be because nvcc doesn't support all the features of the C++ standard library. We hardcoded the mito::real to double for now to avoid this compilation error. We should figure out why this is happening and fix it later.
---
 .../solvers/backend/cuda/CUDADenseSolver.cu   | 32 +++++++++----------
 .../solvers/backend/cuda/CUDADenseSolver.h    | 14 ++++----
 lib/mito/solvers/backend/cuda/externals.h     |  4 +--
 3 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
index 6bca98c60..c45be1c32 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
@@ -157,7 +157,7 @@ mito::solvers::cuda::CUDADenseSolver::reset_system() -> void
 // add/insert {value} to matrix entry at ({row}, {col}) of the host copy
 auto
 mito::solvers::cuda::CUDADenseSolver::set_matrix_value(
-    size_t row, size_t col, const mito::real value,
+    size_t row, size_t col, const double value,
     const mito::solvers::cuda::InsertMode insert_mode = mito::solvers::cuda::InsertMode::ADD_VALUE)
     -> void
 {
@@ -186,7 +186,7 @@ mito::solvers::cuda::CUDADenseSolver::set_matrix_value(
 // add/insert {value} to rhs entry at {row} of the host copy
 auto
 mito::solvers::cuda::CUDADenseSolver::set_rhs_value(
-    size_t row, const mito::real value,
+    size_t row, const double value,
     const mito::solvers::cuda::InsertMode insert_mode = mito::solvers::cuda::InsertMode::ADD_VALUE)
     -> void
 {
@@ -244,15 +244,14 @@ mito::solvers::cuda::CUDADenseSolver::solve() -> void
 
     // copy the host matrix and rhs data to device global memory
     // IMPROVE: We should move the data through streams for better performance later!
-    CHECK_CUDA_ERROR(cudaMemcpy(
-        _d_matrix, _h_matrix, _size * _size * sizeof(mito::real), cudaMemcpyHostToDevice));
     CHECK_CUDA_ERROR(
-        cudaMemcpy(_d_rhs, _h_rhs, _size * sizeof(mito::real), cudaMemcpyHostToDevice));
+        cudaMemcpy(_d_matrix, _h_matrix, _size * _size * sizeof(double), cudaMemcpyHostToDevice));
+    CHECK_CUDA_ERROR(cudaMemcpy(_d_rhs, _h_rhs, _size * sizeof(double), cudaMemcpyHostToDevice));
 
     // allocate device memory for temporary variables in the LU factorization
     int * d_pivot = nullptr;
     int * d_info = nullptr;
-    real * d_workspace = nullptr;
+    double * d_workspace = nullptr;
     int workspace_size = 0;
 
     // get the workspace size for getrf (only double precision LU factorization is supported!)
@@ -263,7 +262,7 @@ mito::solvers::cuda::CUDADenseSolver::solve() -> void
 
     CHECK_CUDA_ERROR(cudaMalloc(&d_pivot, _size * sizeof(int)));
     CHECK_CUDA_ERROR(cudaMalloc(&d_info, sizeof(int)));
-    CHECK_CUDA_ERROR(cudaMalloc(&d_workspace, workspace_size * sizeof(mito::real)));
+    CHECK_CUDA_ERROR(cudaMalloc(&d_workspace, workspace_size * sizeof(double)));
 
     // perform LU factorization
     CHECK_CUSOLVER_ERROR(cusolverDnDgetrf(
@@ -279,7 +278,7 @@ mito::solvers::cuda::CUDADenseSolver::solve() -> void
     // NOTE: _d_rhs contains the solution after the call to getrs as its contents are overwritten
     // by the solution vector
     CHECK_CUDA_ERROR(
-        cudaMemcpy(_h_solution, _d_rhs, _size * sizeof(mito::real), cudaMemcpyDeviceToHost));
+        cudaMemcpy(_h_solution, _d_rhs, _size * sizeof(double), cudaMemcpyDeviceToHost));
 
     // free the temporary device memory
     CHECK_CUDA_ERROR(cudaFree(d_pivot));
@@ -327,10 +326,9 @@ auto
 mito::solvers::cuda::CUDADenseSolver::_allocate_host_memory(size_t size) -> void
 {
     // try to allocate pinned memory on the host for faster transfers
-    cudaError_t err_pinned_alloc_matrix =
-        cudaMallocHost(&_h_matrix, size * size * sizeof(mito::real));
-    cudaError_t err_pinned_alloc_rhs = cudaMallocHost(&_h_rhs, size * sizeof(mito::real));
-    cudaError_t err_pinned_alloc_solution = cudaMallocHost(&_h_solution, size * sizeof(mito::real));
+    cudaError_t err_pinned_alloc_matrix = cudaMallocHost(&_h_matrix, size * size * sizeof(double));
+    cudaError_t err_pinned_alloc_rhs = cudaMallocHost(&_h_rhs, size * sizeof(double));
+    cudaError_t err_pinned_alloc_solution = cudaMallocHost(&_h_solution, size * sizeof(double));
 
     // check if the pinned memory allocation for matrix, rhs, and solution was successful
     if (err_pinned_alloc_matrix == cudaSuccess && err_pinned_alloc_rhs == cudaSuccess
@@ -350,9 +348,9 @@ mito::solvers::cuda::CUDADenseSolver::_allocate_host_memory(size_t size) -> void
 
     // try to allocate regular memory on the host
     try {
-        _h_matrix = new mito::real[size * size];
-        _h_rhs = new mito::real[size];
-        _h_solution = new mito::real[size];
+        _h_matrix = new double[size * size];
+        _h_rhs = new double[size];
+        _h_solution = new double[size];
         // set the flag to indicate that regular memory was allocated
         _allocated_host_memory_type = 2;
     } catch (const std::bad_alloc & e) {
@@ -369,8 +367,8 @@ auto
 mito::solvers::cuda::CUDADenseSolver::_allocate_device_memory(size_t size) -> void
 {
     // allocate global device memory for matrix, rhs, and solution
-    CHECK_CUDA_ERROR(cudaMalloc(&_d_matrix, size * size * sizeof(mito::real)));
-    CHECK_CUDA_ERROR(cudaMalloc(&_d_rhs, size * sizeof(mito::real)));
+    CHECK_CUDA_ERROR(cudaMalloc(&_d_matrix, size * size * sizeof(double)));
+    CHECK_CUDA_ERROR(cudaMalloc(&_d_rhs, size * sizeof(double)));
 
     // all done
     return;
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
index f1688d3fd..19b5b66a7 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
@@ -28,11 +28,11 @@ namespace mito::solvers::cuda {
         auto reset_system() -> void;
 
         // set (add or insert depending on the mode) the value of a matrix entry in the host copy
-        auto set_matrix_value(size_t, size_t, const mito::real, const InsertMode) -> void;
+        auto set_matrix_value(size_t, size_t, const double, const InsertMode) -> void;
 
         // set (add or insert depending on the mode) the value of a right-hand side entry in the
         // host copy
-        auto set_rhs_value(size_t, const mito::real, const InsertMode) -> void;
+        auto set_rhs_value(size_t, const double, const InsertMode) -> void;
 
         // finalize the linear system assembly
         auto finalize_assembly() -> void;
@@ -67,15 +67,15 @@ namespace mito::solvers::cuda {
 
       private:
         // host copy of the matrix
-        mito::real * _h_matrix;
+        double * _h_matrix;
         // host copy of the right-hand side
-        mito::real * _h_rhs;
+        double * _h_rhs;
         // host copy of the solution
-        mito::real * _h_solution;
+        double * _h_solution;
         // device copy of the matrix
-        mito::real * _d_matrix;
+        double * _d_matrix;
         // device copy of the right-hand side
-        mito::real * _d_rhs;
+        double * _d_rhs;
         // size of the linear system
         size_t _size;
         // flag to indicate if the solver has been initialized
diff --git a/lib/mito/solvers/backend/cuda/externals.h b/lib/mito/solvers/backend/cuda/externals.h
index 57325c253..95de2ef23 100644
--- a/lib/mito/solvers/backend/cuda/externals.h
+++ b/lib/mito/solvers/backend/cuda/externals.h
@@ -18,8 +18,8 @@
 #include <cuda_runtime.h>
 #include <cusolverDn.h>
 
-// support
-#include "../../../tensor.h"
+// // support
+// #include "../../../tensor.h"
 
 
 // end of file

From 8f722b5eb70088d11435a17bcb0550d0c8bb2689 Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Mon, 9 Jun 2025 21:55:15 +0200
Subject: [PATCH 04/35] .cmake: added support for CUDA compiler and CUDA solver

---
 .cmake/mito-config.cmake.in |  8 +++++++-
 .cmake/mito_cuda.cmake      | 33 +++++++++++++++++++++++++++++++++
 .cmake/mito_sources.cmake   |  7 +++++++
 CMakeLists.txt              |  3 +++
 4 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 .cmake/mito_cuda.cmake

diff --git a/.cmake/mito-config.cmake.in b/.cmake/mito-config.cmake.in
index 124606f58..d792ed33f 100644
--- a/.cmake/mito-config.cmake.in
+++ b/.cmake/mito-config.cmake.in
@@ -44,4 +44,10 @@ endif(@WITH_VTK@)
 if(@WITH_PETSC@)
   # add compiler definitions
   add_definitions(-DWITH_PETSC)
-endif(@WITH_PETSC@)
\ No newline at end of file
+endif(@WITH_PETSC@)
+
+# cuda dependency
+if(@WITH_CUDA@)
+  # add compiler definitions
+  add_definitions(-DWITH_CUDA)
+endif(@WITH_CUDA@)
\ No newline at end of file
diff --git a/.cmake/mito_cuda.cmake b/.cmake/mito_cuda.cmake
new file mode 100644
index 000000000..778e80a53
--- /dev/null
+++ b/.cmake/mito_cuda.cmake
@@ -0,0 +1,33 @@
+# -*- cmake -*-
+#
+# Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+#
+
+
+# CUDA support
+option(WITH_CUDA "Enable support for CUDA" OFF)
+
+# if CUDA is requested
+if(WITH_CUDA)
+    # find CUDA package
+    find_package(CUDAToolkit REQUIRED)
+    # report
+    message(STATUS "Enable CUDA support")
+    # add compiler definitions
+    add_definitions(-DWITH_CUDA)
+    message(STATUS "Add definition WITH_CUDA")
+    # enable CUDA language for targets
+    enable_language(CUDA)
+    # include CUDA headers
+    target_include_directories(mito PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+    # link against CUDA libraries
+    target_link_libraries(mito PUBLIC CUDA::cudart CUDA::cusolver)
+    # set CUDA host compiler to match the C++ compiler (to tell NVCC the compiler to use for the host code)
+    if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
+        set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE FILEPATH "Host compiler for NVCC")
+    endif()
+    message(STATUS "Using CUDA host compiler: ${CMAKE_CUDA_HOST_COMPILER}")
+endif()
+
+
+# end of file
diff --git a/.cmake/mito_sources.cmake b/.cmake/mito_sources.cmake
index 3095b5fea..a26759b8a 100644
--- a/.cmake/mito_sources.cmake
+++ b/.cmake/mito_sources.cmake
@@ -14,5 +14,12 @@ set(MITO_SOURCES ${MITO_SOURCES}
 )
 endif()
 
+# the mito cuda solvers backend
+if (WITH_CUDA)
+set(MITO_SOURCES ${MITO_SOURCES}
+    lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
+)
+endif()
+
 
 # end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 458b981ca..665525e3f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,6 +58,9 @@ include(mito_metis)
 # petsc support
 include(mito_petsc)
 
+# cuda support
+include(mito_cuda)
+
 # configure mito as a cmake package (so that mito can be found by cmake with find_package(mito))
 mito_shareCmakePackage()
 

From 1a9c41da165effaeff1bdbd1c41a6c11f97e562e Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Sat, 14 Jun 2025 17:57:58 +0200
Subject: [PATCH 05/35] solvers/cuda/CUDADenseSolver: enabled apis & factories
 to create CUDADenseSolver instances

Enabled APIs and factories to create CUDADenseSolver instances in the CUDA backend. We also corrected the CUDADenseSolver instantiation in the factories.h file.
---
 lib/mito/solvers/backend/cuda/factories.h | 4 ++--
 lib/mito/solvers/backend/cuda/public.h    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/mito/solvers/backend/cuda/factories.h b/lib/mito/solvers/backend/cuda/factories.h
index 3600de7b1..638e7ae93 100644
--- a/lib/mito/solvers/backend/cuda/factories.h
+++ b/lib/mito/solvers/backend/cuda/factories.h
@@ -10,9 +10,9 @@
 namespace mito::solvers::cuda {
 
     // cuda dense solver
-    auto dense(const std::string & name)
+    auto dense()
     {
-        return dense_t(name);
+        return dense_t();
     }
 
 }
diff --git a/lib/mito/solvers/backend/cuda/public.h b/lib/mito/solvers/backend/cuda/public.h
index 92a700df5..9b0b64388 100644
--- a/lib/mito/solvers/backend/cuda/public.h
+++ b/lib/mito/solvers/backend/cuda/public.h
@@ -14,13 +14,13 @@
 #include "forward.h"
 
 // published types
-// #include "api.h"
+#include "api.h"
 
 // classes
 #include "CUDADenseSolver.h"
 
 // factories implementation
-// #include "factories.h"
+#include "factories.h"
 
 
 // end of file

From 2a2b8d43331fc9cca3d194b6edf594e524dfd0d0 Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Sat, 14 Jun 2025 18:00:16 +0200
Subject: [PATCH 06/35] solvers/cuda/CUDADenseSolver: added function to get
 solution from the solver

---
 lib/mito/solvers/backend/cuda/CUDADenseSolver.h |  4 ++++
 .../solvers/backend/cuda/CUDADenseSolver.icc    | 17 ++++++++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
index 19b5b66a7..7455c0129 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
@@ -40,6 +40,10 @@ namespace mito::solvers::cuda {
         // solve the linear system
         auto solve() -> void;
 
+        // get the solution vector
+        template <class solutionT>
+        auto get_solution(solutionT & solution) const -> void;
+
       private:
         // initialize the cuSOLVER utilities
         auto _initialize_cusolver() -> void;
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc b/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc
index faf1e0f7e..dd5c0a973 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc
@@ -8,7 +8,22 @@
 #error This header file contains implementation details of class mito::solvers::cuda::CUDADenseSolver
 #else
 
-// We can add template specializations here for CUDADenseSolver, if needed.
+template <class solutionT>
+auto
+mito::solvers::cuda::CUDADenseSolver::get_solution(solutionT & solution) const -> void
+{
+    // check if {_size} matches the size of {solution}
+    assert(_size == std::size(solution));
+
+    // reset the entries in {solution}
+    std::fill(std::begin(solution), std::end(solution), 0.0);
+
+    // copy {_h_solution} into {solution}
+    std::copy(_h_solution, _h_solution + _size, std::begin(solution));
+
+    // all done
+    return;
+}
 
 #endif    // mito_solvers_backend_cuda_CUDADenseSolver_icc
 

From bfe62f6cc7a79b2536405455019607be56fb29b4 Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Sat, 14 Jun 2025 18:02:10 +0200
Subject: [PATCH 07/35] tests/mito.lib/solvers/cuda_dense: added a simple
 linear system solve test

---
 .cmake/mito_tests_mito_lib.cmake              |  4 +
 .../cuda_dense_solver_solve_linear_system.cc  | 73 +++++++++++++++++++
 2 files changed, 77 insertions(+)
 create mode 100644 tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc

diff --git a/.cmake/mito_tests_mito_lib.cmake b/.cmake/mito_tests_mito_lib.cmake
index b3a8b9a84..e1fb3bcee 100644
--- a/.cmake/mito_tests_mito_lib.cmake
+++ b/.cmake/mito_tests_mito_lib.cmake
@@ -69,6 +69,10 @@ if(WITH_PETSC)
     mito_test_driver(tests/mito.lib/solvers/petsc_solve_linear_system.cc)
 endif()
 
+if(WITH_CUDA)
+    mito_test_driver(tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc)
+endif()
+
 # tensor
 mito_test_driver(tests/mito.lib/tensor/one_forms.cc)
 mito_test_driver(tests/mito.lib/tensor/contractions.cc)
diff --git a/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc b/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
new file mode 100644
index 000000000..3fd8c7507
--- /dev/null
+++ b/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
@@ -0,0 +1,73 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+
+#include <gtest/gtest.h>
+#include <mito.h>
+
+
+TEST(Solvers, CUDADenseSolver)
+{
+    // the size of the linear system
+    int N = 10;
+
+    // instantiate a CUDA Dense solver for a linear system of size {N}
+    auto solver = mito::solvers::cuda::dense();
+    solver.initialize(N);
+
+    // set matrix and right-hand side entries
+    for (int i = 0; i < N; i++) {
+        solver.set_matrix_value(i, i, 2.0, mito::solvers::cuda::InsertMode::INSERT_VALUE);
+        if (i > 0) {
+            solver.set_matrix_value(i, i - 1, -1.0, mito::solvers::cuda::InsertMode::INSERT_VALUE);
+        }
+        if (i < N - 1) {
+            solver.set_matrix_value(i, i + 1, -1.0, mito::solvers::cuda::InsertMode::INSERT_VALUE);
+        }
+        solver.set_rhs_value(i, 1.0, mito::solvers::cuda::InsertMode::INSERT_VALUE);
+    }
+
+    // finalize the assembly of the linear system
+    solver.finalize_assembly();
+
+    // solve the linear system
+    solver.solve();
+
+    // read the solution
+    auto x = std::vector<double>(N);
+    solver.get_solution(x);
+
+    // check the solution
+    EXPECT_DOUBLE_EQ(x[0], 5.0);
+    EXPECT_DOUBLE_EQ(x[1], 9.0);
+    EXPECT_DOUBLE_EQ(x[2], 12.0);
+    EXPECT_DOUBLE_EQ(x[3], 14.0);
+    EXPECT_DOUBLE_EQ(x[4], 15.0);
+    EXPECT_DOUBLE_EQ(x[5], 15.0);
+    EXPECT_DOUBLE_EQ(x[6], 14.0);
+    EXPECT_DOUBLE_EQ(x[7], 12.0);
+    EXPECT_DOUBLE_EQ(x[8], 9.0);
+    EXPECT_DOUBLE_EQ(x[9], 5.0);
+
+    // finalize the solver
+    solver.finalize();
+
+    // all done
+    return;
+}
+
+
+int
+main(int argc, char ** argv)
+{
+    ::testing::InitGoogleTest(&argc, argv);
+    auto result = RUN_ALL_TESTS();
+
+    // all done
+    return result;
+}
+
+
+// end of file

From 27759dac7202f5e0068238e09a6443e828e00308 Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Tue, 17 Jun 2025 10:02:16 +0200
Subject: [PATCH 08/35] solvers/cuda/CUDADenseSolver: replaced double with
 mito::real for generic support

Replaced the double data type with mito::real in the CUDADenseSolver to support generic types requested by the user.

Note that the code now compiles only with nvcc compatibility changes in the pyre library as some C++20 functionalities such as lambda functions are not supported by the cuda compiler.
---
 .../solvers/backend/cuda/CUDADenseSolver.cu   | 36 ++++++++++---------
 .../solvers/backend/cuda/CUDADenseSolver.h    | 14 ++++----
 lib/mito/solvers/backend/cuda/externals.h     |  4 +--
 3 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
index c45be1c32..83fa3f52d 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
@@ -157,7 +157,7 @@ mito::solvers::cuda::CUDADenseSolver::reset_system() -> void
 // add/insert {value} to matrix entry at ({row}, {col}) of the host copy
 auto
 mito::solvers::cuda::CUDADenseSolver::set_matrix_value(
-    size_t row, size_t col, const double value,
+    size_t row, size_t col, const mito::real value,
     const mito::solvers::cuda::InsertMode insert_mode = mito::solvers::cuda::InsertMode::ADD_VALUE)
     -> void
 {
@@ -186,7 +186,7 @@ mito::solvers::cuda::CUDADenseSolver::set_matrix_value(
 // add/insert {value} to rhs entry at {row} of the host copy
 auto
 mito::solvers::cuda::CUDADenseSolver::set_rhs_value(
-    size_t row, const double value,
+    size_t row, const mito::real value,
     const mito::solvers::cuda::InsertMode insert_mode = mito::solvers::cuda::InsertMode::ADD_VALUE)
     -> void
 {
@@ -244,25 +244,26 @@ mito::solvers::cuda::CUDADenseSolver::solve() -> void
 
     // copy the host matrix and rhs data to device global memory
     // IMPROVE: We should move the data through streams for better performance later!
+    CHECK_CUDA_ERROR(cudaMemcpy(
+        _d_matrix, _h_matrix, _size * _size * sizeof(mito::real), cudaMemcpyHostToDevice));
     CHECK_CUDA_ERROR(
-        cudaMemcpy(_d_matrix, _h_matrix, _size * _size * sizeof(double), cudaMemcpyHostToDevice));
-    CHECK_CUDA_ERROR(cudaMemcpy(_d_rhs, _h_rhs, _size * sizeof(double), cudaMemcpyHostToDevice));
+        cudaMemcpy(_d_rhs, _h_rhs, _size * sizeof(mito::real), cudaMemcpyHostToDevice));
 
     // allocate device memory for temporary variables in the LU factorization
     int * d_pivot = nullptr;
     int * d_info = nullptr;
-    double * d_workspace = nullptr;
+    mito::real * d_workspace = nullptr;
     int workspace_size = 0;
 
     // get the workspace size for getrf (only double precision LU factorization is supported!)
-    // QUESTION: Should we check if mito::real is double or float and allocate the workspace memory
-    // accordingly?
+    // QUESTION: Should we check if mito::real is double or float and allocate the workspace
+    // memory accordingly?
     CHECK_CUSOLVER_ERROR(cusolverDnDgetrf_bufferSize(
         _cusolver_handle, _size, _size, _d_matrix, _size, &workspace_size));
 
     CHECK_CUDA_ERROR(cudaMalloc(&d_pivot, _size * sizeof(int)));
     CHECK_CUDA_ERROR(cudaMalloc(&d_info, sizeof(int)));
-    CHECK_CUDA_ERROR(cudaMalloc(&d_workspace, workspace_size * sizeof(double)));
+    CHECK_CUDA_ERROR(cudaMalloc(&d_workspace, workspace_size * sizeof(mito::real)));
 
     // perform LU factorization
     CHECK_CUSOLVER_ERROR(cusolverDnDgetrf(
@@ -278,7 +279,7 @@ mito::solvers::cuda::CUDADenseSolver::solve() -> void
     // NOTE: _d_rhs contains the solution after the call to getrs as its contents are overwritten
     // by the solution vector
     CHECK_CUDA_ERROR(
-        cudaMemcpy(_h_solution, _d_rhs, _size * sizeof(double), cudaMemcpyDeviceToHost));
+        cudaMemcpy(_h_solution, _d_rhs, _size * sizeof(mito::real), cudaMemcpyDeviceToHost));
 
     // free the temporary device memory
     CHECK_CUDA_ERROR(cudaFree(d_pivot));
@@ -326,9 +327,10 @@ auto
 mito::solvers::cuda::CUDADenseSolver::_allocate_host_memory(size_t size) -> void
 {
     // try to allocate pinned memory on the host for faster transfers
-    cudaError_t err_pinned_alloc_matrix = cudaMallocHost(&_h_matrix, size * size * sizeof(double));
-    cudaError_t err_pinned_alloc_rhs = cudaMallocHost(&_h_rhs, size * sizeof(double));
-    cudaError_t err_pinned_alloc_solution = cudaMallocHost(&_h_solution, size * sizeof(double));
+    cudaError_t err_pinned_alloc_matrix =
+        cudaMallocHost(&_h_matrix, size * size * sizeof(mito::real));
+    cudaError_t err_pinned_alloc_rhs = cudaMallocHost(&_h_rhs, size * sizeof(mito::real));
+    cudaError_t err_pinned_alloc_solution = cudaMallocHost(&_h_solution, size * sizeof(mito::real));
 
     // check if the pinned memory allocation for matrix, rhs, and solution was successful
     if (err_pinned_alloc_matrix == cudaSuccess && err_pinned_alloc_rhs == cudaSuccess
@@ -348,9 +350,9 @@ mito::solvers::cuda::CUDADenseSolver::_allocate_host_memory(size_t size) -> void
 
     // try to allocate regular memory on the host
     try {
-        _h_matrix = new double[size * size];
-        _h_rhs = new double[size];
-        _h_solution = new double[size];
+        _h_matrix = new mito::real[size * size];
+        _h_rhs = new mito::real[size];
+        _h_solution = new mito::real[size];
         // set the flag to indicate that regular memory was allocated
         _allocated_host_memory_type = 2;
     } catch (const std::bad_alloc & e) {
@@ -367,8 +369,8 @@ auto
 mito::solvers::cuda::CUDADenseSolver::_allocate_device_memory(size_t size) -> void
 {
     // allocate global device memory for matrix, rhs, and solution
-    CHECK_CUDA_ERROR(cudaMalloc(&_d_matrix, size * size * sizeof(double)));
-    CHECK_CUDA_ERROR(cudaMalloc(&_d_rhs, size * sizeof(double)));
+    CHECK_CUDA_ERROR(cudaMalloc(&_d_matrix, size * size * sizeof(mito::real)));
+    CHECK_CUDA_ERROR(cudaMalloc(&_d_rhs, size * sizeof(mito::real)));
 
     // all done
     return;
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
index 7455c0129..5728b3850 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
@@ -28,11 +28,11 @@ namespace mito::solvers::cuda {
         auto reset_system() -> void;
 
         // set (add or insert depending on the mode) the value of a matrix entry in the host copy
-        auto set_matrix_value(size_t, size_t, const double, const InsertMode) -> void;
+        auto set_matrix_value(size_t, size_t, const real, const InsertMode) -> void;
 
         // set (add or insert depending on the mode) the value of a right-hand side entry in the
         // host copy
-        auto set_rhs_value(size_t, const double, const InsertMode) -> void;
+        auto set_rhs_value(size_t, const real, const InsertMode) -> void;
 
         // finalize the linear system assembly
         auto finalize_assembly() -> void;
@@ -71,15 +71,15 @@ namespace mito::solvers::cuda {
 
       private:
         // host copy of the matrix
-        double * _h_matrix;
+        real * _h_matrix;
         // host copy of the right-hand side
-        double * _h_rhs;
+        real * _h_rhs;
         // host copy of the solution
-        double * _h_solution;
+        real * _h_solution;
         // device copy of the matrix
-        double * _d_matrix;
+        real * _d_matrix;
         // device copy of the right-hand side
-        double * _d_rhs;
+        real * _d_rhs;
         // size of the linear system
         size_t _size;
         // flag to indicate if the solver has been initialized
diff --git a/lib/mito/solvers/backend/cuda/externals.h b/lib/mito/solvers/backend/cuda/externals.h
index 95de2ef23..57325c253 100644
--- a/lib/mito/solvers/backend/cuda/externals.h
+++ b/lib/mito/solvers/backend/cuda/externals.h
@@ -18,8 +18,8 @@
 #include <cuda_runtime.h>
 #include <cusolverDn.h>
 
-// // support
-// #include "../../../tensor.h"
+// support
+#include "../../../tensor.h"
 
 
 // end of file

From b3541e435ee0396396e6bd1d8553338dbda5af39 Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Tue, 17 Jun 2025 10:30:36 +0200
Subject: [PATCH 09/35] solvers/cuda/CUDADenseSolver: added cusolver_traits to
 support different data types

Added the cusolver_traits struct to support double and float data type implementations in the CUDADenseSolver.
---
 .../solvers/backend/cuda/CUDADenseSolver.cu   | 25 ++++++++++++-------
 .../solvers/backend/cuda/CUDADenseSolver.h    | 19 ++++++++++++++
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
index 83fa3f52d..9d24b0150 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
@@ -255,24 +255,31 @@ mito::solvers::cuda::CUDADenseSolver::solve() -> void
     mito::real * d_workspace = nullptr;
     int workspace_size = 0;
 
-    // get the workspace size for getrf (only double precision LU factorization is supported!)
-    // QUESTION: Should we check if mito::real is double or float and allocate the workspace
-    // memory accordingly?
-    CHECK_CUSOLVER_ERROR(cusolverDnDgetrf_bufferSize(
-        _cusolver_handle, _size, _size, _d_matrix, _size, &workspace_size));
+    // check if mito::real is either double or float and throw an error if it is not
+    static_assert(
+        std::is_same_v<mito::real, double> || std::is_same_v<mito::real, float>,
+        "Only double or float types are supported in the CUDA dense solver.");
+
+    // get the workspace size for the LU factorization
+    CHECK_CUSOLVER_ERROR(
+        cusolver_traits<mito::real>::getrf_buffer_size(
+            _cusolver_handle, _size, _size, _d_matrix, _size, &workspace_size));
 
     CHECK_CUDA_ERROR(cudaMalloc(&d_pivot, _size * sizeof(int)));
     CHECK_CUDA_ERROR(cudaMalloc(&d_info, sizeof(int)));
     CHECK_CUDA_ERROR(cudaMalloc(&d_workspace, workspace_size * sizeof(mito::real)));
 
     // perform LU factorization
-    CHECK_CUSOLVER_ERROR(cusolverDnDgetrf(
-        _cusolver_handle, _size, _size, _d_matrix, _size, d_workspace, d_pivot, d_info));
+    CHECK_CUSOLVER_ERROR(
+        cusolver_traits<mito::real>::getrf(
+            _cusolver_handle, _size, _size, _d_matrix, _size, d_workspace, d_pivot, d_info));
     CHECK_CUDA_ERROR(cudaDeviceSynchronize());
 
     // solve the linear system
-    CHECK_CUSOLVER_ERROR(cusolverDnDgetrs(
-        _cusolver_handle, CUBLAS_OP_N, _size, 1, _d_matrix, _size, d_pivot, _d_rhs, _size, d_info));
+    CHECK_CUSOLVER_ERROR(
+        cusolver_traits<mito::real>::getrs(
+            _cusolver_handle, CUBLAS_OP_N, _size, 1, _d_matrix, _size, d_pivot, _d_rhs, _size,
+            d_info));
     CHECK_CUDA_ERROR(cudaDeviceSynchronize());
 
     // copy the solution from device global memory to host memory
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
index 5728b3850..b70704ff6 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
@@ -7,6 +7,25 @@
 #pragma once
 
 
+// a struct to hold the cusolver function pointers for different data types
+template <typename T>
+struct cusolver_traits;
+
+template <>
+struct cusolver_traits<double> {
+    static constexpr auto getrf_buffer_size = cusolverDnDgetrf_bufferSize;
+    static constexpr auto getrf = cusolverDnDgetrf;
+    static constexpr auto getrs = cusolverDnDgetrs;
+};
+
+template <>
+struct cusolver_traits<float> {
+    static constexpr auto getrf_buffer_size = cusolverDnSgetrf_bufferSize;
+    static constexpr auto getrf = cusolverDnSgetrf;
+    static constexpr auto getrs = cusolverDnSgetrs;
+};
+
+
 namespace mito::solvers::cuda {
 
     class CUDADenseSolver {

From af6988f98434a796d607c574345189a3db6e6227 Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Tue, 17 Jun 2025 11:32:01 +0200
Subject: [PATCH 10/35] solvers/cuda/CUDADenseSolver: introduced SolverType
 enum & new parameter in constructor

Introduced the `SolverType` enum to distinguish between different solver types in the CUDA backend. Added solver type as a parameter in the `CUDADenseSolver` constructor so the user can choose the solver type at runtime.
---
 lib/mito/solvers/backend/cuda/CUDADenseSolver.cu | 3 ++-
 lib/mito/solvers/backend/cuda/CUDADenseSolver.h  | 4 +++-
 lib/mito/solvers/backend/cuda/factories.h        | 4 ++--
 lib/mito/solvers/backend/cuda/forward.h          | 3 +++
 4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
index 9d24b0150..70d51156d 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
@@ -59,7 +59,8 @@ cusolverGetErrorString(cusolverStatus_t status)
     } while (0)
 
 // constructor
-mito::solvers::cuda::CUDADenseSolver::CUDADenseSolver() :
+mito::solvers::cuda::CUDADenseSolver::CUDADenseSolver(mito::solvers::cuda::SolverType solver_type) :
+    _solver_type(solver_type),
     _h_matrix(nullptr),
     _h_rhs(nullptr),
     _h_solution(nullptr),
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
index b70704ff6..8f6f2d810 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
@@ -31,7 +31,7 @@ namespace mito::solvers::cuda {
     class CUDADenseSolver {
       public:
         // constructor
-        CUDADenseSolver();
+        CUDADenseSolver(mito::solvers::cuda::SolverType);
 
         // destructor
         ~CUDADenseSolver();
@@ -89,6 +89,8 @@ namespace mito::solvers::cuda {
         auto _check_index_validity(size_t) const -> void;
 
       private:
+        // solver type
+        mito::solvers::cuda::SolverType _solver_type;
         // host copy of the matrix
         real * _h_matrix;
         // host copy of the right-hand side
diff --git a/lib/mito/solvers/backend/cuda/factories.h b/lib/mito/solvers/backend/cuda/factories.h
index 638e7ae93..6962e15ab 100644
--- a/lib/mito/solvers/backend/cuda/factories.h
+++ b/lib/mito/solvers/backend/cuda/factories.h
@@ -10,9 +10,9 @@
 namespace mito::solvers::cuda {
 
     // cuda dense solver
-    auto dense()
+    auto dense(mito::solvers::cuda::SolverType solver_type = mito::solvers::cuda::SolverType::LU)
     {
-        return dense_t();
+        return dense_t(solver_type);
     }
 
 }
diff --git a/lib/mito/solvers/backend/cuda/forward.h b/lib/mito/solvers/backend/cuda/forward.h
index 97b757809..51313f2c6 100644
--- a/lib/mito/solvers/backend/cuda/forward.h
+++ b/lib/mito/solvers/backend/cuda/forward.h
@@ -12,6 +12,9 @@ namespace mito::solvers::cuda {
     // available insert modes for matrix and right-hand side entries
     enum InsertMode { ADD_VALUE, INSERT_VALUE };
 
+    // available solver types
+    enum SolverType { CHOLESKY, LU };
+
     // class for CUDA dense solver
     class CUDADenseSolver;
 }

From c88a099b398854b01ca35473b3acd6ab8fd58a70 Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Tue, 17 Jun 2025 11:35:53 +0200
Subject: [PATCH 11/35] solvers/cuda/CUDADenseSolver: storing matrix in
 column-major order on host

We are storing the matrix in column-major order on the host since the cuSolver library expects matrices in column-major order and we could avoid doing a transpose later on the GPU.
---
 lib/mito/solvers/backend/cuda/CUDADenseSolver.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
index 70d51156d..b15a8a0d2 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
@@ -173,10 +173,12 @@ mito::solvers::cuda::CUDADenseSolver::set_matrix_value(
     _check_index_validity(col);
 
     // add/insert the value to the matrix entry in the host matrix
+    // NOTE: We store the matrix in column-major order since the cuSOLVER library expects the matrix
+    // to be in column-major order.
     if (insert_mode == mito::solvers::cuda::InsertMode::ADD_VALUE)
-        _h_matrix[row * _size + col] += value;
+        _h_matrix[col * _size + row] += value;
     else if (insert_mode == mito::solvers::cuda::InsertMode::INSERT_VALUE)
-        _h_matrix[row * _size + col] = value;
+        _h_matrix[col * _size + row] = value;
     else
         throw std::invalid_argument("Invalid insert mode. Use ADD_VALUE or INSERT_VALUE.");
 

From 5dea7bc641c6844da62ed91232af72ac63f8c2c0 Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Tue, 17 Jun 2025 11:41:14 +0200
Subject: [PATCH 12/35] solvers/cuda/CUDADenseSolver: added support for
 Cholesky solver

---
 .../solvers/backend/cuda/CUDADenseSolver.cu   | 55 ++++++++++++++-----
 .../solvers/backend/cuda/CUDADenseSolver.h    | 10 ++++
 2 files changed, 52 insertions(+), 13 deletions(-)

diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
index b15a8a0d2..943d2dcbf 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
@@ -252,7 +252,7 @@ mito::solvers::cuda::CUDADenseSolver::solve() -> void
     CHECK_CUDA_ERROR(
         cudaMemcpy(_d_rhs, _h_rhs, _size * sizeof(mito::real), cudaMemcpyHostToDevice));
 
-    // allocate device memory for temporary variables in the LU factorization
+    // allocate device memory for temporary variables in the factorization
     int * d_pivot = nullptr;
     int * d_info = nullptr;
     mito::real * d_workspace = nullptr;
@@ -263,26 +263,55 @@ mito::solvers::cuda::CUDADenseSolver::solve() -> void
         std::is_same_v<mito::real, double> || std::is_same_v<mito::real, float>,
         "Only double or float types are supported in the CUDA dense solver.");
 
-    // get the workspace size for the LU factorization
-    CHECK_CUSOLVER_ERROR(
-        cusolver_traits<mito::real>::getrf_buffer_size(
-            _cusolver_handle, _size, _size, _d_matrix, _size, &workspace_size));
+    // check the solver type is either LU or Cholesky
+    if (_solver_type != mito::solvers::cuda::SolverType::LU
+        && _solver_type != mito::solvers::cuda::SolverType::CHOLESKY) {
+        throw std::invalid_argument(
+            "Invalid solver type. Only LU and Cholesky solvers are supported in the CUDA dense "
+            "solver.");
+    }
+
+    // get the workspace size for the factorization
+    if (_solver_type == mito::solvers::cuda::SolverType::LU) {
+        CHECK_CUSOLVER_ERROR(
+            cusolver_traits<mito::real>::getrf_buffer_size(
+                _cusolver_handle, _size, _size, _d_matrix, _size, &workspace_size));
+    } else if (_solver_type == mito::solvers::cuda::SolverType::CHOLESKY) {
+        CHECK_CUSOLVER_ERROR(
+            cusolver_traits<mito::real>::potrf_buffer_size(
+                _cusolver_handle, CUBLAS_FILL_MODE_LOWER, _size, _d_matrix, _size,
+                &workspace_size));
+    }
 
     CHECK_CUDA_ERROR(cudaMalloc(&d_pivot, _size * sizeof(int)));
     CHECK_CUDA_ERROR(cudaMalloc(&d_info, sizeof(int)));
     CHECK_CUDA_ERROR(cudaMalloc(&d_workspace, workspace_size * sizeof(mito::real)));
 
-    // perform LU factorization
-    CHECK_CUSOLVER_ERROR(
-        cusolver_traits<mito::real>::getrf(
-            _cusolver_handle, _size, _size, _d_matrix, _size, d_workspace, d_pivot, d_info));
+    // perform the factorization
+    if (_solver_type == mito::solvers::cuda::SolverType::LU) {
+        CHECK_CUSOLVER_ERROR(
+            cusolver_traits<mito::real>::getrf(
+                _cusolver_handle, _size, _size, _d_matrix, _size, d_workspace, d_pivot, d_info));
+    } else if (_solver_type == mito::solvers::cuda::SolverType::CHOLESKY) {
+        CHECK_CUSOLVER_ERROR(
+            cusolver_traits<mito::real>::potrf(
+                _cusolver_handle, CUBLAS_FILL_MODE_LOWER, _size, _d_matrix, _size, d_workspace,
+                workspace_size, d_info));
+    }
     CHECK_CUDA_ERROR(cudaDeviceSynchronize());
 
     // solve the linear system
-    CHECK_CUSOLVER_ERROR(
-        cusolver_traits<mito::real>::getrs(
-            _cusolver_handle, CUBLAS_OP_N, _size, 1, _d_matrix, _size, d_pivot, _d_rhs, _size,
-            d_info));
+    if (_solver_type == mito::solvers::cuda::SolverType::LU) {
+        CHECK_CUSOLVER_ERROR(
+            cusolver_traits<mito::real>::getrs(
+                _cusolver_handle, CUBLAS_OP_N, _size, 1, _d_matrix, _size, d_pivot, _d_rhs, _size,
+                d_info));
+    } else if (_solver_type == mito::solvers::cuda::SolverType::CHOLESKY) {
+        CHECK_CUSOLVER_ERROR(
+            cusolver_traits<mito::real>::potrs(
+                _cusolver_handle, CUBLAS_FILL_MODE_LOWER, _size, 1, _d_matrix, _size, _d_rhs, _size,
+                d_info));
+    }
     CHECK_CUDA_ERROR(cudaDeviceSynchronize());
 
     // copy the solution from device global memory to host memory
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
index 8f6f2d810..5c878421a 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
@@ -13,6 +13,11 @@ struct cusolver_traits;
 
 template <>
 struct cusolver_traits<double> {
+    // Cholesky factorization and solve routines for double precision
+    static constexpr auto potrf_buffer_size = cusolverDnDpotrf_bufferSize;
+    static constexpr auto potrf = cusolverDnDpotrf;
+    static constexpr auto potrs = cusolverDnDpotrs;
+    // LU factorization and solve routines for double precision
     static constexpr auto getrf_buffer_size = cusolverDnDgetrf_bufferSize;
     static constexpr auto getrf = cusolverDnDgetrf;
     static constexpr auto getrs = cusolverDnDgetrs;
@@ -20,6 +25,11 @@ struct cusolver_traits<double> {
 
 template <>
 struct cusolver_traits<float> {
+    // Cholesky factorization and solve routines for single precision
+    static constexpr auto potrf_buffer_size = cusolverDnSpotrf_bufferSize;
+    static constexpr auto potrf = cusolverDnSpotrf;
+    static constexpr auto potrs = cusolverDnSpotrs;
+    // LU factorization and solve routines for single precision
     static constexpr auto getrf_buffer_size = cusolverDnSgetrf_bufferSize;
     static constexpr auto getrf = cusolverDnSgetrf;
     static constexpr auto getrs = cusolverDnSgetrs;

From 9f38bec6dc3b0c5a988ec35f3e4bad047e9408f4 Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Tue, 17 Jun 2025 11:48:32 +0200
Subject: [PATCH 13/35] tests/mito.lib/solvers/cuda_dense: switched to Cholesky
 solver for solving the linear system

---
 tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc b/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
index 3fd8c7507..c7b08cc97 100644
--- a/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
+++ b/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
@@ -14,7 +14,7 @@ TEST(Solvers, CUDADenseSolver)
     int N = 10;
 
     // instantiate a CUDA Dense solver for a linear system of size {N}
-    auto solver = mito::solvers::cuda::dense();
+    auto solver = mito::solvers::cuda::dense(mito::solvers::cuda::SolverType::CHOLESKY);
     solver.initialize(N);
 
     // set matrix and right-hand side entries

From 1e8a49a14600f687a46aa0058cb8817e641795f6 Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Tue, 17 Jun 2025 12:07:47 +0200
Subject: [PATCH 14/35] tests/mito.lib/solvers/cuda_dense: introduced checks
 with tolerances

Added checks with tolerances as the results were very close but not exactly equal which is fine for floating point operations.
---
 .../cuda_dense_solver_solve_linear_system.cc  | 23 +++++++++++--------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc b/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
index c7b08cc97..0239cdab9 100644
--- a/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
+++ b/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
@@ -13,6 +13,9 @@ TEST(Solvers, CUDADenseSolver)
     // the size of the linear system
     int N = 10;
 
+    // numerical tolerance for the solution
+    double tol = 1.0e-10;
+
     // instantiate a CUDA Dense solver for a linear system of size {N}
     auto solver = mito::solvers::cuda::dense(mito::solvers::cuda::SolverType::CHOLESKY);
     solver.initialize(N);
@@ -40,16 +43,16 @@ TEST(Solvers, CUDADenseSolver)
     solver.get_solution(x);
 
     // check the solution
-    EXPECT_DOUBLE_EQ(x[0], 5.0);
-    EXPECT_DOUBLE_EQ(x[1], 9.0);
-    EXPECT_DOUBLE_EQ(x[2], 12.0);
-    EXPECT_DOUBLE_EQ(x[3], 14.0);
-    EXPECT_DOUBLE_EQ(x[4], 15.0);
-    EXPECT_DOUBLE_EQ(x[5], 15.0);
-    EXPECT_DOUBLE_EQ(x[6], 14.0);
-    EXPECT_DOUBLE_EQ(x[7], 12.0);
-    EXPECT_DOUBLE_EQ(x[8], 9.0);
-    EXPECT_DOUBLE_EQ(x[9], 5.0);
+    EXPECT_NEAR(x[0], 5.0, tol);
+    EXPECT_NEAR(x[1], 9.0, tol);
+    EXPECT_NEAR(x[2], 12.0, tol);
+    EXPECT_NEAR(x[3], 14.0, tol);
+    EXPECT_NEAR(x[4], 15.0, tol);
+    EXPECT_NEAR(x[5], 15.0, tol);
+    EXPECT_NEAR(x[6], 14.0, tol);
+    EXPECT_NEAR(x[7], 12.0, tol);
+    EXPECT_NEAR(x[8], 9.0, tol);
+    EXPECT_NEAR(x[9], 5.0, tol);
 
     // finalize the solver
     solver.finalize();

From 10a43140d935227262293994f1eda18d7066ed6c Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Tue, 17 Jun 2025 12:14:47 +0200
Subject: [PATCH 15/35] tests/mito.lib/solvers/cuda_dense: added test with an
 unsymmetric matrix

---
 .../cuda_dense_solver_solve_linear_system.cc  | 55 ++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc b/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
index 0239cdab9..bc2a1ccb6 100644
--- a/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
+++ b/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
@@ -8,7 +8,7 @@
 #include <mito.h>
 
 
-TEST(Solvers, CUDADenseSolver)
+TEST(Solvers, CUDADenseSolverSymmetricLinearSystem)
 {
     // the size of the linear system
     int N = 10;
@@ -61,6 +61,59 @@ TEST(Solvers, CUDADenseSolver)
     return;
 }
 
+TEST(Solvers, CUDADenseSolverUnsymmetricLinearSystem)
+{
+    // the size of the linear system
+    int N = 10;
+
+    // numerical tolerance for the solution
+    double tol = 1.0e-10;
+
+    // instantiate a CUDA Dense solver for a linear system of size {N}
+    auto solver = mito::solvers::cuda::dense(mito::solvers::cuda::SolverType::LU);
+    solver.initialize(N);
+
+    // set matrix and right-hand side entries
+    for (int i = 0; i < N; i++) {
+        solver.set_matrix_value(i, i, 2.0, mito::solvers::cuda::InsertMode::INSERT_VALUE);
+        if (i > 0) {
+            solver.set_matrix_value(i, i - 1, -1.0, mito::solvers::cuda::InsertMode::INSERT_VALUE);
+        }
+        if (i < N - 1) {
+            solver.set_matrix_value(i, i + 1, -0.5, mito::solvers::cuda::InsertMode::INSERT_VALUE);
+        }
+        solver.set_rhs_value(i, 1.0, mito::solvers::cuda::InsertMode::INSERT_VALUE);
+    }
+
+    // finalize the assembly of the linear system
+    solver.finalize_assembly();
+
+    // solve the linear system
+    solver.solve();
+
+    // read the solution
+    auto x = std::vector<double>(N);
+    solver.get_solution(x);
+
+    // check the solution
+    EXPECT_NEAR(x[0], 0.8284194482079074, tol);
+    EXPECT_NEAR(x[1], 1.3136777928316294, tol);
+    EXPECT_NEAR(x[2], 1.5978722749107030, tol);
+    EXPECT_NEAR(x[3], 1.7641335139795535, tol);
+    EXPECT_NEAR(x[4], 1.8607895060968098, tol);
+    EXPECT_NEAR(x[5], 1.9148909964281315, tol);
+    EXPECT_NEAR(x[6], 1.9379849735189063, tol);
+    EXPECT_NEAR(x[7], 1.9221579012193621, tol);
+    EXPECT_NEAR(x[8], 1.8126616578396353, tol);
+    EXPECT_NEAR(x[9], 1.4063308289198175, tol);
+
+    // finalize the solver
+    solver.finalize();
+
+    // all done
+    return;
+}
+
 
 int
 main(int argc, char ** argv)

From 36160dfef5589a318bfc9c26e8f39c2b9a12f7a1 Mon Sep 17 00:00:00 2001
From: Bianca Giovanardi <b.giovanardi@tudelft.nl>
Date: Thu, 19 Jun 2025 18:19:55 +0200
Subject: [PATCH 16/35] solvers/cuda: {typedef} real type in {CUDADenseSolver}
 class

---
 .../solvers/backend/cuda/CUDADenseSolver.cu   | 46 +++++++++----------
 .../solvers/backend/cuda/CUDADenseSolver.h    | 18 +++++---
 2 files changed, 34 insertions(+), 30 deletions(-)

diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
index 943d2dcbf..42fa7e01e 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
@@ -158,7 +158,7 @@ mito::solvers::cuda::CUDADenseSolver::reset_system() -> void
 // add/insert {value} to matrix entry at ({row}, {col}) of the host copy
 auto
 mito::solvers::cuda::CUDADenseSolver::set_matrix_value(
-    size_t row, size_t col, const mito::real value,
+    size_t row, size_t col, const real_type value,
     const mito::solvers::cuda::InsertMode insert_mode = mito::solvers::cuda::InsertMode::ADD_VALUE)
     -> void
 {
@@ -189,7 +189,7 @@ mito::solvers::cuda::CUDADenseSolver::set_matrix_value(
 // add/insert {value} to rhs entry at {row} of the host copy
 auto
 mito::solvers::cuda::CUDADenseSolver::set_rhs_value(
-    size_t row, const mito::real value,
+    size_t row, const real_type value,
     const mito::solvers::cuda::InsertMode insert_mode = mito::solvers::cuda::InsertMode::ADD_VALUE)
     -> void
 {
@@ -248,19 +248,19 @@ mito::solvers::cuda::CUDADenseSolver::solve() -> void
     // copy the host matrix and rhs data to device global memory
     // IMPROVE: We should move the data through streams for better performance later!
     CHECK_CUDA_ERROR(cudaMemcpy(
-        _d_matrix, _h_matrix, _size * _size * sizeof(mito::real), cudaMemcpyHostToDevice));
+        _d_matrix, _h_matrix, _size * _size * sizeof(real_type), cudaMemcpyHostToDevice));
     CHECK_CUDA_ERROR(
-        cudaMemcpy(_d_rhs, _h_rhs, _size * sizeof(mito::real), cudaMemcpyHostToDevice));
+        cudaMemcpy(_d_rhs, _h_rhs, _size * sizeof(real_type), cudaMemcpyHostToDevice));
 
     // allocate device memory for temporary variables in the factorization
     int * d_pivot = nullptr;
     int * d_info = nullptr;
-    mito::real * d_workspace = nullptr;
+    real_type * d_workspace = nullptr;
     int workspace_size = 0;
 
-    // check if mito::real is either double or float and throw an error if it is not
+    // check if real_type is either double or float and throw an error if it is not
     static_assert(
-        std::is_same_v<mito::real, double> || std::is_same_v<mito::real, float>,
+        std::is_same_v<real_type, double> || std::is_same_v<real_type, float>,
         "Only double or float types are supported in the CUDA dense solver.");
 
     // check the solver type is either LU or Cholesky
@@ -274,27 +274,27 @@ mito::solvers::cuda::CUDADenseSolver::solve() -> void
     // get the workspace size for the factorization
     if (_solver_type == mito::solvers::cuda::SolverType::LU) {
         CHECK_CUSOLVER_ERROR(
-            cusolver_traits<mito::real>::getrf_buffer_size(
+            cusolver_traits<real_type>::getrf_buffer_size(
                 _cusolver_handle, _size, _size, _d_matrix, _size, &workspace_size));
     } else if (_solver_type == mito::solvers::cuda::SolverType::CHOLESKY) {
         CHECK_CUSOLVER_ERROR(
-            cusolver_traits<mito::real>::potrf_buffer_size(
+            cusolver_traits<real_type>::potrf_buffer_size(
                 _cusolver_handle, CUBLAS_FILL_MODE_LOWER, _size, _d_matrix, _size,
                 &workspace_size));
     }
 
     CHECK_CUDA_ERROR(cudaMalloc(&d_pivot, _size * sizeof(int)));
     CHECK_CUDA_ERROR(cudaMalloc(&d_info, sizeof(int)));
-    CHECK_CUDA_ERROR(cudaMalloc(&d_workspace, workspace_size * sizeof(mito::real)));
+    CHECK_CUDA_ERROR(cudaMalloc(&d_workspace, workspace_size * sizeof(real_type)));
 
     // perform the factorization
     if (_solver_type == mito::solvers::cuda::SolverType::LU) {
         CHECK_CUSOLVER_ERROR(
-            cusolver_traits<mito::real>::getrf(
+            cusolver_traits<real_type>::getrf(
                 _cusolver_handle, _size, _size, _d_matrix, _size, d_workspace, d_pivot, d_info));
     } else if (_solver_type == mito::solvers::cuda::SolverType::CHOLESKY) {
         CHECK_CUSOLVER_ERROR(
-            cusolver_traits<mito::real>::potrf(
+            cusolver_traits<real_type>::potrf(
                 _cusolver_handle, CUBLAS_FILL_MODE_LOWER, _size, _d_matrix, _size, d_workspace,
                 workspace_size, d_info));
     }
@@ -303,12 +303,12 @@ mito::solvers::cuda::CUDADenseSolver::solve() -> void
     // solve the linear system
     if (_solver_type == mito::solvers::cuda::SolverType::LU) {
         CHECK_CUSOLVER_ERROR(
-            cusolver_traits<mito::real>::getrs(
+            cusolver_traits<real_type>::getrs(
                 _cusolver_handle, CUBLAS_OP_N, _size, 1, _d_matrix, _size, d_pivot, _d_rhs, _size,
                 d_info));
     } else if (_solver_type == mito::solvers::cuda::SolverType::CHOLESKY) {
         CHECK_CUSOLVER_ERROR(
-            cusolver_traits<mito::real>::potrs(
+            cusolver_traits<real_type>::potrs(
                 _cusolver_handle, CUBLAS_FILL_MODE_LOWER, _size, 1, _d_matrix, _size, _d_rhs, _size,
                 d_info));
     }
@@ -318,7 +318,7 @@ mito::solvers::cuda::CUDADenseSolver::solve() -> void
     // NOTE: _d_rhs contains the solution after the call to getrs as its contents are overwritten
     // by the solution vector
     CHECK_CUDA_ERROR(
-        cudaMemcpy(_h_solution, _d_rhs, _size * sizeof(mito::real), cudaMemcpyDeviceToHost));
+        cudaMemcpy(_h_solution, _d_rhs, _size * sizeof(real_type), cudaMemcpyDeviceToHost));
 
     // free the temporary device memory
     CHECK_CUDA_ERROR(cudaFree(d_pivot));
@@ -367,9 +367,9 @@ mito::solvers::cuda::CUDADenseSolver::_allocate_host_memory(size_t size) -> void
 {
     // try to allocate pinned memory on the host for faster transfers
     cudaError_t err_pinned_alloc_matrix =
-        cudaMallocHost(&_h_matrix, size * size * sizeof(mito::real));
-    cudaError_t err_pinned_alloc_rhs = cudaMallocHost(&_h_rhs, size * sizeof(mito::real));
-    cudaError_t err_pinned_alloc_solution = cudaMallocHost(&_h_solution, size * sizeof(mito::real));
+        cudaMallocHost(&_h_matrix, size * size * sizeof(real_type));
+    cudaError_t err_pinned_alloc_rhs = cudaMallocHost(&_h_rhs, size * sizeof(real_type));
+    cudaError_t err_pinned_alloc_solution = cudaMallocHost(&_h_solution, size * sizeof(real_type));
 
     // check if the pinned memory allocation for matrix, rhs, and solution was successful
     if (err_pinned_alloc_matrix == cudaSuccess && err_pinned_alloc_rhs == cudaSuccess
@@ -389,9 +389,9 @@ mito::solvers::cuda::CUDADenseSolver::_allocate_host_memory(size_t size) -> void
 
     // try to allocate regular memory on the host
     try {
-        _h_matrix = new mito::real[size * size];
-        _h_rhs = new mito::real[size];
-        _h_solution = new mito::real[size];
+        _h_matrix = new real_type[size * size];
+        _h_rhs = new real_type[size];
+        _h_solution = new real_type[size];
         // set the flag to indicate that regular memory was allocated
         _allocated_host_memory_type = 2;
     } catch (const std::bad_alloc & e) {
@@ -408,8 +408,8 @@ auto
 mito::solvers::cuda::CUDADenseSolver::_allocate_device_memory(size_t size) -> void
 {
     // allocate global device memory for matrix, rhs, and solution
-    CHECK_CUDA_ERROR(cudaMalloc(&_d_matrix, size * size * sizeof(mito::real)));
-    CHECK_CUDA_ERROR(cudaMalloc(&_d_rhs, size * sizeof(mito::real)));
+    CHECK_CUDA_ERROR(cudaMalloc(&_d_matrix, size * size * sizeof(real_type)));
+    CHECK_CUDA_ERROR(cudaMalloc(&_d_rhs, size * sizeof(real_type)));
 
     // all done
     return;
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
index 5c878421a..3a553d401 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
@@ -39,6 +39,10 @@ struct cusolver_traits<float> {
 namespace mito::solvers::cuda {
 
     class CUDADenseSolver {
+        public:
+        // type alias for real_type numbers
+        using real_type = mito::real;
+
       public:
         // constructor
         CUDADenseSolver(mito::solvers::cuda::SolverType);
@@ -57,11 +61,11 @@ namespace mito::solvers::cuda {
         auto reset_system() -> void;
 
         // set (add or insert depending on the mode) the value of a matrix entry in the host copy
-        auto set_matrix_value(size_t, size_t, const real, const InsertMode) -> void;
+        auto set_matrix_value(size_t, size_t, const real_type, const InsertMode) -> void;
 
         // set (add or insert depending on the mode) the value of a right-hand side entry in the
         // host copy
-        auto set_rhs_value(size_t, const real, const InsertMode) -> void;
+        auto set_rhs_value(size_t, const real_type, const InsertMode) -> void;
 
         // finalize the linear system assembly
         auto finalize_assembly() -> void;
@@ -102,15 +106,15 @@ namespace mito::solvers::cuda {
         // solver type
         mito::solvers::cuda::SolverType _solver_type;
         // host copy of the matrix
-        real * _h_matrix;
+        real_type * _h_matrix;
         // host copy of the right-hand side
-        real * _h_rhs;
+        real_type * _h_rhs;
         // host copy of the solution
-        real * _h_solution;
+        real_type * _h_solution;
         // device copy of the matrix
-        real * _d_matrix;
+        real_type * _d_matrix;
         // device copy of the right-hand side
-        real * _d_rhs;
+        real_type * _d_rhs;
         // size of the linear system
         size_t _size;
         // flag to indicate if the solver has been initialized

From 2f5557ecf9d8cb8bf9b7254e09b0a54ef9fa0a9f Mon Sep 17 00:00:00 2001
From: Bianca Giovanardi <b.giovanardi@tudelft.nl>
Date: Thu, 19 Jun 2025 18:21:20 +0200
Subject: [PATCH 17/35] solvers/cuda: remove dependency of class
 {CUDADenseSolver} from other {mito} headers

This is achieved for example ensuring class {CUDADenseSolver} uses {double} instead of {mito::real} as the underlying {real_type}.
---
 lib/mito/solvers/backend/cuda/CUDADenseSolver.h | 2 +-
 lib/mito/solvers/backend/cuda/externals.h       | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
index 3a553d401..304a16fb3 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
@@ -41,7 +41,7 @@ namespace mito::solvers::cuda {
     class CUDADenseSolver {
         public:
         // type alias for real_type numbers
-        using real_type = mito::real;
+        using real_type = double;
 
       public:
         // constructor
diff --git a/lib/mito/solvers/backend/cuda/externals.h b/lib/mito/solvers/backend/cuda/externals.h
index 57325c253..cfbe3cb97 100644
--- a/lib/mito/solvers/backend/cuda/externals.h
+++ b/lib/mito/solvers/backend/cuda/externals.h
@@ -18,8 +18,5 @@
 #include <cuda_runtime.h>
 #include <cusolverDn.h>
 
-// support
-#include "../../../tensor.h"
-
 
 // end of file

From 81568b99e5a9bc047962317078df9b35cbf8d838 Mon Sep 17 00:00:00 2001
From: Bianca Giovanardi <b.giovanardi@tudelft.nl>
Date: Thu, 19 Jun 2025 18:51:04 +0200
Subject: [PATCH 18/35] solvers/cuda: class {CUDADenseSolver} is now template
 with respect to {real_type}

This ensures that the solver can still be instantiated with a {mito::real}  substitution, while keeping mito objects and cuda objects comiled separately, the formers with {gcc} and the latters with {nvcc}.
---
 .../solvers/backend/cuda/CUDADenseSolver.cu   | 59 +++++++++++++------
 .../solvers/backend/cuda/CUDADenseSolver.h    |  5 +-
 .../solvers/backend/cuda/CUDADenseSolver.icc  |  3 +-
 lib/mito/solvers/backend/cuda/api.h           |  3 +-
 lib/mito/solvers/backend/cuda/factories.h     |  3 +-
 lib/mito/solvers/backend/cuda/forward.h       |  1 +
 .../cuda_dense_solver_solve_linear_system.cc  |  4 +-
 7 files changed, 53 insertions(+), 25 deletions(-)

diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
index 42fa7e01e..cc9062d04 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
@@ -59,7 +59,8 @@ cusolverGetErrorString(cusolverStatus_t status)
     } while (0)
 
 // constructor
-mito::solvers::cuda::CUDADenseSolver::CUDADenseSolver(mito::solvers::cuda::SolverType solver_type) :
+template<typename realT>
+mito::solvers::cuda::CUDADenseSolver<realT>::CUDADenseSolver(mito::solvers::cuda::SolverType solver_type) :
     _solver_type(solver_type),
     _h_matrix(nullptr),
     _h_rhs(nullptr),
@@ -78,14 +79,16 @@ mito::solvers::cuda::CUDADenseSolver::CUDADenseSolver(mito::solvers::cuda::Solve
 }
 
 // destructor
-mito::solvers::cuda::CUDADenseSolver::~CUDADenseSolver()
+template<typename realT>
+mito::solvers::cuda::CUDADenseSolver<realT>::~CUDADenseSolver()
 {
     // finalize cuSOLVER
     _finalize_cusolver();
 }
 
+template<typename realT>
 auto
-mito::solvers::cuda::CUDADenseSolver::initialize(size_t size) -> void
+mito::solvers::cuda::CUDADenseSolver<realT>::initialize(size_t size) -> void
 {
     // check if the solver is already initialized
     if (_is_solver_initialized) {
@@ -118,8 +121,9 @@ mito::solvers::cuda::CUDADenseSolver::initialize(size_t size) -> void
     return;
 }
 
+template<typename realT>
 auto
-mito::solvers::cuda::CUDADenseSolver::finalize() -> void
+mito::solvers::cuda::CUDADenseSolver<realT>::finalize() -> void
 {
     // check if the solver is initialized
     if (_is_solver_initialized) {
@@ -137,8 +141,9 @@ mito::solvers::cuda::CUDADenseSolver::finalize() -> void
     return;
 }
 
+template<typename realT>
 auto
-mito::solvers::cuda::CUDADenseSolver::reset_system() -> void
+mito::solvers::cuda::CUDADenseSolver<realT>::reset_system() -> void
 {
     // check if the solver is initialized
     if (!_is_solver_initialized) {
@@ -156,8 +161,9 @@ mito::solvers::cuda::CUDADenseSolver::reset_system() -> void
 }
 
 // add/insert {value} to matrix entry at ({row}, {col}) of the host copy
+template<typename realT>
 auto
-mito::solvers::cuda::CUDADenseSolver::set_matrix_value(
+mito::solvers::cuda::CUDADenseSolver<realT>::set_matrix_value(
     size_t row, size_t col, const real_type value,
     const mito::solvers::cuda::InsertMode insert_mode = mito::solvers::cuda::InsertMode::ADD_VALUE)
     -> void
@@ -187,8 +193,9 @@ mito::solvers::cuda::CUDADenseSolver::set_matrix_value(
 }
 
 // add/insert {value} to rhs entry at {row} of the host copy
+template<typename realT>
 auto
-mito::solvers::cuda::CUDADenseSolver::set_rhs_value(
+mito::solvers::cuda::CUDADenseSolver<realT>::set_rhs_value(
     size_t row, const real_type value,
     const mito::solvers::cuda::InsertMode insert_mode = mito::solvers::cuda::InsertMode::ADD_VALUE)
     -> void
@@ -214,8 +221,9 @@ mito::solvers::cuda::CUDADenseSolver::set_rhs_value(
     return;
 }
 
+template<typename realT>
 auto
-mito::solvers::cuda::CUDADenseSolver::finalize_assembly() -> void
+mito::solvers::cuda::CUDADenseSolver<realT>::finalize_assembly() -> void
 {
     // check if the solver is initialized
     if (!_is_solver_initialized) {
@@ -236,8 +244,9 @@ mito::solvers::cuda::CUDADenseSolver::finalize_assembly() -> void
     return;
 }
 
+template<typename realT>
 auto
-mito::solvers::cuda::CUDADenseSolver::solve() -> void
+mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
 {
     // check if the assembly is finalized
     if (!_is_assembly_finalized) {
@@ -329,8 +338,9 @@ mito::solvers::cuda::CUDADenseSolver::solve() -> void
     return;
 }
 
+template<typename realT>
 auto
-mito::solvers::cuda::CUDADenseSolver::_initialize_cusolver() -> void
+mito::solvers::cuda::CUDADenseSolver<realT>::_initialize_cusolver() -> void
 {
     // create the cuSOLVER handle
     CHECK_CUSOLVER_ERROR(cusolverDnCreate(&_cusolver_handle));
@@ -345,8 +355,9 @@ mito::solvers::cuda::CUDADenseSolver::_initialize_cusolver() -> void
     return;
 }
 
+template<typename realT>
 auto
-mito::solvers::cuda::CUDADenseSolver::_finalize_cusolver() -> void
+mito::solvers::cuda::CUDADenseSolver<realT>::_finalize_cusolver() -> void
 {
     // destroy the cuSOLVER handle
     CHECK_CUSOLVER_ERROR(cusolverDnDestroy(_cusolver_handle));
@@ -362,8 +373,9 @@ mito::solvers::cuda::CUDADenseSolver::_finalize_cusolver() -> void
     return;
 }
 
+template<typename realT>
 auto
-mito::solvers::cuda::CUDADenseSolver::_allocate_host_memory(size_t size) -> void
+mito::solvers::cuda::CUDADenseSolver<realT>::_allocate_host_memory(size_t size) -> void
 {
     // try to allocate pinned memory on the host for faster transfers
     cudaError_t err_pinned_alloc_matrix =
@@ -404,8 +416,9 @@ mito::solvers::cuda::CUDADenseSolver::_allocate_host_memory(size_t size) -> void
     return;
 }
 
+template<typename realT>
 auto
-mito::solvers::cuda::CUDADenseSolver::_allocate_device_memory(size_t size) -> void
+mito::solvers::cuda::CUDADenseSolver<realT>::_allocate_device_memory(size_t size) -> void
 {
     // allocate global device memory for matrix, rhs, and solution
     CHECK_CUDA_ERROR(cudaMalloc(&_d_matrix, size * size * sizeof(real_type)));
@@ -415,8 +428,9 @@ mito::solvers::cuda::CUDADenseSolver::_allocate_device_memory(size_t size) -> vo
     return;
 }
 
+template<typename realT>
 auto
-mito::solvers::cuda::CUDADenseSolver::_initialize_host_data(size_t size) -> void
+mito::solvers::cuda::CUDADenseSolver<realT>::_initialize_host_data(size_t size) -> void
 {
     // check if host memory is allocated
     if (_allocated_host_memory_type == 0) {
@@ -438,8 +452,9 @@ mito::solvers::cuda::CUDADenseSolver::_initialize_host_data(size_t size) -> void
     return;
 }
 
+template<typename realT>
 auto
-mito::solvers::cuda::CUDADenseSolver::_free_host_memory() -> void
+mito::solvers::cuda::CUDADenseSolver<realT>::_free_host_memory() -> void
 {
     // check if pinned memory was allocated
     if (_allocated_host_memory_type == 1) {
@@ -470,8 +485,9 @@ mito::solvers::cuda::CUDADenseSolver::_free_host_memory() -> void
     return;
 }
 
+template<typename realT>
 auto
-mito::solvers::cuda::CUDADenseSolver::_free_device_memory() -> void
+mito::solvers::cuda::CUDADenseSolver<realT>::_free_device_memory() -> void
 {
     // free global device memory for matrix and rhs
     if (_d_matrix)
@@ -486,8 +502,9 @@ mito::solvers::cuda::CUDADenseSolver::_free_device_memory() -> void
     return;
 }
 
+template<typename realT>
 auto
-mito::solvers::cuda::CUDADenseSolver::_check_index_validity(size_t index) const -> void
+mito::solvers::cuda::CUDADenseSolver<realT>::_check_index_validity(size_t index) const -> void
 {
     // check if the solver is initialized
     // QUESTION: checking multiple times for initialization may be inefficient?
@@ -504,4 +521,10 @@ mito::solvers::cuda::CUDADenseSolver::_check_index_validity(size_t index) const
 
     // all done
     return;
-}
\ No newline at end of file
+}
+
+// explicit instantiation of the {CUDADenseSolver} class for doubles
+template class mito::solvers::cuda::CUDADenseSolver<double>;
+
+// explicit instantiation of the {CUDADenseSolver} class for floats
+template class mito::solvers::cuda::CUDADenseSolver<float>;
\ No newline at end of file
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
index 304a16fb3..dc0f03a63 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
@@ -38,10 +38,11 @@ struct cusolver_traits<float> {
 
 namespace mito::solvers::cuda {
 
+    template<typename realT>
     class CUDADenseSolver {
         public:
-        // type alias for real_type numbers
-        using real_type = double;
+        // type alias for real numbers
+        using real_type = realT;
 
       public:
         // constructor
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc b/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc
index dd5c0a973..983657316 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc
@@ -8,9 +8,10 @@
 #error This header file contains implementation details of class mito::solvers::cuda::CUDADenseSolver
 #else
 
+template <typename realT>
 template <class solutionT>
 auto
-mito::solvers::cuda::CUDADenseSolver::get_solution(solutionT & solution) const -> void
+mito::solvers::cuda::CUDADenseSolver<realT>::get_solution(solutionT & solution) const -> void
 {
     // check if {_size} matches the size of {solution}
     assert(_size == std::size(solution));
diff --git a/lib/mito/solvers/backend/cuda/api.h b/lib/mito/solvers/backend/cuda/api.h
index 40b42b79f..6317d221a 100644
--- a/lib/mito/solvers/backend/cuda/api.h
+++ b/lib/mito/solvers/backend/cuda/api.h
@@ -10,7 +10,8 @@
 namespace mito::solvers::cuda {
 
     // cuda dense solver
-    using dense_t = CUDADenseSolver;
+    template<typename realT>
+    using dense_t = CUDADenseSolver<realT>;
 
 }
 
diff --git a/lib/mito/solvers/backend/cuda/factories.h b/lib/mito/solvers/backend/cuda/factories.h
index 6962e15ab..346a752c8 100644
--- a/lib/mito/solvers/backend/cuda/factories.h
+++ b/lib/mito/solvers/backend/cuda/factories.h
@@ -10,9 +10,10 @@
 namespace mito::solvers::cuda {
 
     // cuda dense solver
+    template<typename realT>
     auto dense(mito::solvers::cuda::SolverType solver_type = mito::solvers::cuda::SolverType::LU)
     {
-        return dense_t(solver_type);
+        return dense_t<realT>(solver_type);
     }
 
 }
diff --git a/lib/mito/solvers/backend/cuda/forward.h b/lib/mito/solvers/backend/cuda/forward.h
index 51313f2c6..2ed383523 100644
--- a/lib/mito/solvers/backend/cuda/forward.h
+++ b/lib/mito/solvers/backend/cuda/forward.h
@@ -16,6 +16,7 @@ namespace mito::solvers::cuda {
     enum SolverType { CHOLESKY, LU };
 
     // class for CUDA dense solver
+    template<typename realT>
     class CUDADenseSolver;
 }
 
diff --git a/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc b/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
index bc2a1ccb6..3ee95da3e 100644
--- a/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
+++ b/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
@@ -17,7 +17,7 @@ TEST(Solvers, CUDADenseSolverSymmetricLinearSystem)
     double tol = 1.0e-10;
 
     // instantiate a CUDA Dense solver for a linear system of size {N}
-    auto solver = mito::solvers::cuda::dense(mito::solvers::cuda::SolverType::CHOLESKY);
+    auto solver = mito::solvers::cuda::dense<mito::real>(mito::solvers::cuda::SolverType::CHOLESKY);
     solver.initialize(N);
 
     // set matrix and right-hand side entries
@@ -70,7 +70,7 @@ TEST(Solvers, CUDADenseSolverUnsymmetricLinearSystem)
     double tol = 1.0e-10;
 
     // instantiate a CUDA Dense solver for a linear system of size {N}
-    auto solver = mito::solvers::cuda::dense(mito::solvers::cuda::SolverType::LU);
+    auto solver = mito::solvers::cuda::dense<mito::real>(mito::solvers::cuda::SolverType::LU);
     solver.initialize(N);
 
     // set matrix and right-hand side entries

From 9a59e4a52886a5b4e2d482aa591c5196e4170994 Mon Sep 17 00:00:00 2001
From: Bianca Giovanardi <b.giovanardi@tudelft.nl>
Date: Fri, 20 Jun 2025 20:07:44 +0200
Subject: [PATCH 19/35] solvers/cuda: add concept for a type representing a
 real value

The template argument of class {CUDADenseSolver} is now required to be a type representing a real value.
---
 .../solvers/backend/cuda/CUDADenseSolver.cu   | 34 +++++++++----------
 .../solvers/backend/cuda/CUDADenseSolver.h    |  2 +-
 .../solvers/backend/cuda/CUDADenseSolver.icc  |  2 +-
 lib/mito/solvers/backend/cuda/api.h           |  2 +-
 lib/mito/solvers/backend/cuda/externals.h     | 10 ++++++
 lib/mito/solvers/backend/cuda/factories.h     |  2 +-
 lib/mito/solvers/backend/cuda/forward.h       |  2 +-
 7 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
index cc9062d04..5e7dbe5e6 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
@@ -59,7 +59,7 @@ cusolverGetErrorString(cusolverStatus_t status)
     } while (0)
 
 // constructor
-template<typename realT>
+template<mito::solvers::cuda::real_c realT>
 mito::solvers::cuda::CUDADenseSolver<realT>::CUDADenseSolver(mito::solvers::cuda::SolverType solver_type) :
     _solver_type(solver_type),
     _h_matrix(nullptr),
@@ -79,14 +79,14 @@ mito::solvers::cuda::CUDADenseSolver<realT>::CUDADenseSolver(mito::solvers::cuda
 }
 
 // destructor
-template<typename realT>
+template<mito::solvers::cuda::real_c realT>
 mito::solvers::cuda::CUDADenseSolver<realT>::~CUDADenseSolver()
 {
     // finalize cuSOLVER
     _finalize_cusolver();
 }
 
-template<typename realT>
+template<mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::initialize(size_t size) -> void
 {
@@ -121,7 +121,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::initialize(size_t size) -> void
     return;
 }
 
-template<typename realT>
+template<mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::finalize() -> void
 {
@@ -141,7 +141,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::finalize() -> void
     return;
 }
 
-template<typename realT>
+template<mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::reset_system() -> void
 {
@@ -161,7 +161,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::reset_system() -> void
 }
 
 // add/insert {value} to matrix entry at ({row}, {col}) of the host copy
-template<typename realT>
+template<mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::set_matrix_value(
     size_t row, size_t col, const real_type value,
@@ -193,7 +193,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::set_matrix_value(
 }
 
 // add/insert {value} to rhs entry at {row} of the host copy
-template<typename realT>
+template<mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::set_rhs_value(
     size_t row, const real_type value,
@@ -221,7 +221,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::set_rhs_value(
     return;
 }
 
-template<typename realT>
+template<mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::finalize_assembly() -> void
 {
@@ -244,7 +244,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::finalize_assembly() -> void
     return;
 }
 
-template<typename realT>
+template<mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
 {
@@ -338,7 +338,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
     return;
 }
 
-template<typename realT>
+template<mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::_initialize_cusolver() -> void
 {
@@ -355,7 +355,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::_initialize_cusolver() -> void
     return;
 }
 
-template<typename realT>
+template<mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::_finalize_cusolver() -> void
 {
@@ -373,7 +373,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::_finalize_cusolver() -> void
     return;
 }
 
-template<typename realT>
+template<mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::_allocate_host_memory(size_t size) -> void
 {
@@ -416,7 +416,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::_allocate_host_memory(size_t size)
     return;
 }
 
-template<typename realT>
+template<mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::_allocate_device_memory(size_t size) -> void
 {
@@ -428,7 +428,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::_allocate_device_memory(size_t size
     return;
 }
 
-template<typename realT>
+template<mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::_initialize_host_data(size_t size) -> void
 {
@@ -452,7 +452,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::_initialize_host_data(size_t size)
     return;
 }
 
-template<typename realT>
+template<mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::_free_host_memory() -> void
 {
@@ -485,7 +485,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::_free_host_memory() -> void
     return;
 }
 
-template<typename realT>
+template<mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::_free_device_memory() -> void
 {
@@ -502,7 +502,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::_free_device_memory() -> void
     return;
 }
 
-template<typename realT>
+template<mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::_check_index_validity(size_t index) const -> void
 {
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
index dc0f03a63..fcecc6fea 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
@@ -38,7 +38,7 @@ struct cusolver_traits<float> {
 
 namespace mito::solvers::cuda {
 
-    template<typename realT>
+    template<real_c realT>
     class CUDADenseSolver {
         public:
         // type alias for real numbers
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc b/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc
index 983657316..fc21f1909 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc
@@ -8,7 +8,7 @@
 #error This header file contains implementation details of class mito::solvers::cuda::CUDADenseSolver
 #else
 
-template <typename realT>
+template <mito::solvers::cuda::real_c realT>
 template <class solutionT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::get_solution(solutionT & solution) const -> void
diff --git a/lib/mito/solvers/backend/cuda/api.h b/lib/mito/solvers/backend/cuda/api.h
index 6317d221a..5bfab20de 100644
--- a/lib/mito/solvers/backend/cuda/api.h
+++ b/lib/mito/solvers/backend/cuda/api.h
@@ -10,7 +10,7 @@
 namespace mito::solvers::cuda {
 
     // cuda dense solver
-    template<typename realT>
+    template<real_c realT>
     using dense_t = CUDADenseSolver<realT>;
 
 }
diff --git a/lib/mito/solvers/backend/cuda/externals.h b/lib/mito/solvers/backend/cuda/externals.h
index cfbe3cb97..120b6a1ca 100644
--- a/lib/mito/solvers/backend/cuda/externals.h
+++ b/lib/mito/solvers/backend/cuda/externals.h
@@ -12,6 +12,7 @@
 #include <vector>
 #include <cassert>
 #include <iostream>
+#include <type_traits>
 
 
 // cuda support
@@ -19,4 +20,13 @@
 #include <cusolverDn.h>
 
 
+namespace mito::solvers::cuda {
+
+    // concept for a real type
+    template<typename T>
+    concept real_c = std::is_floating_point_v<T>;
+
+}
+
+
 // end of file
diff --git a/lib/mito/solvers/backend/cuda/factories.h b/lib/mito/solvers/backend/cuda/factories.h
index 346a752c8..13e862494 100644
--- a/lib/mito/solvers/backend/cuda/factories.h
+++ b/lib/mito/solvers/backend/cuda/factories.h
@@ -10,7 +10,7 @@
 namespace mito::solvers::cuda {
 
     // cuda dense solver
-    template<typename realT>
+    template<real_c realT>
     auto dense(mito::solvers::cuda::SolverType solver_type = mito::solvers::cuda::SolverType::LU)
     {
         return dense_t<realT>(solver_type);
diff --git a/lib/mito/solvers/backend/cuda/forward.h b/lib/mito/solvers/backend/cuda/forward.h
index 2ed383523..9a48b8b05 100644
--- a/lib/mito/solvers/backend/cuda/forward.h
+++ b/lib/mito/solvers/backend/cuda/forward.h
@@ -16,7 +16,7 @@ namespace mito::solvers::cuda {
     enum SolverType { CHOLESKY, LU };
 
     // class for CUDA dense solver
-    template<typename realT>
+    template<real_c realT>
     class CUDADenseSolver;
 }
 

From 4c7a268adb7c849b4fcecbb1e58ae4ad2ff57d65 Mon Sep 17 00:00:00 2001
From: Bianca Giovanardi <b.giovanardi@tudelft.nl>
Date: Fri, 20 Jun 2025 20:12:41 +0200
Subject: [PATCH 20/35] solvers/cuda: remove redundant namespace specification

---
 .../solvers/backend/cuda/CUDADenseSolver.cu   | 30 +++++++++----------
 .../solvers/backend/cuda/CUDADenseSolver.h    |  4 +--
 lib/mito/solvers/backend/cuda/factories.h     |  2 +-
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
index 5e7dbe5e6..033b5b7e3 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
@@ -60,7 +60,7 @@ cusolverGetErrorString(cusolverStatus_t status)
 
 // constructor
 template<mito::solvers::cuda::real_c realT>
-mito::solvers::cuda::CUDADenseSolver<realT>::CUDADenseSolver(mito::solvers::cuda::SolverType solver_type) :
+mito::solvers::cuda::CUDADenseSolver<realT>::CUDADenseSolver(SolverType solver_type) :
     _solver_type(solver_type),
     _h_matrix(nullptr),
     _h_rhs(nullptr),
@@ -165,7 +165,7 @@ template<mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::set_matrix_value(
     size_t row, size_t col, const real_type value,
-    const mito::solvers::cuda::InsertMode insert_mode = mito::solvers::cuda::InsertMode::ADD_VALUE)
+    const InsertMode insert_mode = InsertMode::ADD_VALUE)
     -> void
 {
     // check if the system assembly is finalized and throw an error if it is
@@ -181,9 +181,9 @@ mito::solvers::cuda::CUDADenseSolver<realT>::set_matrix_value(
     // add/insert the value to the matrix entry in the host matrix
     // NOTE: We store the matrix in column-major order since the cuSOLVER library expects the matrix
     // to be in column-major order.
-    if (insert_mode == mito::solvers::cuda::InsertMode::ADD_VALUE)
+    if (insert_mode == InsertMode::ADD_VALUE)
         _h_matrix[col * _size + row] += value;
-    else if (insert_mode == mito::solvers::cuda::InsertMode::INSERT_VALUE)
+    else if (insert_mode == InsertMode::INSERT_VALUE)
         _h_matrix[col * _size + row] = value;
     else
         throw std::invalid_argument("Invalid insert mode. Use ADD_VALUE or INSERT_VALUE.");
@@ -197,7 +197,7 @@ template<mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::set_rhs_value(
     size_t row, const real_type value,
-    const mito::solvers::cuda::InsertMode insert_mode = mito::solvers::cuda::InsertMode::ADD_VALUE)
+    const InsertMode insert_mode = InsertMode::ADD_VALUE)
     -> void
 {
     // check if the system assembly is finalized and throw an error if it is
@@ -210,9 +210,9 @@ mito::solvers::cuda::CUDADenseSolver<realT>::set_rhs_value(
     _check_index_validity(row);
 
     // add/insert the value to the rhs entry in the host rhs
-    if (insert_mode == mito::solvers::cuda::InsertMode::ADD_VALUE)
+    if (insert_mode == InsertMode::ADD_VALUE)
         _h_rhs[row] += value;
-    else if (insert_mode == mito::solvers::cuda::InsertMode::INSERT_VALUE)
+    else if (insert_mode == InsertMode::INSERT_VALUE)
         _h_rhs[row] = value;
     else
         throw std::invalid_argument("Invalid insert mode. Use ADD_VALUE or INSERT_VALUE.");
@@ -273,19 +273,19 @@ mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
         "Only double or float types are supported in the CUDA dense solver.");
 
     // check the solver type is either LU or Cholesky
-    if (_solver_type != mito::solvers::cuda::SolverType::LU
-        && _solver_type != mito::solvers::cuda::SolverType::CHOLESKY) {
+    if (_solver_type != SolverType::LU
+        && _solver_type != SolverType::CHOLESKY) {
         throw std::invalid_argument(
             "Invalid solver type. Only LU and Cholesky solvers are supported in the CUDA dense "
             "solver.");
     }
 
     // get the workspace size for the factorization
-    if (_solver_type == mito::solvers::cuda::SolverType::LU) {
+    if (_solver_type == SolverType::LU) {
         CHECK_CUSOLVER_ERROR(
             cusolver_traits<real_type>::getrf_buffer_size(
                 _cusolver_handle, _size, _size, _d_matrix, _size, &workspace_size));
-    } else if (_solver_type == mito::solvers::cuda::SolverType::CHOLESKY) {
+    } else if (_solver_type == SolverType::CHOLESKY) {
         CHECK_CUSOLVER_ERROR(
             cusolver_traits<real_type>::potrf_buffer_size(
                 _cusolver_handle, CUBLAS_FILL_MODE_LOWER, _size, _d_matrix, _size,
@@ -297,11 +297,11 @@ mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
     CHECK_CUDA_ERROR(cudaMalloc(&d_workspace, workspace_size * sizeof(real_type)));
 
     // perform the factorization
-    if (_solver_type == mito::solvers::cuda::SolverType::LU) {
+    if (_solver_type == SolverType::LU) {
         CHECK_CUSOLVER_ERROR(
             cusolver_traits<real_type>::getrf(
                 _cusolver_handle, _size, _size, _d_matrix, _size, d_workspace, d_pivot, d_info));
-    } else if (_solver_type == mito::solvers::cuda::SolverType::CHOLESKY) {
+    } else if (_solver_type == SolverType::CHOLESKY) {
         CHECK_CUSOLVER_ERROR(
             cusolver_traits<real_type>::potrf(
                 _cusolver_handle, CUBLAS_FILL_MODE_LOWER, _size, _d_matrix, _size, d_workspace,
@@ -310,12 +310,12 @@ mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
     CHECK_CUDA_ERROR(cudaDeviceSynchronize());
 
     // solve the linear system
-    if (_solver_type == mito::solvers::cuda::SolverType::LU) {
+    if (_solver_type == SolverType::LU) {
         CHECK_CUSOLVER_ERROR(
             cusolver_traits<real_type>::getrs(
                 _cusolver_handle, CUBLAS_OP_N, _size, 1, _d_matrix, _size, d_pivot, _d_rhs, _size,
                 d_info));
-    } else if (_solver_type == mito::solvers::cuda::SolverType::CHOLESKY) {
+    } else if (_solver_type == SolverType::CHOLESKY) {
         CHECK_CUSOLVER_ERROR(
             cusolver_traits<real_type>::potrs(
                 _cusolver_handle, CUBLAS_FILL_MODE_LOWER, _size, 1, _d_matrix, _size, _d_rhs, _size,
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
index fcecc6fea..aa97d4d3a 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
@@ -46,7 +46,7 @@ namespace mito::solvers::cuda {
 
       public:
         // constructor
-        CUDADenseSolver(mito::solvers::cuda::SolverType);
+        CUDADenseSolver(SolverType);
 
         // destructor
         ~CUDADenseSolver();
@@ -105,7 +105,7 @@ namespace mito::solvers::cuda {
 
       private:
         // solver type
-        mito::solvers::cuda::SolverType _solver_type;
+        SolverType _solver_type;
         // host copy of the matrix
         real_type * _h_matrix;
         // host copy of the right-hand side
diff --git a/lib/mito/solvers/backend/cuda/factories.h b/lib/mito/solvers/backend/cuda/factories.h
index 13e862494..f628c28e0 100644
--- a/lib/mito/solvers/backend/cuda/factories.h
+++ b/lib/mito/solvers/backend/cuda/factories.h
@@ -11,7 +11,7 @@ namespace mito::solvers::cuda {
 
     // cuda dense solver
     template<real_c realT>
-    auto dense(mito::solvers::cuda::SolverType solver_type = mito::solvers::cuda::SolverType::LU)
+    auto dense(SolverType solver_type = SolverType::LU)
     {
         return dense_t<realT>(solver_type);
     }

From 11123cb9e9408d5e4ef62e6be4cdb42a62067d51 Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Sat, 5 Jul 2025 19:12:33 +0200
Subject: [PATCH 21/35] solvers/cuda/CUDADenseSolver: added arguments for
 constructor & methods in header  to introduce default arguments

Added default arguments to the constructors and methods in `CUDADenseSolver` so we can add default arguments in the constructor and methods. There are also a few formatting edits in this commit.
---
 .../solvers/backend/cuda/CUDADenseSolver.cu   | 54 +++++++++----------
 .../solvers/backend/cuda/CUDADenseSolver.h    | 22 ++++----
 2 files changed, 37 insertions(+), 39 deletions(-)

diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
index 033b5b7e3..77b9aa790 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
@@ -59,7 +59,7 @@ cusolverGetErrorString(cusolverStatus_t status)
     } while (0)
 
 // constructor
-template<mito::solvers::cuda::real_c realT>
+template <mito::solvers::cuda::real_c realT>
 mito::solvers::cuda::CUDADenseSolver<realT>::CUDADenseSolver(SolverType solver_type) :
     _solver_type(solver_type),
     _h_matrix(nullptr),
@@ -71,7 +71,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::CUDADenseSolver(SolverType solver_t
     _is_solver_initialized(false),
     _allocated_host_memory_type(0),
     _is_assembly_finalized(false),
-    _cusolver_handle(nullptr),
+    _cusolver_handle(),
     _cuda_stream(nullptr)
 {
     // initialize cuSOLVER
@@ -79,14 +79,14 @@ mito::solvers::cuda::CUDADenseSolver<realT>::CUDADenseSolver(SolverType solver_t
 }
 
 // destructor
-template<mito::solvers::cuda::real_c realT>
+template <mito::solvers::cuda::real_c realT>
 mito::solvers::cuda::CUDADenseSolver<realT>::~CUDADenseSolver()
 {
     // finalize cuSOLVER
     _finalize_cusolver();
 }
 
-template<mito::solvers::cuda::real_c realT>
+template <mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::initialize(size_t size) -> void
 {
@@ -121,7 +121,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::initialize(size_t size) -> void
     return;
 }
 
-template<mito::solvers::cuda::real_c realT>
+template <mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::finalize() -> void
 {
@@ -141,7 +141,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::finalize() -> void
     return;
 }
 
-template<mito::solvers::cuda::real_c realT>
+template <mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::reset_system() -> void
 {
@@ -161,12 +161,10 @@ mito::solvers::cuda::CUDADenseSolver<realT>::reset_system() -> void
 }
 
 // add/insert {value} to matrix entry at ({row}, {col}) of the host copy
-template<mito::solvers::cuda::real_c realT>
+template <mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::set_matrix_value(
-    size_t row, size_t col, const real_type value,
-    const InsertMode insert_mode = InsertMode::ADD_VALUE)
-    -> void
+    size_t row, size_t col, const real_type value, const InsertMode insert_mode) -> void
 {
     // check if the system assembly is finalized and throw an error if it is
     if (_is_assembly_finalized) {
@@ -193,12 +191,10 @@ mito::solvers::cuda::CUDADenseSolver<realT>::set_matrix_value(
 }
 
 // add/insert {value} to rhs entry at {row} of the host copy
-template<mito::solvers::cuda::real_c realT>
+template <mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::set_rhs_value(
-    size_t row, const real_type value,
-    const InsertMode insert_mode = InsertMode::ADD_VALUE)
-    -> void
+    size_t row, const real_type value, const InsertMode insert_mode) -> void
 {
     // check if the system assembly is finalized and throw an error if it is
     if (_is_assembly_finalized) {
@@ -221,7 +217,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::set_rhs_value(
     return;
 }
 
-template<mito::solvers::cuda::real_c realT>
+template <mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::finalize_assembly() -> void
 {
@@ -244,7 +240,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::finalize_assembly() -> void
     return;
 }
 
-template<mito::solvers::cuda::real_c realT>
+template <mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
 {
@@ -258,8 +254,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
     // IMPROVE: We should move the data through streams for better performance later!
     CHECK_CUDA_ERROR(cudaMemcpy(
         _d_matrix, _h_matrix, _size * _size * sizeof(real_type), cudaMemcpyHostToDevice));
-    CHECK_CUDA_ERROR(
-        cudaMemcpy(_d_rhs, _h_rhs, _size * sizeof(real_type), cudaMemcpyHostToDevice));
+    CHECK_CUDA_ERROR(cudaMemcpy(_d_rhs, _h_rhs, _size * sizeof(real_type), cudaMemcpyHostToDevice));
 
     // allocate device memory for temporary variables in the factorization
     int * d_pivot = nullptr;
@@ -273,8 +268,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
         "Only double or float types are supported in the CUDA dense solver.");
 
     // check the solver type is either LU or Cholesky
-    if (_solver_type != SolverType::LU
-        && _solver_type != SolverType::CHOLESKY) {
+    if (_solver_type != SolverType::LU && _solver_type != SolverType::CHOLESKY) {
         throw std::invalid_argument(
             "Invalid solver type. Only LU and Cholesky solvers are supported in the CUDA dense "
             "solver.");
@@ -324,8 +318,8 @@ mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
     CHECK_CUDA_ERROR(cudaDeviceSynchronize());
 
     // copy the solution from device global memory to host memory
-    // NOTE: _d_rhs contains the solution after the call to getrs as its contents are overwritten
-    // by the solution vector
+    // NOTE: _d_rhs contains the solution after the call to getrs/potrs as its contents are
+    // overwritten by the solution vector
     CHECK_CUDA_ERROR(
         cudaMemcpy(_h_solution, _d_rhs, _size * sizeof(real_type), cudaMemcpyDeviceToHost));
 
@@ -338,7 +332,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
     return;
 }
 
-template<mito::solvers::cuda::real_c realT>
+template <mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::_initialize_cusolver() -> void
 {
@@ -355,7 +349,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::_initialize_cusolver() -> void
     return;
 }
 
-template<mito::solvers::cuda::real_c realT>
+template <mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::_finalize_cusolver() -> void
 {
@@ -373,7 +367,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::_finalize_cusolver() -> void
     return;
 }
 
-template<mito::solvers::cuda::real_c realT>
+template <mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::_allocate_host_memory(size_t size) -> void
 {
@@ -416,7 +410,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::_allocate_host_memory(size_t size)
     return;
 }
 
-template<mito::solvers::cuda::real_c realT>
+template <mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::_allocate_device_memory(size_t size) -> void
 {
@@ -428,7 +422,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::_allocate_device_memory(size_t size
     return;
 }
 
-template<mito::solvers::cuda::real_c realT>
+template <mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::_initialize_host_data(size_t size) -> void
 {
@@ -452,7 +446,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::_initialize_host_data(size_t size)
     return;
 }
 
-template<mito::solvers::cuda::real_c realT>
+template <mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::_free_host_memory() -> void
 {
@@ -485,7 +479,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::_free_host_memory() -> void
     return;
 }
 
-template<mito::solvers::cuda::real_c realT>
+template <mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::_free_device_memory() -> void
 {
@@ -502,7 +496,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::_free_device_memory() -> void
     return;
 }
 
-template<mito::solvers::cuda::real_c realT>
+template <mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::_check_index_validity(size_t index) const -> void
 {
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
index aa97d4d3a..5d71195fc 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
@@ -38,15 +38,15 @@ struct cusolver_traits<float> {
 
 namespace mito::solvers::cuda {
 
-    template<real_c realT>
+    template <real_c realT>
     class CUDADenseSolver {
-        public:
+      public:
         // type alias for real numbers
         using real_type = realT;
 
       public:
         // constructor
-        CUDADenseSolver(SolverType);
+        CUDADenseSolver(SolverType solver_type = SolverType::LU);
 
         // destructor
         ~CUDADenseSolver();
@@ -62,11 +62,15 @@ namespace mito::solvers::cuda {
         auto reset_system() -> void;
 
         // set (add or insert depending on the mode) the value of a matrix entry in the host copy
-        auto set_matrix_value(size_t, size_t, const real_type, const InsertMode) -> void;
+        auto set_matrix_value(
+            size_t row, size_t col, const real_type value,
+            const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void;
 
         // set (add or insert depending on the mode) the value of a right-hand side entry in the
         // host copy
-        auto set_rhs_value(size_t, const real_type, const InsertMode) -> void;
+        auto set_rhs_value(
+            size_t row, const real_type value,
+            const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void;
 
         // finalize the linear system assembly
         auto finalize_assembly() -> void;
@@ -86,13 +90,13 @@ namespace mito::solvers::cuda {
         auto _finalize_cusolver() -> void;
 
         // allocate the host memory for the matrix, right-hand side, and solution
-        auto _allocate_host_memory(size_t) -> void;
+        auto _allocate_host_memory(size_t size) -> void;
 
         // allocate the device memory for the matrix and right-hand side
-        auto _allocate_device_memory(size_t) -> void;
+        auto _allocate_device_memory(size_t size) -> void;
 
         // initialize the host data for the matrix, right-hand side, and solution
-        auto _initialize_host_data(size_t) -> void;
+        auto _initialize_host_data(size_t size) -> void;
 
         // deallocate the host memory for the matrix, right-hand side, and solution
         auto _free_host_memory() -> void;
@@ -101,7 +105,7 @@ namespace mito::solvers::cuda {
         auto _free_device_memory() -> void;
 
         // check the validity of the index in the matrix and right-hand side
-        auto _check_index_validity(size_t) const -> void;
+        auto _check_index_validity(size_t index) const -> void;
 
       private:
         // solver type

From 88e15e851b760315dcbd00e2b5ab1fcf10802d3e Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Sun, 10 Aug 2025 11:46:25 +0200
Subject: [PATCH 22/35] solvers/cuda/CUDADenseSolver: added separate header for
 CUDA error checks

---
 .../solvers/backend/cuda/CUDADenseSolver.cu   | 51 ----------------
 .../solvers/backend/cuda/CUDAErrorChecks.h    | 60 +++++++++++++++++++
 lib/mito/solvers/backend/cuda/public.h        |  3 +
 3 files changed, 63 insertions(+), 51 deletions(-)
 create mode 100644 lib/mito/solvers/backend/cuda/CUDAErrorChecks.h

diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
index 77b9aa790..b3a11c62a 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
@@ -7,57 +7,6 @@
 #include "public.h"
 
 
-// CUDA error checking macro
-#define CHECK_CUDA_ERROR(call)                                                   \
-    do {                                                                         \
-        cudaError_t err = call;                                                  \
-        if (err != cudaSuccess) {                                                \
-            fprintf(stderr, "CUDA error at %s:%d\n", __FILE__, __LINE__);        \
-            fprintf(stderr, "  Error: %s (%d)\n", cudaGetErrorString(err), err); \
-            exit(EXIT_FAILURE);                                                  \
-        }                                                                        \
-    } while (0)
-
-// function to convert cuSOLVER error codes to strings
-inline const char *
-cusolverGetErrorString(cusolverStatus_t status)
-{
-    switch (status) {
-        case CUSOLVER_STATUS_SUCCESS:
-            return "CUSOLVER_STATUS_SUCCESS";
-        case CUSOLVER_STATUS_NOT_INITIALIZED:
-            return "CUSOLVER_STATUS_NOT_INITIALIZED";
-        case CUSOLVER_STATUS_ALLOC_FAILED:
-            return "CUSOLVER_STATUS_ALLOC_FAILED";
-        case CUSOLVER_STATUS_INVALID_VALUE:
-            return "CUSOLVER_STATUS_INVALID_VALUE";
-        case CUSOLVER_STATUS_ARCH_MISMATCH:
-            return "CUSOLVER_STATUS_ARCH_MISMATCH";
-        case CUSOLVER_STATUS_MAPPING_ERROR:
-            return "CUSOLVER_STATUS_MAPPING_ERROR";
-        case CUSOLVER_STATUS_EXECUTION_FAILED:
-            return "CUSOLVER_STATUS_EXECUTION_FAILED";
-        case CUSOLVER_STATUS_INTERNAL_ERROR:
-            return "CUSOLVER_STATUS_INTERNAL_ERROR";
-        case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
-            return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
-        default:
-            return "Unknown cuSOLVER error";
-    }
-}
-
-// cuSOLVER error checking macro
-#define CHECK_CUSOLVER_ERROR(call)                                                                 \
-    do {                                                                                           \
-        cusolverStatus_t status = (call);                                                          \
-        if (status != CUSOLVER_STATUS_SUCCESS) {                                                   \
-            fprintf(                                                                               \
-                stderr, "cuSOLVER error: %s at %s:%d\n", cusolverGetErrorString(status), __FILE__, \
-                __LINE__);                                                                         \
-            exit(EXIT_FAILURE);                                                                    \
-        }                                                                                          \
-    } while (0)
-
 // constructor
 template <mito::solvers::cuda::real_c realT>
 mito::solvers::cuda::CUDADenseSolver<realT>::CUDADenseSolver(SolverType solver_type) :
diff --git a/lib/mito/solvers/backend/cuda/CUDAErrorChecks.h b/lib/mito/solvers/backend/cuda/CUDAErrorChecks.h
new file mode 100644
index 000000000..d98e0ba0f
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/CUDAErrorChecks.h
@@ -0,0 +1,60 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+
+// code guard
+#pragma once
+
+
+// CUDA error checking macro
+#define CHECK_CUDA_ERROR(call)                                                   \
+    do {                                                                         \
+        cudaError_t err = call;                                                  \
+        if (err != cudaSuccess) {                                                \
+            fprintf(stderr, "CUDA error at %s:%d\n", __FILE__, __LINE__);        \
+            fprintf(stderr, "  Error: %s (%d)\n", cudaGetErrorString(err), err); \
+            exit(EXIT_FAILURE);                                                  \
+        }                                                                        \
+    } while (0)
+
+// function to convert cuSOLVER error codes to strings
+inline const char *
+cusolverGetErrorString(cusolverStatus_t status)
+{
+    switch (status) {
+        case CUSOLVER_STATUS_SUCCESS:
+            return "CUSOLVER_STATUS_SUCCESS";
+        case CUSOLVER_STATUS_NOT_INITIALIZED:
+            return "CUSOLVER_STATUS_NOT_INITIALIZED";
+        case CUSOLVER_STATUS_ALLOC_FAILED:
+            return "CUSOLVER_STATUS_ALLOC_FAILED";
+        case CUSOLVER_STATUS_INVALID_VALUE:
+            return "CUSOLVER_STATUS_INVALID_VALUE";
+        case CUSOLVER_STATUS_ARCH_MISMATCH:
+            return "CUSOLVER_STATUS_ARCH_MISMATCH";
+        case CUSOLVER_STATUS_MAPPING_ERROR:
+            return "CUSOLVER_STATUS_MAPPING_ERROR";
+        case CUSOLVER_STATUS_EXECUTION_FAILED:
+            return "CUSOLVER_STATUS_EXECUTION_FAILED";
+        case CUSOLVER_STATUS_INTERNAL_ERROR:
+            return "CUSOLVER_STATUS_INTERNAL_ERROR";
+        case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+            return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+        default:
+            return "Unknown cuSOLVER error";
+    }
+}
+
+// cuSOLVER error checking macro
+#define CHECK_CUSOLVER_ERROR(call)                                                                 \
+    do {                                                                                           \
+        cusolverStatus_t status = (call);                                                          \
+        if (status != CUSOLVER_STATUS_SUCCESS) {                                                   \
+            fprintf(                                                                               \
+                stderr, "cuSOLVER error: %s at %s:%d\n", cusolverGetErrorString(status), __FILE__, \
+                __LINE__);                                                                         \
+            exit(EXIT_FAILURE);                                                                    \
+        }                                                                                          \
+    } while (0)
diff --git a/lib/mito/solvers/backend/cuda/public.h b/lib/mito/solvers/backend/cuda/public.h
index 9b0b64388..0d6580487 100644
--- a/lib/mito/solvers/backend/cuda/public.h
+++ b/lib/mito/solvers/backend/cuda/public.h
@@ -19,6 +19,9 @@
 // classes
 #include "CUDADenseSolver.h"
 
+// error checks
+#include "CUDAErrorChecks.h"
+
 // factories implementation
 #include "factories.h"
 

From 0c947d4e28789cce5fc09e7777560f7e39dc2d1f Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Sun, 10 Aug 2025 21:12:02 +0200
Subject: [PATCH 23/35] solvers/cuda/CUDASolver: added base class for
 implementing attributes & methods common to all CUDA solvers

---
 .cmake/mito_sources.cmake                   |   1 +
 lib/mito/solvers/backend/cuda/CUDASolver.cu | 146 ++++++++++++++++++++
 lib/mito/solvers/backend/cuda/CUDASolver.h  |  94 +++++++++++++
 lib/mito/solvers/backend/cuda/externals.h   |   2 +-
 lib/mito/solvers/backend/cuda/public.h      |   1 +
 5 files changed, 243 insertions(+), 1 deletion(-)
 create mode 100644 lib/mito/solvers/backend/cuda/CUDASolver.cu
 create mode 100644 lib/mito/solvers/backend/cuda/CUDASolver.h

diff --git a/.cmake/mito_sources.cmake b/.cmake/mito_sources.cmake
index a26759b8a..7eede5a15 100644
--- a/.cmake/mito_sources.cmake
+++ b/.cmake/mito_sources.cmake
@@ -17,6 +17,7 @@ endif()
 # the mito cuda solvers backend
 if (WITH_CUDA)
 set(MITO_SOURCES ${MITO_SOURCES}
+    lib/mito/solvers/backend/cuda/CUDASolver.cu
     lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
 )
 endif()
diff --git a/lib/mito/solvers/backend/cuda/CUDASolver.cu b/lib/mito/solvers/backend/cuda/CUDASolver.cu
new file mode 100644
index 000000000..1ecf5cf56
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/CUDASolver.cu
@@ -0,0 +1,146 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+
+#include "public.h"
+
+
+// constructor
+template <mito::solvers::cuda::real_c realT>
+mito::solvers::cuda::CUDASolver<realT>::CUDASolver(SolverType solver_type) :
+    _solver_type(solver_type),
+    _size(0),
+    _is_solver_initialized(false),
+    _is_assembly_finalized(false),
+    _cuda_stream(nullptr)
+{}
+
+// destructor
+template <mito::solvers::cuda::real_c realT>
+mito::solvers::cuda::CUDASolver<realT>::~CUDASolver()
+{}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASolver<realT>::initialize(size_t size) -> void
+{
+    // check if the solver is already initialized
+    if (_is_solver_initialized) {
+        throw std::logic_error(
+            "Solver is already initialized. Are you sure you want to reinitialize? Then call "
+            "finalize() first.");
+    }
+
+    // check if the size is valid
+    if (size <= 0) {
+        throw std::invalid_argument("Size of the linear system must be greater than zero.");
+    }
+
+    // save the size of the linear system
+    _size = size;
+
+    // allocate host memory
+    _allocate_host_memory(size);
+
+    // initialize host data
+    _initialize_host_data(size);
+
+    // allocate device memory
+    _allocate_device_memory(size);
+
+    // turn on the solver initialized flag
+    _is_solver_initialized = true;
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASolver<realT>::finalize() -> void
+{
+    // check if the solver is initialized
+    if (_is_solver_initialized) {
+        // free host memory
+        _free_host_memory();
+
+        // free device memory
+        _free_device_memory();
+    }
+
+    // reset the solver initialized flag
+    _is_solver_initialized = false;
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASolver<realT>::reset_system() -> void
+{
+    // check if the solver is initialized
+    if (!_is_solver_initialized) {
+        throw std::logic_error("Solver is not yet initialized. Call initialize() first.");
+    }
+
+    // fill the host matrix, rhs and solution with zeros
+    _initialize_host_data(_size);
+
+    // reset the assembly finalized flag
+    _is_assembly_finalized = false;
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASolver<realT>::finalize_assembly() -> void
+{
+    // check if the solver is initialized
+    if (!_is_solver_initialized) {
+        throw std::logic_error(
+            "Solver is not yet initialized. Call initialize() first, assemble the "
+            "system, and then finalize the assembly.");
+    }
+
+    // issue a warning that all entries should be set before finalizing the assembly
+    std::cerr
+        << "Warning: Finalizing assembly. Make sure all system entries are set before this step."
+        << std::endl;
+
+    // set the assembly finalized flag to true
+    _is_assembly_finalized = true;
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASolver<realT>::_check_index_validity(size_t index) const -> void
+{
+    // check if the solver is initialized
+    if (!_is_solver_initialized) {
+        throw std::logic_error("Solver is not yet initialized. Call initialize() first.");
+    }
+
+    // check if the index is valid and return false if it is not
+    if (index >= _size) {
+        throw std::out_of_range(
+            "Index " + std::to_string(index) + " is out of range. It must be between 0 and "
+            + std::to_string(_size - 1) + ".");
+    }
+
+    // all done
+    return;
+}
+
+// explicit instantiation of the {CUDASolver} class for doubles
+template class mito::solvers::cuda::CUDASolver<double>;
+
+// explicit instantiation of the {CUDASolver} class for floats
+template class mito::solvers::cuda::CUDASolver<float>;
diff --git a/lib/mito/solvers/backend/cuda/CUDASolver.h b/lib/mito/solvers/backend/cuda/CUDASolver.h
new file mode 100644
index 000000000..3617c303f
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/CUDASolver.h
@@ -0,0 +1,94 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+// code guard
+#pragma once
+
+
+namespace mito::solvers::cuda {
+
+    template <real_c realT>
+    class CUDASolver {
+      public:
+        // type alias for real numbers
+        using real_type = realT;
+
+      public:
+        // constructor
+        CUDASolver(SolverType solver_type);
+
+        // destructor
+        ~CUDASolver();
+
+      public:
+        // initialize the CUDA dense solver
+        auto initialize(size_t size) -> void;
+
+        // finalize the CUDA dense solver
+        auto finalize() -> void;
+
+        // reset the linear system (i.e. the host copy of the matrix, right-hand side and solution)
+        auto reset_system() -> void;
+
+        // set (add or insert depending on the mode) the value of a matrix entry in the host copy
+        virtual auto set_matrix_value(
+            size_t row, size_t col, const real_type value,
+            const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void = 0;
+
+        // set (add or insert depending on the mode) the value of a right-hand side entry in the
+        // host copy
+        virtual auto set_rhs_value(
+            size_t row, const real_type value,
+            const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void = 0;
+
+        // finalize the linear system assembly
+        auto finalize_assembly() -> void;
+
+        // solve the linear system
+        virtual auto solve() -> void = 0;
+
+        // get the solution vector
+        virtual auto get_solution(std::vector<real_type> & solution) const -> void = 0;
+
+      protected:
+        // initialize the cuSOLVER utilities
+        virtual auto _initialize_cusolver() -> void = 0;
+
+        // destroy the cuSOLVER utilities
+        virtual auto _finalize_cusolver() -> void = 0;
+
+        // allocate the host memory for the matrix, right-hand side, and solution
+        virtual auto _allocate_host_memory(size_t size) -> void = 0;
+
+        // allocate the device memory for the matrix and right-hand side
+        virtual auto _allocate_device_memory(size_t size) -> void = 0;
+
+        // initialize the host data for the matrix, right-hand side, and solution
+        virtual auto _initialize_host_data(size_t size) -> void = 0;
+
+        // deallocate the host memory for the matrix, right-hand side, and solution
+        virtual auto _free_host_memory() -> void = 0;
+
+        // deallocate the device memory for the matrix and right-hand side
+        virtual auto _free_device_memory() -> void = 0;
+
+        // check the validity of the index in the matrix and right-hand side
+        auto _check_index_validity(size_t index) const -> void;
+
+      protected:
+        // solver type
+        SolverType _solver_type;
+        // size of the linear system
+        size_t _size;
+        // flag to indicate if the solver has been initialized
+        bool _is_solver_initialized;
+        // flag to indicate if the system assembly has been finalized
+        bool _is_assembly_finalized;
+        // cuda stream
+        cudaStream_t _cuda_stream;
+    };
+}
+
+// end of file
diff --git a/lib/mito/solvers/backend/cuda/externals.h b/lib/mito/solvers/backend/cuda/externals.h
index 120b6a1ca..99140544d 100644
--- a/lib/mito/solvers/backend/cuda/externals.h
+++ b/lib/mito/solvers/backend/cuda/externals.h
@@ -23,7 +23,7 @@
 namespace mito::solvers::cuda {
 
     // concept for a real type
-    template<typename T>
+    template <typename T>
     concept real_c = std::is_floating_point_v<T>;
 
 }
diff --git a/lib/mito/solvers/backend/cuda/public.h b/lib/mito/solvers/backend/cuda/public.h
index 0d6580487..84caec4cd 100644
--- a/lib/mito/solvers/backend/cuda/public.h
+++ b/lib/mito/solvers/backend/cuda/public.h
@@ -17,6 +17,7 @@
 #include "api.h"
 
 // classes
+#include "CUDASolver.h"
 #include "CUDADenseSolver.h"
 
 // error checks

From 665a86cfb485396f0bd7031e7733f52e9a1b70e9 Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Sun, 10 Aug 2025 21:25:25 +0200
Subject: [PATCH 24/35] solvers/cuda/CUDADenseSolver: made derived class of
 CUDASolver

Made the CUDADenseSolver a derived class of CUDASolver to leverage common functionality and improve code reuse.
---
 .../solvers/backend/cuda/CUDADenseSolver.cu   | 201 ++++--------------
 .../solvers/backend/cuda/CUDADenseSolver.h    |  56 ++---
 .../solvers/backend/cuda/CUDADenseSolver.icc  |   8 +-
 3 files changed, 58 insertions(+), 207 deletions(-)

diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
index b3a11c62a..02e5a6d27 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
@@ -10,18 +10,14 @@
 // constructor
 template <mito::solvers::cuda::real_c realT>
 mito::solvers::cuda::CUDADenseSolver<realT>::CUDADenseSolver(SolverType solver_type) :
-    _solver_type(solver_type),
+    CUDASolver<realT>(solver_type),
     _h_matrix(nullptr),
     _h_rhs(nullptr),
     _h_solution(nullptr),
     _d_matrix(nullptr),
     _d_rhs(nullptr),
-    _size(0),
-    _is_solver_initialized(false),
     _allocated_host_memory_type(0),
-    _is_assembly_finalized(false),
-    _cusolver_handle(),
-    _cuda_stream(nullptr)
+    _cusolver_handle()
 {
     // initialize cuSOLVER
     _initialize_cusolver();
@@ -35,80 +31,6 @@ mito::solvers::cuda::CUDADenseSolver<realT>::~CUDADenseSolver()
     _finalize_cusolver();
 }
 
-template <mito::solvers::cuda::real_c realT>
-auto
-mito::solvers::cuda::CUDADenseSolver<realT>::initialize(size_t size) -> void
-{
-    // check if the solver is already initialized
-    if (_is_solver_initialized) {
-        throw std::logic_error(
-            "Solver is already initialized. Are you sure you want to reinitialize? Then call "
-            "finalize() first.");
-    }
-
-    // check if the size is valid
-    if (size <= 0) {
-        throw std::invalid_argument("Size of the linear system must be greater than zero.");
-    }
-
-    // save the size of the linear system
-    _size = size;
-
-    // allocate host memory
-    _allocate_host_memory(size);
-
-    // initialize host data
-    _initialize_host_data(size);
-
-    // allocate device memory
-    _allocate_device_memory(size);
-
-    // turn on the solver initialized flag
-    _is_solver_initialized = true;
-
-    // all done
-    return;
-}
-
-template <mito::solvers::cuda::real_c realT>
-auto
-mito::solvers::cuda::CUDADenseSolver<realT>::finalize() -> void
-{
-    // check if the solver is initialized
-    if (_is_solver_initialized) {
-        // free host memory
-        _free_host_memory();
-
-        // free device memory
-        _free_device_memory();
-    }
-
-    // reset the solver initialized flag
-    _is_solver_initialized = false;
-
-    // all done
-    return;
-}
-
-template <mito::solvers::cuda::real_c realT>
-auto
-mito::solvers::cuda::CUDADenseSolver<realT>::reset_system() -> void
-{
-    // check if the solver is initialized
-    if (!_is_solver_initialized) {
-        throw std::logic_error("Solver is not yet initialized. Call initialize() first.");
-    }
-
-    // fill the host matrix, rhs and solution with zeros
-    _initialize_host_data(_size);
-
-    // reset the assembly finalized flag
-    _is_assembly_finalized = false;
-
-    // all done
-    return;
-}
-
 // add/insert {value} to matrix entry at ({row}, {col}) of the host copy
 template <mito::solvers::cuda::real_c realT>
 auto
@@ -116,22 +38,22 @@ mito::solvers::cuda::CUDADenseSolver<realT>::set_matrix_value(
     size_t row, size_t col, const real_type value, const InsertMode insert_mode) -> void
 {
     // check if the system assembly is finalized and throw an error if it is
-    if (_is_assembly_finalized) {
+    if (this->_is_assembly_finalized) {
         throw std::logic_error(
             "System assembly is already finalized. Cannot add/insert values to the matrix.");
     }
 
     // check if the row and column indices are within bounds
-    _check_index_validity(row);
-    _check_index_validity(col);
+    this->_check_index_validity(row);
+    this->_check_index_validity(col);
 
     // add/insert the value to the matrix entry in the host matrix
     // NOTE: We store the matrix in column-major order since the cuSOLVER library expects the matrix
     // to be in column-major order.
     if (insert_mode == InsertMode::ADD_VALUE)
-        _h_matrix[col * _size + row] += value;
+        _h_matrix[col * this->_size + row] += value;
     else if (insert_mode == InsertMode::INSERT_VALUE)
-        _h_matrix[col * _size + row] = value;
+        _h_matrix[col * this->_size + row] = value;
     else
         throw std::invalid_argument("Invalid insert mode. Use ADD_VALUE or INSERT_VALUE.");
 
@@ -146,13 +68,13 @@ mito::solvers::cuda::CUDADenseSolver<realT>::set_rhs_value(
     size_t row, const real_type value, const InsertMode insert_mode) -> void
 {
     // check if the system assembly is finalized and throw an error if it is
-    if (_is_assembly_finalized) {
+    if (this->_is_assembly_finalized) {
         throw std::logic_error(
             "System assembly is already finalized. Cannot add/insert values to the rhs.");
     }
 
     // check if the row index is within bounds
-    _check_index_validity(row);
+    this->_check_index_validity(row);
 
     // add/insert the value to the rhs entry in the host rhs
     if (insert_mode == InsertMode::ADD_VALUE)
@@ -166,35 +88,12 @@ mito::solvers::cuda::CUDADenseSolver<realT>::set_rhs_value(
     return;
 }
 
-template <mito::solvers::cuda::real_c realT>
-auto
-mito::solvers::cuda::CUDADenseSolver<realT>::finalize_assembly() -> void
-{
-    // check if the solver is initialized
-    if (!_is_solver_initialized) {
-        throw std::logic_error(
-            "Solver is not yet initialized. Call initialize() first, assemble the "
-            "system, and then finalize the assembly.");
-    }
-
-    // issue a warning that all entries should be set before finalizing the assembly
-    std::cerr
-        << "Warning: Finalizing assembly. Make sure all system entries are set before this step."
-        << std::endl;
-
-    // set the assembly finalized flag to true
-    _is_assembly_finalized = true;
-
-    // all done
-    return;
-}
-
 template <mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
 {
     // check if the assembly is finalized
-    if (!_is_assembly_finalized) {
+    if (!this->_is_assembly_finalized) {
         throw std::logic_error(
             "System assembly is not yet finalized. Call finalize_assembly() first.");
     }
@@ -202,8 +101,10 @@ mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
     // copy the host matrix and rhs data to device global memory
     // IMPROVE: We should move the data through streams for better performance later!
     CHECK_CUDA_ERROR(cudaMemcpy(
-        _d_matrix, _h_matrix, _size * _size * sizeof(real_type), cudaMemcpyHostToDevice));
-    CHECK_CUDA_ERROR(cudaMemcpy(_d_rhs, _h_rhs, _size * sizeof(real_type), cudaMemcpyHostToDevice));
+        _d_matrix, _h_matrix, this->_size * this->_size * sizeof(real_type),
+        cudaMemcpyHostToDevice));
+    CHECK_CUDA_ERROR(
+        cudaMemcpy(_d_rhs, _h_rhs, this->_size * sizeof(real_type), cudaMemcpyHostToDevice));
 
     // allocate device memory for temporary variables in the factorization
     int * d_pivot = nullptr;
@@ -211,58 +112,55 @@ mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
     real_type * d_workspace = nullptr;
     int workspace_size = 0;
 
-    // check if real_type is either double or float and throw an error if it is not
-    static_assert(
-        std::is_same_v<real_type, double> || std::is_same_v<real_type, float>,
-        "Only double or float types are supported in the CUDA dense solver.");
-
     // check the solver type is either LU or Cholesky
-    if (_solver_type != SolverType::LU && _solver_type != SolverType::CHOLESKY) {
+    if (this->_solver_type != SolverType::LU && this->_solver_type != SolverType::CHOLESKY) {
         throw std::invalid_argument(
             "Invalid solver type. Only LU and Cholesky solvers are supported in the CUDA dense "
             "solver.");
     }
 
     // get the workspace size for the factorization
-    if (_solver_type == SolverType::LU) {
+    if (this->_solver_type == SolverType::LU) {
         CHECK_CUSOLVER_ERROR(
             cusolver_traits<real_type>::getrf_buffer_size(
-                _cusolver_handle, _size, _size, _d_matrix, _size, &workspace_size));
-    } else if (_solver_type == SolverType::CHOLESKY) {
+                _cusolver_handle, this->_size, this->_size, _d_matrix, this->_size,
+                &workspace_size));
+    } else if (this->_solver_type == SolverType::CHOLESKY) {
         CHECK_CUSOLVER_ERROR(
             cusolver_traits<real_type>::potrf_buffer_size(
-                _cusolver_handle, CUBLAS_FILL_MODE_LOWER, _size, _d_matrix, _size,
+                _cusolver_handle, CUBLAS_FILL_MODE_LOWER, this->_size, _d_matrix, this->_size,
                 &workspace_size));
     }
 
-    CHECK_CUDA_ERROR(cudaMalloc(&d_pivot, _size * sizeof(int)));
+    CHECK_CUDA_ERROR(cudaMalloc(&d_pivot, this->_size * sizeof(int)));
     CHECK_CUDA_ERROR(cudaMalloc(&d_info, sizeof(int)));
     CHECK_CUDA_ERROR(cudaMalloc(&d_workspace, workspace_size * sizeof(real_type)));
 
     // perform the factorization
-    if (_solver_type == SolverType::LU) {
+    if (this->_solver_type == SolverType::LU) {
         CHECK_CUSOLVER_ERROR(
             cusolver_traits<real_type>::getrf(
-                _cusolver_handle, _size, _size, _d_matrix, _size, d_workspace, d_pivot, d_info));
-    } else if (_solver_type == SolverType::CHOLESKY) {
+                _cusolver_handle, this->_size, this->_size, _d_matrix, this->_size, d_workspace,
+                d_pivot, d_info));
+    } else if (this->_solver_type == SolverType::CHOLESKY) {
         CHECK_CUSOLVER_ERROR(
             cusolver_traits<real_type>::potrf(
-                _cusolver_handle, CUBLAS_FILL_MODE_LOWER, _size, _d_matrix, _size, d_workspace,
-                workspace_size, d_info));
+                _cusolver_handle, CUBLAS_FILL_MODE_LOWER, this->_size, _d_matrix, this->_size,
+                d_workspace, workspace_size, d_info));
     }
     CHECK_CUDA_ERROR(cudaDeviceSynchronize());
 
     // solve the linear system
-    if (_solver_type == SolverType::LU) {
+    if (this->_solver_type == SolverType::LU) {
         CHECK_CUSOLVER_ERROR(
             cusolver_traits<real_type>::getrs(
-                _cusolver_handle, CUBLAS_OP_N, _size, 1, _d_matrix, _size, d_pivot, _d_rhs, _size,
-                d_info));
-    } else if (_solver_type == SolverType::CHOLESKY) {
+                _cusolver_handle, CUBLAS_OP_N, this->_size, 1, _d_matrix, this->_size, d_pivot,
+                _d_rhs, this->_size, d_info));
+    } else if (this->_solver_type == SolverType::CHOLESKY) {
         CHECK_CUSOLVER_ERROR(
             cusolver_traits<real_type>::potrs(
-                _cusolver_handle, CUBLAS_FILL_MODE_LOWER, _size, 1, _d_matrix, _size, _d_rhs, _size,
-                d_info));
+                _cusolver_handle, CUBLAS_FILL_MODE_LOWER, this->_size, 1, _d_matrix, this->_size,
+                _d_rhs, this->_size, d_info));
     }
     CHECK_CUDA_ERROR(cudaDeviceSynchronize());
 
@@ -270,7 +168,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
     // NOTE: _d_rhs contains the solution after the call to getrs/potrs as its contents are
     // overwritten by the solution vector
     CHECK_CUDA_ERROR(
-        cudaMemcpy(_h_solution, _d_rhs, _size * sizeof(real_type), cudaMemcpyDeviceToHost));
+        cudaMemcpy(_h_solution, _d_rhs, this->_size * sizeof(real_type), cudaMemcpyDeviceToHost));
 
     // free the temporary device memory
     CHECK_CUDA_ERROR(cudaFree(d_pivot));
@@ -289,10 +187,10 @@ mito::solvers::cuda::CUDADenseSolver<realT>::_initialize_cusolver() -> void
     CHECK_CUSOLVER_ERROR(cusolverDnCreate(&_cusolver_handle));
 
     // create a cuda stream
-    CHECK_CUDA_ERROR(cudaStreamCreateWithPriority(&_cuda_stream, cudaStreamNonBlocking, 0));
+    CHECK_CUDA_ERROR(cudaStreamCreateWithPriority(&(this->_cuda_stream), cudaStreamNonBlocking, 0));
 
     // set the stream for the cuSOLVER handle
-    CHECK_CUSOLVER_ERROR(cusolverDnSetStream(_cusolver_handle, _cuda_stream));
+    CHECK_CUSOLVER_ERROR(cusolverDnSetStream(_cusolver_handle, this->_cuda_stream));
 
     // all done
     return;
@@ -306,11 +204,11 @@ mito::solvers::cuda::CUDADenseSolver<realT>::_finalize_cusolver() -> void
     CHECK_CUSOLVER_ERROR(cusolverDnDestroy(_cusolver_handle));
 
     // destroy the cuda stream
-    CHECK_CUDA_ERROR(cudaStreamDestroy(_cuda_stream));
+    CHECK_CUDA_ERROR(cudaStreamDestroy(this->_cuda_stream));
 
     // reset the handle and stream pointers
     _cusolver_handle = nullptr;
-    _cuda_stream = nullptr;
+    this->_cuda_stream = nullptr;
 
     // all done
     return;
@@ -445,29 +343,8 @@ mito::solvers::cuda::CUDADenseSolver<realT>::_free_device_memory() -> void
     return;
 }
 
-template <mito::solvers::cuda::real_c realT>
-auto
-mito::solvers::cuda::CUDADenseSolver<realT>::_check_index_validity(size_t index) const -> void
-{
-    // check if the solver is initialized
-    // QUESTION: checking multiple times for initialization may be inefficient?
-    if (!_is_solver_initialized) {
-        throw std::logic_error("Solver is not yet initialized. Call initialize() first.");
-    }
-
-    // check if the index is valid and return false if it is not
-    if (index >= _size) {
-        throw std::out_of_range(
-            "Index " + std::to_string(index) + " is out of range. It must be between 0 and "
-            + std::to_string(_size - 1) + ".");
-    }
-
-    // all done
-    return;
-}
-
 // explicit instantiation of the {CUDADenseSolver} class for doubles
 template class mito::solvers::cuda::CUDADenseSolver<double>;
 
 // explicit instantiation of the {CUDADenseSolver} class for floats
-template class mito::solvers::cuda::CUDADenseSolver<float>;
\ No newline at end of file
+template class mito::solvers::cuda::CUDADenseSolver<float>;
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
index 5d71195fc..7fe127472 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
@@ -39,10 +39,10 @@ struct cusolver_traits<float> {
 namespace mito::solvers::cuda {
 
     template <real_c realT>
-    class CUDADenseSolver {
-      public:
-        // type alias for real numbers
-        using real_type = realT;
+    class CUDADenseSolver : public CUDASolver<realT> {
+      private:
+        // real type definition from CUDASolver
+        using real_type = typename CUDASolver<realT>::real_type;
 
       public:
         // constructor
@@ -52,64 +52,46 @@ namespace mito::solvers::cuda {
         ~CUDADenseSolver();
 
       public:
-        // initialize the CUDA dense solver
-        auto initialize(size_t size) -> void;
-
-        // finalize the CUDA dense solver
-        auto finalize() -> void;
-
-        // reset the linear system (i.e. the host copy of the matrix, right-hand side and solution)
-        auto reset_system() -> void;
-
         // set (add or insert depending on the mode) the value of a matrix entry in the host copy
         auto set_matrix_value(
             size_t row, size_t col, const real_type value,
-            const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void;
+            const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void override;
 
         // set (add or insert depending on the mode) the value of a right-hand side entry in the
         // host copy
         auto set_rhs_value(
             size_t row, const real_type value,
-            const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void;
-
-        // finalize the linear system assembly
-        auto finalize_assembly() -> void;
+            const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void override;
 
         // solve the linear system
-        auto solve() -> void;
+        auto solve() -> void override;
 
         // get the solution vector
-        template <class solutionT>
-        auto get_solution(solutionT & solution) const -> void;
+        auto get_solution(std::vector<real_type> & solution) const -> void override;
 
       private:
         // initialize the cuSOLVER utilities
-        auto _initialize_cusolver() -> void;
+        auto _initialize_cusolver() -> void override;
 
         // destroy the cuSOLVER utilities
-        auto _finalize_cusolver() -> void;
+        auto _finalize_cusolver() -> void override;
 
         // allocate the host memory for the matrix, right-hand side, and solution
-        auto _allocate_host_memory(size_t size) -> void;
+        auto _allocate_host_memory(size_t size) -> void override;
 
         // allocate the device memory for the matrix and right-hand side
-        auto _allocate_device_memory(size_t size) -> void;
+        auto _allocate_device_memory(size_t size) -> void override;
 
         // initialize the host data for the matrix, right-hand side, and solution
-        auto _initialize_host_data(size_t size) -> void;
+        auto _initialize_host_data(size_t size) -> void override;
 
         // deallocate the host memory for the matrix, right-hand side, and solution
-        auto _free_host_memory() -> void;
+        auto _free_host_memory() -> void override;
 
         // deallocate the device memory for the matrix and right-hand side
-        auto _free_device_memory() -> void;
-
-        // check the validity of the index in the matrix and right-hand side
-        auto _check_index_validity(size_t index) const -> void;
+        auto _free_device_memory() -> void override;
 
       private:
-        // solver type
-        SolverType _solver_type;
         // host copy of the matrix
         real_type * _h_matrix;
         // host copy of the right-hand side
@@ -120,21 +102,13 @@ namespace mito::solvers::cuda {
         real_type * _d_matrix;
         // device copy of the right-hand side
         real_type * _d_rhs;
-        // size of the linear system
-        size_t _size;
-        // flag to indicate if the solver has been initialized
-        bool _is_solver_initialized;
         // flag to indicate which type of host memory has been allocated
         // 0: no memory allocated
         // 1: pinned memory allocated
         // 2: pageable (regular) memory allocated
         int _allocated_host_memory_type;
-        // flag to indicate if the system assembly has been finalized
-        bool _is_assembly_finalized;
         // cuSOLVER handle
         cusolverDnHandle_t _cusolver_handle;
-        // cuda stream
-        cudaStream_t _cuda_stream;
     };
 }
 
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc b/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc
index fc21f1909..21768cd51 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc
@@ -9,18 +9,18 @@
 #else
 
 template <mito::solvers::cuda::real_c realT>
-template <class solutionT>
 auto
-mito::solvers::cuda::CUDADenseSolver<realT>::get_solution(solutionT & solution) const -> void
+mito::solvers::cuda::CUDADenseSolver<realT>::get_solution(std::vector<real_type> & solution) const
+    -> void
 {
     // check if {_size} matches the size of {solution}
-    assert(_size == std::size(solution));
+    assert(this->_size == std::size(solution));
 
     // reset the entries in {solution}
     std::fill(std::begin(solution), std::end(solution), 0.0);
 
     // copy {_h_solution} into {solution}
-    std::copy(_h_solution, _h_solution + _size, std::begin(solution));
+    std::copy(_h_solution, _h_solution + this->_size, std::begin(solution));
 
     // all done
     return;

From a8d79693ab35791691b96bf7d054deb3a86f85fc Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Mon, 1 Sep 2025 20:51:14 +0200
Subject: [PATCH 25/35] solvers/cuda/utilities: added HostArray utility class
 to manage host memory more easily

---
 lib/mito/solvers/backend/cuda/externals.h |   4 +
 lib/mito/solvers/backend/cuda/utilities.h | 178 ++++++++++++++++++++++
 2 files changed, 182 insertions(+)
 create mode 100644 lib/mito/solvers/backend/cuda/utilities.h

diff --git a/lib/mito/solvers/backend/cuda/externals.h b/lib/mito/solvers/backend/cuda/externals.h
index 99140544d..139bb748c 100644
--- a/lib/mito/solvers/backend/cuda/externals.h
+++ b/lib/mito/solvers/backend/cuda/externals.h
@@ -20,6 +20,10 @@
 #include <cusolverDn.h>
 
 
+// utilities
+#include "utilities.h"
+
+
 namespace mito::solvers::cuda {
 
     // concept for a real type
diff --git a/lib/mito/solvers/backend/cuda/utilities.h b/lib/mito/solvers/backend/cuda/utilities.h
new file mode 100644
index 000000000..a274feb09
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/utilities.h
@@ -0,0 +1,178 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+// code guard
+#pragma once
+
+
+namespace mito::solvers::cuda {
+
+    // a utility class to manage host memory
+    template <std::floating_point T>
+    class HostArray {
+      public:
+        // type alias for real numbers
+        using real_type = T;
+        // available host memory types
+        enum HostMemoryType { PAGEABLE, PINNED };
+
+      public:
+        // default constructor
+        HostArray() : _data(nullptr), _size(0), _memory_type(HostMemoryType::PINNED) {}
+
+        // parameterized constructor
+        HostArray(std::size_t size, HostMemoryType memory_type = HostMemoryType::PINNED) :
+            _data(nullptr),
+            _size(size),
+            _memory_type(memory_type)
+        {
+            allocate(size);
+        }
+
+        // allocate on construction
+        explicit HostArray(std::size_t size) : HostArray() { allocate(size); }
+
+        // destructor
+        ~HostArray() { _free_memory(); }
+
+        // delete copy constructor
+        HostArray(const HostArray &) = delete;
+
+        // delete copy assignment operator
+        HostArray & operator=(const HostArray &) = delete;
+
+        // enable move semantics
+        HostArray(HostArray && other) noexcept :
+            _data(other._data),
+            _size(other._size),
+            _memory_type(other._memory_type)
+        {
+            // reset the state of the moved-from object
+            other._data = nullptr;
+            other._size = 0;
+            other._memory_type = HostMemoryType::PINNED;
+        }
+
+        HostArray & operator=(HostArray && other) noexcept
+        {
+            if (this != &other) {
+                // free existing memory and acquire new memory
+                _free_memory();
+                _data = other._data;
+                _size = other._size;
+                _memory_type = other._memory_type;
+                // reset the state of the moved-from object
+                other._data = nullptr;
+                other._size = 0;
+                other._memory_type = HostMemoryType::PINNED;
+            }
+            return *this;
+        }
+
+        // allocate or reallocate memory for the array
+        void allocate(std::size_t size)
+        {
+            if (_size == size)
+                return;
+
+            // free old memory if any
+            // NOTE: This could be expensive if called frequently. We are going ahead with this
+            // for now for simplicity. But, as a first step towards improvement, we can come up with
+            // a way to reuse existing memory if the new allocation size is smaller than the current
+            // size.
+            _free_memory();
+
+            _size = size;
+            if (_size == 0)
+                return;
+
+            // try to allocate pinned memory first for faster transfers
+            cudaError_t pinned_alloc_error =
+                cudaMallocHost((void **) &_data, _size * sizeof(real_type));
+            if (pinned_alloc_error == cudaSuccess) {
+                _memory_type = HostMemoryType::PINNED;
+            } else {
+                // fallback to pageable memory
+                try {
+                    std::cerr
+                        << "[HostArray] Warning: Failed to allocate pinned memory. Falling back "
+                           "to pageable memory\n";
+                    _data = new real_type[_size];
+                    _memory_type = HostMemoryType::PAGEABLE;
+                } catch (const std::bad_alloc & e) {
+                    throw std::runtime_error(
+                        "Failed to allocate host memory: " + std::string(e.what()));
+                }
+            }
+
+            // all done
+            return;
+        }
+
+        // zero out the array
+        void zero()
+        {
+            if (_data) {
+                std::fill(_data, _data + _size, static_cast<real_type>(0.0));
+            }
+        }
+
+        // data accessor
+        real_type * data() { return _data; }
+
+        // const data accessor
+        const real_type * data() const { return _data; }
+
+        // size accessor
+        std::size_t size() const { return _size; }
+
+        // accessor for memory type
+        HostMemoryType memory_type() const { return _memory_type; }
+
+        // element access
+        real_type & operator[](std::size_t idx) { return _data[idx]; }
+
+        // const element access
+        const real_type & operator[](std::size_t idx) const { return _data[idx]; }
+
+      private:
+        // free the memory in the array
+        void _free_memory()
+        {
+            if (!_data)
+                return;
+
+            // free the memory
+            if (_memory_type == HostMemoryType::PINNED) {
+                cudaError_t error = cudaFreeHost(_data);
+                if (error != cudaSuccess) {
+                    std::cerr << "[HostArray] Error: Failed to free pinned memory: "
+                              << cudaGetErrorString(error) << "\n";
+                }
+            } else {
+                delete[] _data;
+            }
+
+            // reset the state
+            _data = nullptr;
+            _size = 0;
+            _memory_type = HostMemoryType::PINNED;
+
+            // all done
+            return;
+        }
+
+      private:
+        // pointer to the data
+        real_type * _data;
+        // size of the array
+        std::size_t _size;
+        // type of memory
+        HostMemoryType _memory_type;
+    };
+}
+
+
+// end of file

From 3b5e210466e53bb22e07b87fcd415578ce8dfe18 Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Sun, 7 Sep 2025 11:45:23 +0200
Subject: [PATCH 26/35] solvers/cuda/utilities: added DeviceArray utility class
 to manage device memory

Similar to the HostArray class, we added a DeviceArray class to manage device memory for allocation and deallocation more easily.
---
 lib/mito/solvers/backend/cuda/utilities.h | 120 ++++++++++++++++++++++
 1 file changed, 120 insertions(+)

diff --git a/lib/mito/solvers/backend/cuda/utilities.h b/lib/mito/solvers/backend/cuda/utilities.h
index a274feb09..41be8c05f 100644
--- a/lib/mito/solvers/backend/cuda/utilities.h
+++ b/lib/mito/solvers/backend/cuda/utilities.h
@@ -172,6 +172,126 @@ namespace mito::solvers::cuda {
         // type of memory
         HostMemoryType _memory_type;
     };
+
+    // a utility class to manage device memory
+    template <std::floating_point T>
+    class DeviceArray {
+      public:
+        using real_type = T;
+
+      public:
+        // default constructor
+        DeviceArray() : _data(nullptr), _size(0) {}
+
+        // allocate on construction
+        explicit DeviceArray(std::size_t size) : DeviceArray() { allocate(size); }
+
+        // destructor
+        ~DeviceArray() { _free_memory(); }
+
+        // delete copy constructor
+        DeviceArray(const DeviceArray &) = delete;
+
+        // delete copy assignment operator
+        DeviceArray & operator=(const DeviceArray &) = delete;
+
+        // enable move semantics
+        DeviceArray(DeviceArray && other) noexcept : _data(other._data), _size(other._size)
+        {
+            // reset the state of the moved-from object
+            other._data = nullptr;
+            other._size = 0;
+        }
+
+        DeviceArray & operator=(DeviceArray && other) noexcept
+        {
+            if (this != &other) {
+                // free existing memory and acquire new memory
+                _free_memory();
+                _data = other._data;
+                _size = other._size;
+                // reset the state of the moved-from object
+                other._data = nullptr;
+                other._size = 0;
+            }
+            return *this;
+        }
+
+        // allocate or reallocate memory for the array
+        void allocate(std::size_t size)
+        {
+            if (_size == size)
+                return;
+
+            // free old memory if any
+            _free_memory();
+
+            _size = size;
+            if (_size == 0)
+                return;
+
+            // allocate device memory
+            cudaError_t error = cudaMalloc((void **) &_data, size * sizeof(real_type));
+            if (error != cudaSuccess) {
+                throw std::runtime_error(
+                    "[DeviceArray] Error: Failed to allocate device memory: "
+                    + std::string(cudaGetErrorString(error)));
+            }
+
+            // all done
+            return;
+        }
+
+        // zero out the device array
+        void zero()
+        {
+            if (_data) {
+                cudaError_t error = cudaMemset(_data, 0, _size * sizeof(real_type));
+                if (error != cudaSuccess) {
+                    throw std::runtime_error(
+                        "[DeviceArray] Error: Failed to zero out device memory: "
+                        + std::string(cudaGetErrorString(error)));
+                }
+            }
+        }
+
+        // data accessor
+        real_type * data() { return _data; }
+
+        // const data accessor
+        const real_type * data() const { return _data; }
+
+        // size accessor
+        std::size_t size() const { return _size; }
+
+      private:
+        // free the memory in the array
+        void _free_memory()
+        {
+            if (!_data)
+                return;
+
+            // free the memory
+            cudaError_t error = cudaFree(_data);
+            if (error != cudaSuccess) {
+                std::cerr << "[DeviceArray] Error: Failed to free device memory: "
+                          << cudaGetErrorString(error) << "\n";
+            }
+
+            // reset the state
+            _data = nullptr;
+            _size = 0;
+
+            // all done
+            return;
+        }
+
+      private:
+        // pointer to the data
+        real_type * _data;
+        // size of the array
+        std::size_t _size;
+    };
 }
 
 

From 76ee128174ae5140af38a9223b4bf1012f87075d Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Sun, 7 Sep 2025 17:40:42 +0200
Subject: [PATCH 27/35] solvers/cuda/CUDASolver & CUDADenseSolver: multiple
 changes with use of Host, Device Arrays & moved few common members to base
 class

We made multiple changes in CUDASolver and CUDADenseSolver classes in this commit:

1. We moved few common methods and attributes to the base class CUDASolver so we can reuse them in future for CUDASparseSolver class.
2. Used the HostArray and DeviceArray classes for managing host and device memory respectively. This change reduced a lot of code clutter related to memory management.
3. Removed a lot of unnecessary methods to free/initialize memory as the HostArray and DeviceArray classes automatically manage memory now.
4. A few other methods/attributes are also removed which are not needed.
---
 .../solvers/backend/cuda/CUDADenseSolver.cu   | 189 ++++--------------
 .../solvers/backend/cuda/CUDADenseSolver.h    |  40 +---
 lib/mito/solvers/backend/cuda/CUDASolver.cu   |  54 ++---
 lib/mito/solvers/backend/cuda/CUDASolver.h    |  32 +--
 .../{CUDADenseSolver.icc => CUDASolver.icc}   |  14 +-
 lib/mito/solvers/backend/cuda/forward.h       |   2 +-
 .../cuda_dense_solver_solve_linear_system.cc  |   6 -
 7 files changed, 91 insertions(+), 246 deletions(-)
 rename lib/mito/solvers/backend/cuda/{CUDADenseSolver.icc => CUDASolver.icc} (53%)

diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
index 02e5a6d27..969a94538 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
@@ -11,12 +11,8 @@
 template <mito::solvers::cuda::real_c realT>
 mito::solvers::cuda::CUDADenseSolver<realT>::CUDADenseSolver(SolverType solver_type) :
     CUDASolver<realT>(solver_type),
-    _h_matrix(nullptr),
-    _h_rhs(nullptr),
-    _h_solution(nullptr),
-    _d_matrix(nullptr),
-    _d_rhs(nullptr),
-    _allocated_host_memory_type(0),
+    _h_matrix(),
+    _d_matrix(),
     _cusolver_handle()
 {
     // initialize cuSOLVER
@@ -61,28 +57,22 @@ mito::solvers::cuda::CUDADenseSolver<realT>::set_matrix_value(
     return;
 }
 
-// add/insert {value} to rhs entry at {row} of the host copy
 template <mito::solvers::cuda::real_c realT>
 auto
-mito::solvers::cuda::CUDADenseSolver<realT>::set_rhs_value(
-    size_t row, const real_type value, const InsertMode insert_mode) -> void
+mito::solvers::cuda::CUDADenseSolver<realT>::reset_system() -> void
 {
-    // check if the system assembly is finalized and throw an error if it is
-    if (this->_is_assembly_finalized) {
-        throw std::logic_error(
-            "System assembly is already finalized. Cannot add/insert values to the rhs.");
+    // check if the solver is initialized
+    if (!this->_is_solver_initialized) {
+        throw std::logic_error("Solver is not yet initialized. Call initialize() first.");
     }
 
-    // check if the row index is within bounds
-    this->_check_index_validity(row);
+    // fill the host matrix, rhs and solution with zeros
+    _h_matrix.zero();
+    this->_h_rhs.zero();
+    this->_h_solution.zero();
 
-    // add/insert the value to the rhs entry in the host rhs
-    if (insert_mode == InsertMode::ADD_VALUE)
-        _h_rhs[row] += value;
-    else if (insert_mode == InsertMode::INSERT_VALUE)
-        _h_rhs[row] = value;
-    else
-        throw std::invalid_argument("Invalid insert mode. Use ADD_VALUE or INSERT_VALUE.");
+    // reset the assembly finalized flag
+    this->_is_assembly_finalized = false;
 
     // all done
     return;
@@ -101,10 +91,11 @@ mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
     // copy the host matrix and rhs data to device global memory
     // IMPROVE: We should move the data through streams for better performance later!
     CHECK_CUDA_ERROR(cudaMemcpy(
-        _d_matrix, _h_matrix, this->_size * this->_size * sizeof(real_type),
+        _d_matrix.data(), _h_matrix.data(), this->_size * this->_size * sizeof(real_type),
+        cudaMemcpyHostToDevice));
+    CHECK_CUDA_ERROR(cudaMemcpy(
+        this->_d_rhs.data(), this->_h_rhs.data(), this->_size * sizeof(real_type),
         cudaMemcpyHostToDevice));
-    CHECK_CUDA_ERROR(
-        cudaMemcpy(_d_rhs, _h_rhs, this->_size * sizeof(real_type), cudaMemcpyHostToDevice));
 
     // allocate device memory for temporary variables in the factorization
     int * d_pivot = nullptr;
@@ -123,13 +114,13 @@ mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
     if (this->_solver_type == SolverType::LU) {
         CHECK_CUSOLVER_ERROR(
             cusolver_traits<real_type>::getrf_buffer_size(
-                _cusolver_handle, this->_size, this->_size, _d_matrix, this->_size,
+                _cusolver_handle, this->_size, this->_size, _d_matrix.data(), this->_size,
                 &workspace_size));
     } else if (this->_solver_type == SolverType::CHOLESKY) {
         CHECK_CUSOLVER_ERROR(
             cusolver_traits<real_type>::potrf_buffer_size(
-                _cusolver_handle, CUBLAS_FILL_MODE_LOWER, this->_size, _d_matrix, this->_size,
-                &workspace_size));
+                _cusolver_handle, CUBLAS_FILL_MODE_LOWER, this->_size, _d_matrix.data(),
+                this->_size, &workspace_size));
     }
 
     CHECK_CUDA_ERROR(cudaMalloc(&d_pivot, this->_size * sizeof(int)));
@@ -140,13 +131,13 @@ mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
     if (this->_solver_type == SolverType::LU) {
         CHECK_CUSOLVER_ERROR(
             cusolver_traits<real_type>::getrf(
-                _cusolver_handle, this->_size, this->_size, _d_matrix, this->_size, d_workspace,
-                d_pivot, d_info));
+                _cusolver_handle, this->_size, this->_size, _d_matrix.data(), this->_size,
+                d_workspace, d_pivot, d_info));
     } else if (this->_solver_type == SolverType::CHOLESKY) {
         CHECK_CUSOLVER_ERROR(
             cusolver_traits<real_type>::potrf(
-                _cusolver_handle, CUBLAS_FILL_MODE_LOWER, this->_size, _d_matrix, this->_size,
-                d_workspace, workspace_size, d_info));
+                _cusolver_handle, CUBLAS_FILL_MODE_LOWER, this->_size, _d_matrix.data(),
+                this->_size, d_workspace, workspace_size, d_info));
     }
     CHECK_CUDA_ERROR(cudaDeviceSynchronize());
 
@@ -154,21 +145,22 @@ mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
     if (this->_solver_type == SolverType::LU) {
         CHECK_CUSOLVER_ERROR(
             cusolver_traits<real_type>::getrs(
-                _cusolver_handle, CUBLAS_OP_N, this->_size, 1, _d_matrix, this->_size, d_pivot,
-                _d_rhs, this->_size, d_info));
+                _cusolver_handle, CUBLAS_OP_N, this->_size, 1, _d_matrix.data(), this->_size,
+                d_pivot, this->_d_rhs.data(), this->_size, d_info));
     } else if (this->_solver_type == SolverType::CHOLESKY) {
         CHECK_CUSOLVER_ERROR(
             cusolver_traits<real_type>::potrs(
-                _cusolver_handle, CUBLAS_FILL_MODE_LOWER, this->_size, 1, _d_matrix, this->_size,
-                _d_rhs, this->_size, d_info));
+                _cusolver_handle, CUBLAS_FILL_MODE_LOWER, this->_size, 1, _d_matrix.data(),
+                this->_size, this->_d_rhs.data(), this->_size, d_info));
     }
     CHECK_CUDA_ERROR(cudaDeviceSynchronize());
 
     // copy the solution from device global memory to host memory
     // NOTE: _d_rhs contains the solution after the call to getrs/potrs as its contents are
     // overwritten by the solution vector
-    CHECK_CUDA_ERROR(
-        cudaMemcpy(_h_solution, _d_rhs, this->_size * sizeof(real_type), cudaMemcpyDeviceToHost));
+    CHECK_CUDA_ERROR(cudaMemcpy(
+        this->_h_solution.data(), this->_d_rhs.data(), this->_size * sizeof(real_type),
+        cudaMemcpyDeviceToHost));
 
     // free the temporary device memory
     CHECK_CUDA_ERROR(cudaFree(d_pivot));
@@ -218,40 +210,15 @@ template <mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::_allocate_host_memory(size_t size) -> void
 {
-    // try to allocate pinned memory on the host for faster transfers
-    cudaError_t err_pinned_alloc_matrix =
-        cudaMallocHost(&_h_matrix, size * size * sizeof(real_type));
-    cudaError_t err_pinned_alloc_rhs = cudaMallocHost(&_h_rhs, size * sizeof(real_type));
-    cudaError_t err_pinned_alloc_solution = cudaMallocHost(&_h_solution, size * sizeof(real_type));
-
-    // check if the pinned memory allocation for matrix, rhs, and solution was successful
-    if (err_pinned_alloc_matrix == cudaSuccess && err_pinned_alloc_rhs == cudaSuccess
-        && err_pinned_alloc_solution == cudaSuccess) {
-        // set the flag to indicate that pinned memory was allocated
-        _allocated_host_memory_type = 1;
-        return;
-    }
+    // allocate host memory for matrix, rhs and solution
+    _h_matrix.allocate(size * size);
+    this->_h_rhs.allocate(size);
+    this->_h_solution.allocate(size);
 
-    // free any partially allocated pinned memory
-    if (err_pinned_alloc_matrix == cudaSuccess)
-        CHECK_CUDA_ERROR(cudaFreeHost(_h_matrix));
-    if (err_pinned_alloc_rhs == cudaSuccess)
-        CHECK_CUDA_ERROR(cudaFreeHost(_h_rhs));
-    if (err_pinned_alloc_solution == cudaSuccess)
-        CHECK_CUDA_ERROR(cudaFreeHost(_h_solution));
-
-    // try to allocate regular memory on the host
-    try {
-        _h_matrix = new real_type[size * size];
-        _h_rhs = new real_type[size];
-        _h_solution = new real_type[size];
-        // set the flag to indicate that regular memory was allocated
-        _allocated_host_memory_type = 2;
-    } catch (const std::bad_alloc & e) {
-        throw std::runtime_error(
-            "Failed to allocate host memory for matrix, rhs, and solution: "
-            + std::string(e.what()));
-    }
+    // zero out the arrays
+    _h_matrix.zero();
+    this->_h_rhs.zero();
+    this->_h_solution.zero();
 
     // all done
     return;
@@ -261,88 +228,14 @@ template <mito::solvers::cuda::real_c realT>
 auto
 mito::solvers::cuda::CUDADenseSolver<realT>::_allocate_device_memory(size_t size) -> void
 {
-    // allocate global device memory for matrix, rhs, and solution
-    CHECK_CUDA_ERROR(cudaMalloc(&_d_matrix, size * size * sizeof(real_type)));
-    CHECK_CUDA_ERROR(cudaMalloc(&_d_rhs, size * sizeof(real_type)));
-
-    // all done
-    return;
-}
-
-template <mito::solvers::cuda::real_c realT>
-auto
-mito::solvers::cuda::CUDADenseSolver<realT>::_initialize_host_data(size_t size) -> void
-{
-    // check if host memory is allocated
-    if (_allocated_host_memory_type == 0) {
-        // throw developer error
-        throw std::logic_error(
-            "Host memory is not yet allocated. Call _allocate_host_memory() first.");
-    }
-
-    // initialize the host matrix, rhs and solution with zeros
-    for (size_t i = 0; i < size * size; ++i) {
-        _h_matrix[i] = 0.0;
-    }
-    for (size_t i = 0; i < size; ++i) {
-        _h_rhs[i] = 0.0;
-        _h_solution[i] = 0.0;
-    }
+    // allocate global device memory for matrix and rhs
+    _d_matrix.allocate(size * size);
+    this->_d_rhs.allocate(size);
 
     // all done
     return;
 }
 
-template <mito::solvers::cuda::real_c realT>
-auto
-mito::solvers::cuda::CUDADenseSolver<realT>::_free_host_memory() -> void
-{
-    // check if pinned memory was allocated
-    if (_allocated_host_memory_type == 1) {
-        // free pinned memory
-        if (_h_matrix)
-            CHECK_CUDA_ERROR(cudaFreeHost(_h_matrix));
-        if (_h_rhs)
-            CHECK_CUDA_ERROR(cudaFreeHost(_h_rhs));
-        if (_h_solution)
-            CHECK_CUDA_ERROR(cudaFreeHost(_h_solution));
-    } else if (_allocated_host_memory_type == 2) {
-        // free regular memory
-        if (_h_matrix)
-            delete[] _h_matrix;
-        if (_h_rhs)
-            delete[] _h_rhs;
-        if (_h_solution)
-            delete[] _h_solution;
-    }
-
-    // reset the flag to indicate that the memory has been freed
-    _allocated_host_memory_type = 0;
-    // reset the pointers
-    _h_matrix = nullptr;
-    _h_rhs = nullptr;
-    _h_solution = nullptr;
-
-    return;
-}
-
-template <mito::solvers::cuda::real_c realT>
-auto
-mito::solvers::cuda::CUDADenseSolver<realT>::_free_device_memory() -> void
-{
-    // free global device memory for matrix and rhs
-    if (_d_matrix)
-        CHECK_CUDA_ERROR(cudaFree(_d_matrix));
-    if (_d_rhs)
-        CHECK_CUDA_ERROR(cudaFree(_d_rhs));
-
-    // reset the pointers
-    _d_matrix = nullptr;
-    _d_rhs = nullptr;
-
-    return;
-}
-
 // explicit instantiation of the {CUDADenseSolver} class for doubles
 template class mito::solvers::cuda::CUDADenseSolver<double>;
 
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
index 7fe127472..0c26e113f 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.h
@@ -57,18 +57,12 @@ namespace mito::solvers::cuda {
             size_t row, size_t col, const real_type value,
             const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void override;
 
-        // set (add or insert depending on the mode) the value of a right-hand side entry in the
-        // host copy
-        auto set_rhs_value(
-            size_t row, const real_type value,
-            const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void override;
+        // reset the linear system (i.e. the host copy of the matrix, right-hand side and solution)
+        auto reset_system() -> void override;
 
         // solve the linear system
         auto solve() -> void override;
 
-        // get the solution vector
-        auto get_solution(std::vector<real_type> & solution) const -> void override;
-
       private:
         // initialize the cuSOLVER utilities
         auto _initialize_cusolver() -> void override;
@@ -82,41 +76,15 @@ namespace mito::solvers::cuda {
         // allocate the device memory for the matrix and right-hand side
         auto _allocate_device_memory(size_t size) -> void override;
 
-        // initialize the host data for the matrix, right-hand side, and solution
-        auto _initialize_host_data(size_t size) -> void override;
-
-        // deallocate the host memory for the matrix, right-hand side, and solution
-        auto _free_host_memory() -> void override;
-
-        // deallocate the device memory for the matrix and right-hand side
-        auto _free_device_memory() -> void override;
-
       private:
         // host copy of the matrix
-        real_type * _h_matrix;
-        // host copy of the right-hand side
-        real_type * _h_rhs;
-        // host copy of the solution
-        real_type * _h_solution;
+        HostArray<real_type> _h_matrix;
         // device copy of the matrix
-        real_type * _d_matrix;
-        // device copy of the right-hand side
-        real_type * _d_rhs;
-        // flag to indicate which type of host memory has been allocated
-        // 0: no memory allocated
-        // 1: pinned memory allocated
-        // 2: pageable (regular) memory allocated
-        int _allocated_host_memory_type;
+        DeviceArray<real_type> _d_matrix;
         // cuSOLVER handle
         cusolverDnHandle_t _cusolver_handle;
     };
 }
 
 
-// get the template definitions
-#define mito_solvers_backend_cuda_CUDADenseSolver_icc
-#include "CUDADenseSolver.icc"
-#undef mito_solvers_backend_cuda_CUDADenseSolver_icc
-
-
 // end of file
diff --git a/lib/mito/solvers/backend/cuda/CUDASolver.cu b/lib/mito/solvers/backend/cuda/CUDASolver.cu
index 1ecf5cf56..8ee9e4641 100644
--- a/lib/mito/solvers/backend/cuda/CUDASolver.cu
+++ b/lib/mito/solvers/backend/cuda/CUDASolver.cu
@@ -10,6 +10,9 @@
 // constructor
 template <mito::solvers::cuda::real_c realT>
 mito::solvers::cuda::CUDASolver<realT>::CUDASolver(SolverType solver_type) :
+    _h_rhs(),
+    _h_solution(),
+    _d_rhs(),
     _solver_type(solver_type),
     _size(0),
     _is_solver_initialized(false),
@@ -29,8 +32,8 @@ mito::solvers::cuda::CUDASolver<realT>::initialize(size_t size) -> void
     // check if the solver is already initialized
     if (_is_solver_initialized) {
         throw std::logic_error(
-            "Solver is already initialized. Are you sure you want to reinitialize? Then call "
-            "finalize() first.");
+            "Solver is already initialized. You are not allowed to re-initialize it. If you want "
+            "to re-initialize, you need to create a new solver instance.");
     }
 
     // check if the size is valid
@@ -44,9 +47,6 @@ mito::solvers::cuda::CUDASolver<realT>::initialize(size_t size) -> void
     // allocate host memory
     _allocate_host_memory(size);
 
-    // initialize host data
-    _initialize_host_data(size);
-
     // allocate device memory
     _allocate_device_memory(size);
 
@@ -57,40 +57,28 @@ mito::solvers::cuda::CUDASolver<realT>::initialize(size_t size) -> void
     return;
 }
 
+// add/insert {value} to rhs entry at {row} of the host copy
 template <mito::solvers::cuda::real_c realT>
 auto
-mito::solvers::cuda::CUDASolver<realT>::finalize() -> void
-{
-    // check if the solver is initialized
-    if (_is_solver_initialized) {
-        // free host memory
-        _free_host_memory();
-
-        // free device memory
-        _free_device_memory();
-    }
-
-    // reset the solver initialized flag
-    _is_solver_initialized = false;
-
-    // all done
-    return;
-}
-
-template <mito::solvers::cuda::real_c realT>
-auto
-mito::solvers::cuda::CUDASolver<realT>::reset_system() -> void
+mito::solvers::cuda::CUDASolver<realT>::set_rhs_value(
+    size_t row, const real_type value, const InsertMode insert_mode) -> void
 {
-    // check if the solver is initialized
-    if (!_is_solver_initialized) {
-        throw std::logic_error("Solver is not yet initialized. Call initialize() first.");
+    // check if the system assembly is finalized and throw an error if it is
+    if (_is_assembly_finalized) {
+        throw std::logic_error(
+            "System assembly is already finalized. Cannot add/insert values to the rhs.");
     }
 
-    // fill the host matrix, rhs and solution with zeros
-    _initialize_host_data(_size);
+    // check if the row index is within bounds
+    this->_check_index_validity(row);
 
-    // reset the assembly finalized flag
-    _is_assembly_finalized = false;
+    // add/insert the value to the rhs entry in the host rhs
+    if (insert_mode == InsertMode::ADD_VALUE)
+        _h_rhs[row] += value;
+    else if (insert_mode == InsertMode::INSERT_VALUE)
+        _h_rhs[row] = value;
+    else
+        throw std::invalid_argument("Invalid insert mode. Use ADD_VALUE or INSERT_VALUE.");
 
     // all done
     return;
diff --git a/lib/mito/solvers/backend/cuda/CUDASolver.h b/lib/mito/solvers/backend/cuda/CUDASolver.h
index 3617c303f..3c8d7d069 100644
--- a/lib/mito/solvers/backend/cuda/CUDASolver.h
+++ b/lib/mito/solvers/backend/cuda/CUDASolver.h
@@ -26,11 +26,8 @@ namespace mito::solvers::cuda {
         // initialize the CUDA dense solver
         auto initialize(size_t size) -> void;
 
-        // finalize the CUDA dense solver
-        auto finalize() -> void;
-
         // reset the linear system (i.e. the host copy of the matrix, right-hand side and solution)
-        auto reset_system() -> void;
+        virtual auto reset_system() -> void = 0;
 
         // set (add or insert depending on the mode) the value of a matrix entry in the host copy
         virtual auto set_matrix_value(
@@ -41,7 +38,7 @@ namespace mito::solvers::cuda {
         // host copy
         virtual auto set_rhs_value(
             size_t row, const real_type value,
-            const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void = 0;
+            const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void;
 
         // finalize the linear system assembly
         auto finalize_assembly() -> void;
@@ -50,7 +47,8 @@ namespace mito::solvers::cuda {
         virtual auto solve() -> void = 0;
 
         // get the solution vector
-        virtual auto get_solution(std::vector<real_type> & solution) const -> void = 0;
+        template <class solutionT>
+        auto get_solution(solutionT & solution) const -> void;
 
       protected:
         // initialize the cuSOLVER utilities
@@ -65,19 +63,16 @@ namespace mito::solvers::cuda {
         // allocate the device memory for the matrix and right-hand side
         virtual auto _allocate_device_memory(size_t size) -> void = 0;
 
-        // initialize the host data for the matrix, right-hand side, and solution
-        virtual auto _initialize_host_data(size_t size) -> void = 0;
-
-        // deallocate the host memory for the matrix, right-hand side, and solution
-        virtual auto _free_host_memory() -> void = 0;
-
-        // deallocate the device memory for the matrix and right-hand side
-        virtual auto _free_device_memory() -> void = 0;
-
         // check the validity of the index in the matrix and right-hand side
         auto _check_index_validity(size_t index) const -> void;
 
       protected:
+        // host copy of the right-hand side
+        HostArray<real_type> _h_rhs;
+        // host copy of the solution
+        HostArray<real_type> _h_solution;
+        // device copy of the right-hand side
+        DeviceArray<real_type> _d_rhs;
         // solver type
         SolverType _solver_type;
         // size of the linear system
@@ -91,4 +86,11 @@ namespace mito::solvers::cuda {
     };
 }
 
+
+// get the template definitions
+#define mito_solvers_backend_cuda_CUDASolver_icc
+#include "CUDASolver.icc"
+#undef mito_solvers_backend_cuda_CUDASolver_icc
+
+
 // end of file
diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc b/lib/mito/solvers/backend/cuda/CUDASolver.icc
similarity index 53%
rename from lib/mito/solvers/backend/cuda/CUDADenseSolver.icc
rename to lib/mito/solvers/backend/cuda/CUDASolver.icc
index 21768cd51..5ee5e26f3 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.icc
+++ b/lib/mito/solvers/backend/cuda/CUDASolver.icc
@@ -4,29 +4,29 @@
 //
 
 
-#if !defined(mito_solvers_backend_cuda_CUDADenseSolver_icc)
-#error This header file contains implementation details of class mito::solvers::cuda::CUDADenseSolver
+#if !defined(mito_solvers_backend_cuda_CUDASolver_icc)
+#error This header file contains implementation details of class mito::solvers::cuda::CUDASolver
 #else
 
 template <mito::solvers::cuda::real_c realT>
+template <class solutionT>
 auto
-mito::solvers::cuda::CUDADenseSolver<realT>::get_solution(std::vector<real_type> & solution) const
-    -> void
+mito::solvers::cuda::CUDASolver<realT>::get_solution(solutionT & solution) const -> void
 {
     // check if {_size} matches the size of {solution}
-    assert(this->_size == std::size(solution));
+    assert(_size == std::size(solution));
 
     // reset the entries in {solution}
     std::fill(std::begin(solution), std::end(solution), 0.0);
 
     // copy {_h_solution} into {solution}
-    std::copy(_h_solution, _h_solution + this->_size, std::begin(solution));
+    std::copy(_h_solution.data(), _h_solution.data() + _size, std::begin(solution));
 
     // all done
     return;
 }
 
-#endif    // mito_solvers_backend_cuda_CUDADenseSolver_icc
+#endif    // mito_solvers_backend_cuda_CUDASolver_icc
 
 
 // end of file
diff --git a/lib/mito/solvers/backend/cuda/forward.h b/lib/mito/solvers/backend/cuda/forward.h
index 9a48b8b05..b31e977a3 100644
--- a/lib/mito/solvers/backend/cuda/forward.h
+++ b/lib/mito/solvers/backend/cuda/forward.h
@@ -16,7 +16,7 @@ namespace mito::solvers::cuda {
     enum SolverType { CHOLESKY, LU };
 
     // class for CUDA dense solver
-    template<real_c realT>
+    template <real_c realT>
     class CUDADenseSolver;
 }
 
diff --git a/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc b/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
index 3ee95da3e..aac76a423 100644
--- a/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
+++ b/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
@@ -54,9 +54,6 @@ TEST(Solvers, CUDADenseSolverSymmetricLinearSystem)
     EXPECT_NEAR(x[8], 9.0, tol);
     EXPECT_NEAR(x[9], 5.0, tol);
 
-    // finalize the solver
-    solver.finalize();
-
     // all done
     return;
 }
@@ -107,9 +104,6 @@ TEST(Solvers, CUDADenseSolverUnsymmetricLinearSystem)
     EXPECT_NEAR(x[8], 1.8126616578396353, tol);
     EXPECT_NEAR(x[9], 1.4063308289198175, tol);
 
-    // finalize the solver
-    solver.finalize();
-
     // all done
     return;
 }

From f7d913be6dbc199c131804b9660de7d9edca70b7 Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Wed, 17 Sep 2025 17:22:57 +0200
Subject: [PATCH 28/35] solvers/cuda: removed/corrected some comments

---
 lib/mito/solvers/backend/cuda/CUDADenseSolver.cu | 1 -
 lib/mito/solvers/backend/cuda/CUDASolver.h       | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
index 969a94538..b5c9162a2 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
@@ -89,7 +89,6 @@ mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
     }
 
     // copy the host matrix and rhs data to device global memory
-    // IMPROVE: We should move the data through streams for better performance later!
     CHECK_CUDA_ERROR(cudaMemcpy(
         _d_matrix.data(), _h_matrix.data(), this->_size * this->_size * sizeof(real_type),
         cudaMemcpyHostToDevice));
diff --git a/lib/mito/solvers/backend/cuda/CUDASolver.h b/lib/mito/solvers/backend/cuda/CUDASolver.h
index 3c8d7d069..f8c5e3e9b 100644
--- a/lib/mito/solvers/backend/cuda/CUDASolver.h
+++ b/lib/mito/solvers/backend/cuda/CUDASolver.h
@@ -23,7 +23,7 @@ namespace mito::solvers::cuda {
         ~CUDASolver();
 
       public:
-        // initialize the CUDA dense solver
+        // initialize the CUDA solver
         auto initialize(size_t size) -> void;
 
         // reset the linear system (i.e. the host copy of the matrix, right-hand side and solution)

From 3ee0e234ebf73a2124462698d0c69a75a1fc6353 Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Wed, 17 Sep 2025 17:24:01 +0200
Subject: [PATCH 29/35] solvers/cuda/utilities: added DeviceSparseMatrix class
 to manage device memory for sparse matrices

---
 lib/mito/solvers/backend/cuda/utilities.h | 210 ++++++++++++++++++++++
 1 file changed, 210 insertions(+)

diff --git a/lib/mito/solvers/backend/cuda/utilities.h b/lib/mito/solvers/backend/cuda/utilities.h
index 41be8c05f..c84a8af23 100644
--- a/lib/mito/solvers/backend/cuda/utilities.h
+++ b/lib/mito/solvers/backend/cuda/utilities.h
@@ -292,6 +292,216 @@ namespace mito::solvers::cuda {
         // size of the array
         std::size_t _size;
     };
+
+    // a utility class to manage device memory for sparse matrices in CSR format
+    template <std::floating_point T>
+    class DeviceSparseMatrix {
+      public:
+        using real_type = T;
+        using index_type = int;
+
+      public:
+        // default constructor
+        DeviceSparseMatrix() :
+            _values(nullptr),
+            _row_offsets(nullptr),
+            _col_indices(nullptr),
+            _rows(0),
+            _cols(0),
+            _nnz(0)
+        {}
+
+        // allocate on construction
+        explicit DeviceSparseMatrix(std::size_t rows, std::size_t cols, std::size_t nnz) :
+            DeviceSparseMatrix()
+        {
+            allocate(rows, cols, nnz);
+        }
+
+        // destructor
+        ~DeviceSparseMatrix() { _free_memory(); }
+
+        // delete copy constructor
+        DeviceSparseMatrix(const DeviceSparseMatrix &) = delete;
+
+        // delete copy assignment operator
+        DeviceSparseMatrix & operator=(const DeviceSparseMatrix &) = delete;
+
+        // enable move semantics
+        DeviceSparseMatrix(DeviceSparseMatrix && other) noexcept :
+            _values(other._values),
+            _row_offsets(other._row_offsets),
+            _col_indices(other._col_indices),
+            _rows(other._rows),
+            _cols(other._cols),
+            _nnz(other._nnz)
+        {
+            // reset the state of the moved-from object
+            other._values = nullptr;
+            other._row_offsets = nullptr;
+            other._col_indices = nullptr;
+            other._rows = 0;
+            other._cols = 0;
+            other._nnz = 0;
+        }
+
+        DeviceSparseMatrix & operator=(DeviceSparseMatrix && other) noexcept
+        {
+            if (this != &other) {
+                // free existing memory and acquire new memory
+                _free_memory();
+                _values = other._values;
+                _row_offsets = other._row_offsets;
+                _col_indices = other._col_indices;
+                _rows = other._rows;
+                _cols = other._cols;
+                _nnz = other._nnz;
+                // reset the state of the moved-from object
+                other._values = nullptr;
+                other._row_offsets = nullptr;
+                other._col_indices = nullptr;
+                other._rows = 0;
+                other._cols = 0;
+                other._nnz = 0;
+            }
+            return *this;
+        }
+
+        // allocate or reallocate memory for the matrix
+        void allocate(std::size_t rows, std::size_t cols, std::size_t nnz)
+        {
+            if (_rows == rows && _cols == cols && _nnz == nnz)
+                return;
+
+            // free old memory if any
+            _free_memory();
+
+            _rows = rows;
+            _cols = cols;
+            _nnz = nnz;
+            if (_rows == 0 || _cols == 0 || _nnz == 0)    // no memory to allocate
+                return;
+
+            // allocate device memory for the CSR representation
+            cudaError_t error;
+
+            error = cudaMalloc((void **) &_values, _nnz * sizeof(real_type));
+            if (error != cudaSuccess) {
+                throw std::runtime_error(
+                    "[DeviceSparseMatrix] Error: Failed to allocate values array: "
+                    + std::string(cudaGetErrorString(error)));
+            }
+
+            error = cudaMalloc((void **) &_row_offsets, (_rows + 1) * sizeof(index_type));
+            if (error != cudaSuccess) {
+                throw std::runtime_error(
+                    "[DeviceSparseMatrix] Error: Failed to allocate row offsets array: "
+                    + std::string(cudaGetErrorString(error)));
+            }
+
+            error = cudaMalloc((void **) &_col_indices, _nnz * sizeof(index_type));
+            if (error != cudaSuccess) {
+                throw std::runtime_error(
+                    "[DeviceSparseMatrix] Error: Failed to allocate column indices array: "
+                    + std::string(cudaGetErrorString(error)));
+            }
+
+            // all done
+            return;
+        }
+
+        // zero out the device matrix
+        void zero()
+        {
+            if (_values) {
+                cudaError_t error = cudaMemset(_values, 0, _nnz * sizeof(real_type));
+                if (error != cudaSuccess) {
+                    throw std::runtime_error(
+                        "[DeviceSparseMatrix] Error: Failed to zero out device memory: "
+                        + std::string(cudaGetErrorString(error)));
+                }
+            }
+        }
+
+        // values accessor
+        real_type * values() { return _values; }
+
+        // row offsets accessor
+        index_type * row_offsets() { return _row_offsets; }
+
+        // column indices accessor
+        index_type * col_indices() { return _col_indices; }
+
+        // const values accessor
+        const real_type * values() const { return _values; }
+
+        // const row offsets accessor
+        const index_type * row_offsets() const { return _row_offsets; }
+
+        // const column indices accessor
+        const index_type * col_indices() const { return _col_indices; }
+
+        // row number accessor
+        size_t rows() const { return _rows; }
+
+        // column number accessor
+        size_t cols() const { return _cols; }
+
+        // non-zero entries accessor
+        size_t nnz() const { return _nnz; }
+
+      private:
+        // free the memory in the matrix
+        void _free_memory()
+        {
+            if (_values) {
+                cudaError_t error = cudaFree(_values);
+                if (error != cudaSuccess) {
+                    std::cerr << "[DeviceSparseMatrix] Error: Failed to free values array: "
+                              << cudaGetErrorString(error) << "\n";
+                }
+            }
+            if (_row_offsets) {
+                cudaError_t error = cudaFree(_row_offsets);
+                if (error != cudaSuccess) {
+                    std::cerr << "[DeviceSparseMatrix] Error: Failed to free row offsets array: "
+                              << cudaGetErrorString(error) << "\n";
+                }
+            }
+            if (_col_indices) {
+                cudaError_t error = cudaFree(_col_indices);
+                if (error != cudaSuccess) {
+                    std::cerr << "[DeviceSparseMatrix] Error: Failed to free column indices array: "
+                              << cudaGetErrorString(error) << "\n";
+                }
+            }
+
+            // reset the state
+            _values = nullptr;
+            _row_offsets = nullptr;
+            _col_indices = nullptr;
+            _rows = 0;
+            _cols = 0;
+            _nnz = 0;
+
+            // all done
+            return;
+        }
+
+      private:
+        // pointer to the non-zero values
+        real_type * _values;
+        // pointer to the row offsets
+        index_type * _row_offsets;
+        // pointer to the column indices
+        index_type * _col_indices;
+        // number of rows
+        size_t _rows;
+        // number of columns
+        size_t _cols;
+        // number of non-zero entries
+        size_t _nnz;
+    };
 }
 
 

From ddefaab93db87259df298821a210f1310be08a02 Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Thu, 18 Sep 2025 11:27:40 +0200
Subject: [PATCH 30/35] solvers/cuda/CUDASparseSolver: first attempt at the
 CUDA sparse solver implementation

This is the first attempt at implementing a sparse linear solver in CUDA. We used the Eigen sparse matrix to store the system matrix on the host side. The memory arrays we developed for the dense solver were reused to store the right-hand side and the solution vectors on the both host and device side. We used the cuDSS library to solve the linear system.
---
 .cmake/mito_cuda.cmake                        |  21 ++
 .cmake/mito_sources.cmake                     |   1 +
 .../solvers/backend/cuda/CUDAErrorChecks.h    |  10 +
 .../solvers/backend/cuda/CUDASparseSolver.cu  | 294 ++++++++++++++++++
 .../solvers/backend/cuda/CUDASparseSolver.h   |  65 ++++
 lib/mito/solvers/backend/cuda/api.h           |   5 +-
 lib/mito/solvers/backend/cuda/externals.h     |   6 +
 lib/mito/solvers/backend/cuda/factories.h     |   8 +-
 lib/mito/solvers/backend/cuda/forward.h       |   4 +
 lib/mito/solvers/backend/cuda/public.h        |   3 +
 10 files changed, 415 insertions(+), 2 deletions(-)
 create mode 100644 lib/mito/solvers/backend/cuda/CUDASparseSolver.cu
 create mode 100644 lib/mito/solvers/backend/cuda/CUDASparseSolver.h

diff --git a/.cmake/mito_cuda.cmake b/.cmake/mito_cuda.cmake
index 778e80a53..d1f0cc7b1 100644
--- a/.cmake/mito_cuda.cmake
+++ b/.cmake/mito_cuda.cmake
@@ -27,6 +27,27 @@ if(WITH_CUDA)
         set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE FILEPATH "Host compiler for NVCC")
     endif()
     message(STATUS "Using CUDA host compiler: ${CMAKE_CUDA_HOST_COMPILER}")
+    # disable offline compilation support warnings
+    target_compile_options(mito PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:-Wno-deprecated-gpu-targets>)
+    # find Eigen for CUDA sparse solver
+    find_package(Eigen3 3.3 QUIET NO_MODULE)
+    if(Eigen3_FOUND)
+        # add Eigen compiler definition
+        message(STATUS "Found Eigen3: ${EIGEN3_INCLUDE_DIR}")
+        add_definitions(-DWITH_EIGEN)
+        # add include directories for Eigen
+        target_include_directories(mito PUBLIC ${EIGEN3_INCLUDE_DIR})
+        # disable warnings due to incompatible return types in Eigen
+        target_compile_options(mito PUBLIC $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
+        # find cudss library
+        find_package(cudss REQUIRED)
+        # include cudss headers
+        target_include_directories(mito PUBLIC ${cudss_INCLUDE_DIR})
+        # link against cudss libraries
+        target_link_libraries(mito PUBLIC cudss)
+    else()
+        message(WARNING "Eigen3 not found. CUDA sparse solver parts will be disabled.")
+    endif()
 endif()
 
 
diff --git a/.cmake/mito_sources.cmake b/.cmake/mito_sources.cmake
index 7eede5a15..86c600cd2 100644
--- a/.cmake/mito_sources.cmake
+++ b/.cmake/mito_sources.cmake
@@ -19,6 +19,7 @@ if (WITH_CUDA)
 set(MITO_SOURCES ${MITO_SOURCES}
     lib/mito/solvers/backend/cuda/CUDASolver.cu
     lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
+    lib/mito/solvers/backend/cuda/CUDASparseSolver.cu
 )
 endif()
 
diff --git a/lib/mito/solvers/backend/cuda/CUDAErrorChecks.h b/lib/mito/solvers/backend/cuda/CUDAErrorChecks.h
index d98e0ba0f..0fb22c848 100644
--- a/lib/mito/solvers/backend/cuda/CUDAErrorChecks.h
+++ b/lib/mito/solvers/backend/cuda/CUDAErrorChecks.h
@@ -58,3 +58,13 @@ cusolverGetErrorString(cusolverStatus_t status)
             exit(EXIT_FAILURE);                                                                    \
         }                                                                                          \
     } while (0)
+
+// cuDSS error checking macro
+#define CHECK_CUDSS_ERROR(call)                                                        \
+    do {                                                                               \
+        cudssStatus_t status = (call);                                                 \
+        if (status != CUDSS_STATUS_SUCCESS) {                                          \
+            fprintf(stderr, "cuDSS error: %d at %s:%d\n", status, __FILE__, __LINE__); \
+            exit(EXIT_FAILURE);                                                        \
+        }                                                                              \
+    } while (0)
diff --git a/lib/mito/solvers/backend/cuda/CUDASparseSolver.cu b/lib/mito/solvers/backend/cuda/CUDASparseSolver.cu
new file mode 100644
index 000000000..cf127183e
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/CUDASparseSolver.cu
@@ -0,0 +1,294 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+
+#include "public.h"
+
+
+// constructor
+template <mito::solvers::cuda::real_c realT>
+mito::solvers::cuda::CUDASparseSolver<realT>::CUDASparseSolver(SolverType solver_type) :
+    CUDASolver<realT>(solver_type),
+    _h_matrix(),
+    _d_matrix(),
+    _d_solution(),
+    _cudss_handle()
+{
+    // initialize cuSOLVER
+    _initialize_cusolver();
+}
+
+// destructor
+template <mito::solvers::cuda::real_c realT>
+mito::solvers::cuda::CUDASparseSolver<realT>::~CUDASparseSolver()
+{
+    // finalize cuSOLVER
+    _finalize_cusolver();
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASparseSolver<realT>::initialize(size_t size, size_t nnz) -> void
+{
+    // call the base class initialize function
+    CUDASolver<realT>::initialize(size);
+
+    // reserve space for non-zero entries per row in the host sparse matrix for efficiency
+    // NOTE: This is just an estimate. The actual number of non-zero entries may be less than
+    // nnz. But it is better to set it to a reasonable value to avoid multiple reallocations.
+    // In general, over-allocating is better than under-allocating.
+    nnz <= 0 ? _h_matrix.reserve(Eigen::VectorXi::Constant(size, size)) :
+               _h_matrix.reserve(Eigen::VectorXi::Constant(size, nnz));
+
+    // all done
+    return;
+}
+
+// add/insert {value} to matrix entry at ({row}, {col}) of the host copy
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASparseSolver<realT>::set_matrix_value(
+    size_t row, size_t col, const real_type value, const InsertMode insert_mode) -> void
+{
+    // check if the system assembly is finalized and throw an error if it is
+    if (this->_is_assembly_finalized) {
+        throw std::logic_error(
+            "System assembly is already finalized. Cannot add/insert values to the matrix.");
+    }
+
+    // check if the row and column indices are within bounds
+    this->_check_index_validity(row);
+    this->_check_index_validity(col);
+
+    // add/insert the value to the matrix entry in the host matrix
+    // NOTE: We store the host matrix in CSR format using Eigen's SparseMatrix class in row-major
+    // order.
+    if (insert_mode == InsertMode::ADD_VALUE)
+        _h_matrix.coeffRef(row, col) += value;
+    else if (insert_mode == InsertMode::INSERT_VALUE)
+        _h_matrix.coeffRef(row, col) = value;
+    else
+        throw std::invalid_argument("Invalid insert mode. Use ADD_VALUE or INSERT_VALUE.");
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASparseSolver<realT>::reset_system() -> void
+{
+    // check if the solver is initialized
+    if (!this->_is_solver_initialized) {
+        throw std::logic_error("Solver is not yet initialized. Call initialize() first.");
+    }
+
+    // fill the host matrix, rhs and solution with zeros
+    _h_matrix.setZero();
+    this->_h_rhs.zero();
+    this->_h_solution.zero();
+
+    // reset the assembly finalized flag
+    this->_is_assembly_finalized = false;
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASparseSolver<realT>::solve() -> void
+{
+    // check if the assembly is finalized
+    if (!this->_is_assembly_finalized) {
+        throw std::logic_error(
+            "System assembly is not yet finalized. Call finalize_assembly() first.");
+    }
+
+    // finalize the host matrix (compress the storage)
+    // NOTE: This is not a mandatory step as we are only using the Eigen matrix for storage on the
+    // host but we are doing it anyway.
+    _h_matrix.makeCompressed();
+
+    // create and fill the auxiliary containers to hold the CSR representation of the host matrix
+    size_t nnz_matrix = _h_matrix.nonZeros();
+    std::vector<real_type> h_csr_values(nnz_matrix, 0.0);
+    std::vector<int> h_csr_row_offsets(this->_size + 1, 0);
+    std::vector<int> h_csr_col_indices(nnz_matrix, 0);
+    int idx = 0;
+    for (size_t k = 0; k < this->_size; ++k) {    // for each row
+        h_csr_row_offsets[k] = idx;
+        for (typename Eigen::SparseMatrix<real_type, Eigen::RowMajor>::InnerIterator it(
+                 _h_matrix, k);
+             it; ++it) {    // for each non-zero entry in the row
+            h_csr_values[idx] = it.value();
+            h_csr_col_indices[idx] = it.col();
+            ++idx;
+        }
+    }
+    h_csr_row_offsets[this->_size] = nnz_matrix;
+
+    // allocate device memory for the matrix
+    _d_matrix.allocate(this->_size, this->_size, nnz_matrix);
+
+    // copy the host matrix and rhs data to device global memory
+    CHECK_CUDA_ERROR(cudaMemcpy(
+        _d_matrix.values(), h_csr_values.data(), h_csr_values.size() * sizeof(real_type),
+        cudaMemcpyHostToDevice));
+    CHECK_CUDA_ERROR(cudaMemcpy(
+        _d_matrix.row_offsets(), h_csr_row_offsets.data(), h_csr_row_offsets.size() * sizeof(int),
+        cudaMemcpyHostToDevice));
+    CHECK_CUDA_ERROR(cudaMemcpy(
+        _d_matrix.col_indices(), h_csr_col_indices.data(), h_csr_col_indices.size() * sizeof(int),
+        cudaMemcpyHostToDevice));
+    CHECK_CUDA_ERROR(cudaMemcpy(
+        this->_d_rhs.data(), this->_h_rhs.data(), this->_size * sizeof(real_type),
+        cudaMemcpyHostToDevice));
+
+    // cuDSS data structures to solve the linear system
+    cudssConfig_t config;
+    cudssData_t data;
+    cudssMatrix_t d_A, d_b, d_x;
+
+    // create the cuDSS config and data structures
+    CHECK_CUDSS_ERROR(cudssConfigCreate(&config));
+    CHECK_CUDSS_ERROR(cudssDataCreate(_cudss_handle, &data));
+
+    // determine the data type
+    cudaDataType_t dtype;
+    if constexpr (std::is_same_v<real_type, double>) {
+        dtype = CUDA_R_64F;
+    } else if constexpr (std::is_same_v<real_type, float>) {
+        dtype = CUDA_R_32F;
+    }
+
+    // determine the matrix type based on the solver type
+    cudssMatrixType_t mtype;
+    if (this->_solver_type == SolverType::LU) {
+        mtype = CUDSS_MTYPE_GENERAL;
+    } else if (this->_solver_type == SolverType::CHOLESKY) {
+        mtype = CUDSS_MTYPE_SPD;
+    } else {
+        throw std::logic_error(
+            "Unsupported solver type. Only LU and CHOLESKY are supported in the CUDA sparse "
+            "solver.");
+    }
+
+    // create (CSR) matrix and vector descriptors
+    CHECK_CUDSS_ERROR(cudssMatrixCreateCsr(
+        &d_A, this->_size, this->_size, nnz_matrix, _d_matrix.row_offsets(), NULL,
+        _d_matrix.col_indices(), _d_matrix.values(), CUDA_R_32I, dtype, mtype, CUDSS_MVIEW_FULL,
+        CUDSS_BASE_ZERO));
+    CHECK_CUDSS_ERROR(cudssMatrixCreateDn(
+        &d_b, this->_size, 1, this->_size, this->_d_rhs.data(), dtype, CUDSS_LAYOUT_COL_MAJOR));
+    CHECK_CUDSS_ERROR(cudssMatrixCreateDn(
+        &d_x, this->_size, 1, this->_size, _d_solution.data(), dtype, CUDSS_LAYOUT_COL_MAJOR));
+
+    /************* solve the linear system in three stages *************/
+
+    // Stage 1: reordering & symbolic factorization
+    CHECK_CUDSS_ERROR(
+        cudssExecute(_cudss_handle, CUDSS_PHASE_ANALYSIS, config, data, d_A, d_x, d_b));
+
+    // Stage 2: numerical factorization
+    CHECK_CUDSS_ERROR(
+        cudssExecute(_cudss_handle, CUDSS_PHASE_FACTORIZATION, config, data, d_A, d_x, d_b));
+
+    // Stage 3: solve the system
+    CHECK_CUDSS_ERROR(cudssExecute(_cudss_handle, CUDSS_PHASE_SOLVE, config, data, d_A, d_x, d_b));
+
+    // copy the solution back to host
+    CHECK_CUDA_ERROR(cudaMemcpy(
+        this->_h_solution.data(), _d_solution.data(), this->_size * sizeof(real_type),
+        cudaMemcpyDeviceToHost));
+
+    // destroy the cuDSS data structures
+    CHECK_CUDSS_ERROR(cudssMatrixDestroy(d_A));
+    CHECK_CUDSS_ERROR(cudssMatrixDestroy(d_b));
+    CHECK_CUDSS_ERROR(cudssMatrixDestroy(d_x));
+    CHECK_CUDSS_ERROR(cudssDataDestroy(_cudss_handle, data));
+    CHECK_CUDSS_ERROR(cudssConfigDestroy(config));
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASparseSolver<realT>::_initialize_cusolver() -> void
+{
+    // create the cuDSS handle
+    CHECK_CUDSS_ERROR(cudssCreate(&_cudss_handle));
+
+    // create a cuda stream
+    CHECK_CUDA_ERROR(cudaStreamCreateWithPriority(&(this->_cuda_stream), cudaStreamNonBlocking, 0));
+
+    // set the stream for the cuDSS handle
+    CHECK_CUDSS_ERROR(cudssSetStream(_cudss_handle, this->_cuda_stream));
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASparseSolver<realT>::_finalize_cusolver() -> void
+{
+    // destroy the cuDSS handle
+    CHECK_CUDSS_ERROR(cudssDestroy(_cudss_handle));
+
+    // destroy the cuda stream
+    CHECK_CUDA_ERROR(cudaStreamDestroy(this->_cuda_stream));
+
+    // reset the handle and stream pointers
+    _cudss_handle = nullptr;
+    this->_cuda_stream = nullptr;
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASparseSolver<realT>::_allocate_host_memory(size_t size) -> void
+{
+    // allocate host memory for matrix, rhs and solution
+    _h_matrix.resize(size, size);
+    this->_h_rhs.allocate(size);
+    this->_h_solution.allocate(size);
+
+    // zero out the arrays
+    _h_matrix.setZero();
+    this->_h_rhs.zero();
+    this->_h_solution.zero();
+
+    // all done
+    return;
+}
+
+template <mito::solvers::cuda::real_c realT>
+auto
+mito::solvers::cuda::CUDASparseSolver<realT>::_allocate_device_memory(size_t size) -> void
+{
+    // NOTE: We are not allocating memory for the matrix here because we do not know the number of
+    // non-zero entries yet and allocating device memory is an expensive operation. So we will
+    // allocate the device memory for the matrix directly after the assembly is finalized.
+
+    // allocate device memory for rhs and solution
+    this->_d_rhs.allocate(size);
+    _d_solution.allocate(size);
+
+    // zero out the solution array
+    _d_solution.zero();
+
+    // all done
+    return;
+}
+
+// explicit instantiation of the {CUDASparseSolver} class for doubles
+template class mito::solvers::cuda::CUDASparseSolver<double>;
+
+// explicit instantiation of the {CUDASparseSolver} class for floats
+template class mito::solvers::cuda::CUDASparseSolver<float>;
diff --git a/lib/mito/solvers/backend/cuda/CUDASparseSolver.h b/lib/mito/solvers/backend/cuda/CUDASparseSolver.h
new file mode 100644
index 000000000..8b97fdba0
--- /dev/null
+++ b/lib/mito/solvers/backend/cuda/CUDASparseSolver.h
@@ -0,0 +1,65 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+// code guard
+#pragma once
+
+
+namespace mito::solvers::cuda {
+    template <real_c realT>
+    class CUDASparseSolver : public CUDASolver<realT> {
+      private:
+        // real type definition from CUDASolver
+        using real_type = typename CUDASolver<realT>::real_type;
+
+      public:
+        // constructor
+        CUDASparseSolver(SolverType solver_type = SolverType::LU);
+
+        // destructor
+        ~CUDASparseSolver();
+
+      public:
+        // initialize the CUDA sparse solver
+        auto initialize(size_t size, size_t nnz = -1) -> void;
+
+        // set (add or insert depending on the mode) the value of a matrix entry in the host copy
+        auto set_matrix_value(
+            size_t row, size_t col, const real_type value,
+            const InsertMode insert_mode = InsertMode::INSERT_VALUE) -> void override;
+
+        // reset the linear system (i.e. the host copy of the matrix, right-hand side and solution)
+        auto reset_system() -> void override;
+
+        // solve the linear system
+        auto solve() -> void override;
+
+      private:
+        // initialize the cuSOLVER utilities
+        auto _initialize_cusolver() -> void override;
+
+        // destroy the cuSOLVER utilities
+        auto _finalize_cusolver() -> void override;
+
+        // allocate the host memory for the matrix, right-hand side, and solution
+        auto _allocate_host_memory(size_t size) -> void override;
+
+        // allocate the device memory for the matrix and right-hand side
+        auto _allocate_device_memory(size_t size) -> void override;
+
+      private:
+        // host copy of the matrix in CSR format
+        Eigen::SparseMatrix<real_type, Eigen::RowMajor> _h_matrix;
+        // device copy of the matrix in CSR format
+        DeviceSparseMatrix<real_type> _d_matrix;
+        // device copy of the solution
+        DeviceArray<real_type> _d_solution;
+        // cuDSS handle
+        cudssHandle_t _cudss_handle;
+    };
+}
+
+
+// end of file
diff --git a/lib/mito/solvers/backend/cuda/api.h b/lib/mito/solvers/backend/cuda/api.h
index 5bfab20de..fbb598816 100644
--- a/lib/mito/solvers/backend/cuda/api.h
+++ b/lib/mito/solvers/backend/cuda/api.h
@@ -10,9 +10,12 @@
 namespace mito::solvers::cuda {
 
     // cuda dense solver
-    template<real_c realT>
+    template <real_c realT>
     using dense_t = CUDADenseSolver<realT>;
 
+    // cuda sparse solver
+    template <real_c realT>
+    using sparse_t = CUDASparseSolver<realT>;
 }
 
 
diff --git a/lib/mito/solvers/backend/cuda/externals.h b/lib/mito/solvers/backend/cuda/externals.h
index 139bb748c..a6422a3bd 100644
--- a/lib/mito/solvers/backend/cuda/externals.h
+++ b/lib/mito/solvers/backend/cuda/externals.h
@@ -13,11 +13,17 @@
 #include <cassert>
 #include <iostream>
 #include <type_traits>
+#ifdef WITH_EIGEN
+#include <Eigen/Sparse>    // for sparse matrix support
+#endif                     // WITH_EIGEN
 
 
 // cuda support
 #include <cuda_runtime.h>
 #include <cusolverDn.h>
+#ifdef WITH_EIGEN
+#include <cudss.h>
+#endif    // WITH_EIGEN
 
 
 // utilities
diff --git a/lib/mito/solvers/backend/cuda/factories.h b/lib/mito/solvers/backend/cuda/factories.h
index f628c28e0..8dd3f44a6 100644
--- a/lib/mito/solvers/backend/cuda/factories.h
+++ b/lib/mito/solvers/backend/cuda/factories.h
@@ -10,12 +10,18 @@
 namespace mito::solvers::cuda {
 
     // cuda dense solver
-    template<real_c realT>
+    template <real_c realT>
     auto dense(SolverType solver_type = SolverType::LU)
     {
         return dense_t<realT>(solver_type);
     }
 
+    // cuda sparse solver
+    template <real_c realT>
+    auto sparse(SolverType solver_type = SolverType::LU)
+    {
+        return sparse_t<realT>(solver_type);
+    }
 }
 
 
diff --git a/lib/mito/solvers/backend/cuda/forward.h b/lib/mito/solvers/backend/cuda/forward.h
index b31e977a3..ec32486d8 100644
--- a/lib/mito/solvers/backend/cuda/forward.h
+++ b/lib/mito/solvers/backend/cuda/forward.h
@@ -18,6 +18,10 @@ namespace mito::solvers::cuda {
     // class for CUDA dense solver
     template <real_c realT>
     class CUDADenseSolver;
+
+    // class for CUDA sparse solver
+    template <real_c realT>
+    class CUDASparseSolver;
 }
 
 
diff --git a/lib/mito/solvers/backend/cuda/public.h b/lib/mito/solvers/backend/cuda/public.h
index 84caec4cd..cd87df6b1 100644
--- a/lib/mito/solvers/backend/cuda/public.h
+++ b/lib/mito/solvers/backend/cuda/public.h
@@ -19,6 +19,9 @@
 // classes
 #include "CUDASolver.h"
 #include "CUDADenseSolver.h"
+#ifdef WITH_EIGEN
+#include "CUDASparseSolver.h"
+#endif    // WITH_EIGEN
 
 // error checks
 #include "CUDAErrorChecks.h"

From 7ecb1cea3369335781b6a64e1cbb942da72454d3 Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Thu, 18 Sep 2025 11:45:19 +0200
Subject: [PATCH 31/35] tests/mito.lib/solvers/cuda_sparse: added symmetric &
 unsymmetric system tests for the CUDA sparse solver

---
 .cmake/mito_tests_mito_lib.cmake              |   1 +
 .../cuda_sparse_solver_solve_linear_system.cc | 131 ++++++++++++++++++
 2 files changed, 132 insertions(+)
 create mode 100644 tests/mito.lib/solvers/cuda_sparse_solver_solve_linear_system.cc

diff --git a/.cmake/mito_tests_mito_lib.cmake b/.cmake/mito_tests_mito_lib.cmake
index e1fb3bcee..0e63104eb 100644
--- a/.cmake/mito_tests_mito_lib.cmake
+++ b/.cmake/mito_tests_mito_lib.cmake
@@ -71,6 +71,7 @@ endif()
 
 if(WITH_CUDA)
     mito_test_driver(tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc)
+    mito_test_driver(tests/mito.lib/solvers/cuda_sparse_solver_solve_linear_system.cc)
 endif()
 
 # tensor
diff --git a/tests/mito.lib/solvers/cuda_sparse_solver_solve_linear_system.cc b/tests/mito.lib/solvers/cuda_sparse_solver_solve_linear_system.cc
new file mode 100644
index 000000000..5a64960d2
--- /dev/null
+++ b/tests/mito.lib/solvers/cuda_sparse_solver_solve_linear_system.cc
@@ -0,0 +1,131 @@
+// -*- c++ -*-
+//
+// Copyright (c) 2020-2024, the MiTo Authors, all rights reserved
+//
+
+
+#include <gtest/gtest.h>
+#include <mito.h>
+
+TEST(Solvers, CUDASparseSolverSymmetricLinearSystem)
+{
+    // the size of the linear system
+    int N = 10;
+
+    // numerical tolerance for the solution
+    double tol = 1.0e-10;
+
+    // instantiate a CUDA Sparse solver for a linear system of size {N}
+    auto solver =
+        mito::solvers::cuda::sparse<mito::real>(mito::solvers::cuda::SolverType::CHOLESKY);
+    solver.initialize(N, 3);    // 3 non-zero entries per row
+
+    // set matrix and right-hand side entries
+    for (int i = 0; i < N; i++) {
+        // matrix: diagonal
+        solver.set_matrix_value(i, i, 2.0, mito::solvers::cuda::InsertMode::INSERT_VALUE);
+        // matrix: subdiagonal
+        if (i > 0) {
+            solver.set_matrix_value(i, i - 1, -1.0, mito::solvers::cuda::InsertMode::ADD_VALUE);
+        }
+        // matrix: superdiagonal
+        if (i < N - 1) {
+            solver.set_matrix_value(i, i + 1, -1.0, mito::solvers::cuda::InsertMode::INSERT_VALUE);
+        }
+        // rhs
+        solver.set_rhs_value(i, 1.0, mito::solvers::cuda::InsertMode::ADD_VALUE);
+    }
+
+    // finalize the assembly of the linear system
+    solver.finalize_assembly();
+
+    // solve the linear system
+    solver.solve();
+
+    // read the solution
+    auto x = std::vector<double>(N);
+    solver.get_solution(x);
+
+    // check the solution
+    EXPECT_NEAR(x[0], 5.0, tol);
+    EXPECT_NEAR(x[1], 9.0, tol);
+    EXPECT_NEAR(x[2], 12.0, tol);
+    EXPECT_NEAR(x[3], 14.0, tol);
+    EXPECT_NEAR(x[4], 15.0, tol);
+    EXPECT_NEAR(x[5], 15.0, tol);
+    EXPECT_NEAR(x[6], 14.0, tol);
+    EXPECT_NEAR(x[7], 12.0, tol);
+    EXPECT_NEAR(x[8], 9.0, tol);
+    EXPECT_NEAR(x[9], 5.0, tol);
+
+    // all done
+    return;
+}
+
+TEST(Solvers, CUDASparseSolverUnsymmetricLinearSystem)
+{
+    // the size of the linear system
+    int N = 10;
+
+    // numerical tolerance for the solution
+    double tol = 1.0e-10;
+
+    // instantiate a CUDA Sparse solver for a linear system of size {N}
+    auto solver = mito::solvers::cuda::sparse<mito::real>(mito::solvers::cuda::SolverType::LU);
+    solver.initialize(N, 3);    // 3 non-zero entries per row
+
+    // set matrix and right-hand side entries
+    for (int i = 0; i < N; i++) {
+        // matrix: diagonal
+        solver.set_matrix_value(i, i, 2.0, mito::solvers::cuda::InsertMode::ADD_VALUE);
+        // matrix: subdiagonal
+        if (i > 0) {
+            solver.set_matrix_value(i, i - 1, -1.0, mito::solvers::cuda::InsertMode::INSERT_VALUE);
+        }
+        // matrix: superdiagonal
+        if (i < N - 1) {
+            solver.set_matrix_value(i, i + 1, -0.5, mito::solvers::cuda::InsertMode::INSERT_VALUE);
+        }
+        // rhs
+        solver.set_rhs_value(i, 1.0, mito::solvers::cuda::InsertMode::ADD_VALUE);
+    }
+
+    // finalize the assembly of the linear system
+    solver.finalize_assembly();
+
+    // solve the linear system
+    solver.solve();
+
+    // read the solution
+    auto x = std::vector<double>(N);
+    solver.get_solution(x);
+
+    // check the solution
+    EXPECT_NEAR(x[0], 0.8284194482079074, tol);
+    EXPECT_NEAR(x[1], 1.3136777928316294, tol);
+    EXPECT_NEAR(x[2], 1.5978722749107030, tol);
+    EXPECT_NEAR(x[3], 1.7641335139795535, tol);
+    EXPECT_NEAR(x[4], 1.8607895060968098, tol);
+    EXPECT_NEAR(x[5], 1.9148909964281315, tol);
+    EXPECT_NEAR(x[6], 1.9379849735189063, tol);
+    EXPECT_NEAR(x[7], 1.9221579012193621, tol);
+    EXPECT_NEAR(x[8], 1.8126616578396353, tol);
+    EXPECT_NEAR(x[9], 1.4063308289198175, tol);
+
+    // all done
+    return;
+}
+
+
+int
+main(int argc, char ** argv)
+{
+    ::testing::InitGoogleTest(&argc, argv);
+    auto result = RUN_ALL_TESTS();
+
+    // all done
+    return result;
+}
+
+
+// end of file

From 356a00bba02007d6a017cfa67fad5884ab99d32f Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Thu, 18 Sep 2025 14:35:12 +0200
Subject: [PATCH 32/35] solvers/cuda/CUDASparseSolver: corrected nnz default
 input as size_t cannot be negative

---
 lib/mito/solvers/backend/cuda/CUDASparseSolver.cu | 4 ++--
 lib/mito/solvers/backend/cuda/CUDASparseSolver.h  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/mito/solvers/backend/cuda/CUDASparseSolver.cu b/lib/mito/solvers/backend/cuda/CUDASparseSolver.cu
index cf127183e..bdc404c78 100644
--- a/lib/mito/solvers/backend/cuda/CUDASparseSolver.cu
+++ b/lib/mito/solvers/backend/cuda/CUDASparseSolver.cu
@@ -39,8 +39,8 @@ mito::solvers::cuda::CUDASparseSolver<realT>::initialize(size_t size, size_t nnz
     // NOTE: This is just an estimate. The actual number of non-zero entries may be less than
     // nnz. But it is better to set it to a reasonable value to avoid multiple reallocations.
     // In general, over-allocating is better than under-allocating.
-    nnz <= 0 ? _h_matrix.reserve(Eigen::VectorXi::Constant(size, size)) :
-               _h_matrix.reserve(Eigen::VectorXi::Constant(size, nnz));
+    (nnz == 0) ? _h_matrix.reserve(Eigen::VectorXi::Constant(size, size)) :
+                 _h_matrix.reserve(Eigen::VectorXi::Constant(size, nnz));
 
     // all done
     return;
diff --git a/lib/mito/solvers/backend/cuda/CUDASparseSolver.h b/lib/mito/solvers/backend/cuda/CUDASparseSolver.h
index 8b97fdba0..c8967aee1 100644
--- a/lib/mito/solvers/backend/cuda/CUDASparseSolver.h
+++ b/lib/mito/solvers/backend/cuda/CUDASparseSolver.h
@@ -23,7 +23,7 @@ namespace mito::solvers::cuda {
 
       public:
         // initialize the CUDA sparse solver
-        auto initialize(size_t size, size_t nnz = -1) -> void;
+        auto initialize(size_t size, size_t nnz = 0) -> void;
 
         // set (add or insert depending on the mode) the value of a matrix entry in the host copy
         auto set_matrix_value(

From a7e15e6dde1ab9e4d901f0d18dcd980bf28986f9 Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Thu, 18 Sep 2025 14:36:59 +0200
Subject: [PATCH 33/35] solvers/cuda: added synchronization steps after memory
 transfers & solve steps to ensure data integrity & correctness

---
 lib/mito/solvers/backend/cuda/CUDADenseSolver.cu  | 1 +
 lib/mito/solvers/backend/cuda/CUDASparseSolver.cu | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
index b5c9162a2..2c0e79401 100644
--- a/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
+++ b/lib/mito/solvers/backend/cuda/CUDADenseSolver.cu
@@ -95,6 +95,7 @@ mito::solvers::cuda::CUDADenseSolver<realT>::solve() -> void
     CHECK_CUDA_ERROR(cudaMemcpy(
         this->_d_rhs.data(), this->_h_rhs.data(), this->_size * sizeof(real_type),
         cudaMemcpyHostToDevice));
+    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
 
     // allocate device memory for temporary variables in the factorization
     int * d_pivot = nullptr;
diff --git a/lib/mito/solvers/backend/cuda/CUDASparseSolver.cu b/lib/mito/solvers/backend/cuda/CUDASparseSolver.cu
index bdc404c78..8186672e1 100644
--- a/lib/mito/solvers/backend/cuda/CUDASparseSolver.cu
+++ b/lib/mito/solvers/backend/cuda/CUDASparseSolver.cu
@@ -146,6 +146,7 @@ mito::solvers::cuda::CUDASparseSolver<realT>::solve() -> void
     CHECK_CUDA_ERROR(cudaMemcpy(
         this->_d_rhs.data(), this->_h_rhs.data(), this->_size * sizeof(real_type),
         cudaMemcpyHostToDevice));
+    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
 
     // cuDSS data structures to solve the linear system
     cudssConfig_t config;
@@ -191,13 +192,16 @@ mito::solvers::cuda::CUDASparseSolver<realT>::solve() -> void
     // Stage 1: reordering & symbolic factorization
     CHECK_CUDSS_ERROR(
         cudssExecute(_cudss_handle, CUDSS_PHASE_ANALYSIS, config, data, d_A, d_x, d_b));
+    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
 
     // Stage 2: numerical factorization
     CHECK_CUDSS_ERROR(
         cudssExecute(_cudss_handle, CUDSS_PHASE_FACTORIZATION, config, data, d_A, d_x, d_b));
+    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
 
     // Stage 3: solve the system
     CHECK_CUDSS_ERROR(cudssExecute(_cudss_handle, CUDSS_PHASE_SOLVE, config, data, d_A, d_x, d_b));
+    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
 
     // copy the solution back to host
     CHECK_CUDA_ERROR(cudaMemcpy(

From 0bf8c74a25fd763c39dd246e951b41e7de874727 Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Thu, 18 Sep 2025 15:05:31 +0200
Subject: [PATCH 34/35] tests/mito.lib/solvers/cuda_sparse: not setting nnz per
 row in unsymmetric test to check if solver handles it

---
 .../mito.lib/solvers/cuda_sparse_solver_solve_linear_system.cc  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/mito.lib/solvers/cuda_sparse_solver_solve_linear_system.cc b/tests/mito.lib/solvers/cuda_sparse_solver_solve_linear_system.cc
index 5a64960d2..e29a8a77e 100644
--- a/tests/mito.lib/solvers/cuda_sparse_solver_solve_linear_system.cc
+++ b/tests/mito.lib/solvers/cuda_sparse_solver_solve_linear_system.cc
@@ -72,7 +72,7 @@ TEST(Solvers, CUDASparseSolverUnsymmetricLinearSystem)
 
     // instantiate a CUDA Sparse solver for a linear system of size {N}
     auto solver = mito::solvers::cuda::sparse<mito::real>(mito::solvers::cuda::SolverType::LU);
-    solver.initialize(N, 3);    // 3 non-zero entries per row
+    solver.initialize(N);    // non-zeros per row are not set
 
     // set matrix and right-hand side entries
     for (int i = 0; i < N; i++) {

From d0e52bcf24002d37f4356436b0d512515c6fe90b Mon Sep 17 00:00:00 2001
From: Sai Kubair Kota <s.k.kota@tudelft.nl>
Date: Thu, 18 Sep 2025 15:26:13 +0200
Subject: [PATCH 35/35] tests/mito.lib/solvers/cuda_dense: made matrices truly
 dense by filling all entries so we can test dense solver more thoroughly

---
 .../cuda_dense_solver_solve_linear_system.cc  | 94 ++++++++++++-------
 1 file changed, 58 insertions(+), 36 deletions(-)

diff --git a/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc b/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
index aac76a423..6cd7ea189 100644
--- a/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
+++ b/tests/mito.lib/solvers/cuda_dense_solver_solve_linear_system.cc
@@ -20,16 +20,27 @@ TEST(Solvers, CUDADenseSolverSymmetricLinearSystem)
     auto solver = mito::solvers::cuda::dense<mito::real>(mito::solvers::cuda::SolverType::CHOLESKY);
     solver.initialize(N);
 
-    // set matrix and right-hand side entries
+    // set matrix entries
+    double A[10][10] = { { 3.5, -1.2, 0.8, 0.0, -0.5, 1.1, 0.0, 0.3, -0.9, 0.4 },
+                         { -1.2, 4.0, 1.3, -0.4, 0.6, -0.7, 0.0, 0.5, 0.0, -1.0 },
+                         { 0.8, 1.3, 3.8, 0.7, -0.8, 0.0, -0.2, 0.6, -0.5, 0.0 },
+                         { 0.0, -0.4, 0.7, 4.2, 1.0, 0.9, 0.3, 0.0, 0.4, -0.3 },
+                         { -0.5, 0.6, -0.8, 1.0, 4.5, 0.4, -0.7, 0.8, 0.0, 0.2 },
+                         { 1.1, -0.7, 0.0, 0.9, 0.4, 3.6, -0.3, 0.0, -0.4, 0.5 },
+                         { 0.0, 0.0, -0.2, 0.3, -0.7, -0.3, 3.3, 0.6, -0.8, 0.0 },
+                         { 0.3, 0.5, 0.6, 0.0, 0.8, 0.0, 0.6, 4.1, 0.2, 0.7 },
+                         { -0.9, 0.0, -0.5, 0.4, 0.0, -0.4, -0.8, 0.2, 3.7, 0.9 },
+                         { 0.4, -1.0, 0.0, -0.3, 0.2, 0.5, 0.0, 0.7, 0.9, 4.0 } };
     for (int i = 0; i < N; i++) {
-        solver.set_matrix_value(i, i, 2.0, mito::solvers::cuda::InsertMode::INSERT_VALUE);
-        if (i > 0) {
-            solver.set_matrix_value(i, i - 1, -1.0, mito::solvers::cuda::InsertMode::INSERT_VALUE);
+        for (int j = 0; j < N; j++) {
+            solver.set_matrix_value(i, j, A[i][j], mito::solvers::cuda::InsertMode::INSERT_VALUE);
         }
-        if (i < N - 1) {
-            solver.set_matrix_value(i, i + 1, -1.0, mito::solvers::cuda::InsertMode::INSERT_VALUE);
-        }
-        solver.set_rhs_value(i, 1.0, mito::solvers::cuda::InsertMode::INSERT_VALUE);
+    }
+
+    // set rhs entries
+    double b[10] = { 1.0, -2.0, 0.5, 1.5, -1.0, 0.7, -0.8, 2.0, 0.0, -1.5 };
+    for (int i = 0; i < N; i++) {
+        solver.set_rhs_value(i, b[i], mito::solvers::cuda::InsertMode::INSERT_VALUE);
     }
 
     // finalize the assembly of the linear system
@@ -43,16 +54,16 @@ TEST(Solvers, CUDADenseSolverSymmetricLinearSystem)
     solver.get_solution(x);
 
     // check the solution
-    EXPECT_NEAR(x[0], 5.0, tol);
-    EXPECT_NEAR(x[1], 9.0, tol);
-    EXPECT_NEAR(x[2], 12.0, tol);
-    EXPECT_NEAR(x[3], 14.0, tol);
-    EXPECT_NEAR(x[4], 15.0, tol);
-    EXPECT_NEAR(x[5], 15.0, tol);
-    EXPECT_NEAR(x[6], 14.0, tol);
-    EXPECT_NEAR(x[7], 12.0, tol);
-    EXPECT_NEAR(x[8], 9.0, tol);
-    EXPECT_NEAR(x[9], 5.0, tol);
+    EXPECT_NEAR(x[0], -0.0560503759, tol);
+    EXPECT_NEAR(x[1], -0.6932480517, tol);
+    EXPECT_NEAR(x[2], 0.0666007599, tol);
+    EXPECT_NEAR(x[3], 0.3544482567, tol);
+    EXPECT_NEAR(x[4], -0.4084514057, tol);
+    EXPECT_NEAR(x[5], 0.0767339355, tol);
+    EXPECT_NEAR(x[6], -0.5097195863, tol);
+    EXPECT_NEAR(x[7], 0.8324810769, tol);
+    EXPECT_NEAR(x[8], -0.0333448254, tol);
+    EXPECT_NEAR(x[9], -0.6434741305, tol);
 
     // all done
     return;
@@ -70,16 +81,27 @@ TEST(Solvers, CUDADenseSolverUnsymmetricLinearSystem)
     auto solver = mito::solvers::cuda::dense<mito::real>(mito::solvers::cuda::SolverType::LU);
     solver.initialize(N);
 
-    // set matrix and right-hand side entries
+    // set matrix entries
+    double A[10][10] = { { 3.5, -1.2, 0.8, 0.0, -0.5, 1.1, 0.0, 0.3, -0.9, 0.4 },
+                         { 0.6, 4.0, 1.3, -0.4, 0.6, -0.7, 0.0, 0.5, 0.0, -1.0 },
+                         { -1.1, 1.0, 3.8, 0.7, -0.8, 0.0, -0.2, 0.6, -0.5, 0.0 },
+                         { 0.0, -0.4, 0.7, 4.2, -0.9, 0.9, 0.3, 0.0, 0.4, -0.3 },
+                         { 0.9, 0.0, -0.8, 1.0, 4.5, 0.4, -0.7, 0.8, 0.0, 0.2 },
+                         { 1.1, -0.7, 0.0, -1.3, 0.4, 3.6, -0.3, 0.0, -0.4, 0.5 },
+                         { 0.0, 0.0, 1.1, 0.3, -0.7, -0.3, 3.3, 0.6, -0.8, 0.0 },
+                         { 0.3, 0.5, 0.6, 0.0, 0.8, 0.0, 0.6, 4.1, -0.5, 0.7 },
+                         { -0.9, 0.0, -0.5, -1.0, 0.0, -0.4, -0.8, 0.2, 3.7, 0.9 },
+                         { 0.4, 1.3, 0.0, -0.3, 0.2, 0.5, 0.0, 0.7, 0.9, 4.0 } };
     for (int i = 0; i < N; i++) {
-        solver.set_matrix_value(i, i, 2.0, mito::solvers::cuda::InsertMode::INSERT_VALUE);
-        if (i > 0) {
-            solver.set_matrix_value(i, i - 1, -1.0, mito::solvers::cuda::InsertMode::INSERT_VALUE);
+        for (int j = 0; j < N; j++) {
+            solver.set_matrix_value(i, j, A[i][j], mito::solvers::cuda::InsertMode::INSERT_VALUE);
         }
-        if (i < N - 1) {
-            solver.set_matrix_value(i, i + 1, -0.5, mito::solvers::cuda::InsertMode::INSERT_VALUE);
-        }
-        solver.set_rhs_value(i, 1.0, mito::solvers::cuda::InsertMode::INSERT_VALUE);
+    }
+
+    // set rhs entries
+    double b[10] = { -1.2, 0.5, 0.7, 1.0, 0.3, -1.5, 2.0, -0.8, 0.6, 1.1 };
+    for (int i = 0; i < N; i++) {
+        solver.set_rhs_value(i, b[i], mito::solvers::cuda::InsertMode::INSERT_VALUE);
     }
 
     // finalize the assembly of the linear system
@@ -93,16 +115,16 @@ TEST(Solvers, CUDADenseSolverUnsymmetricLinearSystem)
     solver.get_solution(x);
 
     // check the solution
-    EXPECT_NEAR(x[0], 0.8284194482079074, tol);
-    EXPECT_NEAR(x[1], 1.3136777928316294, tol);
-    EXPECT_NEAR(x[2], 1.5978722749107030, tol);
-    EXPECT_NEAR(x[3], 1.7641335139795535, tol);
-    EXPECT_NEAR(x[4], 1.8607895060968098, tol);
-    EXPECT_NEAR(x[5], 1.9148909964281315, tol);
-    EXPECT_NEAR(x[6], 1.9379849735189063, tol);
-    EXPECT_NEAR(x[7], 1.9221579012193621, tol);
-    EXPECT_NEAR(x[8], 1.8126616578396353, tol);
-    EXPECT_NEAR(x[9], 1.4063308289198175, tol);
+    EXPECT_NEAR(x[0], -0.1642774470, tol);
+    EXPECT_NEAR(x[1], 0.1352982046, tol);
+    EXPECT_NEAR(x[2], 0.2493049978, tol);
+    EXPECT_NEAR(x[3], 0.2587528172, tol);
+    EXPECT_NEAR(x[4], 0.2711567201, tol);
+    EXPECT_NEAR(x[5], -0.2258981243, tol);
+    EXPECT_NEAR(x[6], 0.6823460181, tol);
+    EXPECT_NEAR(x[7], -0.4005242029, tol);
+    EXPECT_NEAR(x[8], 0.3015605175, tol);
+    EXPECT_NEAR(x[9], 0.2837823381, tol);
 
     // all done
     return;