From 84a7b8be1d5db3c88243441213c2e83adf4390a8 Mon Sep 17 00:00:00 2001 From: maayan Date: Tue, 5 Aug 2025 16:25:49 +0300 Subject: [PATCH 01/40] first commit --- include/tensor.hpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/tensor.hpp b/include/tensor.hpp index 71b24d6..351cdef 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -7,12 +7,19 @@ namespace nn::global { using ValueType = float; +enum class Backend { + CPU, + GPU, +}; + class Tensor { private: std::vector data; std::vector shape; std::vector strides; + Backend BackendType; + void computeStrides(); inline size_t flattenIndex(const std::vector &indices) const; From 313c342f1156f262189e86808197f637e930d58a Mon Sep 17 00:00:00 2001 From: maayan Date: Tue, 5 Aug 2025 16:43:49 +0300 Subject: [PATCH 02/40] new commit --- CMakeLists.txt | 21 +++++++++++++++++---- src/model/tensor.cpp | 1 + src/model/tensor_gpu.cu | 5 +++++ src/model/tensor_gpu.hpp | 39 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 4 deletions(-) create mode 100644 src/model/tensor_gpu.cu create mode 100644 src/model/tensor_gpu.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 86f4cc4..a622b5b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,10 +1,12 @@ cmake_minimum_required(VERSION 3.28) -project(NeuralNetwork LANGUAGES CXX) +project(NeuralNetwork LANGUAGES CXX CUDA) # Add CUDA here # ------------------------------------------------------------------ # Configuration set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CUDA_STANDARD 17) # Add CUDA standard +set(CMAKE_CUDA_STANDARD_REQUIRED ON) # Enforce it set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin") @@ -40,7 +42,7 @@ FetchContent_Declare(nlohmann_json FetchContent_MakeAvailable(SFML nlohmann_json) # ------------------------------------------------------------------ -# Function: Apply sanitizers +# Function: Apply sanitizers (for CPU code only) function(apply_sanitizers target) target_compile_options(${target} PRIVATE -fsanitize=address -fno-omit-frame-pointer -g) target_link_libraries(${target} PRIVATE -fsanitize=address) @@ -48,13 +50,22 @@ endfunction() # ------------------------------------------------------------------ # Main library + +# Add both C++ and CUDA sources file(GLOB_RECURSE NN_SOURCES CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cu" # Include CUDA source files ) add_library(NeuralNetwork STATIC ${NN_SOURCES}) set_target_properties(NeuralNetwork PROPERTIES POSITION_INDEPENDENT_CODE ON) +# Enable separable compilation for CUDA files +set_target_properties(NeuralNetwork PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON +) + target_include_directories(NeuralNetwork PUBLIC $ @@ -71,19 +82,20 @@ target_link_libraries(NeuralNetwork SFML::Window SFML::System nlohmann_json::nlohmann_json + cuda + cudart ) target_compile_options(NeuralNetwork PRIVATE -Wall -Wextra -Wpedantic) # ------------------------------------------------------------------ -# Tests (with sanitizers) +# Tests option(BUILD_NN_TESTS "Build NeuralNetwork tests" OFF) if(BUILD_NN_TESTS) enable_testing() include(CTest) - # Apply sanitizers only for test builds apply_sanitizers(NeuralNetwork) file(GLOB TEST_SOURCES CONFIGURE_DEPENDS tests/*.cpp) @@ -109,3 +121,4 @@ endif() # Install install(TARGETS NeuralNetwork ARCHIVE DESTINATION lib) install(DIRECTORY include/ DESTINATION include) + diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index 80fd8cb..3765a33 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -1,6 +1,7 @@ #include #include #include +#include "tensor_gpu.hpp" namespace nn::global { Tensor::Tensor(const std::vector &shape, float init) diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu new file mode 100644 index 0000000..8530403 --- /dev/null +++ b/src/model/tensor_gpu.cu @@ -0,0 +1,5 @@ +#include "tensor_gpu.hpp" +#include + +namespace tensor_gpu { +} diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp new file mode 100644 index 0000000..8294ad7 --- /dev/null +++ b/src/model/tensor_gpu.hpp @@ -0,0 +1,39 @@ +#include + +class Tensor; // Forward declaration + +namespace tensor_gpu { + +/// Allocate memory on GPU for a tensor. +float *allocate(std::size_t count); + +/// Free GPU memory. +void deallocate(float *devicePtr); + +/// Copy data from CPU to GPU. +void copyToDevice(float *deviceDst, const float *hostSrc, std::size_t count); + +/// Copy data from GPU to CPU. +void copyToHost(float *hostDst, const float *deviceSrc, std::size_t count); + +/// Set all elements to zero (on GPU). +void zero(float *deviceData, std::size_t count); + +/// Element-wise addition: C = A + B +void add(const float *A, const float *B, float *C, std::size_t count); + +/// Element-wise multiply: C = A * B +void multiply(const float *A, const float *B, float *C, std::size_t count); + +/// Dot product between two vectors (A · B) +float dot(const float *A, const float *B, std::size_t count); + +/// Apply activation function (e.g., ReLU) +void relu(float *deviceData, std::size_t count); + +/// Apply derivative of activation function (e.g., ReLU') +void relu_derivative(const float *input, float *output, std::size_t count); + +// Add more operations as needed... + +} // namespace tensor_gpu From 2294ccd197002e4b0495710e9b1ce44a252823ae Mon Sep 17 00:00:00 2001 From: maayan Date: Tue, 5 Aug 2025 17:21:28 +0300 Subject: [PATCH 03/40] new commit --- include/tensor.hpp | 38 +++++---- src/model/tensor.cpp | 164 +++++++++++++++++++++++--------------- src/model/tensor_gpu.cu | 165 ++++++++++++++++++++++++++++++++++++++- src/model/tensor_gpu.hpp | 2 - 4 files changed, 284 insertions(+), 85 deletions(-) diff --git a/include/tensor.hpp b/include/tensor.hpp index 351cdef..f27a3e3 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -14,11 +14,15 @@ enum class Backend { class Tensor { private: - std::vector data; - std::vector shape; - std::vector strides; + std::vector cpu_data; + std::vector cpu_shape; + std::vector cpu_strides; - Backend BackendType; + ValueType *gpu_data = nullptr; + ValueType *gpu_shape = nullptr; + ValueType *gpu_strides = nullptr; + + bool isGpu() const { return true; } void computeStrides(); inline size_t flattenIndex(const std::vector &indices) const; @@ -27,29 +31,29 @@ class Tensor { // Constructors Tensor(const std::vector &shape, float init = 0.0f); Tensor(const Tensor &other) - : data(other.data), - shape(other.shape), - strides(other.strides) {} + : cpu_data(other.cpu_data), + cpu_shape(other.cpu_shape), + cpu_strides(other.cpu_strides) {} Tensor &operator=(const Tensor &other); // Element access ValueType &operator()(const std::vector &indices); ValueType operator()(const std::vector &indices) const; - inline ValueType &operator[](size_t i) { return data[i]; } - inline const ValueType &operator[](size_t i) const { return data[i]; } + ValueType &operator[](size_t i); + const ValueType &operator[](size_t i) const; // Iterators (for range-based loops) - auto begin() noexcept { return data.begin(); } - auto end() noexcept { return data.end(); } - auto begin() const noexcept { return data.begin(); } - auto end() const noexcept { return data.end(); } + auto begin() noexcept { return cpu_data.begin(); } + auto end() noexcept { return cpu_data.end(); } + auto begin() const noexcept { return cpu_data.begin(); } + auto end() const noexcept { return cpu_data.end(); } // Shape and size - inline const std::vector &getShape() const { return shape; } - inline size_t numElements() const { return data.size(); } - inline const std::vector &getData() const { return data; } - inline void fill(const ValueType &value) { std::fill(begin(), end(), value); } + const std::vector &getShape() const; + size_t numElements() const; + const std::vector &getData() const; + void fill(const ValueType &value); // Arithmetic operations Tensor operator+(const Tensor &other) const; diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index 3765a33..bf513bf 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -1,11 +1,11 @@ +#include "tensor_gpu.hpp" #include #include #include -#include "tensor_gpu.hpp" namespace nn::global { Tensor::Tensor(const std::vector &shape, float init) - : shape(shape) { + : cpu_shape(shape) { if (shape.empty()) { throw std::invalid_argument("Tensor shape cannot be empty."); } @@ -15,101 +15,135 @@ Tensor::Tensor(const std::vector &shape, float init) shape.end(), size_t(1), std::multiplies<>()); - data.assign(totalSize, init); + cpu_data.assign(totalSize, init); computeStrides(); } +ValueType &Tensor::operator[](size_t i) { + if (isGpu()) { + return gpu_data[i]; + } + return cpu_data[i]; +} + +const ValueType &Tensor::operator[](size_t i) const { + if (isGpu()) { + return gpu_data[i]; + } + return cpu_data[i]; +} + +const std::vector &Tensor::getShape() const { + + if (isGpu()) { + return gpu_data[i]; + } + return cpu_shape; +} + +size_t Tensor::numElements() const { + return cpu_data.size(); +} + +const std::vector &Tensor::getData() const { + return cpu_data; +} + +void Tensor::fill(const ValueType &value) { + std::fill(begin(), end(), value); +} + Tensor &Tensor::operator=(const Tensor &other) { if (this == &other) return *this; - data = other.data; - shape = other.shape; - strides = other.strides; + cpu_data = other.cpu_data; + cpu_shape = other.cpu_shape; + cpu_strides = other.cpu_strides; return *this; } void Tensor::computeStrides() { - const size_t dim = shape.size(); - strides.resize(dim); + const size_t dim = cpu_shape.size(); + cpu_strides.resize(dim); size_t stride = 1; for (size_t i = dim; i-- > 0;) { - strides[i] = stride; - stride *= shape[i]; + cpu_strides[i] = stride; + stride *= cpu_shape[i]; } } inline size_t Tensor::flattenIndex(const std::vector &indices) const { - if (indices.size() != shape.size()) { + if (indices.size() != cpu_shape.size()) { throw std::invalid_argument("Incorrect number of indices."); } size_t index = 0; - for (size_t i = 0; i < shape.size(); ++i) { - if (indices[i] >= shape[i]) + for (size_t i = 0; i < cpu_shape.size(); ++i) { + if (indices[i] >= cpu_shape[i]) throw std::out_of_range("Index out of bounds."); - index += indices[i] * strides[i]; + index += indices[i] * cpu_strides[i]; } return index; } ValueType &Tensor::operator()(const std::vector &indices) { - return data[flattenIndex(indices)]; + return cpu_data[flattenIndex(indices)]; } ValueType Tensor::operator()(const std::vector &indices) const { - return data[flattenIndex(indices)]; + return cpu_data[flattenIndex(indices)]; } Tensor Tensor::operator+(const Tensor &other) const { - if (shape != other.shape) { + if (cpu_shape != other.cpu_shape) { throw std::invalid_argument("Shape mismatch in Tensor::operator+."); } - Tensor result(shape); - const float *a = data.data(); - const float *b = other.data.data(); - float *r = result.data.data(); - const size_t N = data.size(); + Tensor result(cpu_shape); + const float *a = cpu_data.data(); + const float *b = other.cpu_data.data(); + float *r = result.cpu_data.data(); + const size_t N = cpu_data.size(); for (size_t i = 0; i < N; ++i) r[i] = a[i] + b[i]; return result; } Tensor Tensor::operator-(const Tensor &other) const { - if (shape != other.shape) { + if (cpu_shape != other.cpu_shape) { throw std::invalid_argument("Shape mismatch in Tensor::operator-."); } - Tensor result(shape); - const float *a = data.data(); - const float *b = other.data.data(); - float *r = result.data.data(); - const size_t N = data.size(); + Tensor result(cpu_shape); + const float *a = cpu_data.data(); + const float *b = other.cpu_data.data(); + float *r = result.cpu_data.data(); + const size_t N = cpu_data.size(); for (size_t i = 0; i < N; ++i) r[i] = a[i] - b[i]; return result; } Tensor Tensor::operator/(const Tensor &other) const { - if (shape != other.shape) { + if (cpu_shape != other.cpu_shape) { throw std::invalid_argument("Shape mismatch in Tensor::operator/."); } - Tensor result(shape); - const float *a = data.data(); - const float *b = other.data.data(); - float *r = result.data.data(); - const size_t N = data.size(); + Tensor result(cpu_shape); + const float *a = cpu_data.data(); + const float *b = other.cpu_data.data(); + float *r = result.cpu_data.data(); + const size_t N = cpu_data.size(); for (size_t i = 0; i < N; ++i) r[i] = a[i] / b[i]; return result; } Tensor &Tensor::operator+=(const Tensor &other) { - if (shape != other.shape) + if (cpu_shape != other.cpu_shape) throw std::invalid_argument("Shape mismatch."); - float *__restrict__ a = data.data(); - const float *__restrict__ b = other.data.data(); - const size_t N = data.size(); + float *__restrict__ a = cpu_data.data(); + const float *__restrict__ b = other.cpu_data.data(); + const size_t N = cpu_data.size(); for (size_t i = 0; i < N; ++i) a[i] += b[i]; @@ -118,53 +152,53 @@ Tensor &Tensor::operator+=(const Tensor &other) { } Tensor &Tensor::operator-=(const Tensor &other) { - if (shape != other.shape) + if (cpu_shape != other.cpu_shape) throw std::invalid_argument("Shape mismatch."); - float *a = data.data(); - const float *b = other.data.data(); - const size_t N = data.size(); + float *a = cpu_data.data(); + const float *b = other.cpu_data.data(); + const size_t N = cpu_data.size(); for (size_t i = 0; i < N; ++i) a[i] -= b[i]; return *this; } Tensor &Tensor::operator*=(const Tensor &other) { - if (shape != other.shape) + if (cpu_shape != other.cpu_shape) throw std::invalid_argument("Shape mismatch in Tensor::operator*=."); - const size_t N = data.size(); + const size_t N = cpu_data.size(); for (size_t i = 0; i < N; ++i) - data[i] *= other.data[i]; + cpu_data[i] *= other.cpu_data[i]; return *this; } Tensor &Tensor::operator/=(const Tensor &other) { - if (shape != other.shape) + if (cpu_shape != other.cpu_shape) throw std::invalid_argument("Shape mismatch in Tensor::operator/=."); - const size_t N = data.size(); + const size_t N = cpu_data.size(); for (size_t i = 0; i < N; ++i) - data[i] /= other.data[i]; + cpu_data[i] /= other.cpu_data[i]; return *this; } Tensor &Tensor::operator*=(ValueType scalar) { - for (auto &x : data) + for (auto &x : cpu_data) x *= scalar; return *this; } Tensor &Tensor::operator-=(ValueType scalar) { - for (auto &x : data) + for (auto &x : cpu_data) x -= scalar; return *this; } Tensor &Tensor::operator+=(ValueType scalar) { - for (auto &x : data) + for (auto &x : cpu_data) x += scalar; return *this; } Tensor &Tensor::operator/=(ValueType scalar) { - for (auto &x : data) + for (auto &x : cpu_data) x /= scalar; return *this; } @@ -194,8 +228,8 @@ Tensor Tensor::operator+(ValueType scalar) const { } Tensor Tensor::matmul(const Tensor &other) const { - const auto &aShape = shape; - const auto &bShape = other.shape; + const auto &aShape = cpu_shape; + const auto &bShape = other.cpu_shape; if (aShape.size() != 2 || bShape.size() != 1) throw std::runtime_error("matmul: unsupported shapes."); @@ -207,9 +241,9 @@ Tensor Tensor::matmul(const Tensor &other) const { Tensor result({M}); - const float *A = data.data(); - const float *B = other.data.data(); - float *R = result.data.data(); + const float *A = cpu_data.data(); + const float *B = other.cpu_data.data(); + float *R = result.cpu_data.data(); for (size_t i = 0; i < M; ++i) { float sum = 0.0f; @@ -234,9 +268,9 @@ Tensor Tensor::outer(const Tensor &a, const Tensor &b) { size_t n = bShape[0]; Tensor result({m, n}); - float *r = result.data.data(); - const float *A = a.data.data(); - const float *B = b.data.data(); + float *r = result.cpu_data.data(); + const float *A = a.cpu_data.data(); + const float *B = b.cpu_data.data(); for (size_t i = 0; i < m; ++i) { for (size_t j = 0; j < n; ++j) { @@ -247,8 +281,8 @@ Tensor Tensor::outer(const Tensor &a, const Tensor &b) { } Tensor Tensor::matmulT(const Tensor &vec) const { - const auto &wShape = shape; - const auto &vShape = vec.shape; + const auto &wShape = cpu_shape; + const auto &vShape = vec.cpu_shape; if (wShape.size() != 2 || vShape.size() != 1) throw std::runtime_error("matmulT: bad dimensions"); @@ -260,9 +294,9 @@ Tensor Tensor::matmulT(const Tensor &vec) const { Tensor result({N}, 0.0f); - const float *W = data.data(); - const float *V = vec.data.data(); - float *R = result.data.data(); + const float *W = cpu_data.data(); + const float *V = vec.cpu_data.data(); + float *R = result.cpu_data.data(); for (size_t i = 0; i < N; ++i) { float sum = 0.0f; diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu index 8530403..8bf3322 100644 --- a/src/model/tensor_gpu.cu +++ b/src/model/tensor_gpu.cu @@ -1,5 +1,168 @@ -#include "tensor_gpu.hpp" #include +#include "tensor_gpu.hpp" +#include +#include namespace tensor_gpu { + +// Allocate memory on GPU for a tensor. +float* allocate(std::size_t count) { + float* devicePtr = nullptr; + cudaError_t err = cudaMalloc(&devicePtr, count * sizeof(float)); + if (err != cudaSuccess) { + throw std::runtime_error("cudaMalloc failed"); + } + return devicePtr; +} + +// Free GPU memory. +void deallocate(float* devicePtr) { + if (devicePtr) { + cudaFree(devicePtr); + } +} + +// Copy data from CPU to GPU. +void copyToDevice(float* deviceDst, const float* hostSrc, std::size_t count) { + cudaMemcpy(deviceDst, hostSrc, count * sizeof(float), cudaMemcpyHostToDevice); +} + +// Copy data from GPU to CPU. +void copyToHost(float* hostDst, const float* deviceSrc, std::size_t count) { + cudaMemcpy(hostDst, deviceSrc, count * sizeof(float), cudaMemcpyDeviceToHost); +} + +// Kernel to set all elements to zero. +__global__ void zeroKernel(float* data, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + data[idx] = 0.0f; + } +} + +// Set all elements to zero (on GPU). +void zero(float* deviceData, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + zeroKernel<<>>(deviceData, count); + cudaDeviceSynchronize(); +} + +// Kernel for element-wise addition: C = A + B +__global__ void addKernel(const float* A, const float* B, float* C, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + C[idx] = A[idx] + B[idx]; + } +} + +// Element-wise addition: C = A + B +void add(const float* A, const float* B, float* C, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + addKernel<<>>(A, B, C, count); + cudaDeviceSynchronize(); +} + +// Kernel for element-wise multiplication: C = A * B +__global__ void multiplyKernel(const float* A, const float* B, float* C, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + C[idx] = A[idx] * B[idx]; + } +} + +// Element-wise multiply: C = A * B +void multiply(const float* A, const float* B, float* C, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + multiplyKernel<<>>(A, B, C, count); + cudaDeviceSynchronize(); +} + +// Dot product kernel using parallel reduction (simplified version) +__global__ void dotKernel(const float* A, const float* B, float* partialSum, std::size_t count) { + __shared__ float cache[256]; + std::size_t tid = threadIdx.x; + std::size_t idx = blockIdx.x * blockDim.x + tid; + + float temp = 0.0f; + if (idx < count) { + temp = A[idx] * B[idx]; + } + cache[tid] = temp; + __syncthreads(); + + // Reduction in shared memory + for (std::size_t stride = blockDim.x / 2; stride > 0; stride /= 2) { + if (tid < stride) { + cache[tid] += cache[tid + stride]; + } + __syncthreads(); + } + + if (tid == 0) { + partialSum[blockIdx.x] = cache[0]; + } +} + +// Dot product between two vectors (A · B) +float dot(const float* A, const float* B, std::size_t count) { + const std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + + // Allocate partial sums + float* d_partialSum = nullptr; + cudaMalloc(&d_partialSum, numBlocks * sizeof(float)); + + dotKernel<<>>(A, B, d_partialSum, count); + cudaDeviceSynchronize(); + + // Copy partial sums to host + float* h_partialSum = new float[numBlocks]; + cudaMemcpy(h_partialSum, d_partialSum, numBlocks * sizeof(float), cudaMemcpyDeviceToHost); + + // Final reduction on CPU + float totalSum = 0.0f; + for (std::size_t i = 0; i < numBlocks; i++) { + totalSum += h_partialSum[i]; + } + + delete[] h_partialSum; + cudaFree(d_partialSum); + return totalSum; +} + +// Kernel to apply ReLU activation: max(0, x) +__global__ void reluKernel(float* data, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + data[idx] = data[idx] > 0.0f ? data[idx] : 0.0f; + } +} + +// Apply activation function (e.g., ReLU) +void relu(float* deviceData, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + reluKernel<<>>(deviceData, count); + cudaDeviceSynchronize(); +} + +// Kernel to apply ReLU derivative: +// output[i] = input[i] > 0 ? 1 : 0 +__global__ void reluDerivativeKernel(const float* input, float* output, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + output[idx] = (input[idx] > 0.0f) ? 1.0f : 0.0f; + } +} + +// Apply derivative of activation function (e.g., ReLU') +void relu_derivative(const float* input, float* output, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + reluDerivativeKernel<<>>(input, output, count); + cudaDeviceSynchronize(); } +} // namespace tensor_gpu diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp index 8294ad7..f6d9e62 100644 --- a/src/model/tensor_gpu.hpp +++ b/src/model/tensor_gpu.hpp @@ -34,6 +34,4 @@ void relu(float *deviceData, std::size_t count); /// Apply derivative of activation function (e.g., ReLU') void relu_derivative(const float *input, float *output, std::size_t count); -// Add more operations as needed... - } // namespace tensor_gpu From 590e9ccc66be7b10be0feac7e43072b846a8ce75 Mon Sep 17 00:00:00 2001 From: maayan Date: Tue, 5 Aug 2025 17:56:15 +0300 Subject: [PATCH 04/40] new commit --- include/tensor.hpp | 7 +- src/model/tensor.cpp | 258 +++++++++++++++++--------------- src/networks/fnn/DenseLayer.hpp | 10 +- 3 files changed, 152 insertions(+), 123 deletions(-) diff --git a/include/tensor.hpp b/include/tensor.hpp index f27a3e3..6af117d 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -19,10 +19,14 @@ class Tensor { std::vector cpu_strides; ValueType *gpu_data = nullptr; + size_t gpu_data_size{0}; ValueType *gpu_shape = nullptr; + size_t gpu_shape_size{0}; ValueType *gpu_strides = nullptr; + size_t gpu_strides_size{0}; - bool isGpu() const { return true; } + + static const bool isGpu{false}; void computeStrides(); inline size_t flattenIndex(const std::vector &indices) const; @@ -50,7 +54,6 @@ class Tensor { auto end() const noexcept { return cpu_data.end(); } // Shape and size - const std::vector &getShape() const; size_t numElements() const; const std::vector &getData() const; void fill(const ValueType &value); diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index bf513bf..ed2dedc 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -4,8 +4,7 @@ #include namespace nn::global { -Tensor::Tensor(const std::vector &shape, float init) - : cpu_shape(shape) { +Tensor::Tensor(const std::vector &shape, float init) { if (shape.empty()) { throw std::invalid_argument("Tensor shape cannot be empty."); } @@ -15,38 +14,38 @@ Tensor::Tensor(const std::vector &shape, float init) shape.end(), size_t(1), std::multiplies<>()); - cpu_data.assign(totalSize, init); + + if (!isGpu) { + cpu_shape = shape; + cpu_data.assign(totalSize, init); + } + computeStrides(); } ValueType &Tensor::operator[](size_t i) { - if (isGpu()) { - return gpu_data[i]; + if (isGpu) { } return cpu_data[i]; } const ValueType &Tensor::operator[](size_t i) const { - if (isGpu()) { - return gpu_data[i]; + if (isGpu) { } return cpu_data[i]; } -const std::vector &Tensor::getShape() const { - - if (isGpu()) { - return gpu_data[i]; - } - return cpu_shape; -} - size_t Tensor::numElements() const { + if (isGpu) { + return gpu_data_size; + } return cpu_data.size(); } const std::vector &Tensor::getData() const { - return cpu_data; + if (!isGpu) { + return cpu_data; + } } void Tensor::fill(const ValueType &value) { @@ -57,10 +56,12 @@ Tensor &Tensor::operator=(const Tensor &other) { if (this == &other) return *this; - cpu_data = other.cpu_data; - cpu_shape = other.cpu_shape; - cpu_strides = other.cpu_strides; - + if (!isGpu) { + cpu_data = other.cpu_data; + cpu_shape = other.cpu_shape; + cpu_strides = other.cpu_strides; + } else { + } return *this; } @@ -138,68 +139,86 @@ Tensor Tensor::operator/(const Tensor &other) const { } Tensor &Tensor::operator+=(const Tensor &other) { - if (cpu_shape != other.cpu_shape) - throw std::invalid_argument("Shape mismatch."); - - float *__restrict__ a = cpu_data.data(); - const float *__restrict__ b = other.cpu_data.data(); - const size_t N = cpu_data.size(); - - for (size_t i = 0; i < N; ++i) - a[i] += b[i]; - + if (!isGpu) { + if (cpu_shape != other.cpu_shape) + throw std::invalid_argument("Shape mismatch in Tensor::operator+=."); + const size_t N = cpu_data.size(); + for (size_t i = 0; i < N; ++i) + cpu_data[i] += other.cpu_data[i]; + } else { + } return *this; } Tensor &Tensor::operator-=(const Tensor &other) { - if (cpu_shape != other.cpu_shape) - throw std::invalid_argument("Shape mismatch."); - float *a = cpu_data.data(); - const float *b = other.cpu_data.data(); - const size_t N = cpu_data.size(); - for (size_t i = 0; i < N; ++i) - a[i] -= b[i]; + if (!isGpu) { + if (cpu_shape != other.cpu_shape) + throw std::invalid_argument("Shape mismatch in Tensor::operator-=."); + const size_t N = cpu_data.size(); + for (size_t i = 0; i < N; ++i) + cpu_data[i] -= other.cpu_data[i]; + } else { + } return *this; } Tensor &Tensor::operator*=(const Tensor &other) { - if (cpu_shape != other.cpu_shape) - throw std::invalid_argument("Shape mismatch in Tensor::operator*=."); - const size_t N = cpu_data.size(); - for (size_t i = 0; i < N; ++i) - cpu_data[i] *= other.cpu_data[i]; + if (!isGpu) { + if (cpu_shape != other.cpu_shape) + throw std::invalid_argument("Shape mismatch in Tensor::operator*=."); + const size_t N = cpu_data.size(); + for (size_t i = 0; i < N; ++i) + cpu_data[i] *= other.cpu_data[i]; + } else { + } return *this; } Tensor &Tensor::operator/=(const Tensor &other) { - if (cpu_shape != other.cpu_shape) - throw std::invalid_argument("Shape mismatch in Tensor::operator/=."); - const size_t N = cpu_data.size(); - for (size_t i = 0; i < N; ++i) - cpu_data[i] /= other.cpu_data[i]; + if (!isGpu) { + if (cpu_shape != other.cpu_shape) + throw std::invalid_argument("Shape mismatch in Tensor::operator/=."); + const size_t N = cpu_data.size(); + for (size_t i = 0; i < N; ++i) + cpu_data[i] /= other.cpu_data[i]; + } else { + } return *this; } Tensor &Tensor::operator*=(ValueType scalar) { - for (auto &x : cpu_data) - x *= scalar; + if (!isGpu) { + for (auto &x : cpu_data) + x *= scalar; + } else { + } return *this; } Tensor &Tensor::operator-=(ValueType scalar) { - for (auto &x : cpu_data) - x -= scalar; + if (!isGpu) { + for (auto &x : cpu_data) + x -= scalar; + } else { + } return *this; } Tensor &Tensor::operator+=(ValueType scalar) { - for (auto &x : cpu_data) - x += scalar; + if (!isGpu) { + for (auto &x : cpu_data) + x += scalar; + } else { + } return *this; } + Tensor &Tensor::operator/=(ValueType scalar) { - for (auto &x : cpu_data) - x /= scalar; + if (!isGpu) { + for (auto &x : cpu_data) + x /= scalar; + } else { + } return *this; } @@ -228,83 +247,86 @@ Tensor Tensor::operator+(ValueType scalar) const { } Tensor Tensor::matmul(const Tensor &other) const { - const auto &aShape = cpu_shape; - const auto &bShape = other.cpu_shape; - - if (aShape.size() != 2 || bShape.size() != 1) - throw std::runtime_error("matmul: unsupported shapes."); - - size_t M = aShape[0]; - size_t K = aShape[1]; - if (K != bShape[0]) - throw std::runtime_error("matmul: shape mismatch."); - - Tensor result({M}); - - const float *A = cpu_data.data(); - const float *B = other.cpu_data.data(); - float *R = result.cpu_data.data(); - - for (size_t i = 0; i < M; ++i) { - float sum = 0.0f; - size_t base = i * K; - for (size_t j = 0; j < K; ++j) { - sum += A[base + j] * B[j]; + if (!isGpu) { + const auto &aShape = cpu_shape; + const auto &bShape = other.cpu_shape; + + if (aShape.size() != 2 || bShape.size() != 1) + throw std::runtime_error("matmul: unsupported shapes."); + + size_t M = aShape[0]; + size_t K = aShape[1]; + if (K != bShape[0]) + throw std::runtime_error("matmul: shape mismatch."); + + Tensor result({M}); + + const float *A = cpu_data.data(); + const float *B = other.cpu_data.data(); + float *R = result.cpu_data.data(); + + for (size_t i = 0; i < M; ++i) { + float sum = 0.0f; + size_t base = i * K; + for (size_t j = 0; j < K; ++j) { + sum += A[base + j] * B[j]; + } + R[i] = sum; } - R[i] = sum; + return result; } - return result; } Tensor Tensor::outer(const Tensor &a, const Tensor &b) { - const std::vector &aShape = a.getShape(); - const std::vector &bShape = b.getShape(); - - if (aShape.size() != 1 || bShape.size() != 1) { - throw std::runtime_error("outer: both tensors must be 1D vectors"); - } + if (!isGpu) { + if (a.cpu_shape.size() != 1 || b.cpu_shape.size() != 1) { + throw std::runtime_error("outer: both tensors must be 1D vectors"); + } - size_t m = aShape[0]; - size_t n = bShape[0]; + size_t m = a.cpu_shape[0]; + size_t n = b.cpu_shape[0]; - Tensor result({m, n}); - float *r = result.cpu_data.data(); - const float *A = a.cpu_data.data(); - const float *B = b.cpu_data.data(); + Tensor result({m, n}); + float *r = result.cpu_data.data(); + const float *A = a.cpu_data.data(); + const float *B = b.cpu_data.data(); - for (size_t i = 0; i < m; ++i) { - for (size_t j = 0; j < n; ++j) { - r[i * n + j] = A[i] * B[j]; + for (size_t i = 0; i < m; ++i) { + for (size_t j = 0; j < n; ++j) { + r[i * n + j] = A[i] * B[j]; + } } + return result; } - return result; } Tensor Tensor::matmulT(const Tensor &vec) const { - const auto &wShape = cpu_shape; - const auto &vShape = vec.cpu_shape; - - if (wShape.size() != 2 || vShape.size() != 1) - throw std::runtime_error("matmulT: bad dimensions"); - - size_t M = wShape[0]; - size_t N = wShape[1]; - if (vShape[0] != M) - throw std::runtime_error("matmulT: incompatible"); - - Tensor result({N}, 0.0f); - - const float *W = cpu_data.data(); - const float *V = vec.cpu_data.data(); - float *R = result.cpu_data.data(); - - for (size_t i = 0; i < N; ++i) { - float sum = 0.0f; - for (size_t j = 0; j < M; ++j) { - sum += W[j * N + i] * V[j]; + if (!isGpu) { + const auto &wShape = cpu_shape; + const auto &vShape = vec.cpu_shape; + + if (wShape.size() != 2 || vShape.size() != 1) + throw std::runtime_error("matmulT: bad dimensions"); + + size_t M = wShape[0]; + size_t N = wShape[1]; + if (vShape[0] != M) + throw std::runtime_error("matmulT: incompatible"); + + Tensor result({N}, 0.0f); + + const float *W = cpu_data.data(); + const float *V = vec.cpu_data.data(); + float *R = result.cpu_data.data(); + + for (size_t i = 0; i < N; ++i) { + float sum = 0.0f; + for (size_t j = 0; j < M; ++j) { + sum += W[j * N + i] * V[j]; + } + R[i] = sum; } - R[i] = sum; + return result; } - return result; } } // namespace nn::global diff --git a/src/networks/fnn/DenseLayer.hpp b/src/networks/fnn/DenseLayer.hpp index 84587aa..2b401a2 100644 --- a/src/networks/fnn/DenseLayer.hpp +++ b/src/networks/fnn/DenseLayer.hpp @@ -11,11 +11,15 @@ struct LayerParams { global::Tensor weights; global::Tensor biases; + size_t size_; + size_t prevSize_; + LayerParams(size_t out_dim, size_t in_dim) - : weights({out_dim, in_dim}), biases({out_dim}) {} + : weights({out_dim, in_dim}), biases({out_dim}), + size_(out_dim), prevSize_(in_dim) {} - size_t size() const { return biases.numElements(); } - size_t prevSize() const { return weights.getShape()[1]; } + size_t size() const { return size_; } + size_t prevSize() const { return prevSize_; } size_t paramSize() const { return biases.numElements() + weights.numElements(); } }; From b1a9c5359496a676b201ab65565c424bff2c9e5e Mon Sep 17 00:00:00 2001 From: maayan Date: Tue, 5 Aug 2025 18:38:27 +0300 Subject: [PATCH 05/40] new commit --- include/tensor.hpp | 13 +++- src/model/activations.cpp | 135 +++++++++++++++++--------------- src/model/activations.hpp | 3 - src/model/tensor_gpu.cu | 158 +++++++++++++++++++++++++++++++------- src/model/tensor_gpu.hpp | 44 +++++++---- 5 files changed, 243 insertions(+), 110 deletions(-) diff --git a/include/tensor.hpp b/include/tensor.hpp index 6af117d..758e513 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -4,6 +4,10 @@ #include #include +namespace nn::model { +class Activation; +} + namespace nn::global { using ValueType = float; @@ -19,18 +23,19 @@ class Tensor { std::vector cpu_strides; ValueType *gpu_data = nullptr; - size_t gpu_data_size{0}; + std::size_t gpu_data_size{0}; ValueType *gpu_shape = nullptr; - size_t gpu_shape_size{0}; + size_t gpu_shape_size{0}; ValueType *gpu_strides = nullptr; - size_t gpu_strides_size{0}; - + size_t gpu_strides_size{0}; static const bool isGpu{false}; void computeStrides(); inline size_t flattenIndex(const std::vector &indices) const; + friend model::Activation; + public: // Constructors Tensor(const std::vector &shape, float init = 0.0f); diff --git a/src/model/activations.cpp b/src/model/activations.cpp index 19669bf..525c227 100644 --- a/src/model/activations.cpp +++ b/src/model/activations.cpp @@ -1,36 +1,7 @@ #include "activations.hpp" +#include "tensor_gpu.hpp" namespace nn::model { -global::ValueType Activation::activate(const global::ValueType z) const { - switch (activationType) { - case ActivationType::Relu: - return relu(z); - case ActivationType::LeakyRelu: - return leakyRelu(z); - case ActivationType::Sigmoid: - return sigmoid(z); - case ActivationType::Tanh: - return tanh(z); - default: - return z; - } -} - -global::ValueType Activation::derivativeActivate(const global::ValueType z) const { - switch (activationType) { - case ActivationType::Relu: - return derivativeRelu(z); - case ActivationType::LeakyRelu: - return derivativeLeakyRelu(z); - case ActivationType::Sigmoid: - return derivativeSigmoid(z); - case ActivationType::Tanh: - return derivativeTanh(z); - default: - return z; - } -} - void Activation::activate(const global::Tensor &net, global::Tensor &out) const { switch (activationType) { case ActivationType::Relu: @@ -114,61 +85,105 @@ global::ValueType Activation::derivativeTanh(const global::ValueType z) { } void Activation::relu(const global::Tensor &net, global::Tensor &out) { - for (size_t i = 0; i < net.numElements(); ++i) - out[i] = relu(net[i]); + if (net.isGpu) { + global::tensor_gpu::relu(net.gpu_data, out.gpu_data, net.gpu_data_size); + } else { + for (size_t i = 0; i < net.numElements(); ++i) { + out[i] = relu(net[i]); + } + } } void Activation::derivativeRelu(const global::Tensor &net, global::Tensor &out) { - for (size_t i = 0; i < net.numElements(); ++i) - out[i] *= derivativeRelu(net[i]); + if (net.isGpu) { + global::tensor_gpu::relu_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size); + } else { + for (size_t i = 0; i < net.numElements(); ++i) { + out[i] *= derivativeRelu(net[i]); + } + } } void Activation::leakyRelu(const global::Tensor &net, global::Tensor &out) { - for (size_t i = 0; i < net.numElements(); ++i) - out[i] = leakyRelu(net[i]); + if (net.isGpu) { + global::tensor_gpu::leaky_relu(net.gpu_data, out.gpu_data, net.gpu_data_size); + } else { + for (size_t i = 0; i < net.numElements(); ++i) { + out[i] = leakyRelu(net[i]); + } + } } void Activation::derivativeLeakyRelu(const global::Tensor &net, global::Tensor &out) { - for (size_t i = 0; i < net.numElements(); ++i) - out[i] *= derivativeLeakyRelu(net[i]); + if (net.isGpu) { + global::tensor_gpu::leaky_relu_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size); + } else { + for (size_t i = 0; i < net.numElements(); ++i) { + out[i] *= derivativeLeakyRelu(net[i]); + } + } } void Activation::sigmoid(const global::Tensor &net, global::Tensor &out) { - for (size_t i = 0; i < net.numElements(); ++i) - out[i] = sigmoid(net[i]); + if (net.isGpu) { + global::tensor_gpu::sigmoid(net.gpu_data, out.gpu_data, net.gpu_data_size); + } else { + for (size_t i = 0; i < net.numElements(); ++i) { + out[i] = sigmoid(net[i]); + } + } } void Activation::derivativeSigmoid(const global::Tensor &net, global::Tensor &out) { - for (size_t i = 0; i < net.numElements(); ++i) - out[i] *= derivativeSigmoid(net[i]); + if (net.isGpu) { + global::tensor_gpu::sigmoid_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size); + } else { + for (size_t i = 0; i < net.numElements(); ++i) { + out[i] *= derivativeSigmoid(net[i]); + } + } } void Activation::tanh(const global::Tensor &net, global::Tensor &out) { - for (size_t i = 0; i < net.numElements(); ++i) - out[i] = tanh(net[i]); + if (net.isGpu) { + global::tensor_gpu::tanh_activation(net.gpu_data, out.gpu_data, net.gpu_data_size); + } else { + for (size_t i = 0; i < net.numElements(); ++i) { + out[i] = tanh(net[i]); + } + } } void Activation::derivativeTanh(const global::Tensor &net, global::Tensor &out) { - for (size_t i = 0; i < net.numElements(); ++i) - out[i] *= derivativeTanh(net[i]); + if (net.isGpu) { + global::tensor_gpu::tanh_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size); + } else { + for (size_t i = 0; i < net.numElements(); ++i) { + out[i] *= derivativeTanh(net[i]); + } + } } void Activation::softmax(const global::Tensor &net, global::Tensor &out) { - global::ValueType max = maxVector(net); - global::ValueType sum = 0.0; - - for (size_t i = 0; i < net.numElements(); ++i) { - global::ValueType x = net[i] - max; - if (x < -700.0) - x = -700.0; - if (x > 700.0) - x = 700.0; - out[i] = std::exp(x); - sum += out[i]; - } + if (net.isGpu) { + + } else { + global::ValueType max = maxVector(net); + global::ValueType sum = 0.0; + + for (size_t i = 0; i < net.numElements(); ++i) { + global::ValueType x = net[i] - max; + if (x < -700.0) + x = -700.0; + if (x > 700.0) + x = 700.0; + out[i] = std::exp(x); + sum += out[i]; + } - sum = maxValue(sum, 1e-10); + sum = maxValue(sum, 1e-10); - out /= sum; + out /= sum; + } } } // namespace nn::model diff --git a/src/model/activations.hpp b/src/model/activations.hpp index e49a8f0..9c6b4a2 100644 --- a/src/model/activations.hpp +++ b/src/model/activations.hpp @@ -67,9 +67,6 @@ class Activation { : activationType(other.activationType) {} ~Activation() = default; - global::ValueType activate(const global::ValueType x) const; - global::ValueType derivativeActivate(const global::ValueType x) const; - void activate(const global::Tensor &net, global::Tensor &out) const; void derivativeActivate(const global::Tensor &net, diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu index 8bf3322..cd2d5a7 100644 --- a/src/model/tensor_gpu.cu +++ b/src/model/tensor_gpu.cu @@ -3,12 +3,11 @@ #include #include -namespace tensor_gpu { - +namespace nn::global::tensor_gpu { // Allocate memory on GPU for a tensor. -float* allocate(std::size_t count) { - float* devicePtr = nullptr; - cudaError_t err = cudaMalloc(&devicePtr, count * sizeof(float)); +ValueType* allocate(std::size_t count) { + ValueType* devicePtr = nullptr; + cudaError_t err = cudaMalloc(&devicePtr, count * sizeof(ValueType)); if (err != cudaSuccess) { throw std::runtime_error("cudaMalloc failed"); } @@ -16,24 +15,24 @@ float* allocate(std::size_t count) { } // Free GPU memory. -void deallocate(float* devicePtr) { +void deallocate(ValueType* devicePtr) { if (devicePtr) { cudaFree(devicePtr); } } // Copy data from CPU to GPU. -void copyToDevice(float* deviceDst, const float* hostSrc, std::size_t count) { - cudaMemcpy(deviceDst, hostSrc, count * sizeof(float), cudaMemcpyHostToDevice); +void copyToDevice(ValueType* deviceDst, const ValueType* hostSrc, std::size_t count) { + cudaMemcpy(deviceDst, hostSrc, count * sizeof(ValueType), cudaMemcpyHostToDevice); } // Copy data from GPU to CPU. -void copyToHost(float* hostDst, const float* deviceSrc, std::size_t count) { - cudaMemcpy(hostDst, deviceSrc, count * sizeof(float), cudaMemcpyDeviceToHost); +void copyToHost(ValueType* hostDst, const ValueType* deviceSrc, std::size_t count) { + cudaMemcpy(hostDst, deviceSrc, count * sizeof(ValueType), cudaMemcpyDeviceToHost); } // Kernel to set all elements to zero. -__global__ void zeroKernel(float* data, std::size_t count) { +__global__ void zeroKernel(ValueType* data, std::size_t count) { std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < count) { data[idx] = 0.0f; @@ -41,7 +40,7 @@ __global__ void zeroKernel(float* data, std::size_t count) { } // Set all elements to zero (on GPU). -void zero(float* deviceData, std::size_t count) { +void zero(ValueType* deviceData, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; zeroKernel<<>>(deviceData, count); @@ -49,7 +48,7 @@ void zero(float* deviceData, std::size_t count) { } // Kernel for element-wise addition: C = A + B -__global__ void addKernel(const float* A, const float* B, float* C, std::size_t count) { +__global__ void addKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < count) { C[idx] = A[idx] + B[idx]; @@ -57,7 +56,7 @@ __global__ void addKernel(const float* A, const float* B, float* C, std::size_t } // Element-wise addition: C = A + B -void add(const float* A, const float* B, float* C, std::size_t count) { +void add(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; addKernel<<>>(A, B, C, count); @@ -65,7 +64,7 @@ void add(const float* A, const float* B, float* C, std::size_t count) { } // Kernel for element-wise multiplication: C = A * B -__global__ void multiplyKernel(const float* A, const float* B, float* C, std::size_t count) { +__global__ void multiplyKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < count) { C[idx] = A[idx] * B[idx]; @@ -73,7 +72,7 @@ __global__ void multiplyKernel(const float* A, const float* B, float* C, std::si } // Element-wise multiply: C = A * B -void multiply(const float* A, const float* B, float* C, std::size_t count) { +void multiply(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; multiplyKernel<<>>(A, B, C, count); @@ -81,8 +80,8 @@ void multiply(const float* A, const float* B, float* C, std::size_t count) { } // Dot product kernel using parallel reduction (simplified version) -__global__ void dotKernel(const float* A, const float* B, float* partialSum, std::size_t count) { - __shared__ float cache[256]; +__global__ void dotKernel(const ValueType* A, const ValueType* B, ValueType* partialSum, std::size_t count) { + __shared__ ValueType cache[256]; std::size_t tid = threadIdx.x; std::size_t idx = blockIdx.x * blockDim.x + tid; @@ -107,23 +106,23 @@ __global__ void dotKernel(const float* A, const float* B, float* partialSum, std } // Dot product between two vectors (A · B) -float dot(const float* A, const float* B, std::size_t count) { +float dot(const ValueType* A, const ValueType* B, std::size_t count) { const std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; // Allocate partial sums - float* d_partialSum = nullptr; - cudaMalloc(&d_partialSum, numBlocks * sizeof(float)); + ValueType* d_partialSum = nullptr; + cudaMalloc(&d_partialSum, numBlocks * sizeof(ValueType)); dotKernel<<>>(A, B, d_partialSum, count); cudaDeviceSynchronize(); // Copy partial sums to host - float* h_partialSum = new float[numBlocks]; + ValueType* h_partialSum = new ValueType[numBlocks]; cudaMemcpy(h_partialSum, d_partialSum, numBlocks * sizeof(float), cudaMemcpyDeviceToHost); // Final reduction on CPU - float totalSum = 0.0f; + ValueType totalSum = 0.0f; for (std::size_t i = 0; i < numBlocks; i++) { totalSum += h_partialSum[i]; } @@ -134,24 +133,24 @@ float dot(const float* A, const float* B, std::size_t count) { } // Kernel to apply ReLU activation: max(0, x) -__global__ void reluKernel(float* data, std::size_t count) { +__global__ void reluKernel(const ValueType *input, ValueType *output, std::size_t count) { std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < count) { - data[idx] = data[idx] > 0.0f ? data[idx] : 0.0f; + output[idx] = input[idx] > 0.0 ? input[idx] : 0.0f; } } // Apply activation function (e.g., ReLU) -void relu(float* deviceData, std::size_t count) { +void relu(const ValueType *input, ValueType *output, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; - reluKernel<<>>(deviceData, count); + reluKernel<<>>(input, output, count); cudaDeviceSynchronize(); } // Kernel to apply ReLU derivative: // output[i] = input[i] > 0 ? 1 : 0 -__global__ void reluDerivativeKernel(const float* input, float* output, std::size_t count) { +__global__ void reluDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) { std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < count) { output[idx] = (input[idx] > 0.0f) ? 1.0f : 0.0f; @@ -159,10 +158,111 @@ __global__ void reluDerivativeKernel(const float* input, float* output, std::siz } // Apply derivative of activation function (e.g., ReLU') -void relu_derivative(const float* input, float* output, std::size_t count) { +void relu_derivative(const ValueType* input, ValueType* output, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; reluDerivativeKernel<<>>(input, output, count); cudaDeviceSynchronize(); } + +// Kernel to apply Sigmoid activation: 1 / (1 + exp(-x)) +__global__ void sigmoidKernel(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + ValueType x = input[idx]; + output[idx] = 1.0f / (1.0f + expf(-x)); + } +} + +// Apply Sigmoid activation +void sigmoid(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + sigmoidKernel<<>>(input, output, count); + cudaDeviceSynchronize(); +} + +// Kernel for Sigmoid derivative: s(x) * (1 - s(x)) +__global__ void sigmoidDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + ValueType x = input[idx]; + ValueType s = 1.0f / (1.0f + expf(-x)); + output[idx] = s * (1.0f - s); + } +} + +// Apply Sigmoid derivative +void sigmoid_derivative(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + sigmoidDerivativeKernel<<>>(input, output, count); + cudaDeviceSynchronize(); +} + +// Kernel to apply Tanh activation: tanh(x) +__global__ void tanhKernel(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + output[idx] = tanhf(input[idx]); + } +} + +// Apply Tanh activation +void tanh_activation(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + tanhKernel<<>>(input, output, count); + cudaDeviceSynchronize(); +} + +// Kernel for Tanh derivative: 1 - tanh(x)^2 +__global__ void tanhDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + ValueType t = tanhf(input[idx]); + output[idx] = 1.0f - t * t; + } +} + +// Apply Tanh derivative +void tanh_derivative(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + tanhDerivativeKernel<<>>(input, output, count); + cudaDeviceSynchronize(); +} + +// Kernel for Leaky ReLU: x > 0 ? x : alpha * x +__global__ void leakyReluKernel(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + ValueType x = input[idx]; + output[idx] = (x > 0.0f) ? x : alpha * x; + } +} + +// Apply Leaky ReLU +void leaky_relu(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + leakyReluKernel<<>>(input, output, count, alpha); + cudaDeviceSynchronize(); +} + +// Kernel for Leaky ReLU derivative: x > 0 ? 1 : alpha +__global__ void leakyReluDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + output[idx] = (input[idx] > 0.0f) ? 1.0f : alpha; + } +} + +// Apply Leaky ReLU derivative +void leaky_relu_derivative(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + leakyReluDerivativeKernel<<>>(input, output, count, alpha); + cudaDeviceSynchronize(); +} } // namespace tensor_gpu diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp index f6d9e62..cde66d2 100644 --- a/src/model/tensor_gpu.hpp +++ b/src/model/tensor_gpu.hpp @@ -1,37 +1,53 @@ +#ifndef TENSOR_GPU +#define TENSOR_GPU + +#include "tensor.hpp" #include class Tensor; // Forward declaration -namespace tensor_gpu { +namespace nn::global::tensor_gpu { /// Allocate memory on GPU for a tensor. -float *allocate(std::size_t count); +ValueType *allocate(std::size_t count); /// Free GPU memory. -void deallocate(float *devicePtr); +void deallocate(ValueType *devicePtr); /// Copy data from CPU to GPU. -void copyToDevice(float *deviceDst, const float *hostSrc, std::size_t count); +void copyToDevice(ValueType *deviceDst, const ValueType *hostSrc, std::size_t count); /// Copy data from GPU to CPU. -void copyToHost(float *hostDst, const float *deviceSrc, std::size_t count); +void copyToHost(ValueType *hostDst, const ValueType *deviceSrc, std::size_t count); /// Set all elements to zero (on GPU). -void zero(float *deviceData, std::size_t count); +void zero(ValueType *deviceData, std::size_t count); /// Element-wise addition: C = A + B -void add(const float *A, const float *B, float *C, std::size_t count); +void add(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count); /// Element-wise multiply: C = A * B -void multiply(const float *A, const float *B, float *C, std::size_t count); +void multiply(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count); /// Dot product between two vectors (A · B) -float dot(const float *A, const float *B, std::size_t count); +float dot(const ValueType *A, const ValueType *B, std::size_t count); + +// ---------------- ReLU ---------------- +void relu(const ValueType *input, ValueType *output, std::size_t count); +void relu_derivative(const ValueType *input, ValueType *output, std::size_t count); + +// ---------------- Sigmoid ---------------- +void sigmoid(const ValueType *input, ValueType *output, std::size_t count); +void sigmoid_derivative(const ValueType *input, ValueType *output, std::size_t count); + +// ---------------- Tanh ---------------- +void tanh_activation(const ValueType *input, ValueType *output, std::size_t count); +void tanh_derivative(const ValueType *input, ValueType *output, std::size_t count); -/// Apply activation function (e.g., ReLU) -void relu(float *deviceData, std::size_t count); +// ---------------- Leaky ReLU ---------------- +void leaky_relu(const ValueType *input, ValueType *output, std::size_t count, ValueType alpha = 0.01f); +void leaky_relu_derivative(const ValueType *input, ValueType *output, std::size_t count, ValueType alpha = 0.01f); -/// Apply derivative of activation function (e.g., ReLU') -void relu_derivative(const float *input, float *output, std::size_t count); +} // namespace nn::global::tensor_gpu -} // namespace tensor_gpu +#endif // TENSOR_GPU From 6aecfb500b79ae1e5865877cd9c9376aeb6525a1 Mon Sep 17 00:00:00 2001 From: maayan Date: Tue, 5 Aug 2025 19:03:50 +0300 Subject: [PATCH 06/40] new commit --- include/tensor.hpp | 6 +-- src/model/tensor.cpp | 99 +++++++++++++++++++++++----------------- src/model/tensor_gpu.cu | 10 ++-- src/model/tensor_gpu.hpp | 8 ++-- 4 files changed, 71 insertions(+), 52 deletions(-) diff --git a/include/tensor.hpp b/include/tensor.hpp index 758e513..6da851a 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -3,13 +3,13 @@ #include #include +#include "../src/model/tensor_gpu.hpp" namespace nn::model { class Activation; } namespace nn::global { -using ValueType = float; enum class Backend { CPU, @@ -24,9 +24,9 @@ class Tensor { ValueType *gpu_data = nullptr; std::size_t gpu_data_size{0}; - ValueType *gpu_shape = nullptr; + size_t *gpu_shape = nullptr; size_t gpu_shape_size{0}; - ValueType *gpu_strides = nullptr; + size_t *gpu_strides = nullptr; size_t gpu_strides_size{0}; static const bool isGpu{false}; diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index ed2dedc..67d06c7 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -18,6 +18,11 @@ Tensor::Tensor(const std::vector &shape, float init) { if (!isGpu) { cpu_shape = shape; cpu_data.assign(totalSize, init); + } else { + gpu_shape = (size_t *)tensor_gpu::allocate(shape.size() * sizeof(size_t)); + tensor_gpu::copyToDevice(gpu_shape, shape.data(), gpu_data_size * sizeof(size_t)); + gpu_data = (ValueType *)tensor_gpu::allocate(totalSize * sizeof(ValueType)); + gpu_data_size = totalSize; } computeStrides(); @@ -76,66 +81,78 @@ void Tensor::computeStrides() { } inline size_t Tensor::flattenIndex(const std::vector &indices) const { - if (indices.size() != cpu_shape.size()) { - throw std::invalid_argument("Incorrect number of indices."); - } - size_t index = 0; - for (size_t i = 0; i < cpu_shape.size(); ++i) { - if (indices[i] >= cpu_shape[i]) - throw std::out_of_range("Index out of bounds."); - index += indices[i] * cpu_strides[i]; + if (!isGpu) { + if (indices.size() != cpu_shape.size()) { + throw std::invalid_argument("Incorrect number of indices."); + } + size_t index = 0; + for (size_t i = 0; i < cpu_shape.size(); ++i) { + if (indices[i] >= cpu_shape[i]) + throw std::out_of_range("Index out of bounds."); + index += indices[i] * cpu_strides[i]; + } + return index; } - return index; } ValueType &Tensor::operator()(const std::vector &indices) { - return cpu_data[flattenIndex(indices)]; + if (!isGpu) { + return cpu_data[flattenIndex(indices)]; + } } ValueType Tensor::operator()(const std::vector &indices) const { - return cpu_data[flattenIndex(indices)]; + if (!isGpu) { + return cpu_data[flattenIndex(indices)]; + } } Tensor Tensor::operator+(const Tensor &other) const { - if (cpu_shape != other.cpu_shape) { - throw std::invalid_argument("Shape mismatch in Tensor::operator+."); + if (!isGpu) { + if (cpu_shape != other.cpu_shape) { + throw std::invalid_argument("Shape mismatch in Tensor::operator+."); + } + Tensor result(cpu_shape); + const float *a = cpu_data.data(); + const float *b = other.cpu_data.data(); + float *r = result.cpu_data.data(); + const size_t N = cpu_data.size(); + for (size_t i = 0; i < N; ++i) + r[i] = a[i] + b[i]; + return result; } - Tensor result(cpu_shape); - const float *a = cpu_data.data(); - const float *b = other.cpu_data.data(); - float *r = result.cpu_data.data(); - const size_t N = cpu_data.size(); - for (size_t i = 0; i < N; ++i) - r[i] = a[i] + b[i]; - return result; } Tensor Tensor::operator-(const Tensor &other) const { - if (cpu_shape != other.cpu_shape) { - throw std::invalid_argument("Shape mismatch in Tensor::operator-."); + if (!isGpu) { + if (cpu_shape != other.cpu_shape) { + throw std::invalid_argument("Shape mismatch in Tensor::operator-."); + } + Tensor result(cpu_shape); + const float *a = cpu_data.data(); + const float *b = other.cpu_data.data(); + float *r = result.cpu_data.data(); + const size_t N = cpu_data.size(); + for (size_t i = 0; i < N; ++i) + r[i] = a[i] - b[i]; + return result; } - Tensor result(cpu_shape); - const float *a = cpu_data.data(); - const float *b = other.cpu_data.data(); - float *r = result.cpu_data.data(); - const size_t N = cpu_data.size(); - for (size_t i = 0; i < N; ++i) - r[i] = a[i] - b[i]; - return result; } Tensor Tensor::operator/(const Tensor &other) const { - if (cpu_shape != other.cpu_shape) { - throw std::invalid_argument("Shape mismatch in Tensor::operator/."); + if (!isGpu) { + if (cpu_shape != other.cpu_shape) { + throw std::invalid_argument("Shape mismatch in Tensor::operator/."); + } + Tensor result(cpu_shape); + const float *a = cpu_data.data(); + const float *b = other.cpu_data.data(); + float *r = result.cpu_data.data(); + const size_t N = cpu_data.size(); + for (size_t i = 0; i < N; ++i) + r[i] = a[i] / b[i]; + return result; } - Tensor result(cpu_shape); - const float *a = cpu_data.data(); - const float *b = other.cpu_data.data(); - float *r = result.cpu_data.data(); - const size_t N = cpu_data.size(); - for (size_t i = 0; i < N; ++i) - r[i] = a[i] / b[i]; - return result; } Tensor &Tensor::operator+=(const Tensor &other) { diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu index cd2d5a7..b911481 100644 --- a/src/model/tensor_gpu.cu +++ b/src/model/tensor_gpu.cu @@ -5,9 +5,9 @@ namespace nn::global::tensor_gpu { // Allocate memory on GPU for a tensor. -ValueType* allocate(std::size_t count) { - ValueType* devicePtr = nullptr; - cudaError_t err = cudaMalloc(&devicePtr, count * sizeof(ValueType)); +void* allocate(std::size_t count) { + void* devicePtr = nullptr; + cudaError_t err = cudaMalloc(&devicePtr, count); if (err != cudaSuccess) { throw std::runtime_error("cudaMalloc failed"); } @@ -22,8 +22,8 @@ void deallocate(ValueType* devicePtr) { } // Copy data from CPU to GPU. -void copyToDevice(ValueType* deviceDst, const ValueType* hostSrc, std::size_t count) { - cudaMemcpy(deviceDst, hostSrc, count * sizeof(ValueType), cudaMemcpyHostToDevice); +void copyToDevice(void* deviceDst, const void * hostSrc, std::size_t size) { + cudaMemcpy(deviceDst, hostSrc, size, cudaMemcpyHostToDevice); } // Copy data from GPU to CPU. diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp index cde66d2..77f7aa3 100644 --- a/src/model/tensor_gpu.hpp +++ b/src/model/tensor_gpu.hpp @@ -1,21 +1,23 @@ #ifndef TENSOR_GPU #define TENSOR_GPU -#include "tensor.hpp" #include +namespace nn::global { +using ValueType = float; +} class Tensor; // Forward declaration namespace nn::global::tensor_gpu { /// Allocate memory on GPU for a tensor. -ValueType *allocate(std::size_t count); +void *allocate(std::size_t count); /// Free GPU memory. void deallocate(ValueType *devicePtr); /// Copy data from CPU to GPU. -void copyToDevice(ValueType *deviceDst, const ValueType *hostSrc, std::size_t count); +void copyToDevice(void *deviceDst, const void *hostSrc, std::size_t count); /// Copy data from GPU to CPU. void copyToHost(ValueType *hostDst, const ValueType *deviceSrc, std::size_t count); From 2d8637c3f098543ed07e8b2af7b0ad64208f67a3 Mon Sep 17 00:00:00 2001 From: maayan Date: Tue, 5 Aug 2025 19:51:36 +0300 Subject: [PATCH 07/40] new commit --- include/tensor.hpp | 3 +-- src/model/tensor.cpp | 25 +++++++++++++++++++------ src/model/tensor_gpu.cu | 22 ++++++++++++++++++++-- src/model/tensor_gpu.hpp | 6 +++++- 4 files changed, 45 insertions(+), 11 deletions(-) diff --git a/include/tensor.hpp b/include/tensor.hpp index 6da851a..2a66eb8 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -25,9 +25,8 @@ class Tensor { ValueType *gpu_data = nullptr; std::size_t gpu_data_size{0}; size_t *gpu_shape = nullptr; - size_t gpu_shape_size{0}; size_t *gpu_strides = nullptr; - size_t gpu_strides_size{0}; + size_t gpu_shape_size{0}; static const bool isGpu{false}; diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index 67d06c7..6547556 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -66,17 +66,30 @@ Tensor &Tensor::operator=(const Tensor &other) { cpu_shape = other.cpu_shape; cpu_strides = other.cpu_strides; } else { + gpu_shape = (size_t *)tensor_gpu::allocate(other.gpu_shape_size * sizeof(size_t)); + gpu_data = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType)); + gpu_data_size = other.gpu_data_size; + gpu_shape_size = other.gpu_shape_size; + + tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType)); + tensor_gpu::copyDeviceToDevice(gpu_shape, other.gpu_shape, gpu_shape_size * sizeof(size_t)); + tensor_gpu::copyDeviceToDevice(gpu_strides, other.gpu_strides, gpu_shape_size * sizeof(size_t)); } return *this; } void Tensor::computeStrides() { - const size_t dim = cpu_shape.size(); - cpu_strides.resize(dim); - size_t stride = 1; - for (size_t i = dim; i-- > 0;) { - cpu_strides[i] = stride; - stride *= cpu_shape[i]; + if (isGpu) { + gpu_strides = (size_t *)tensor_gpu::allocate(gpu_shape_size * sizeof(size_t)); + tensor_gpu::computeStridesDevice(gpu_shape, gpu_strides, gpu_shape_size * sizeof(size_t)); + } else { + const size_t dim = cpu_shape.size(); + cpu_strides.resize(dim); + size_t stride = 1; + for (size_t i = dim; i-- > 0;) { + cpu_strides[i] = stride; + stride *= cpu_shape[i]; + } } } diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu index b911481..9199305 100644 --- a/src/model/tensor_gpu.cu +++ b/src/model/tensor_gpu.cu @@ -26,9 +26,14 @@ void copyToDevice(void* deviceDst, const void * hostSrc, std::size_t size) { cudaMemcpy(deviceDst, hostSrc, size, cudaMemcpyHostToDevice); } + +void copyDeviceToDevice(void *deviceDst, const void *deviceSrc, std::size_t count) { + cudaMemcpy(deviceDst, deviceDst, count, cudaMemcpyDeviceToDevice); +} + // Copy data from GPU to CPU. -void copyToHost(ValueType* hostDst, const ValueType* deviceSrc, std::size_t count) { - cudaMemcpy(hostDst, deviceSrc, count * sizeof(ValueType), cudaMemcpyDeviceToHost); +void copyToHost(void* hostDst, const void* deviceSrc, std::size_t count) { + cudaMemcpy(hostDst, deviceSrc, count, cudaMemcpyDeviceToHost); } // Kernel to set all elements to zero. @@ -132,6 +137,19 @@ float dot(const ValueType* A, const ValueType* B, std::size_t count) { return totalSum; } +__global__ void computeStrides(const size_t *shape, size_t *strides, size_t ndim) { + size_t stride = 1; + for (int i = ndim - 1; i >= 0; --i) { + strides[i] = stride; + stride *= shape[i]; + } +} + +void computeStridesDevice(const size_t *gpu_shape, size_t *gpu_strides, std::size_t ndim) { + computeStrides<<<1, 1>>>(gpu_shape, gpu_strides, ndim); + cudaDeviceSynchronize(); // Ensure computation completes +} + // Kernel to apply ReLU activation: max(0, x) __global__ void reluKernel(const ValueType *input, ValueType *output, std::size_t count) { std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp index 77f7aa3..3195650 100644 --- a/src/model/tensor_gpu.hpp +++ b/src/model/tensor_gpu.hpp @@ -20,7 +20,9 @@ void deallocate(ValueType *devicePtr); void copyToDevice(void *deviceDst, const void *hostSrc, std::size_t count); /// Copy data from GPU to CPU. -void copyToHost(ValueType *hostDst, const ValueType *deviceSrc, std::size_t count); +void copyToHost(void *hostDst, const void *deviceSrc, std::size_t count); + +void copyDeviceToDevice(void *deviceDst, const void *deviceSrc, std::size_t count); /// Set all elements to zero (on GPU). void zero(ValueType *deviceData, std::size_t count); @@ -34,6 +36,8 @@ void multiply(const ValueType *A, const ValueType *B, ValueType *C, std::size_t /// Dot product between two vectors (A · B) float dot(const ValueType *A, const ValueType *B, std::size_t count); +void computeStridesDevice(const size_t *gpu_shape, size_t *gpu_strides, std::size_t ndim); + // ---------------- ReLU ---------------- void relu(const ValueType *input, ValueType *output, std::size_t count); void relu_derivative(const ValueType *input, ValueType *output, std::size_t count); From 228b6d6c2d02bc1972dc73f405db553f5a085b65 Mon Sep 17 00:00:00 2001 From: maayan Date: Wed, 6 Aug 2025 10:53:09 +0300 Subject: [PATCH 08/40] new commit --- include/tensor.hpp | 8 ++-- src/model/activations.cpp | 29 ++++++------ src/model/tensor.cpp | 23 +++++++++- src/model/tensor_gpu.cu | 97 ++++++++++++++++++++++++++++++++++++++- src/model/tensor_gpu.hpp | 10 ++++ 5 files changed, 148 insertions(+), 19 deletions(-) diff --git a/include/tensor.hpp b/include/tensor.hpp index 2a66eb8..9147fb1 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -1,9 +1,9 @@ #ifndef TENSOR #define TENSOR +#include "../src/model/tensor_gpu.hpp" #include #include -#include "../src/model/tensor_gpu.hpp" namespace nn::model { class Activation; @@ -23,7 +23,7 @@ class Tensor { std::vector cpu_strides; ValueType *gpu_data = nullptr; - std::size_t gpu_data_size{0}; + std::size_t gpu_data_size{0}; size_t *gpu_shape = nullptr; size_t *gpu_strides = nullptr; size_t gpu_shape_size{0}; @@ -33,6 +33,8 @@ class Tensor { void computeStrides(); inline size_t flattenIndex(const std::vector &indices) const; + void loadTempGpu() const; + friend model::Activation; public: @@ -59,7 +61,7 @@ class Tensor { // Shape and size size_t numElements() const; - const std::vector &getData() const; + void getData(std::vector &dest) const; void fill(const ValueType &value); // Arithmetic operations diff --git a/src/model/activations.cpp b/src/model/activations.cpp index 525c227..319104b 100644 --- a/src/model/activations.cpp +++ b/src/model/activations.cpp @@ -1,4 +1,5 @@ #include "activations.hpp" +#include "tensor.hpp" #include "tensor_gpu.hpp" namespace nn::model { @@ -44,7 +45,9 @@ void Activation::derivativeActivate(const global::Tensor &net, global::Tensor &o } global::ValueType Activation::maxVector(const global::Tensor &metrix) { - global::ValueType max = metrix[0]; + if (metrix.isGpu) { + } + global::ValueType max = metrix.cpu_data[0]; for (auto &value : metrix) { if (value > max) { max = value; @@ -89,7 +92,7 @@ void Activation::relu(const global::Tensor &net, global::Tensor &out) { global::tensor_gpu::relu(net.gpu_data, out.gpu_data, net.gpu_data_size); } else { for (size_t i = 0; i < net.numElements(); ++i) { - out[i] = relu(net[i]); + out.cpu_data[i] = relu(net.cpu_data[i]); } } } @@ -99,7 +102,7 @@ void Activation::derivativeRelu(const global::Tensor &net, global::Tensor &out) global::tensor_gpu::relu_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size); } else { for (size_t i = 0; i < net.numElements(); ++i) { - out[i] *= derivativeRelu(net[i]); + out.cpu_data[i] *= derivativeRelu(net.cpu_data[i]); } } } @@ -109,7 +112,7 @@ void Activation::leakyRelu(const global::Tensor &net, global::Tensor &out) { global::tensor_gpu::leaky_relu(net.gpu_data, out.gpu_data, net.gpu_data_size); } else { for (size_t i = 0; i < net.numElements(); ++i) { - out[i] = leakyRelu(net[i]); + out.cpu_data[i] = leakyRelu(net.cpu_data[i]); } } } @@ -119,7 +122,7 @@ void Activation::derivativeLeakyRelu(const global::Tensor &net, global::Tensor & global::tensor_gpu::leaky_relu_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size); } else { for (size_t i = 0; i < net.numElements(); ++i) { - out[i] *= derivativeLeakyRelu(net[i]); + out.cpu_data[i] *= derivativeLeakyRelu(net.cpu_data[i]); } } } @@ -129,7 +132,7 @@ void Activation::sigmoid(const global::Tensor &net, global::Tensor &out) { global::tensor_gpu::sigmoid(net.gpu_data, out.gpu_data, net.gpu_data_size); } else { for (size_t i = 0; i < net.numElements(); ++i) { - out[i] = sigmoid(net[i]); + out.cpu_data[i] = sigmoid(net.cpu_data[i]); } } } @@ -139,7 +142,7 @@ void Activation::derivativeSigmoid(const global::Tensor &net, global::Tensor &ou global::tensor_gpu::sigmoid_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size); } else { for (size_t i = 0; i < net.numElements(); ++i) { - out[i] *= derivativeSigmoid(net[i]); + out.cpu_data[i] *= derivativeSigmoid(net.cpu_data[i]); } } } @@ -149,7 +152,7 @@ void Activation::tanh(const global::Tensor &net, global::Tensor &out) { global::tensor_gpu::tanh_activation(net.gpu_data, out.gpu_data, net.gpu_data_size); } else { for (size_t i = 0; i < net.numElements(); ++i) { - out[i] = tanh(net[i]); + out.cpu_data[i] = tanh(net.cpu_data[i]); } } } @@ -159,26 +162,26 @@ void Activation::derivativeTanh(const global::Tensor &net, global::Tensor &out) global::tensor_gpu::tanh_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size); } else { for (size_t i = 0; i < net.numElements(); ++i) { - out[i] *= derivativeTanh(net[i]); + out.cpu_data[i] *= derivativeTanh(net.cpu_data[i]); } } } void Activation::softmax(const global::Tensor &net, global::Tensor &out) { if (net.isGpu) { - + global::tensor_gpu::softmax(net.gpu_data, out.gpu_data, net.gpu_data_size); } else { global::ValueType max = maxVector(net); global::ValueType sum = 0.0; for (size_t i = 0; i < net.numElements(); ++i) { - global::ValueType x = net[i] - max; + global::ValueType x = net.cpu_data[i] - max; if (x < -700.0) x = -700.0; if (x > 700.0) x = 700.0; - out[i] = std::exp(x); - sum += out[i]; + out.cpu_data[i] = std::exp(x); + sum += out.cpu_data[i]; } sum = maxValue(sum, 1e-10); diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index 6547556..2a0bab9 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -1,7 +1,9 @@ #include "tensor_gpu.hpp" +#include #include #include #include +#include namespace nn::global { Tensor::Tensor(const std::vector &shape, float init) { @@ -29,13 +31,19 @@ Tensor::Tensor(const std::vector &shape, float init) { } ValueType &Tensor::operator[](size_t i) { + static ValueType value; if (isGpu) { + value = tensor_gpu::getValueAt(gpu_data, i); + return value; } return cpu_data[i]; } const ValueType &Tensor::operator[](size_t i) const { + static ValueType value; if (isGpu) { + value = tensor_gpu::getValueAt(gpu_data, i); + return value; } return cpu_data[i]; } @@ -47,10 +55,15 @@ size_t Tensor::numElements() const { return cpu_data.size(); } -const std::vector &Tensor::getData() const { +void Tensor::getData(std::vector &dest) const { if (!isGpu) { - return cpu_data; + dest = cpu_data; } + + ValueType *newV = nullptr; + tensor_gpu::copyToHost(newV, gpu_data, gpu_data_size * sizeof(ValueType)); + + std::copy(newV, newV + gpu_data_size, dest.begin()); } void Tensor::fill(const ValueType &value) { @@ -95,6 +108,7 @@ void Tensor::computeStrides() { inline size_t Tensor::flattenIndex(const std::vector &indices) const { if (!isGpu) { + // CPU version, same as before if (indices.size() != cpu_shape.size()) { throw std::invalid_argument("Incorrect number of indices."); } @@ -105,6 +119,11 @@ inline size_t Tensor::flattenIndex(const std::vector &indices) const { index += indices[i] * cpu_strides[i]; } return index; + } else { + if (indices.size() != gpu_shape_size) { + throw std::invalid_argument("Incorrect number of indices."); + } + return tensor_gpu::flattenIndexGpu(indices.data(), gpu_shape, gpu_strides, gpu_shape_size); } } diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu index 9199305..99f81b6 100644 --- a/src/model/tensor_gpu.cu +++ b/src/model/tensor_gpu.cu @@ -28,7 +28,7 @@ void copyToDevice(void* deviceDst, const void * hostSrc, std::size_t size) { void copyDeviceToDevice(void *deviceDst, const void *deviceSrc, std::size_t count) { - cudaMemcpy(deviceDst, deviceDst, count, cudaMemcpyDeviceToDevice); + cudaMemcpy(deviceDst, deviceSrc, count, cudaMemcpyDeviceToDevice); } // Copy data from GPU to CPU. @@ -283,4 +283,99 @@ void leaky_relu_derivative(const ValueType* input, ValueType* output, std::size_ leakyReluDerivativeKernel<<>>(input, output, count, alpha); cudaDeviceSynchronize(); } + +__global__ void softmaxKernel(const ValueType* input, ValueType* output, std::size_t count) { + extern __shared__ ValueType shared[]; + + std::size_t tid = threadIdx.x; + std::size_t idx = blockIdx.x * blockDim.x + tid; + + if (idx >= count) return; + + // Load input into shared memory + shared[tid] = input[idx]; + __syncthreads(); + + // Step 1: Find max value for numerical stability + ValueType max_val = shared[0]; + for (std::size_t i = 1; i < blockDim.x && blockIdx.x * blockDim.x + i < count; ++i) { + max_val = fmaxf(max_val, shared[i]); + } + __syncthreads(); + + // Step 2: Compute exp(x - max) + ValueType e = expf(shared[tid] - max_val); + shared[tid] = e; + __syncthreads(); + + // Step 3: Sum of exponentials + ValueType sum = 0.0f; + for (std::size_t i = 0; i < blockDim.x && blockIdx.x * blockDim.x + i < count; ++i) { + sum += shared[i]; + } + __syncthreads(); + + // Step 4: Normalize + output[idx] = shared[tid] / sum; +} + +void softmax(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + std::size_t sharedMemSize = blockSize * sizeof(ValueType); + + softmaxKernel<<>>(input, output, count); + cudaDeviceSynchronize(); +} + +template +void setValueAt(T* devicePtr, std::size_t index, T value) { + cudaMemcpy(devicePtr + index, &value, sizeof(T), cudaMemcpyHostToDevice); +} + +template +ValueType getValueAt(const T* devicePtr , std::size_t index) { + T value; + cudaMemcpy(&value, devicePtr + index, sizeof(T), cudaMemcpyDeviceToHost); + return value; +} + +// Compute flattened index on device +__global__ void flattenIndexKernel(const size_t* indices, const size_t* shape, const size_t* strides, size_t ndim, size_t* outIndex) { + size_t idx = 0; + for (size_t i = 0; i < ndim; ++i) { + if (indices[i] >= shape[i]) { + *outIndex = size_t(-1); // invalid index + return; + } + idx += indices[i] * strides[i]; + } + *outIndex = idx; +} + +size_t flattenIndexGpu(const size_t* indices,const size_t* d_shape,const size_t* d_strides,size_t ndim) { + // Copy indices vector to device memory + size_t* d_indices = nullptr; + cudaMalloc(&d_indices, ndim * sizeof(size_t)); + cudaMemcpy(d_indices, indices, ndim * sizeof(size_t), cudaMemcpyHostToDevice); + + size_t* d_outIndex = nullptr; + cudaMalloc(&d_outIndex, sizeof(size_t)); + + // Launch kernel with a single thread since this is a scalar computation + flattenIndexKernel<<<1, 1>>>(d_indices, d_shape, d_strides, ndim, d_outIndex); + cudaDeviceSynchronize(); + + size_t hostIndex; + cudaMemcpy(&hostIndex, d_outIndex, sizeof(size_t), cudaMemcpyDeviceToHost); + + cudaFree(d_indices); + cudaFree(d_outIndex); + + if (hostIndex == size_t(-1)) { + throw std::out_of_range("Index out of bounds."); + } + + return hostIndex; +} } // namespace tensor_gpu diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp index 3195650..a26bf29 100644 --- a/src/model/tensor_gpu.hpp +++ b/src/model/tensor_gpu.hpp @@ -54,6 +54,16 @@ void tanh_derivative(const ValueType *input, ValueType *output, std::size_t coun void leaky_relu(const ValueType *input, ValueType *output, std::size_t count, ValueType alpha = 0.01f); void leaky_relu_derivative(const ValueType *input, ValueType *output, std::size_t count, ValueType alpha = 0.01f); +void softmax(const ValueType *net, ValueType *out, std::size_t size); + +template +ValueType getValueAt(const T *devicePtr, std::size_t index); + +template +void setValueAt(T *devicePtr, std::size_t index, T value); + +size_t flattenIndexGpu(const size_t *indices, const size_t *d_shape, const size_t *d_strides, size_t ndim); + } // namespace nn::global::tensor_gpu #endif // TENSOR_GPU From 7c96a3c2d7bcce83b81c503ffa1de8d273f21341 Mon Sep 17 00:00:00 2001 From: maayan Date: Wed, 6 Aug 2025 11:22:11 +0300 Subject: [PATCH 09/40] new commit --- include/tensor.hpp | 5 ---- src/model/tensor.cpp | 54 ++++++---------------------------------- src/model/tensor_gpu.cu | 48 +++++++++++++++++++++++++++++++++++ src/model/tensor_gpu.hpp | 1 + 4 files changed, 56 insertions(+), 52 deletions(-) diff --git a/include/tensor.hpp b/include/tensor.hpp index 9147fb1..23dc8b6 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -65,11 +65,6 @@ class Tensor { void fill(const ValueType &value); // Arithmetic operations - Tensor operator+(const Tensor &other) const; - Tensor operator*(const Tensor &other) const; - Tensor operator-(const Tensor &other) const; - Tensor operator/(const Tensor &other) const; - Tensor operator*(ValueType scalar) const; Tensor operator+(ValueType scalar) const; Tensor operator/(ValueType scalar) const; diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index 2a0bab9..10f7e54 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -128,63 +128,23 @@ inline size_t Tensor::flattenIndex(const std::vector &indices) const { } ValueType &Tensor::operator()(const std::vector &indices) { + static ValueType value; if (!isGpu) { return cpu_data[flattenIndex(indices)]; } + + value = tensor_gpu::getValueAtIndices(indices.data()); + return value; } ValueType Tensor::operator()(const std::vector &indices) const { + static ValueType value; if (!isGpu) { return cpu_data[flattenIndex(indices)]; } -} -Tensor Tensor::operator+(const Tensor &other) const { - if (!isGpu) { - if (cpu_shape != other.cpu_shape) { - throw std::invalid_argument("Shape mismatch in Tensor::operator+."); - } - Tensor result(cpu_shape); - const float *a = cpu_data.data(); - const float *b = other.cpu_data.data(); - float *r = result.cpu_data.data(); - const size_t N = cpu_data.size(); - for (size_t i = 0; i < N; ++i) - r[i] = a[i] + b[i]; - return result; - } -} - -Tensor Tensor::operator-(const Tensor &other) const { - if (!isGpu) { - if (cpu_shape != other.cpu_shape) { - throw std::invalid_argument("Shape mismatch in Tensor::operator-."); - } - Tensor result(cpu_shape); - const float *a = cpu_data.data(); - const float *b = other.cpu_data.data(); - float *r = result.cpu_data.data(); - const size_t N = cpu_data.size(); - for (size_t i = 0; i < N; ++i) - r[i] = a[i] - b[i]; - return result; - } -} - -Tensor Tensor::operator/(const Tensor &other) const { - if (!isGpu) { - if (cpu_shape != other.cpu_shape) { - throw std::invalid_argument("Shape mismatch in Tensor::operator/."); - } - Tensor result(cpu_shape); - const float *a = cpu_data.data(); - const float *b = other.cpu_data.data(); - float *r = result.cpu_data.data(); - const size_t N = cpu_data.size(); - for (size_t i = 0; i < N; ++i) - r[i] = a[i] / b[i]; - return result; - } + value = tensor_gpu::getValueAtIndices(indices.data()); + return value; } Tensor &Tensor::operator+=(const Tensor &other) { diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu index 99f81b6..0a3fe10 100644 --- a/src/model/tensor_gpu.cu +++ b/src/model/tensor_gpu.cu @@ -378,4 +378,52 @@ size_t flattenIndexGpu(const size_t* indices,const size_t* d_shape,const size_t* return hostIndex; } + +__global__ void computeFlatIndexKernel( + const size_t* indices, const size_t* shape, const size_t* strides, + size_t rank, size_t* outIndex +) { + size_t flatIndex = 0; + for (size_t i = 0; i < rank; ++i) { + flatIndex += indices[i] * strides[i]; + } + *outIndex = flatIndex; +} + +ValueType getValueAtIndicesGpu( + const ValueType* deviceData, + const size_t* hostIndices, + const size_t* deviceShape, + const size_t* deviceStrides, + size_t rank +) { + // Copy host indices to device + size_t* deviceIndices; + cudaMalloc(&deviceIndices, sizeof(size_t) * rank); + cudaMemcpy(deviceIndices, hostIndices, sizeof(size_t) * rank, cudaMemcpyHostToDevice); + + // Allocate output for index + size_t* deviceFlatIndex; + cudaMalloc(&deviceFlatIndex, sizeof(size_t)); + + // Launch kernel to compute flat index + computeFlatIndexKernel<<<1, 1>>>( + deviceIndices, deviceShape, deviceStrides, rank, deviceFlatIndex + ); + cudaDeviceSynchronize(); + + // Copy back flat index + size_t flatIndex; + cudaMemcpy(&flatIndex, deviceFlatIndex, sizeof(size_t), cudaMemcpyDeviceToHost); + + // Get value at that index + ValueType value; + cudaMemcpy(&value, deviceData + flatIndex, sizeof(ValueType), cudaMemcpyDeviceToHost); + + // Cleanup + cudaFree(deviceIndices); + cudaFree(deviceFlatIndex); + + return value; +} } // namespace tensor_gpu diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp index a26bf29..cf171e5 100644 --- a/src/model/tensor_gpu.hpp +++ b/src/model/tensor_gpu.hpp @@ -64,6 +64,7 @@ void setValueAt(T *devicePtr, std::size_t index, T value); size_t flattenIndexGpu(const size_t *indices, const size_t *d_shape, const size_t *d_strides, size_t ndim); +ValueType getValueAtIndices(const size_t *indices); } // namespace nn::global::tensor_gpu #endif // TENSOR_GPU From 038136f20c6bdc0a463a797c5e14ec6b27399c80 Mon Sep 17 00:00:00 2001 From: maayan Date: Wed, 6 Aug 2025 12:05:56 +0300 Subject: [PATCH 10/40] new commit --- include/tensor.hpp | 2 +- src/model/tensor.cpp | 66 ++++++++++++++- src/model/tensor_gpu.cu | 176 ++++++++++++++++++++++++++++++++++++--- src/model/tensor_gpu.hpp | 36 ++++++-- 4 files changed, 258 insertions(+), 22 deletions(-) diff --git a/include/tensor.hpp b/include/tensor.hpp index 23dc8b6..60bdd3e 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -28,7 +28,7 @@ class Tensor { size_t *gpu_strides = nullptr; size_t gpu_shape_size{0}; - static const bool isGpu{false}; + static const bool isGpu{true}; void computeStrides(); inline size_t flattenIndex(const std::vector &indices) const; diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index 10f7e54..f76e921 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -33,7 +33,7 @@ Tensor::Tensor(const std::vector &shape, float init) { ValueType &Tensor::operator[](size_t i) { static ValueType value; if (isGpu) { - value = tensor_gpu::getValueAt(gpu_data, i); + value = tensor_gpu::getValueAt(gpu_data, i); return value; } return cpu_data[i]; @@ -42,7 +42,7 @@ ValueType &Tensor::operator[](size_t i) { const ValueType &Tensor::operator[](size_t i) const { static ValueType value; if (isGpu) { - value = tensor_gpu::getValueAt(gpu_data, i); + value = tensor_gpu::getValueAt(gpu_data, i); return value; } return cpu_data[i]; @@ -133,7 +133,7 @@ ValueType &Tensor::operator()(const std::vector &indices) { return cpu_data[flattenIndex(indices)]; } - value = tensor_gpu::getValueAtIndices(indices.data()); + value = tensor_gpu::getValueAtIndices(gpu_data, indices.data(), gpu_shape, gpu_strides, gpu_shape_size); return value; } @@ -143,7 +143,7 @@ ValueType Tensor::operator()(const std::vector &indices) const { return cpu_data[flattenIndex(indices)]; } - value = tensor_gpu::getValueAtIndices(indices.data()); + value = tensor_gpu::getValueAtIndices(gpu_data, indices.data(), gpu_shape, gpu_strides, gpu_shape_size); return value; } @@ -155,6 +155,9 @@ Tensor &Tensor::operator+=(const Tensor &other) { for (size_t i = 0; i < N; ++i) cpu_data[i] += other.cpu_data[i]; } else { + if (gpu_shape != other.gpu_shape) + throw std::invalid_argument("Shape mismatch in Tensor::operator+=."); + tensor_gpu::add(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType)); } return *this; } @@ -167,6 +170,9 @@ Tensor &Tensor::operator-=(const Tensor &other) { for (size_t i = 0; i < N; ++i) cpu_data[i] -= other.cpu_data[i]; } else { + if (gpu_shape != other.gpu_shape) + throw std::invalid_argument("Shape mismatch in Tensor::operator+=."); + tensor_gpu::subtraction(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType)); } return *this; } @@ -179,6 +185,9 @@ Tensor &Tensor::operator*=(const Tensor &other) { for (size_t i = 0; i < N; ++i) cpu_data[i] *= other.cpu_data[i]; } else { + if (gpu_shape != other.gpu_shape) + throw std::invalid_argument("Shape mismatch in Tensor::operator+=."); + tensor_gpu::multiply(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType)); } return *this; } @@ -191,6 +200,9 @@ Tensor &Tensor::operator/=(const Tensor &other) { for (size_t i = 0; i < N; ++i) cpu_data[i] /= other.cpu_data[i]; } else { + if (gpu_shape != other.gpu_shape) + throw std::invalid_argument("Shape mismatch in Tensor::operator+=."); + tensor_gpu::division(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType)); } return *this; } @@ -200,6 +212,7 @@ Tensor &Tensor::operator*=(ValueType scalar) { for (auto &x : cpu_data) x *= scalar; } else { + tensor_gpu::multiply(gpu_data, scalar, gpu_data, gpu_data_size * sizeof(ValueType)); } return *this; } @@ -209,6 +222,7 @@ Tensor &Tensor::operator-=(ValueType scalar) { for (auto &x : cpu_data) x -= scalar; } else { + tensor_gpu::subtraction(gpu_data, scalar, gpu_data, gpu_data_size * sizeof(ValueType)); } return *this; } @@ -218,6 +232,7 @@ Tensor &Tensor::operator+=(ValueType scalar) { for (auto &x : cpu_data) x += scalar; } else { + tensor_gpu::add(gpu_data, scalar, gpu_data, gpu_data_size * sizeof(ValueType)); } return *this; } @@ -227,6 +242,7 @@ Tensor &Tensor::operator/=(ValueType scalar) { for (auto &x : cpu_data) x /= scalar; } else { + tensor_gpu::division(gpu_data, scalar, gpu_data, gpu_data_size * sizeof(ValueType)); } return *this; } @@ -284,6 +300,21 @@ Tensor Tensor::matmul(const Tensor &other) const { } return result; } + + // Validate shapes similarly (assumed available via gpu_shape_size and gpu_shape pointer) + if (gpu_shape_size != 2 || other.gpu_shape_size != 1) + throw std::runtime_error("matmul (GPU): unsupported shapes."); + + size_t M = gpu_shape[0]; + size_t K = gpu_shape[1]; + if (K != other.gpu_shape[0]) + throw std::runtime_error("matmul (GPU): shape mismatch."); + + Tensor result({M}, 0.0f); + + // Call GPU kernel or helper + tensor_gpu::matmul(gpu_data, other.gpu_data, result.gpu_data, M, K); + return result; } Tensor Tensor::outer(const Tensor &a, const Tensor &b) { @@ -307,6 +338,18 @@ Tensor Tensor::outer(const Tensor &a, const Tensor &b) { } return result; } + + if (a.gpu_shape_size != 1 || b.gpu_shape_size != 1) + throw std::runtime_error("outer (GPU): both tensors must be 1D vectors"); + + size_t m = a.gpu_shape[0]; + size_t n = b.gpu_shape[0]; + + Tensor result({m, n}); + + // Call GPU kernel or helper + tensor_gpu::outer(a.gpu_data, b.gpu_data, result.gpu_data, m, n); + return result; } Tensor Tensor::matmulT(const Tensor &vec) const { @@ -337,5 +380,20 @@ Tensor Tensor::matmulT(const Tensor &vec) const { } return result; } + + // GPU path + if (gpu_shape_size != 2 || vec.gpu_shape_size != 1) + throw std::runtime_error("matmulT (GPU): bad dimensions"); + + size_t M = gpu_shape[0]; + size_t N = gpu_shape[1]; + if (vec.gpu_shape[0] != M) + throw std::runtime_error("matmulT (GPU): incompatible"); + + Tensor result({N}); + + // Call GPU kernel or helper + tensor_gpu::matmulT(gpu_data, vec.gpu_data, result.gpu_data, M, N); + return result; } } // namespace nn::global diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu index 0a3fe10..7741a0d 100644 --- a/src/model/tensor_gpu.cu +++ b/src/model/tensor_gpu.cu @@ -68,6 +68,39 @@ void add(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count cudaDeviceSynchronize(); } + +// Kernel for element-wise addition: C = A - B +__global__ void subtractionKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + C[idx] = A[idx] - B[idx]; + } +} + +// Element-wise addition: C = A + B +void subtraction(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + subtractionKernel<<>>(A, B, C, count); + cudaDeviceSynchronize(); +} + +// Kernel for element-wise addition: C = A / B +__global__ void divisionKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + C[idx] = A[idx] / B[idx]; + } +} + +// Element-wise addition: C = A / B +void division(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + divisionKernel<<>>(A, B, C, count); + cudaDeviceSynchronize(); +} + // Kernel for element-wise multiplication: C = A * B __global__ void multiplyKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -84,6 +117,71 @@ void multiply(const ValueType* A, const ValueType* B, ValueType* C, std::size_t cudaDeviceSynchronize(); } +// Kernel for element-wise addition: C = A + B +__global__ void addKernel(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + C[idx] = A[idx] + B; + } +} + +// Element-wise addition: C = A + B +void add(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + addKernel<<>>(A, B, C, count); + cudaDeviceSynchronize(); +} + + +// Kernel for element-wise addition: C = A - B +__global__ void subtractionKernel(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + C[idx] = A[idx] - B; + } +} + +// Element-wise addition: C = A + B +void subtraction(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + subtractionKernel<<>>(A, B, C, count); + cudaDeviceSynchronize(); +} + +// Kernel for element-wise addition: C = A / B +__global__ void divisionKernel(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + C[idx] = A[idx] / B; + } +} + +// Element-wise addition: C = A / B +void division(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + divisionKernel<<>>(A, B, C, count); + cudaDeviceSynchronize(); +} + +// Kernel for element-wise multiplication: C = A * B +__global__ void multiplyKernel(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + C[idx] = A[idx] * B; + } +} + +// Element-wise multiply: C = A * B +void multiply(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + multiplyKernel<<>>(A, B, C, count); + cudaDeviceSynchronize(); +} + // Dot product kernel using parallel reduction (simplified version) __global__ void dotKernel(const ValueType* A, const ValueType* B, ValueType* partialSum, std::size_t count) { __shared__ ValueType cache[256]; @@ -328,15 +426,13 @@ void softmax(const ValueType* input, ValueType* output, std::size_t count) { cudaDeviceSynchronize(); } -template -void setValueAt(T* devicePtr, std::size_t index, T value) { - cudaMemcpy(devicePtr + index, &value, sizeof(T), cudaMemcpyHostToDevice); +void setValueAt(ValueType* devicePtr, std::size_t index, ValueType value) { + cudaMemcpy(devicePtr + index, &value, sizeof(ValueType), cudaMemcpyHostToDevice); } -template -ValueType getValueAt(const T* devicePtr , std::size_t index) { - T value; - cudaMemcpy(&value, devicePtr + index, sizeof(T), cudaMemcpyDeviceToHost); +ValueType getValueAt(const ValueType* devicePtr , std::size_t index) { + ValueType value; + cudaMemcpy(&value, devicePtr + index, sizeof(ValueType), cudaMemcpyDeviceToHost); return value; } @@ -390,17 +486,17 @@ __global__ void computeFlatIndexKernel( *outIndex = flatIndex; } -ValueType getValueAtIndicesGpu( +ValueType getValueAtIndices( const ValueType* deviceData, const size_t* hostIndices, const size_t* deviceShape, const size_t* deviceStrides, - size_t rank + size_t size ) { // Copy host indices to device size_t* deviceIndices; - cudaMalloc(&deviceIndices, sizeof(size_t) * rank); - cudaMemcpy(deviceIndices, hostIndices, sizeof(size_t) * rank, cudaMemcpyHostToDevice); + cudaMalloc(&deviceIndices, sizeof(size_t) * size); + cudaMemcpy(deviceIndices, hostIndices, sizeof(size_t) * size, cudaMemcpyHostToDevice); // Allocate output for index size_t* deviceFlatIndex; @@ -408,7 +504,7 @@ ValueType getValueAtIndicesGpu( // Launch kernel to compute flat index computeFlatIndexKernel<<<1, 1>>>( - deviceIndices, deviceShape, deviceStrides, rank, deviceFlatIndex + deviceIndices, deviceShape, deviceStrides, size, deviceFlatIndex ); cudaDeviceSynchronize(); @@ -426,4 +522,60 @@ ValueType getValueAtIndicesGpu( return value; } + +__global__ void matmulKernel(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K) { + size_t row = blockIdx.x * blockDim.x + threadIdx.x; + if (row < M) { + ValueType sum = 0; + for (size_t j = 0; j < K; ++j) { + sum += A[row * K + j] * B[j]; + } + R[row] = sum; + } +} + +__global__ void outerKernel(const ValueType *a, const ValueType *b, ValueType *result, size_t m, size_t n) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total = m * n; + if (idx < total) { + size_t i = idx / n; + size_t j = idx % n; + result[i * n + j] = a[i] * b[j]; + } +} + +__global__ void matmulTKernel(const ValueType *W, const ValueType *V, ValueType *R, size_t M, size_t N) { + size_t col = blockIdx.x * blockDim.x + threadIdx.x; + if (col < N) { + ValueType sum = 0; + for (size_t i = 0; i < M; ++i) { + // W is M x N, access element at (i, col) + sum += W[i * N + col] * V[i]; + } + R[col] = sum; + } +} + +// Wrapper functions to launch kernels + +void matmul(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K) { + const int blockSize = 256; + int gridSize = (M + blockSize - 1) / blockSize; + matmulKernel<<>>(A, B, R, M, K); + cudaDeviceSynchronize(); +} + +void outer(const ValueType *a, const ValueType *b, ValueType *result, size_t m, size_t n) { + const int blockSize = 256; + int gridSize = (m * n + blockSize - 1) / blockSize; + outerKernel<<>>(a, b, result, m, n); + cudaDeviceSynchronize(); +} + +void matmulT(const ValueType *W, const ValueType *V, ValueType *R, size_t M, size_t N) { + const int blockSize = 256; + int gridSize = (N + blockSize - 1) / blockSize; + matmulTKernel<<>>(W, V, R, M, N); + cudaDeviceSynchronize(); +} } // namespace tensor_gpu diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp index cf171e5..e4cdc28 100644 --- a/src/model/tensor_gpu.hpp +++ b/src/model/tensor_gpu.hpp @@ -30,9 +30,27 @@ void zero(ValueType *deviceData, std::size_t count); /// Element-wise addition: C = A + B void add(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count); +/// Element-wise addition: C = A - B +void subtraction(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count); + +/// Element-wise addition: C = A / B +void division(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count); + /// Element-wise multiply: C = A * B void multiply(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count); +/// Element-wise addition: C = A + B +void add(const ValueType *A, const ValueType B, ValueType *C, std::size_t count); + +/// Element-wise addition: C = A - B +void subtraction(const ValueType *A, const ValueType B, ValueType *C, std::size_t count); + +/// Element-wise addition: C = A / B +void division(const ValueType *A, const ValueType B, ValueType *C, std::size_t count); + +/// Element-wise multiply: C = A * B +void multiply(const ValueType *A, const ValueType B, ValueType *C, std::size_t count); + /// Dot product between two vectors (A · B) float dot(const ValueType *A, const ValueType *B, std::size_t count); @@ -54,17 +72,25 @@ void tanh_derivative(const ValueType *input, ValueType *output, std::size_t coun void leaky_relu(const ValueType *input, ValueType *output, std::size_t count, ValueType alpha = 0.01f); void leaky_relu_derivative(const ValueType *input, ValueType *output, std::size_t count, ValueType alpha = 0.01f); +// ---------------- Softmax ---------------- void softmax(const ValueType *net, ValueType *out, std::size_t size); -template -ValueType getValueAt(const T *devicePtr, std::size_t index); +ValueType getValueAt(const ValueType *devicePtr, std::size_t index); -template -void setValueAt(T *devicePtr, std::size_t index, T value); +void setValueAt(ValueType *devicePtr, std::size_t index, ValueType value); size_t flattenIndexGpu(const size_t *indices, const size_t *d_shape, const size_t *d_strides, size_t ndim); -ValueType getValueAtIndices(const size_t *indices); +ValueType getValueAtIndices( + const ValueType *deviceData, + const size_t *hostIndices, + const size_t *deviceShape, + const size_t *deviceStrides, + size_t rank); + +void matmul(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K); +void outer(const ValueType *a, const ValueType *b, ValueType *result, size_t m, size_t n); +void matmulT(const ValueType *W, const ValueType *V, ValueType *R, size_t M, size_t N); } // namespace nn::global::tensor_gpu #endif // TENSOR_GPU From 27858235a39660b414c792c60630c5f2890693d9 Mon Sep 17 00:00:00 2001 From: maayan Date: Wed, 6 Aug 2025 13:51:26 +0300 Subject: [PATCH 11/40] new commit --- CMakeLists.txt | 7 ++++--- src/model/tensor_gpu.cu | 18 +++++++++++++++--- src/model/tensor_gpu.hpp | 2 +- tests/binary_test.cpp | 1 - tests/data/config-binary_test.json | 2 +- 5 files changed, 21 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a622b5b..1bb21b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,5 @@ cmake_minimum_required(VERSION 3.28) +set(CMAKE_CUDA_ARCHITECTURES 86) # For RTX 3060 project(NeuralNetwork LANGUAGES CXX CUDA) # Add CUDA here # ------------------------------------------------------------------ @@ -10,6 +11,8 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON) # Enforce it set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin") +enable_language(CUDA) + # Default to Debug build type if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type" FORCE) @@ -63,7 +66,6 @@ set_target_properties(NeuralNetwork PROPERTIES POSITION_INDEPENDENT_CODE ON) # Enable separable compilation for CUDA files set_target_properties(NeuralNetwork PROPERTIES CUDA_SEPARABLE_COMPILATION ON - CUDA_RESOLVE_DEVICE_SYMBOLS ON ) target_include_directories(NeuralNetwork @@ -83,7 +85,6 @@ target_link_libraries(NeuralNetwork SFML::System nlohmann_json::nlohmann_json cuda - cudart ) target_compile_options(NeuralNetwork PRIVATE -Wall -Wextra -Wpedantic) @@ -121,4 +122,4 @@ endif() # Install install(TARGETS NeuralNetwork ARCHIVE DESTINATION lib) install(DIRECTORY include/ DESTINATION include) - + diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu index 7741a0d..74c9583 100644 --- a/src/model/tensor_gpu.cu +++ b/src/model/tensor_gpu.cu @@ -1,14 +1,26 @@ #include #include "tensor_gpu.hpp" #include +#include #include + namespace nn::global::tensor_gpu { // Allocate memory on GPU for a tensor. -void* allocate(std::size_t count) { - void* devicePtr = nullptr; - cudaError_t err = cudaMalloc(&devicePtr, count); +void* allocate(std::size_t size) { + int count = 0; + cudaError_t err = cudaGetDeviceCount(&count); if (err != cudaSuccess) { + std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl; + } + std::cout << "CUDA device count: " << count << std::endl; + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, 0); + std::cout << "GPU memory available: " << prop.totalGlobalMem / (1024 * 1024) << " MB\n"; + + void* devicePtr = nullptr; + cudaError_t err1 = cudaMalloc(&devicePtr, size); + if (err1 != cudaSuccess) { throw std::runtime_error("cudaMalloc failed"); } return devicePtr; diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp index e4cdc28..2d0745f 100644 --- a/src/model/tensor_gpu.hpp +++ b/src/model/tensor_gpu.hpp @@ -11,7 +11,7 @@ class Tensor; // Forward declaration namespace nn::global::tensor_gpu { /// Allocate memory on GPU for a tensor. -void *allocate(std::size_t count); +void *allocate(std::size_t size); /// Free GPU memory. void deallocate(ValueType *devicePtr); diff --git a/tests/binary_test.cpp b/tests/binary_test.cpp index 7dbb327..3f740e2 100644 --- a/tests/binary_test.cpp +++ b/tests/binary_test.cpp @@ -1,4 +1,3 @@ -#include "tensor.hpp" #include "tests.hpp" #include #include diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json index 007cb80..59d9390 100644 --- a/tests/data/config-binary_test.json +++ b/tests/data/config-binary_test.json @@ -1,6 +1,6 @@ { "visual config": { - "enableVisuals": true, + "enableVisuals": false, "modes": [ { "state": "pause", "mode": true }, { "state": "precise mode", "mode": false }, From d50e875c233edde5c9ec022bfc668e70929e511c Mon Sep 17 00:00:00 2001 From: maayan Date: Wed, 6 Aug 2025 15:42:54 +0300 Subject: [PATCH 12/40] bug fix --- CMakeLists.txt | 13 +------------ src/model/tensor.cpp | 5 ++++- src/model/tensor_gpu.cu | 11 ----------- tests/binary_test.cpp | 3 ++- 4 files changed, 7 insertions(+), 25 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1bb21b6..c29b4f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,13 +44,6 @@ FetchContent_Declare(nlohmann_json FetchContent_MakeAvailable(SFML nlohmann_json) -# ------------------------------------------------------------------ -# Function: Apply sanitizers (for CPU code only) -function(apply_sanitizers target) - target_compile_options(${target} PRIVATE -fsanitize=address -fno-omit-frame-pointer -g) - target_link_libraries(${target} PRIVATE -fsanitize=address) -endfunction() - # ------------------------------------------------------------------ # Main library @@ -97,8 +90,6 @@ if(BUILD_NN_TESTS) enable_testing() include(CTest) - apply_sanitizers(NeuralNetwork) - file(GLOB TEST_SOURCES CONFIGURE_DEPENDS tests/*.cpp) if(TEST_SOURCES) @@ -109,8 +100,6 @@ if(BUILD_NN_TESTS) target_link_libraries(${test_name} PRIVATE NeuralNetwork) target_include_directories(${test_name} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include") - apply_sanitizers(${test_name}) - add_test(NAME ${test_name} COMMAND ${test_name}) endforeach() else() @@ -122,4 +111,4 @@ endif() # Install install(TARGETS NeuralNetwork ARCHIVE DESTINATION lib) install(DIRECTORY include/ DESTINATION include) - + diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index f76e921..ff9e1e7 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -21,10 +21,13 @@ Tensor::Tensor(const std::vector &shape, float init) { cpu_shape = shape; cpu_data.assign(totalSize, init); } else { + gpu_data = (ValueType *)tensor_gpu::allocate(totalSize * sizeof(ValueType)); + gpu_shape = (size_t *)tensor_gpu::allocate(shape.size() * sizeof(size_t)); tensor_gpu::copyToDevice(gpu_shape, shape.data(), gpu_data_size * sizeof(size_t)); - gpu_data = (ValueType *)tensor_gpu::allocate(totalSize * sizeof(ValueType)); + gpu_data_size = totalSize; + gpu_shape_size = shape.size(); } computeStrides(); diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu index 74c9583..493bbf0 100644 --- a/src/model/tensor_gpu.cu +++ b/src/model/tensor_gpu.cu @@ -1,23 +1,12 @@ #include #include "tensor_gpu.hpp" #include -#include #include namespace nn::global::tensor_gpu { // Allocate memory on GPU for a tensor. void* allocate(std::size_t size) { - int count = 0; - cudaError_t err = cudaGetDeviceCount(&count); - if (err != cudaSuccess) { - std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl; - } - std::cout << "CUDA device count: " << count << std::endl; - cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, 0); - std::cout << "GPU memory available: " << prop.totalGlobalMem / (1024 * 1024) << " MB\n"; - void* devicePtr = nullptr; cudaError_t err1 = cudaMalloc(&devicePtr, size); if (err1 != cudaSuccess) { diff --git a/tests/binary_test.cpp b/tests/binary_test.cpp index 3f740e2..c9716ad 100644 --- a/tests/binary_test.cpp +++ b/tests/binary_test.cpp @@ -1,3 +1,4 @@ +#include "tensor.hpp" #include "tests.hpp" #include #include @@ -81,7 +82,7 @@ int main(int argc, char *argv[]) { if (argc > 1 && std::string(argv[1]) == "l") { model.load("test.txt"); } else { - std::vector files {"../tests/data/test1", "../tests/data/test2"}; + std::vector files{"../tests/data/test1", "../tests/data/test2"}; model.train(files); nn::model::modelResult result = model.evaluateModel("../tests/data/database-binary_test"); From 5c37bfbb990df8ba437e32a6e2676ddcc8f55605 Mon Sep 17 00:00:00 2001 From: maayan Date: Wed, 6 Aug 2025 16:06:39 +0300 Subject: [PATCH 13/40] bug fixes --- include/tensor.hpp | 10 +--------- src/model/dataBase.cpp | 3 ++- src/model/tensor.cpp | 20 ++++++++++++++++++++ src/model/tensor_gpu.cu | 1 - 4 files changed, 23 insertions(+), 11 deletions(-) diff --git a/include/tensor.hpp b/include/tensor.hpp index 60bdd3e..f47d82b 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -11,11 +11,6 @@ class Activation; namespace nn::global { -enum class Backend { - CPU, - GPU, -}; - class Tensor { private: std::vector cpu_data; @@ -40,10 +35,7 @@ class Tensor { public: // Constructors Tensor(const std::vector &shape, float init = 0.0f); - Tensor(const Tensor &other) - : cpu_data(other.cpu_data), - cpu_shape(other.cpu_shape), - cpu_strides(other.cpu_strides) {} + Tensor(const Tensor &other); Tensor &operator=(const Tensor &other); diff --git a/src/model/dataBase.cpp b/src/model/dataBase.cpp index 2c13444..bb79bce 100644 --- a/src/model/dataBase.cpp +++ b/src/model/dataBase.cpp @@ -65,8 +65,9 @@ int DataBase::load(const std::string &db_filename) { } TrainSample new_sample = readLine(line); - if (new_sample.input.numElements() == 0) + if (new_sample.input.numElements() == 0) { continue; + } samples.add(new_sample); } diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index ff9e1e7..1772600 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -33,6 +33,26 @@ Tensor::Tensor(const std::vector &shape, float init) { computeStrides(); } +Tensor::Tensor(const Tensor &other) { + if (isGpu) { + gpu_data_size = other.gpu_data_size; + gpu_shape_size = other.gpu_shape_size; + + gpu_data = (ValueType *)tensor_gpu::allocate(gpu_data_size * sizeof(ValueType)); + gpu_strides = (size_t *)tensor_gpu::allocate(gpu_shape_size * sizeof(size_t)); + gpu_shape = (size_t *)tensor_gpu::allocate(gpu_shape_size * sizeof(size_t)); + + tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size*sizeof(ValueType)); + tensor_gpu::copyDeviceToDevice(gpu_shape, other.gpu_shape, gpu_shape_size*sizeof(size_t)); + tensor_gpu::copyDeviceToDevice(gpu_shape, other.gpu_shape, gpu_shape_size*sizeof(size_t)); + + } else { + cpu_data = other.cpu_data; + cpu_shape = other.cpu_shape; + cpu_strides = other.cpu_strides; + } +} + ValueType &Tensor::operator[](size_t i) { static ValueType value; if (isGpu) { diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu index 493bbf0..be277df 100644 --- a/src/model/tensor_gpu.cu +++ b/src/model/tensor_gpu.cu @@ -3,7 +3,6 @@ #include #include - namespace nn::global::tensor_gpu { // Allocate memory on GPU for a tensor. void* allocate(std::size_t size) { From 882810c3403e633bc6a3a7dd87321adefc1c841b Mon Sep 17 00:00:00 2001 From: maayan Date: Wed, 6 Aug 2025 18:04:03 +0300 Subject: [PATCH 14/40] new commit --- include/tensor.hpp | 9 ++-- src/model/dataBase.cpp | 2 +- src/model/model.cpp | 23 ++++++---- src/model/tensor.cpp | 68 +++++++++++----------------- src/model/tensor_gpu.cu | 47 ++++++++++++++++++- src/model/tensor_gpu.hpp | 12 ++++- src/networks/fnn/DenseLayer.cpp | 14 +++--- src/networks/fnn/FNNetwork.cpp | 1 + tests/binary_test.cpp | 5 +- tests/data/config-binary_test.json | 4 +- tests/data/database-binary_test.nndb | 2 +- tests/data/test1.nndb | 2 +- tests/data/test2.nndb | 2 +- 13 files changed, 119 insertions(+), 72 deletions(-) diff --git a/include/tensor.hpp b/include/tensor.hpp index f47d82b..40c90fe 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -37,13 +37,12 @@ class Tensor { Tensor(const std::vector &shape, float init = 0.0f); Tensor(const Tensor &other); + ~Tensor(); + Tensor &operator=(const Tensor &other); - // Element access - ValueType &operator()(const std::vector &indices); - ValueType operator()(const std::vector &indices) const; - ValueType &operator[](size_t i); - const ValueType &operator[](size_t i) const; + ValueType getValue(const std::vector newShape) const; + void setValue(const std::vector newShape, const ValueType value); // Iterators (for range-based loops) auto begin() noexcept { return cpu_data.begin(); } diff --git a/src/model/dataBase.cpp b/src/model/dataBase.cpp index bb79bce..2a8afce 100644 --- a/src/model/dataBase.cpp +++ b/src/model/dataBase.cpp @@ -26,7 +26,7 @@ TrainSample DataBase::readLine(const std::string &line) { for (size_t i = 0; i < samples.sInputSize; ++i) { iss >> token; - new_sample.input({i}) = std::stod(token); + new_sample.input.setValue({i}, std::stod(token)); } return new_sample; diff --git a/src/model/model.cpp b/src/model/model.cpp index eb2f22e..920ad24 100644 --- a/src/model/model.cpp +++ b/src/model/model.cpp @@ -1,5 +1,6 @@ #include "../networks/cnn/CNNetwork.hpp" #include "../networks/fnn/FNNetwork.hpp" +#include "dataBase.hpp" #include #include #include @@ -134,7 +135,8 @@ void Model::addCNN(const std::uint32_t width, ISubNetworkConfig &_config) { } void Model::runModel(const global::Tensor &input) { - visual.updateInput(input); + // visual.updateInput(input); + printf("test1:\n"); network[0]->forward(input); for (size_t i = 1; i < network.size(); ++i) { @@ -177,13 +179,18 @@ global::ValueType Model::runBackPropagation( resetNetworkGradient(); for (size_t i = 0; i < batch.size(); ++i) { - auto current_sample_ptr = batch.samples.at(i); + TrainSample *current_sample_ptr = batch.samples.at(i); visual.updatePrediction(current_sample_ptr->pre); - runModel(transformation(current_sample_ptr->input)); + printf("test1\n"); + printf("test1: %zu\n", current_sample_ptr->input.numElements()); + // runModel(transformation(current_sample_ptr->input)); + + printf("\n"); + runModel(current_sample_ptr->input); global::Tensor output({outputSize()}); - output[current_sample_ptr->pre.index] = 1; + output.setValue({current_sample_ptr->pre.index}, 1); if (doBackward) { Backward(output); @@ -404,7 +411,7 @@ void Model::save(const std::string &file) { outFile << params.numElements() << " "; for (size_t j = 0; j < params.numElements(); ++j) { - outFile << params[j] << " "; + outFile << params.getValue({j}) << " "; } outFile << std::endl; @@ -429,7 +436,7 @@ void Model::load(const std::string &file) { for (size_t i = 0; i < ParamSize; ++i) { iss >> num; - numbers[i] = num; + numbers.setValue({i}, num); } network[networkI]->setParams(numbers); @@ -444,12 +451,12 @@ global::Prediction Model::getPrediction() const { size_t max = 0; for (size_t i = 1; i < outputSize(); ++i) { - if (getOutput()[i] > getOutput()[max]) { + if (getOutput().getValue({i}) > getOutput().getValue({max})) { max = i; } } - return global::Prediction(max, getOutput()[max]); + return global::Prediction(max, getOutput().getValue({max})); } void Model::setTraining() { diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index 1772600..4d3807d 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -34,41 +34,23 @@ Tensor::Tensor(const std::vector &shape, float init) { } Tensor::Tensor(const Tensor &other) { - if (isGpu) { - gpu_data_size = other.gpu_data_size; - gpu_shape_size = other.gpu_shape_size; + if (isGpu) { + gpu_data_size = other.gpu_data_size; + gpu_shape_size = other.gpu_shape_size; gpu_data = (ValueType *)tensor_gpu::allocate(gpu_data_size * sizeof(ValueType)); gpu_strides = (size_t *)tensor_gpu::allocate(gpu_shape_size * sizeof(size_t)); gpu_shape = (size_t *)tensor_gpu::allocate(gpu_shape_size * sizeof(size_t)); - tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size*sizeof(ValueType)); - tensor_gpu::copyDeviceToDevice(gpu_shape, other.gpu_shape, gpu_shape_size*sizeof(size_t)); - tensor_gpu::copyDeviceToDevice(gpu_shape, other.gpu_shape, gpu_shape_size*sizeof(size_t)); - - } else { - cpu_data = other.cpu_data; - cpu_shape = other.cpu_shape; - cpu_strides = other.cpu_strides; - } -} - -ValueType &Tensor::operator[](size_t i) { - static ValueType value; - if (isGpu) { - value = tensor_gpu::getValueAt(gpu_data, i); - return value; - } - return cpu_data[i]; -} + tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType)); + tensor_gpu::copyDeviceToDevice(gpu_shape, other.gpu_shape, gpu_shape_size * sizeof(size_t)); + tensor_gpu::copyDeviceToDevice(gpu_shape, other.gpu_shape, gpu_shape_size * sizeof(size_t)); -const ValueType &Tensor::operator[](size_t i) const { - static ValueType value; - if (isGpu) { - value = tensor_gpu::getValueAt(gpu_data, i); - return value; + } else { + cpu_data = other.cpu_data; + cpu_shape = other.cpu_shape; + cpu_strides = other.cpu_strides; } - return cpu_data[i]; } size_t Tensor::numElements() const { @@ -150,24 +132,18 @@ inline size_t Tensor::flattenIndex(const std::vector &indices) const { } } -ValueType &Tensor::operator()(const std::vector &indices) { - static ValueType value; - if (!isGpu) { - return cpu_data[flattenIndex(indices)]; +ValueType Tensor::getValue(const std::vector newShape) const { + if (isGpu) { + return tensor_gpu::getValueAtIndices(gpu_data, newShape.data(), gpu_shape, gpu_strides, gpu_shape_size); } - - value = tensor_gpu::getValueAtIndices(gpu_data, indices.data(), gpu_shape, gpu_strides, gpu_shape_size); - return value; + return cpu_data[flattenIndex(newShape)]; } -ValueType Tensor::operator()(const std::vector &indices) const { - static ValueType value; - if (!isGpu) { - return cpu_data[flattenIndex(indices)]; +void Tensor::setValue(const std::vector newShape, const ValueType value) { + if (isGpu) { + tensor_gpu::setValueAtIndices(gpu_data, newShape.data(), gpu_shape, gpu_strides, gpu_shape_size, value); } - - value = tensor_gpu::getValueAtIndices(gpu_data, indices.data(), gpu_shape, gpu_strides, gpu_shape_size); - return value; + cpu_data[flattenIndex(newShape)] = value; } Tensor &Tensor::operator+=(const Tensor &other) { @@ -419,4 +395,12 @@ Tensor Tensor::matmulT(const Tensor &vec) const { tensor_gpu::matmulT(gpu_data, vec.gpu_data, result.gpu_data, M, N); return result; } + +Tensor::~Tensor() { + if (isGpu) { + tensor_gpu::deallocate(gpu_data); + tensor_gpu::deallocate(gpu_shape); + tensor_gpu::deallocate(gpu_strides); + } +} } // namespace nn::global diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu index be277df..78bb4cd 100644 --- a/src/model/tensor_gpu.cu +++ b/src/model/tensor_gpu.cu @@ -15,7 +15,7 @@ void* allocate(std::size_t size) { } // Free GPU memory. -void deallocate(ValueType* devicePtr) { +void deallocate(void* devicePtr) { if (devicePtr) { cudaFree(devicePtr); } @@ -523,6 +523,51 @@ ValueType getValueAtIndices( return value; } +__global__ void setValueAtIndexKernel(ValueType* data, size_t flatIndex, ValueType value) { + data[flatIndex] = value; +} + +void setValueAtIndices( + ValueType* deviceData, + const size_t* hostIndices, + const size_t* deviceShape, + const size_t* deviceStrides, + size_t ndim, + ValueType value +) { + // Step 1: Allocate and copy indices to GPU + size_t* deviceIndices; + cudaMalloc(&deviceIndices, ndim * sizeof(size_t)); + cudaMemcpy(deviceIndices, hostIndices, ndim * sizeof(size_t), cudaMemcpyHostToDevice); + + // Step 2: Allocate memory to store computed flat index + size_t* deviceFlatIndex; + cudaMalloc(&deviceFlatIndex, sizeof(size_t)); + + // Step 3: Launch kernel to compute flat index + computeFlatIndexKernel<<<1, 1>>>(deviceIndices, deviceShape, deviceStrides, ndim, deviceFlatIndex); + cudaDeviceSynchronize(); + + // Step 4: Copy flat index to host + size_t flatIndex; + cudaMemcpy(&flatIndex, deviceFlatIndex, sizeof(size_t), cudaMemcpyDeviceToHost); + + // Step 5: Validate flat index + if (flatIndex == size_t(-1)) { + cudaFree(deviceIndices); + cudaFree(deviceFlatIndex); + throw std::out_of_range("Invalid indices in setValueAtIndices"); + } + + // Step 6: Launch kernel to set value at computed flat index + setValueAtIndexKernel<<<1, 1>>>(deviceData, flatIndex, value); + cudaDeviceSynchronize(); + + // Cleanup + cudaFree(deviceIndices); + cudaFree(deviceFlatIndex); +} + __global__ void matmulKernel(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K) { size_t row = blockIdx.x * blockDim.x + threadIdx.x; if (row < M) { diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp index 2d0745f..6142cfa 100644 --- a/src/model/tensor_gpu.hpp +++ b/src/model/tensor_gpu.hpp @@ -14,7 +14,7 @@ namespace nn::global::tensor_gpu { void *allocate(std::size_t size); /// Free GPU memory. -void deallocate(ValueType *devicePtr); +void deallocate(void *devicePtr); /// Copy data from CPU to GPU. void copyToDevice(void *deviceDst, const void *hostSrc, std::size_t count); @@ -86,7 +86,15 @@ ValueType getValueAtIndices( const size_t *hostIndices, const size_t *deviceShape, const size_t *deviceStrides, - size_t rank); + size_t size); + +void setValueAtIndices( + ValueType *deviceData, + const size_t *hostIndices, + const size_t *deviceShape, + const size_t *deviceStrides, + size_t ndim, + ValueType value); void matmul(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K); void outer(const ValueType *a, const ValueType *b, ValueType *result, size_t m, size_t n); diff --git a/src/networks/fnn/DenseLayer.cpp b/src/networks/fnn/DenseLayer.cpp index dc8f7ea..a0f9833 100644 --- a/src/networks/fnn/DenseLayer.cpp +++ b/src/networks/fnn/DenseLayer.cpp @@ -28,7 +28,7 @@ void Hidden_Layer::CreateDropoutMask() { std::bernoulli_distribution bernoulli(keepProb); for (size_t i = 0; i < dropoutMask.numElements(); ++i) { - dropoutMask[i] = static_cast(bernoulli(rng)); + dropoutMask.setValue({i}, static_cast(bernoulli(rng))); } } @@ -63,7 +63,7 @@ void Output_Layer::backward( global::ValueType Output_Layer::getCrossEntropyLoss( const global::Tensor &prediction, const size_t target) { - return -std::log(std::max(prediction[target], MIN_LOSS_VALUE)); + return -std::log(std::max(prediction.getValue({target}), MIN_LOSS_VALUE)); } global::ValueType Output_Layer::getLoss(const global::Prediction &targets) { @@ -73,7 +73,7 @@ global::ValueType Output_Layer::getLoss(const global::Prediction &targets) { void Hidden_Layer::forward(const global::Tensor &metrix) { if (isTraining) CreateDropoutMask(); - + net = parameters.weights.matmul(metrix); net += parameters.biases; @@ -129,14 +129,14 @@ const global::Tensor DenseLayer::getData() const { size_t currentI = 0; for (size_t i = 0; i < size(); ++i) { for (size_t j = 0; j < prevSize(); ++j) { - matrix[currentI] = parameters.weights({i, j}); + matrix.setValue({currentI}, parameters.weights.getValue({i, j})); ++currentI; } } for (size_t i = 0; i < size(); ++i) { - matrix[currentI] = parameters.biases[i]; + matrix.setValue({currentI}, parameters.biases.getValue({i})); ++currentI; } @@ -148,14 +148,14 @@ void DenseLayer::setData(const global::Tensor newParam) { size_t currentI = 0; for (size_t i = 0; i < size(); ++i) { for (size_t j = 0; j < prevSize(); ++j) { - parameters.weights({i, j}) = newParam[currentI]; + parameters.weights.setValue({i, j}, newParam.getValue({currentI})); ++currentI; } } for (size_t i = 0; i < size(); ++i) { - parameters.biases[i] = newParam[currentI]; + parameters.biases.setValue({i}, newParam.getValue({currentI})); ++currentI; } diff --git a/src/networks/fnn/FNNetwork.cpp b/src/networks/fnn/FNNetwork.cpp index 46f163c..52bde8f 100644 --- a/src/networks/fnn/FNNetwork.cpp +++ b/src/networks/fnn/FNNetwork.cpp @@ -52,6 +52,7 @@ void FNNetwork::sendNewVNeurons(const size_t i) const { void FNNetwork::forward(const global::Tensor &newInput) { input = newInput; + printf("test1: %zu", input.numElements()); layers[0]->forward(input); sendNewVNeurons(0); diff --git a/tests/binary_test.cpp b/tests/binary_test.cpp index c9716ad..5ad6b53 100644 --- a/tests/binary_test.cpp +++ b/tests/binary_test.cpp @@ -77,12 +77,15 @@ int main(int argc, char *argv[]) { size_t input_size = 10; std::string config_FN = tests::appendToBase("config-binary_test.json"); + // nn::global::Tensor give_me_a_name({5, 1}); + // nn::global::Tensor newt = give_me_a_name; + // return 0; nn::model::Model model(config_FN); if (argc > 1 && std::string(argv[1]) == "l") { model.load("test.txt"); } else { - std::vector files{"../tests/data/test1", "../tests/data/test2"}; + std::vector files {"../tests/data/test1", "../tests/data/test2"}; model.train(files); nn::model::modelResult result = model.evaluateModel("../tests/data/database-binary_test"); diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json index 59d9390..66b35f8 100644 --- a/tests/data/config-binary_test.json +++ b/tests/data/config-binary_test.json @@ -26,8 +26,8 @@ "network config": [ { "type": "FNN", - "input size": 100, - "output size": 100, + "input size": 10, + "output size": 16, "output activation": 4, "layers": [ { "size": 30, "activationType": 1 }, diff --git a/tests/data/database-binary_test.nndb b/tests/data/database-binary_test.nndb index 4a40143..f075152 100644 --- a/tests/data/database-binary_test.nndb +++ b/tests/data/database-binary_test.nndb @@ -1,4 +1,4 @@ -1000 100 +1000 10 0 0.5 0.5 0.5 0.5 0.1 0.1 0.1 0.1 0.1 0.1 0 0.1 0.5 0.5 0.5 0.5 0.1 0.1 0.1 0.1 0.1 0 0.1 0.1 0.5 0.5 0.5 0.5 0.1 0.1 0.1 0.1 diff --git a/tests/data/test1.nndb b/tests/data/test1.nndb index 1aba172..8ea0dc7 100644 --- a/tests/data/test1.nndb +++ b/tests/data/test1.nndb @@ -1,4 +1,4 @@ -60 100 +60 10 0 0.5 0.5 0.5 0.5 0.1 0.1 0.1 0.1 0.1 0.1 0 0.1 0.5 0.5 0.5 0.5 0.1 0.1 0.1 0.1 0.1 0 0.1 0.1 0.5 0.5 0.5 0.5 0.1 0.1 0.1 0.1 diff --git a/tests/data/test2.nndb b/tests/data/test2.nndb index 69eadc4..209c026 100644 --- a/tests/data/test2.nndb +++ b/tests/data/test2.nndb @@ -1,4 +1,4 @@ -70 100 +70 10 5 0.5 1 0.5 1 0.1 0.1 0.1 0.1 0.1 0.1 5 0.1 0.5 1 0.5 1 0.1 0.1 0.1 0.1 0.1 5 0.1 0.1 0.5 1 0.5 1 0.1 0.1 0.1 0.1 From 4c83f40225d89e4364d8b290296deedad7cd5bf7 Mon Sep 17 00:00:00 2001 From: maayan Date: Wed, 6 Aug 2025 18:17:11 +0300 Subject: [PATCH 15/40] more bug fixes --- src/networks/fnn/FNNetwork.cpp | 12 +++--------- src/networks/fnn/FnnVisualizer.cpp | 8 ++++---- src/visualizer/visualModel.cpp | 8 +++----- tests/binary_test.cpp | 6 +++--- 4 files changed, 13 insertions(+), 21 deletions(-) diff --git a/src/networks/fnn/FNNetwork.cpp b/src/networks/fnn/FNNetwork.cpp index 52bde8f..94ee3ac 100644 --- a/src/networks/fnn/FNNetwork.cpp +++ b/src/networks/fnn/FNNetwork.cpp @@ -139,13 +139,7 @@ void FNNetwork::updateWeights(IOptimizer &optimizer) { } void FNNetwork::calculateInputDelta(const global::Tensor &deltas) { - input.fill(0); - - for (size_t i = 0; i < inputSize(); ++i) { - for (size_t j = 0; j < layers[0]->size(); ++j) { - input[i] += deltas[j] * layers[0]->getParms().weights({j, i}); - } - } + input = deltas.matmulT(layers[0]->getParms().weights); } size_t FNNetwork::getParamCount() const { @@ -167,7 +161,7 @@ global::Tensor FNNetwork::getParams() const { global::Tensor params = layers[i]->getData(); for (size_t j = 0; j < params.numElements(); ++j) { - matrix[matrixI] = params[j]; + matrix.setValue({matrixI}, params.getValue({j})); ++matrixI; } } @@ -181,7 +175,7 @@ void FNNetwork::setParams(const global::Tensor params) { global::Tensor newParam({layers[i]->getParamCount()}); for (size_t k = 0; k < newParam.numElements(); ++k) { - newParam[k] = params[j]; + newParam.setValue({k}, params.getValue({j})); ++j; } diff --git a/src/networks/fnn/FnnVisualizer.cpp b/src/networks/fnn/FnnVisualizer.cpp index 5ae7306..47359a0 100644 --- a/src/networks/fnn/FnnVisualizer.cpp +++ b/src/networks/fnn/FnnVisualizer.cpp @@ -98,7 +98,7 @@ void VisualDenseLayer::drawWeights(const size_t neuron_i, sf::RenderTexture &tar line_[2].position = to; line_[0].color = LINE_COLOR; - line_[0].color.a = parameters.weights({neuron_i, neuronP}) * 50; + line_[0].color.a = parameters.weights.getValue({neuron_i, neuronP}) * 50; line_[1].color = line_[0].color; line_[2].color = getColorFromTextT(getTextT(neuron_i, neuronP)); target.draw(line_); @@ -162,7 +162,7 @@ void VisualDenseLayer::renderNeuron(const size_t index, sf::RenderTexture &targe drawWeights(index, target); } - drawNeuron(cacheNeurons[index], net[index], out[index], target); + drawNeuron(cacheNeurons[index], net.getValue({index}), out.getValue({index}), target); } void VisualDenseLayer::drawNeurons(sf::RenderTexture &target) { @@ -198,10 +198,10 @@ float VisualDenseLayer::calculateGap(const int size, const float scale) { } textType VisualDenseLayer::getTextT(const size_t layer_i, const size_t layer_p) { - if (gradients.weights({layer_i, layer_p}) < 0) + if (gradients.weights.getValue({layer_i, layer_p}) < 0) return textType::DOWN; - if (gradients.weights({layer_i, layer_p}) > 0) + if (gradients.weights.getValue({layer_i, layer_p}) > 0) return textType::UP; return textType::NORMAL; diff --git a/src/visualizer/visualModel.cpp b/src/visualizer/visualModel.cpp index 6391393..4be7484 100644 --- a/src/visualizer/visualModel.cpp +++ b/src/visualizer/visualModel.cpp @@ -4,8 +4,6 @@ #include "fonts.hpp" #include "network/IvisualNetwork.hpp" #include "panel.hpp" -#include -#include #include #include @@ -77,14 +75,14 @@ sf::Color DummyLayer::getNeuronColor(const global::ValueType value) { void DummyLayer::renderNeuron(sf::RenderTexture &target, const size_t index) { sf::RectangleShape shape(cacheNeurons[index].size); - shape.setFillColor(getNeuronColor(values({index}))); + shape.setFillColor(getNeuronColor(values.getValue({index}))); shape.setPosition(cacheNeurons[index].position + pos); target.draw(shape); if (10 * cacheNeurons[index].size.y / global::NEURON_WIDTH > global::MIN_FONT_SIZE) { std::ostringstream ss; - ss << std::fixed << std::setprecision(4) << values({index}); + ss << std::fixed << std::setprecision(4) << values.getValue({index}); sf::Text text(Fonts::getFont()); text.setCharacterSize(10 * cacheNeurons[index].size.y / global::NEURON_WIDTH); @@ -160,7 +158,7 @@ void ModelPanel::renderSubNetwork(const size_t index) { void ModelPanel::setPrediction(const global::Prediction &pre) { global::Tensor output({predictionLayer.size()}); - output({pre.index}) = 1; + output.setValue({pre.index}, 1); predictionLayer.setValues(output); setUpdate(); diff --git a/tests/binary_test.cpp b/tests/binary_test.cpp index 5ad6b53..6bbc2f6 100644 --- a/tests/binary_test.cpp +++ b/tests/binary_test.cpp @@ -129,9 +129,9 @@ int main(int argc, char *argv[]) { } for (size_t i = 4 + num2; i > num2; i--) { - input({i - 1}) = bit_by_index(num1, 4 - i + num2); - if (input({i - 1}) == 0) { - input({i - 1}) = 0.5; + input.setValue({i - 1}, bit_by_index(num1, 4 - i + num2)); + if (input.getValue({i - 1}) == 0) { + input.setValue({i - 1}, 0.5); } } From 8062331aa9612c6dbae7df71fc38ecb4e8a4b6a2 Mon Sep 17 00:00:00 2001 From: maayan Date: Wed, 6 Aug 2025 19:01:16 +0300 Subject: [PATCH 16/40] bug fix --- src/model/model.cpp | 10 ++-------- src/networks/fnn/FNNetwork.cpp | 3 +-- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/src/model/model.cpp b/src/model/model.cpp index 920ad24..9e6442c 100644 --- a/src/model/model.cpp +++ b/src/model/model.cpp @@ -135,8 +135,7 @@ void Model::addCNN(const std::uint32_t width, ISubNetworkConfig &_config) { } void Model::runModel(const global::Tensor &input) { - // visual.updateInput(input); - printf("test1:\n"); + visual.updateInput(input); network[0]->forward(input); for (size_t i = 1; i < network.size(); ++i) { @@ -182,12 +181,7 @@ global::ValueType Model::runBackPropagation( TrainSample *current_sample_ptr = batch.samples.at(i); visual.updatePrediction(current_sample_ptr->pre); - printf("test1\n"); - printf("test1: %zu\n", current_sample_ptr->input.numElements()); - // runModel(transformation(current_sample_ptr->input)); - - printf("\n"); - runModel(current_sample_ptr->input); + runModel(transformation(current_sample_ptr->input)); global::Tensor output({outputSize()}); output.setValue({current_sample_ptr->pre.index}, 1); diff --git a/src/networks/fnn/FNNetwork.cpp b/src/networks/fnn/FNNetwork.cpp index 94ee3ac..fd6f7fd 100644 --- a/src/networks/fnn/FNNetwork.cpp +++ b/src/networks/fnn/FNNetwork.cpp @@ -52,7 +52,6 @@ void FNNetwork::sendNewVNeurons(const size_t i) const { void FNNetwork::forward(const global::Tensor &newInput) { input = newInput; - printf("test1: %zu", input.numElements()); layers[0]->forward(input); sendNewVNeurons(0); @@ -139,7 +138,7 @@ void FNNetwork::updateWeights(IOptimizer &optimizer) { } void FNNetwork::calculateInputDelta(const global::Tensor &deltas) { - input = deltas.matmulT(layers[0]->getParms().weights); + input = layers[0]->getParms().weights.matmulT(deltas); } size_t FNNetwork::getParamCount() const { From 413770bf2ff8a0242c3e24ca3ee3e5d30ce17e96 Mon Sep 17 00:00:00 2001 From: maayan Date: Wed, 6 Aug 2025 21:06:55 +0300 Subject: [PATCH 17/40] new commit --- include/tensor.hpp | 4 ++-- src/model/tensor.cpp | 22 ++++++++++--------- src/model/tensor_gpu.cu | 46 +++++++++++++++++++--------------------- src/model/tensor_gpu.hpp | 2 -- tests/binary_test.cpp | 6 ++++-- 5 files changed, 40 insertions(+), 40 deletions(-) diff --git a/include/tensor.hpp b/include/tensor.hpp index 40c90fe..c1fa8fa 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -41,8 +41,8 @@ class Tensor { Tensor &operator=(const Tensor &other); - ValueType getValue(const std::vector newShape) const; - void setValue(const std::vector newShape, const ValueType value); + ValueType getValue(const std::vector &newShape) const; + void setValue(const std::vector &newShape, const ValueType value); // Iterators (for range-based loops) auto begin() noexcept { return cpu_data.begin(); } diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index 4d3807d..ea9a223 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -24,7 +24,7 @@ Tensor::Tensor(const std::vector &shape, float init) { gpu_data = (ValueType *)tensor_gpu::allocate(totalSize * sizeof(ValueType)); gpu_shape = (size_t *)tensor_gpu::allocate(shape.size() * sizeof(size_t)); - tensor_gpu::copyToDevice(gpu_shape, shape.data(), gpu_data_size * sizeof(size_t)); + tensor_gpu::copyToDevice(gpu_shape, shape.data(), shape.size() * sizeof(size_t)); gpu_data_size = totalSize; gpu_shape_size = shape.size(); @@ -99,7 +99,7 @@ Tensor &Tensor::operator=(const Tensor &other) { void Tensor::computeStrides() { if (isGpu) { gpu_strides = (size_t *)tensor_gpu::allocate(gpu_shape_size * sizeof(size_t)); - tensor_gpu::computeStridesDevice(gpu_shape, gpu_strides, gpu_shape_size * sizeof(size_t)); + tensor_gpu::computeStridesDevice(gpu_shape, gpu_strides, gpu_shape_size); } else { const size_t dim = cpu_shape.size(); cpu_strides.resize(dim); @@ -132,18 +132,20 @@ inline size_t Tensor::flattenIndex(const std::vector &indices) const { } } -ValueType Tensor::getValue(const std::vector newShape) const { - if (isGpu) { - return tensor_gpu::getValueAtIndices(gpu_data, newShape.data(), gpu_shape, gpu_strides, gpu_shape_size); +ValueType Tensor::getValue(const std::vector &indices) const { + if (!isGpu) { + return cpu_data[flattenIndex(indices)]; } - return cpu_data[flattenIndex(newShape)]; + + return tensor_gpu::getValueAt(gpu_data, flattenIndex(indices)); } -void Tensor::setValue(const std::vector newShape, const ValueType value) { - if (isGpu) { - tensor_gpu::setValueAtIndices(gpu_data, newShape.data(), gpu_shape, gpu_strides, gpu_shape_size, value); +void Tensor::setValue(const std::vector &indices, const ValueType value) { + if (!isGpu) { + cpu_data[flattenIndex(indices)] = value; + } else { + tensor_gpu::setValueAt(gpu_data, flattenIndex(indices), value); } - cpu_data[flattenIndex(newShape)] = value; } Tensor &Tensor::operator+=(const Tensor &other) { diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu index 78bb4cd..b27ffd4 100644 --- a/src/model/tensor_gpu.cu +++ b/src/model/tensor_gpu.cu @@ -27,13 +27,13 @@ void copyToDevice(void* deviceDst, const void * hostSrc, std::size_t size) { } -void copyDeviceToDevice(void *deviceDst, const void *deviceSrc, std::size_t count) { - cudaMemcpy(deviceDst, deviceSrc, count, cudaMemcpyDeviceToDevice); +void copyDeviceToDevice(void *deviceDst, const void *deviceSrc, std::size_t size) { + cudaMemcpy(deviceDst, deviceSrc, size, cudaMemcpyDeviceToDevice); } // Copy data from GPU to CPU. -void copyToHost(void* hostDst, const void* deviceSrc, std::size_t count) { - cudaMemcpy(hostDst, deviceSrc, count, cudaMemcpyDeviceToHost); +void copyToHost(void* hostDst, const void* deviceSrc, std::size_t size) { + cudaMemcpy(hostDst, deviceSrc, size, cudaMemcpyDeviceToHost); } // Kernel to set all elements to zero. @@ -436,12 +436,13 @@ ValueType getValueAt(const ValueType* devicePtr , std::size_t index) { return value; } -// Compute flattened index on device -__global__ void flattenIndexKernel(const size_t* indices, const size_t* shape, const size_t* strides, size_t ndim, size_t* outIndex) { +// Kernel to compute flattened index +__global__ void flattenIndexKernel(const size_t* indices, const size_t* shape, + const size_t* strides, size_t ndim, size_t* outIndex) { size_t idx = 0; for (size_t i = 0; i < ndim; ++i) { if (indices[i] >= shape[i]) { - *outIndex = size_t(-1); // invalid index + *outIndex = size_t(-1); return; } idx += indices[i] * strides[i]; @@ -449,34 +450,33 @@ __global__ void flattenIndexKernel(const size_t* indices, const size_t* shape, c *outIndex = idx; } -size_t flattenIndexGpu(const size_t* indices,const size_t* d_shape,const size_t* d_strides,size_t ndim) { - // Copy indices vector to device memory - size_t* d_indices = nullptr; +// Host function to launch kernel +size_t flattenIndexGpu(const size_t* h_indices, const size_t* d_shape, + const size_t* d_strides, size_t ndim) { + size_t *d_indices, *d_outIndex; cudaMalloc(&d_indices, ndim * sizeof(size_t)); - cudaMemcpy(d_indices, indices, ndim * sizeof(size_t), cudaMemcpyHostToDevice); - - size_t* d_outIndex = nullptr; cudaMalloc(&d_outIndex, sizeof(size_t)); - // Launch kernel with a single thread since this is a scalar computation + cudaMemcpy(d_indices, h_indices, ndim * sizeof(size_t), cudaMemcpyHostToDevice); + flattenIndexKernel<<<1, 1>>>(d_indices, d_shape, d_strides, ndim, d_outIndex); cudaDeviceSynchronize(); - size_t hostIndex; - cudaMemcpy(&hostIndex, d_outIndex, sizeof(size_t), cudaMemcpyDeviceToHost); + size_t result; + cudaMemcpy(&result, d_outIndex, sizeof(size_t), cudaMemcpyDeviceToHost); cudaFree(d_indices); cudaFree(d_outIndex); - if (hostIndex == size_t(-1)) { - throw std::out_of_range("Index out of bounds."); + if (result == size_t(-1)) { + throw std::out_of_range("Flattened index out of bounds."); } - return hostIndex; + return result; } __global__ void computeFlatIndexKernel( - const size_t* indices, const size_t* shape, const size_t* strides, + const size_t* indices, const size_t* strides, size_t rank, size_t* outIndex ) { size_t flatIndex = 0; @@ -489,7 +489,6 @@ __global__ void computeFlatIndexKernel( ValueType getValueAtIndices( const ValueType* deviceData, const size_t* hostIndices, - const size_t* deviceShape, const size_t* deviceStrides, size_t size ) { @@ -504,7 +503,7 @@ ValueType getValueAtIndices( // Launch kernel to compute flat index computeFlatIndexKernel<<<1, 1>>>( - deviceIndices, deviceShape, deviceStrides, size, deviceFlatIndex + deviceIndices, deviceStrides, size, deviceFlatIndex ); cudaDeviceSynchronize(); @@ -530,7 +529,6 @@ __global__ void setValueAtIndexKernel(ValueType* data, size_t flatIndex, ValueTy void setValueAtIndices( ValueType* deviceData, const size_t* hostIndices, - const size_t* deviceShape, const size_t* deviceStrides, size_t ndim, ValueType value @@ -545,7 +543,7 @@ void setValueAtIndices( cudaMalloc(&deviceFlatIndex, sizeof(size_t)); // Step 3: Launch kernel to compute flat index - computeFlatIndexKernel<<<1, 1>>>(deviceIndices, deviceShape, deviceStrides, ndim, deviceFlatIndex); + computeFlatIndexKernel<<<1, 1>>>(deviceIndices, deviceStrides, ndim, deviceFlatIndex); cudaDeviceSynchronize(); // Step 4: Copy flat index to host diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp index 6142cfa..ae48a4a 100644 --- a/src/model/tensor_gpu.hpp +++ b/src/model/tensor_gpu.hpp @@ -84,14 +84,12 @@ size_t flattenIndexGpu(const size_t *indices, const size_t *d_shape, const size_ ValueType getValueAtIndices( const ValueType *deviceData, const size_t *hostIndices, - const size_t *deviceShape, const size_t *deviceStrides, size_t size); void setValueAtIndices( ValueType *deviceData, const size_t *hostIndices, - const size_t *deviceShape, const size_t *deviceStrides, size_t ndim, ValueType value); diff --git a/tests/binary_test.cpp b/tests/binary_test.cpp index 6bbc2f6..aff4107 100644 --- a/tests/binary_test.cpp +++ b/tests/binary_test.cpp @@ -77,9 +77,11 @@ int main(int argc, char *argv[]) { size_t input_size = 10; std::string config_FN = tests::appendToBase("config-binary_test.json"); - // nn::global::Tensor give_me_a_name({5, 1}); - // nn::global::Tensor newt = give_me_a_name; + // nn::global::Tensor give_me_a_name({5, 3}); + // printf("test: \n"); + // give_me_a_name.setValue({2, 1}, 5); // return 0; + nn::model::Model model(config_FN); if (argc > 1 && std::string(argv[1]) == "l") { From efe75cbb62be12784c0975c2f4447585d4ca8090 Mon Sep 17 00:00:00 2001 From: maayan Date: Wed, 6 Aug 2025 21:49:46 +0300 Subject: [PATCH 18/40] bug fixes, i simplify the data structure --- include/tensor.hpp | 8 +- src/model/tensor.cpp | 198 +++++++++-------------------- src/model/tensor_gpu.hpp | 1 + src/networks/fnn/DenseLayer.cpp | 1 + src/networks/fnn/FNNetwork.cpp | 2 +- tests/data/config-binary_test.json | 4 +- 6 files changed, 68 insertions(+), 146 deletions(-) diff --git a/include/tensor.hpp b/include/tensor.hpp index c1fa8fa..68999da 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -2,7 +2,6 @@ #define TENSOR #include "../src/model/tensor_gpu.hpp" -#include #include namespace nn::model { @@ -14,14 +13,11 @@ namespace nn::global { class Tensor { private: std::vector cpu_data; - std::vector cpu_shape; - std::vector cpu_strides; + std::vector shape; + std::vector strides; ValueType *gpu_data = nullptr; std::size_t gpu_data_size{0}; - size_t *gpu_shape = nullptr; - size_t *gpu_strides = nullptr; - size_t gpu_shape_size{0}; static const bool isGpu{true}; diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index ea9a223..c131cf7 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -6,50 +6,38 @@ #include namespace nn::global { -Tensor::Tensor(const std::vector &shape, float init) { - if (shape.empty()) { +Tensor::Tensor(const std::vector &shape_, float init) { + if (shape_.empty()) { throw std::invalid_argument("Tensor shape cannot be empty."); } size_t totalSize = std::accumulate( - shape.begin(), - shape.end(), + shape_.begin(), + shape_.end(), size_t(1), std::multiplies<>()); + shape = shape_; if (!isGpu) { - cpu_shape = shape; cpu_data.assign(totalSize, init); } else { gpu_data = (ValueType *)tensor_gpu::allocate(totalSize * sizeof(ValueType)); - gpu_shape = (size_t *)tensor_gpu::allocate(shape.size() * sizeof(size_t)); - tensor_gpu::copyToDevice(gpu_shape, shape.data(), shape.size() * sizeof(size_t)); - gpu_data_size = totalSize; - gpu_shape_size = shape.size(); } computeStrides(); } Tensor::Tensor(const Tensor &other) { + shape = other.shape; + strides = other.strides; if (isGpu) { gpu_data_size = other.gpu_data_size; - gpu_shape_size = other.gpu_shape_size; - gpu_data = (ValueType *)tensor_gpu::allocate(gpu_data_size * sizeof(ValueType)); - gpu_strides = (size_t *)tensor_gpu::allocate(gpu_shape_size * sizeof(size_t)); - gpu_shape = (size_t *)tensor_gpu::allocate(gpu_shape_size * sizeof(size_t)); - tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType)); - tensor_gpu::copyDeviceToDevice(gpu_shape, other.gpu_shape, gpu_shape_size * sizeof(size_t)); - tensor_gpu::copyDeviceToDevice(gpu_shape, other.gpu_shape, gpu_shape_size * sizeof(size_t)); - } else { cpu_data = other.cpu_data; - cpu_shape = other.cpu_shape; - cpu_strides = other.cpu_strides; } } @@ -79,57 +67,40 @@ Tensor &Tensor::operator=(const Tensor &other) { if (this == &other) return *this; + shape = other.shape; + strides = other.strides; if (!isGpu) { cpu_data = other.cpu_data; - cpu_shape = other.cpu_shape; - cpu_strides = other.cpu_strides; } else { - gpu_shape = (size_t *)tensor_gpu::allocate(other.gpu_shape_size * sizeof(size_t)); gpu_data = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType)); gpu_data_size = other.gpu_data_size; - gpu_shape_size = other.gpu_shape_size; - tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType)); - tensor_gpu::copyDeviceToDevice(gpu_shape, other.gpu_shape, gpu_shape_size * sizeof(size_t)); - tensor_gpu::copyDeviceToDevice(gpu_strides, other.gpu_strides, gpu_shape_size * sizeof(size_t)); } return *this; } void Tensor::computeStrides() { - if (isGpu) { - gpu_strides = (size_t *)tensor_gpu::allocate(gpu_shape_size * sizeof(size_t)); - tensor_gpu::computeStridesDevice(gpu_shape, gpu_strides, gpu_shape_size); - } else { - const size_t dim = cpu_shape.size(); - cpu_strides.resize(dim); - size_t stride = 1; - for (size_t i = dim; i-- > 0;) { - cpu_strides[i] = stride; - stride *= cpu_shape[i]; - } + const size_t dim = shape.size(); + strides.resize(dim); + size_t stride = 1; + for (size_t i = dim; i-- > 0;) { + strides[i] = stride; + stride *= shape[i]; } } inline size_t Tensor::flattenIndex(const std::vector &indices) const { - if (!isGpu) { - // CPU version, same as before - if (indices.size() != cpu_shape.size()) { - throw std::invalid_argument("Incorrect number of indices."); - } - size_t index = 0; - for (size_t i = 0; i < cpu_shape.size(); ++i) { - if (indices[i] >= cpu_shape[i]) - throw std::out_of_range("Index out of bounds."); - index += indices[i] * cpu_strides[i]; - } - return index; - } else { - if (indices.size() != gpu_shape_size) { - throw std::invalid_argument("Incorrect number of indices."); - } - return tensor_gpu::flattenIndexGpu(indices.data(), gpu_shape, gpu_strides, gpu_shape_size); + // CPU version, same as before + if (indices.size() != shape.size()) { + throw std::invalid_argument("Incorrect number of indices."); } + size_t index = 0; + for (size_t i = 0; i < shape.size(); ++i) { + if (indices[i] >= shape[i]) + throw std::out_of_range("Index out of bounds."); + index += indices[i] * strides[i]; + } + return index; } ValueType Tensor::getValue(const std::vector &indices) const { @@ -149,60 +120,52 @@ void Tensor::setValue(const std::vector &indices, const ValueType value) } Tensor &Tensor::operator+=(const Tensor &other) { + if (shape != other.shape) + throw std::invalid_argument("Shape mismatch in Tensor::operator+=."); if (!isGpu) { - if (cpu_shape != other.cpu_shape) - throw std::invalid_argument("Shape mismatch in Tensor::operator+=."); const size_t N = cpu_data.size(); for (size_t i = 0; i < N; ++i) cpu_data[i] += other.cpu_data[i]; } else { - if (gpu_shape != other.gpu_shape) - throw std::invalid_argument("Shape mismatch in Tensor::operator+=."); tensor_gpu::add(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType)); } return *this; } Tensor &Tensor::operator-=(const Tensor &other) { + if (shape != other.shape) + throw std::invalid_argument("Shape mismatch in Tensor::operator-=."); if (!isGpu) { - if (cpu_shape != other.cpu_shape) - throw std::invalid_argument("Shape mismatch in Tensor::operator-=."); const size_t N = cpu_data.size(); for (size_t i = 0; i < N; ++i) cpu_data[i] -= other.cpu_data[i]; } else { - if (gpu_shape != other.gpu_shape) - throw std::invalid_argument("Shape mismatch in Tensor::operator+=."); tensor_gpu::subtraction(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType)); } return *this; } Tensor &Tensor::operator*=(const Tensor &other) { + if (shape != other.shape) + throw std::invalid_argument("Shape mismatch in Tensor::operator*=."); if (!isGpu) { - if (cpu_shape != other.cpu_shape) - throw std::invalid_argument("Shape mismatch in Tensor::operator*=."); const size_t N = cpu_data.size(); for (size_t i = 0; i < N; ++i) cpu_data[i] *= other.cpu_data[i]; } else { - if (gpu_shape != other.gpu_shape) - throw std::invalid_argument("Shape mismatch in Tensor::operator+=."); tensor_gpu::multiply(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType)); } return *this; } Tensor &Tensor::operator/=(const Tensor &other) { + if (shape != other.shape) + throw std::invalid_argument("Shape mismatch in Tensor::operator/=."); if (!isGpu) { - if (cpu_shape != other.cpu_shape) - throw std::invalid_argument("Shape mismatch in Tensor::operator/=."); const size_t N = cpu_data.size(); for (size_t i = 0; i < N; ++i) cpu_data[i] /= other.cpu_data[i]; } else { - if (gpu_shape != other.gpu_shape) - throw std::invalid_argument("Shape mismatch in Tensor::operator+=."); tensor_gpu::division(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType)); } return *this; @@ -273,20 +236,18 @@ Tensor Tensor::operator+(ValueType scalar) const { } Tensor Tensor::matmul(const Tensor &other) const { - if (!isGpu) { - const auto &aShape = cpu_shape; - const auto &bShape = other.cpu_shape; + const auto &aShape = shape; + const auto &bShape = other.shape; + if (aShape.size() != 2 || bShape.size() != 1) + throw std::runtime_error("matmul: unsupported shapes."); - if (aShape.size() != 2 || bShape.size() != 1) - throw std::runtime_error("matmul: unsupported shapes."); - - size_t M = aShape[0]; - size_t K = aShape[1]; - if (K != bShape[0]) - throw std::runtime_error("matmul: shape mismatch."); - - Tensor result({M}); + size_t M = aShape[0]; + size_t K = aShape[1]; + if (K != bShape[0]) + throw std::runtime_error("matmul: shape mismatch."); + Tensor result({M}); + if (!isGpu) { const float *A = cpu_data.data(); const float *B = other.cpu_data.data(); float *R = result.cpu_data.data(); @@ -301,33 +262,21 @@ Tensor Tensor::matmul(const Tensor &other) const { } return result; } - - // Validate shapes similarly (assumed available via gpu_shape_size and gpu_shape pointer) - if (gpu_shape_size != 2 || other.gpu_shape_size != 1) - throw std::runtime_error("matmul (GPU): unsupported shapes."); - - size_t M = gpu_shape[0]; - size_t K = gpu_shape[1]; - if (K != other.gpu_shape[0]) - throw std::runtime_error("matmul (GPU): shape mismatch."); - - Tensor result({M}, 0.0f); - - // Call GPU kernel or helper tensor_gpu::matmul(gpu_data, other.gpu_data, result.gpu_data, M, K); return result; } Tensor Tensor::outer(const Tensor &a, const Tensor &b) { - if (!isGpu) { - if (a.cpu_shape.size() != 1 || b.cpu_shape.size() != 1) { - throw std::runtime_error("outer: both tensors must be 1D vectors"); - } + if (a.shape.size() != 1 || b.shape.size() != 1) { + throw std::runtime_error("outer: both tensors must be 1D vectors"); + } - size_t m = a.cpu_shape[0]; - size_t n = b.cpu_shape[0]; + size_t m = a.shape[0]; + size_t n = b.shape[0]; - Tensor result({m, n}); + Tensor result({m, n}); + + if (!isGpu) { float *r = result.cpu_data.data(); const float *A = a.cpu_data.data(); const float *B = b.cpu_data.data(); @@ -339,35 +288,25 @@ Tensor Tensor::outer(const Tensor &a, const Tensor &b) { } return result; } - - if (a.gpu_shape_size != 1 || b.gpu_shape_size != 1) - throw std::runtime_error("outer (GPU): both tensors must be 1D vectors"); - - size_t m = a.gpu_shape[0]; - size_t n = b.gpu_shape[0]; - - Tensor result({m, n}); - - // Call GPU kernel or helper tensor_gpu::outer(a.gpu_data, b.gpu_data, result.gpu_data, m, n); return result; } Tensor Tensor::matmulT(const Tensor &vec) const { - if (!isGpu) { - const auto &wShape = cpu_shape; - const auto &vShape = vec.cpu_shape; + const auto &wShape = shape; + const auto &vShape = vec.shape; - if (wShape.size() != 2 || vShape.size() != 1) - throw std::runtime_error("matmulT: bad dimensions"); + if (wShape.size() != 2 || vShape.size() != 1) + throw std::runtime_error("matmulT: bad dimensions"); - size_t M = wShape[0]; - size_t N = wShape[1]; - if (vShape[0] != M) - throw std::runtime_error("matmulT: incompatible"); + size_t M = wShape[0]; + size_t N = wShape[1]; + if (vShape[0] != M) + throw std::runtime_error("matmulT: incompatible"); - Tensor result({N}, 0.0f); + Tensor result({N}, 0.0f); + if (!isGpu) { const float *W = cpu_data.data(); const float *V = vec.cpu_data.data(); float *R = result.cpu_data.data(); @@ -381,19 +320,6 @@ Tensor Tensor::matmulT(const Tensor &vec) const { } return result; } - - // GPU path - if (gpu_shape_size != 2 || vec.gpu_shape_size != 1) - throw std::runtime_error("matmulT (GPU): bad dimensions"); - - size_t M = gpu_shape[0]; - size_t N = gpu_shape[1]; - if (vec.gpu_shape[0] != M) - throw std::runtime_error("matmulT (GPU): incompatible"); - - Tensor result({N}); - - // Call GPU kernel or helper tensor_gpu::matmulT(gpu_data, vec.gpu_data, result.gpu_data, M, N); return result; } @@ -401,8 +327,6 @@ Tensor Tensor::matmulT(const Tensor &vec) const { Tensor::~Tensor() { if (isGpu) { tensor_gpu::deallocate(gpu_data); - tensor_gpu::deallocate(gpu_shape); - tensor_gpu::deallocate(gpu_strides); } } } // namespace nn::global diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp index ae48a4a..1b5a97f 100644 --- a/src/model/tensor_gpu.hpp +++ b/src/model/tensor_gpu.hpp @@ -2,6 +2,7 @@ #define TENSOR_GPU #include + namespace nn::global { using ValueType = float; } diff --git a/src/networks/fnn/DenseLayer.cpp b/src/networks/fnn/DenseLayer.cpp index a0f9833..31c5b2a 100644 --- a/src/networks/fnn/DenseLayer.cpp +++ b/src/networks/fnn/DenseLayer.cpp @@ -33,6 +33,7 @@ void Hidden_Layer::CreateDropoutMask() { } void Output_Layer::forward(const global::Tensor &metrix) { + net = parameters.weights.matmul(metrix); net += parameters.biases; diff --git a/src/networks/fnn/FNNetwork.cpp b/src/networks/fnn/FNNetwork.cpp index fd6f7fd..4304b9a 100644 --- a/src/networks/fnn/FNNetwork.cpp +++ b/src/networks/fnn/FNNetwork.cpp @@ -52,7 +52,7 @@ void FNNetwork::sendNewVNeurons(const size_t i) const { void FNNetwork::forward(const global::Tensor &newInput) { input = newInput; - layers[0]->forward(input); + layers[0]->forward(newInput); sendNewVNeurons(0); for (size_t i = 1; i < layers.size(); ++i) { diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json index 66b35f8..783afa4 100644 --- a/tests/data/config-binary_test.json +++ b/tests/data/config-binary_test.json @@ -1,9 +1,9 @@ { "visual config": { - "enableVisuals": false, + "enableVisuals": true, "modes": [ { "state": "pause", "mode": true }, - { "state": "precise mode", "mode": false }, + { "state": "precise mode", "mode": true }, { "state": "auto pause", "mode": false } ] }, From ace20ca37a0ad33451cd1bb386771a80d710675b Mon Sep 17 00:00:00 2001 From: maayan Date: Thu, 7 Aug 2025 17:53:56 +0300 Subject: [PATCH 19/40] bug fixes --- src/model/tensor.cpp | 1 + tests/data/config-binary_test.json | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index c131cf7..f247ecd 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -72,6 +72,7 @@ Tensor &Tensor::operator=(const Tensor &other) { if (!isGpu) { cpu_data = other.cpu_data; } else { + tensor_gpu::deallocate(gpu_data); gpu_data = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType)); gpu_data_size = other.gpu_data_size; tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType)); diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json index 783afa4..e6982c6 100644 --- a/tests/data/config-binary_test.json +++ b/tests/data/config-binary_test.json @@ -1,14 +1,14 @@ { "visual config": { - "enableVisuals": true, + "enableVisuals": false, "modes": [ { "state": "pause", "mode": true }, - { "state": "precise mode", "mode": true }, + { "state": "precise mode", "mode": false }, { "state": "auto pause", "mode": false } ] }, "training config": { - "batch size": 16, + "batch size": 32, "batch count": 1000, "auto save": { "saveEvery": 2000, @@ -30,7 +30,7 @@ "output size": 16, "output activation": 4, "layers": [ - { "size": 30, "activationType": 1 }, + { "size": 100, "activationType": 1 }, { "size": 30, "activationType": 1 } ] } From 26c607aa8a735cc3456a3b371a0654d702539385 Mon Sep 17 00:00:00 2001 From: maayan Date: Thu, 7 Aug 2025 18:12:01 +0300 Subject: [PATCH 20/40] bug fix --- include/tensor.hpp | 6 ------ src/model/activations.cpp | 6 +++--- src/model/model.cpp | 12 +++++++++--- src/model/tensor.cpp | 11 +++++++++-- src/networks/fnn/DenseLayer.cpp | 8 +++++--- tests/binary_test.cpp | 4 ++-- 6 files changed, 28 insertions(+), 19 deletions(-) diff --git a/include/tensor.hpp b/include/tensor.hpp index 68999da..2699c5f 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -40,12 +40,6 @@ class Tensor { ValueType getValue(const std::vector &newShape) const; void setValue(const std::vector &newShape, const ValueType value); - // Iterators (for range-based loops) - auto begin() noexcept { return cpu_data.begin(); } - auto end() noexcept { return cpu_data.end(); } - auto begin() const noexcept { return cpu_data.begin(); } - auto end() const noexcept { return cpu_data.end(); } - // Shape and size size_t numElements() const; void getData(std::vector &dest) const; diff --git a/src/model/activations.cpp b/src/model/activations.cpp index 319104b..9f537c3 100644 --- a/src/model/activations.cpp +++ b/src/model/activations.cpp @@ -48,9 +48,9 @@ global::ValueType Activation::maxVector(const global::Tensor &metrix) { if (metrix.isGpu) { } global::ValueType max = metrix.cpu_data[0]; - for (auto &value : metrix) { - if (value > max) { - max = value; + for (size_t i = 0; i < metrix.numElements(); ++i) { + if (metrix.getValue({i}) > max) { + max = metrix.getValue({i}); } } diff --git a/src/model/model.cpp b/src/model/model.cpp index 9e6442c..525e4df 100644 --- a/src/model/model.cpp +++ b/src/model/model.cpp @@ -350,9 +350,15 @@ modelResult Model::evaluateModel( runModel(transformation(sample.input)); - size_t predicted_index = std::distance( - getOutput().begin(), - std::max_element(getOutput().begin(), getOutput().end())); + size_t predicted_index = 0; + float max_value = getOutput().getValue({0}); + + for (size_t j = 1; j < getOutput().numElements(); ++j) { + if (getOutput().getValue({j}) > max_value) { + max_value = getOutput().getValue({j}); + predicted_index = j; + } + } if (showProgressbar) { bar++; diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index f247ecd..ca5021b 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -60,7 +60,14 @@ void Tensor::getData(std::vector &dest) const { } void Tensor::fill(const ValueType &value) { - std::fill(begin(), end(), value); + if (isGpu) { + tensor_gpu::zero(gpu_data, gpu_data_size); + tensor_gpu::add(gpu_data, value, gpu_data, gpu_data_size); + } else { + for (auto &n : cpu_data) { + n = value; + } + } } Tensor &Tensor::operator=(const Tensor &other) { @@ -72,7 +79,7 @@ Tensor &Tensor::operator=(const Tensor &other) { if (!isGpu) { cpu_data = other.cpu_data; } else { - tensor_gpu::deallocate(gpu_data); + tensor_gpu::deallocate(gpu_data); gpu_data = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType)); gpu_data_size = other.gpu_data_size; tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType)); diff --git a/src/networks/fnn/DenseLayer.cpp b/src/networks/fnn/DenseLayer.cpp index 31c5b2a..51a92a6 100644 --- a/src/networks/fnn/DenseLayer.cpp +++ b/src/networks/fnn/DenseLayer.cpp @@ -149,7 +149,7 @@ void DenseLayer::setData(const global::Tensor newParam) { size_t currentI = 0; for (size_t i = 0; i < size(); ++i) { for (size_t j = 0; j < prevSize(); ++j) { - parameters.weights.setValue({i, j}, newParam.getValue({currentI})); + parameters.weights.setValue({i, j}, newParam.getValue({currentI})); ++currentI; } @@ -168,8 +168,10 @@ void DenseLayer::fillParamRandom() { global::ValueType std_dev = std::sqrt(2.0 / static_cast(prevSize())); std::normal_distribution<> dist(0.0, std_dev); - for (auto &value : parameters.weights) { - value = dist(gen); + for (size_t i = 0; i < parameters.size(); ++i) { + for (size_t j = 0; j < parameters.prevSize(); ++j) { + parameters.weights.setValue({i, j}, dist(gen)); + } } } diff --git a/tests/binary_test.cpp b/tests/binary_test.cpp index aff4107..7bdd693 100644 --- a/tests/binary_test.cpp +++ b/tests/binary_test.cpp @@ -66,8 +66,8 @@ void print_database(int actual_size, int input_size, int database_size) { } void printVector(const nn::global::Tensor &vec) { - for (const auto &elem : vec) { - std::cout << elem << ' '; + for (size_t i =0; i < vec.numElements(); ++i) { + std::cout << vec.getValue({i}) << ' '; } std::cout << '\n'; From a157c5e1767c121c4d8a88ce2e96f4091c012cab Mon Sep 17 00:00:00 2001 From: maayan Date: Thu, 7 Aug 2025 18:39:42 +0300 Subject: [PATCH 21/40] new commit --- src/model/tensor.cpp | 1 - src/model/tensor_gpu.cu | 73 +--------------------------------------- src/model/tensor_gpu.hpp | 3 -- 3 files changed, 1 insertion(+), 76 deletions(-) diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index ca5021b..ef6372a 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -22,7 +22,6 @@ Tensor::Tensor(const std::vector &shape_, float init) { cpu_data.assign(totalSize, init); } else { gpu_data = (ValueType *)tensor_gpu::allocate(totalSize * sizeof(ValueType)); - gpu_data_size = totalSize; } diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu index b27ffd4..6b7cbb6 100644 --- a/src/model/tensor_gpu.cu +++ b/src/model/tensor_gpu.cu @@ -49,7 +49,6 @@ void zero(ValueType* deviceData, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; zeroKernel<<>>(deviceData, count); - cudaDeviceSynchronize(); } // Kernel for element-wise addition: C = A + B @@ -65,7 +64,6 @@ void add(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; addKernel<<>>(A, B, C, count); - cudaDeviceSynchronize(); } @@ -82,7 +80,6 @@ void subtraction(const ValueType* A, const ValueType* B, ValueType* C, std::size std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; subtractionKernel<<>>(A, B, C, count); - cudaDeviceSynchronize(); } // Kernel for element-wise addition: C = A / B @@ -98,7 +95,6 @@ void division(const ValueType* A, const ValueType* B, ValueType* C, std::size_t std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; divisionKernel<<>>(A, B, C, count); - cudaDeviceSynchronize(); } // Kernel for element-wise multiplication: C = A * B @@ -114,7 +110,6 @@ void multiply(const ValueType* A, const ValueType* B, ValueType* C, std::size_t std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; multiplyKernel<<>>(A, B, C, count); - cudaDeviceSynchronize(); } // Kernel for element-wise addition: C = A + B @@ -130,7 +125,6 @@ void add(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; addKernel<<>>(A, B, C, count); - cudaDeviceSynchronize(); } @@ -147,7 +141,6 @@ void subtraction(const ValueType* A, const ValueType B, ValueType* C, std::size_ std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; subtractionKernel<<>>(A, B, C, count); - cudaDeviceSynchronize(); } // Kernel for element-wise addition: C = A / B @@ -163,7 +156,6 @@ void division(const ValueType* A, const ValueType B, ValueType* C, std::size_t c std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; divisionKernel<<>>(A, B, C, count); - cudaDeviceSynchronize(); } // Kernel for element-wise multiplication: C = A * B @@ -179,60 +171,6 @@ void multiply(const ValueType* A, const ValueType B, ValueType* C, std::size_t c std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; multiplyKernel<<>>(A, B, C, count); - cudaDeviceSynchronize(); -} - -// Dot product kernel using parallel reduction (simplified version) -__global__ void dotKernel(const ValueType* A, const ValueType* B, ValueType* partialSum, std::size_t count) { - __shared__ ValueType cache[256]; - std::size_t tid = threadIdx.x; - std::size_t idx = blockIdx.x * blockDim.x + tid; - - float temp = 0.0f; - if (idx < count) { - temp = A[idx] * B[idx]; - } - cache[tid] = temp; - __syncthreads(); - - // Reduction in shared memory - for (std::size_t stride = blockDim.x / 2; stride > 0; stride /= 2) { - if (tid < stride) { - cache[tid] += cache[tid + stride]; - } - __syncthreads(); - } - - if (tid == 0) { - partialSum[blockIdx.x] = cache[0]; - } -} - -// Dot product between two vectors (A · B) -float dot(const ValueType* A, const ValueType* B, std::size_t count) { - const std::size_t blockSize = 256; - std::size_t numBlocks = (count + blockSize - 1) / blockSize; - - // Allocate partial sums - ValueType* d_partialSum = nullptr; - cudaMalloc(&d_partialSum, numBlocks * sizeof(ValueType)); - - dotKernel<<>>(A, B, d_partialSum, count); - cudaDeviceSynchronize(); - - // Copy partial sums to host - ValueType* h_partialSum = new ValueType[numBlocks]; - cudaMemcpy(h_partialSum, d_partialSum, numBlocks * sizeof(float), cudaMemcpyDeviceToHost); - - // Final reduction on CPU - ValueType totalSum = 0.0f; - for (std::size_t i = 0; i < numBlocks; i++) { - totalSum += h_partialSum[i]; - } - - delete[] h_partialSum; - cudaFree(d_partialSum); - return totalSum; } __global__ void computeStrides(const size_t *shape, size_t *strides, size_t ndim) { @@ -245,7 +183,7 @@ __global__ void computeStrides(const size_t *shape, size_t *strides, size_t ndim void computeStridesDevice(const size_t *gpu_shape, size_t *gpu_strides, std::size_t ndim) { computeStrides<<<1, 1>>>(gpu_shape, gpu_strides, ndim); - cudaDeviceSynchronize(); // Ensure computation completes + cudaDeviceSynchronize(); } // Kernel to apply ReLU activation: max(0, x) @@ -261,7 +199,6 @@ void relu(const ValueType *input, ValueType *output, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; reluKernel<<>>(input, output, count); - cudaDeviceSynchronize(); } // Kernel to apply ReLU derivative: @@ -278,7 +215,6 @@ void relu_derivative(const ValueType* input, ValueType* output, std::size_t coun std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; reluDerivativeKernel<<>>(input, output, count); - cudaDeviceSynchronize(); } // Kernel to apply Sigmoid activation: 1 / (1 + exp(-x)) @@ -295,7 +231,6 @@ void sigmoid(const ValueType* input, ValueType* output, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; sigmoidKernel<<>>(input, output, count); - cudaDeviceSynchronize(); } // Kernel for Sigmoid derivative: s(x) * (1 - s(x)) @@ -313,7 +248,6 @@ void sigmoid_derivative(const ValueType* input, ValueType* output, std::size_t c std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; sigmoidDerivativeKernel<<>>(input, output, count); - cudaDeviceSynchronize(); } // Kernel to apply Tanh activation: tanh(x) @@ -329,7 +263,6 @@ void tanh_activation(const ValueType* input, ValueType* output, std::size_t coun std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; tanhKernel<<>>(input, output, count); - cudaDeviceSynchronize(); } // Kernel for Tanh derivative: 1 - tanh(x)^2 @@ -346,7 +279,6 @@ void tanh_derivative(const ValueType* input, ValueType* output, std::size_t coun std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; tanhDerivativeKernel<<>>(input, output, count); - cudaDeviceSynchronize(); } // Kernel for Leaky ReLU: x > 0 ? x : alpha * x @@ -363,7 +295,6 @@ void leaky_relu(const ValueType* input, ValueType* output, std::size_t count, Va std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; leakyReluKernel<<>>(input, output, count, alpha); - cudaDeviceSynchronize(); } // Kernel for Leaky ReLU derivative: x > 0 ? 1 : alpha @@ -379,7 +310,6 @@ void leaky_relu_derivative(const ValueType* input, ValueType* output, std::size_ std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; leakyReluDerivativeKernel<<>>(input, output, count, alpha); - cudaDeviceSynchronize(); } __global__ void softmaxKernel(const ValueType* input, ValueType* output, std::size_t count) { @@ -423,7 +353,6 @@ void softmax(const ValueType* input, ValueType* output, std::size_t count) { std::size_t sharedMemSize = blockSize * sizeof(ValueType); softmaxKernel<<>>(input, output, count); - cudaDeviceSynchronize(); } void setValueAt(ValueType* devicePtr, std::size_t index, ValueType value) { diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp index 1b5a97f..4e48651 100644 --- a/src/model/tensor_gpu.hpp +++ b/src/model/tensor_gpu.hpp @@ -52,9 +52,6 @@ void division(const ValueType *A, const ValueType B, ValueType *C, std::size_t c /// Element-wise multiply: C = A * B void multiply(const ValueType *A, const ValueType B, ValueType *C, std::size_t count); -/// Dot product between two vectors (A · B) -float dot(const ValueType *A, const ValueType *B, std::size_t count); - void computeStridesDevice(const size_t *gpu_shape, size_t *gpu_strides, std::size_t ndim); // ---------------- ReLU ---------------- From 696092acd5c48af3e25749004053d350ef17ffaa Mon Sep 17 00:00:00 2001 From: maayan Date: Thu, 7 Aug 2025 18:55:29 +0300 Subject: [PATCH 22/40] performance improvment --- include/tensor.hpp | 6 ------ src/model/optimizers.cpp | 5 +++-- src/model/optimizers.hpp | 4 ++-- src/model/tensor.cpp | 45 ++++++++++------------------------------ 4 files changed, 16 insertions(+), 44 deletions(-) diff --git a/include/tensor.hpp b/include/tensor.hpp index 2699c5f..3da043c 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -45,12 +45,6 @@ class Tensor { void getData(std::vector &dest) const; void fill(const ValueType &value); - // Arithmetic operations - Tensor operator*(ValueType scalar) const; - Tensor operator+(ValueType scalar) const; - Tensor operator/(ValueType scalar) const; - Tensor operator-(ValueType scalar) const; - Tensor &operator+=(const Tensor &other); Tensor &operator-=(const Tensor &other); Tensor &operator*=(const Tensor &other); diff --git a/src/model/optimizers.cpp b/src/model/optimizers.cpp index 9014aa2..b34a89a 100644 --- a/src/model/optimizers.cpp +++ b/src/model/optimizers.cpp @@ -1,7 +1,8 @@ #include "optimizers.hpp" namespace nn::model { -void ConstantOptimizer::step(global::Tensor &weight, const global::Tensor &grad) { - weight -= grad * (config.getLearningRate() / batchSize); +void ConstantOptimizer::step(global::Tensor &weight, global::Tensor &grad) { + grad *= config.getLearningRate() / batchSize; + weight -= grad; } } // namespace nn::model diff --git a/src/model/optimizers.hpp b/src/model/optimizers.hpp index a910edf..203ea4e 100644 --- a/src/model/optimizers.hpp +++ b/src/model/optimizers.hpp @@ -13,7 +13,7 @@ class IOptimizer { public: virtual ~IOptimizer() = default; - virtual void step(global::Tensor &weight, const global::Tensor &grad) = 0; + virtual void step(global::Tensor &weight, global::Tensor &grad) = 0; virtual void reset() = 0; void setOfset(const int batchSize_) { batchSize = batchSize_; } @@ -27,7 +27,7 @@ class ConstantOptimizer : public IOptimizer { ConstantOptimizer(const ConstantOptimizerConfig &config_) : config(config_) {} - void step(global::Tensor &weight, const global::Tensor &grad) override; + void step(global::Tensor &weight, global::Tensor &grad) override; void reset() override {} }; diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index ef6372a..e71c0f9 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -78,10 +78,11 @@ Tensor &Tensor::operator=(const Tensor &other) { if (!isGpu) { cpu_data = other.cpu_data; } else { - tensor_gpu::deallocate(gpu_data); - gpu_data = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType)); + ValueType *temp = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType)); gpu_data_size = other.gpu_data_size; tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType)); + tensor_gpu::deallocate(gpu_data); + gpu_data = temp; } return *this; } @@ -134,7 +135,7 @@ Tensor &Tensor::operator+=(const Tensor &other) { for (size_t i = 0; i < N; ++i) cpu_data[i] += other.cpu_data[i]; } else { - tensor_gpu::add(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType)); + tensor_gpu::add(gpu_data, other.gpu_data, gpu_data, gpu_data_size); } return *this; } @@ -147,7 +148,7 @@ Tensor &Tensor::operator-=(const Tensor &other) { for (size_t i = 0; i < N; ++i) cpu_data[i] -= other.cpu_data[i]; } else { - tensor_gpu::subtraction(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType)); + tensor_gpu::subtraction(gpu_data, other.gpu_data, gpu_data, gpu_data_size); } return *this; } @@ -160,7 +161,7 @@ Tensor &Tensor::operator*=(const Tensor &other) { for (size_t i = 0; i < N; ++i) cpu_data[i] *= other.cpu_data[i]; } else { - tensor_gpu::multiply(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType)); + tensor_gpu::multiply(gpu_data, other.gpu_data, gpu_data, gpu_data_size); } return *this; } @@ -173,7 +174,7 @@ Tensor &Tensor::operator/=(const Tensor &other) { for (size_t i = 0; i < N; ++i) cpu_data[i] /= other.cpu_data[i]; } else { - tensor_gpu::division(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType)); + tensor_gpu::division(gpu_data, other.gpu_data, gpu_data, gpu_data_size); } return *this; } @@ -183,7 +184,7 @@ Tensor &Tensor::operator*=(ValueType scalar) { for (auto &x : cpu_data) x *= scalar; } else { - tensor_gpu::multiply(gpu_data, scalar, gpu_data, gpu_data_size * sizeof(ValueType)); + tensor_gpu::multiply(gpu_data, scalar, gpu_data, gpu_data_size); } return *this; } @@ -193,7 +194,7 @@ Tensor &Tensor::operator-=(ValueType scalar) { for (auto &x : cpu_data) x -= scalar; } else { - tensor_gpu::subtraction(gpu_data, scalar, gpu_data, gpu_data_size * sizeof(ValueType)); + tensor_gpu::subtraction(gpu_data, scalar, gpu_data, gpu_data_size); } return *this; } @@ -203,7 +204,7 @@ Tensor &Tensor::operator+=(ValueType scalar) { for (auto &x : cpu_data) x += scalar; } else { - tensor_gpu::add(gpu_data, scalar, gpu_data, gpu_data_size * sizeof(ValueType)); + tensor_gpu::add(gpu_data, scalar, gpu_data, gpu_data_size); } return *this; } @@ -213,35 +214,11 @@ Tensor &Tensor::operator/=(ValueType scalar) { for (auto &x : cpu_data) x /= scalar; } else { - tensor_gpu::division(gpu_data, scalar, gpu_data, gpu_data_size * sizeof(ValueType)); + tensor_gpu::division(gpu_data, scalar, gpu_data, gpu_data_size); } return *this; } -Tensor Tensor::operator*(ValueType scalar) const { - Tensor result(*this); - result *= scalar; - return result; -} - -Tensor Tensor::operator/(ValueType scalar) const { - Tensor result(*this); - result /= scalar; - return result; -} - -Tensor Tensor::operator-(ValueType scalar) const { - Tensor result(*this); - result -= scalar; - return result; -} - -Tensor Tensor::operator+(ValueType scalar) const { - Tensor result(*this); - result += scalar; - return result; -} - Tensor Tensor::matmul(const Tensor &other) const { const auto &aShape = shape; const auto &bShape = other.shape; From 2f64b7ecce63072d870609f6bb7b9b55dd0888ca Mon Sep 17 00:00:00 2001 From: maayan Date: Thu, 7 Aug 2025 19:10:58 +0300 Subject: [PATCH 23/40] small change --- src/model/tensor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index e71c0f9..749eaeb 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -3,7 +3,6 @@ #include #include #include -#include namespace nn::global { Tensor::Tensor(const std::vector &shape_, float init) { @@ -23,6 +22,7 @@ Tensor::Tensor(const std::vector &shape_, float init) { } else { gpu_data = (ValueType *)tensor_gpu::allocate(totalSize * sizeof(ValueType)); gpu_data_size = totalSize; + fill(init); } computeStrides(); From 7890c8f88ff51ece577bc7768e105ef0dc820dd6 Mon Sep 17 00:00:00 2001 From: maayan Date: Fri, 8 Aug 2025 14:49:11 +0300 Subject: [PATCH 24/40] bug fix, improve performance --- include/network/INetwork.hpp | 2 +- include/tensor.hpp | 4 ++- src/model/model.cpp | 4 ++- src/model/tensor.cpp | 54 +++++++++++++++------------------ src/networks/cnn/CNNetwork.cpp | 2 +- src/networks/cnn/CNNetwork.hpp | 2 +- src/networks/fnn/DenseLayer.cpp | 42 ++++++++++++------------- src/networks/fnn/DenseLayer.hpp | 13 +++++--- src/networks/fnn/FNNetwork.cpp | 14 ++++----- src/networks/fnn/FNNetwork.hpp | 4 +-- 10 files changed, 70 insertions(+), 71 deletions(-) diff --git a/include/network/INetwork.hpp b/include/network/INetwork.hpp index 530a444..574c600 100644 --- a/include/network/INetwork.hpp +++ b/include/network/INetwork.hpp @@ -11,7 +11,7 @@ class INetwork { virtual ~INetwork() = default; virtual void forward(const global::Tensor &input) = 0; - virtual void backward(const global::Tensor &outputDeltas) = 0; + virtual void backward(global::Tensor **outputDeltas) = 0; virtual void updateWeights(IOptimizer &optimizer) = 0; virtual void resetGradient() = 0; diff --git a/include/tensor.hpp b/include/tensor.hpp index 3da043c..f422913 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -42,6 +42,8 @@ class Tensor { // Shape and size size_t numElements() const; + const std::vector &getShape() const { return shape; } + const std::vector &getStrides() const { return strides; } void getData(std::vector &dest) const; void fill(const ValueType &value); @@ -57,7 +59,7 @@ class Tensor { Tensor matmul(const Tensor &other) const; static Tensor outer(const Tensor &a, const Tensor &b); - Tensor matmulT(const Tensor &vec) const; + void matmulT(const Tensor &vec, Tensor &result) const; }; } // namespace nn::global diff --git a/src/model/model.cpp b/src/model/model.cpp index 525e4df..70249a7 100644 --- a/src/model/model.cpp +++ b/src/model/model.cpp @@ -1,6 +1,7 @@ #include "../networks/cnn/CNNetwork.hpp" #include "../networks/fnn/FNNetwork.hpp" #include "dataBase.hpp" +#include "tensor.hpp" #include #include #include @@ -159,9 +160,10 @@ void Model::updateWeights(const int batchSize) { void Model::Backward(const global::Tensor &output) { global::Tensor deltas = output; + global::Tensor *delta = &deltas; for (int i = static_cast(network.size()) - 1; i >= 0; --i) { - network[i]->backward(deltas); + network[i]->backward(&delta); deltas = network[i]->getInput(); } } diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index 749eaeb..0bfe7b0 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -22,7 +22,7 @@ Tensor::Tensor(const std::vector &shape_, float init) { } else { gpu_data = (ValueType *)tensor_gpu::allocate(totalSize * sizeof(ValueType)); gpu_data_size = totalSize; - fill(init); + fill(init); } computeStrides(); @@ -73,17 +73,25 @@ Tensor &Tensor::operator=(const Tensor &other) { if (this == &other) return *this; - shape = other.shape; - strides = other.strides; if (!isGpu) { cpu_data = other.cpu_data; } else { - ValueType *temp = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType)); - gpu_data_size = other.gpu_data_size; + ValueType *temp = gpu_data; + if (gpu_data_size != other.gpu_data_size) { + temp = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType)); + + gpu_data_size = other.gpu_data_size; + } tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType)); - tensor_gpu::deallocate(gpu_data); - gpu_data = temp; + + if (gpu_data_size != other.gpu_data_size) { + tensor_gpu::deallocate(gpu_data); + gpu_data = temp; + } } + + shape = other.shape; + strides = other.strides; return *this; } @@ -276,36 +284,22 @@ Tensor Tensor::outer(const Tensor &a, const Tensor &b) { return result; } -Tensor Tensor::matmulT(const Tensor &vec) const { - const auto &wShape = shape; - const auto &vShape = vec.shape; - - if (wShape.size() != 2 || vShape.size() != 1) +void Tensor::matmulT(const Tensor &vec, Tensor &result) const { + if (shape.size() != 2 || vec.shape.size() != 1) throw std::runtime_error("matmulT: bad dimensions"); - - size_t M = wShape[0]; - size_t N = wShape[1]; - if (vShape[0] != M) + if (vec.shape[0] != shape[0]) throw std::runtime_error("matmulT: incompatible"); - Tensor result({N}, 0.0f); - if (!isGpu) { - const float *W = cpu_data.data(); - const float *V = vec.cpu_data.data(); - float *R = result.cpu_data.data(); - - for (size_t i = 0; i < N; ++i) { - float sum = 0.0f; - for (size_t j = 0; j < M; ++j) { - sum += W[j * N + i] * V[j]; + result.fill(0); + for (size_t i = 0; i < shape[1]; ++i) { + for (size_t j = 0; j < shape[0]; ++j) { + result.cpu_data[i] += cpu_data[j * shape[1] + i] * vec.cpu_data[j]; } - R[i] = sum; } - return result; + } else { + tensor_gpu::matmulT(gpu_data, vec.gpu_data, result.gpu_data, shape[0], shape[1]); } - tensor_gpu::matmulT(gpu_data, vec.gpu_data, result.gpu_data, M, N); - return result; } Tensor::~Tensor() { diff --git a/src/networks/cnn/CNNetwork.cpp b/src/networks/cnn/CNNetwork.cpp index fc45ba3..a2d8b1c 100644 --- a/src/networks/cnn/CNNetwork.cpp +++ b/src/networks/cnn/CNNetwork.cpp @@ -16,7 +16,7 @@ void CNNetwork::forward(const global::Tensor &newInput) { input = newInput; } -void CNNetwork::backward(const global::Tensor &) { +void CNNetwork::backward(global::Tensor **) { } global::ValueType CNNetwork::getLoss(const global::Prediction &) const { diff --git a/src/networks/cnn/CNNetwork.hpp b/src/networks/cnn/CNNetwork.hpp index e2ad58d..1c08705 100644 --- a/src/networks/cnn/CNNetwork.hpp +++ b/src/networks/cnn/CNNetwork.hpp @@ -24,7 +24,7 @@ class CNNetwork : public INetwork { ~CNNetwork() override = default; void forward(const global::Tensor &newInput) override; - void backward(const global::Tensor &outputDeltas) override; + void backward(global::Tensor **outputDeltas) override; void updateWeights(IOptimizer &optimizer) override; void resetGradient() override; diff --git a/src/networks/fnn/DenseLayer.cpp b/src/networks/fnn/DenseLayer.cpp index 51a92a6..feacf6c 100644 --- a/src/networks/fnn/DenseLayer.cpp +++ b/src/networks/fnn/DenseLayer.cpp @@ -11,7 +11,8 @@ DenseLayer::DenseLayer( out({size}), parameters(size, prevSize), gradients(size, prevSize), - activationFunction(activation) { + activationFunction(activation), + deltaL({size}) { if (randomInit) { fillParamRandom(); } @@ -40,25 +41,25 @@ void Output_Layer::forward(const global::Tensor &metrix) { activationFunction.activate(net, out); } -global::Tensor Output_Layer::getDelta(const global::Tensor &output) { - global::Tensor deltas = out; - deltas -= output; - - return deltas; +void Output_Layer::getDelta(const global::Tensor &output) { + deltaL = out; + deltaL -= output; } void Output_Layer::backward( - global::Tensor &deltas, + global::Tensor **deltas, const global::Tensor &prevLayer, const LayerParams *) { if (activationFunction.getType() == ActivationType::Softmax) { - deltas = getDelta(deltas); + getDelta(**deltas); } else { - activationFunction.derivativeActivate(out, deltas); + activationFunction.derivativeActivate(out, **deltas); + deltaL = **deltas; } - gradients.biases += deltas; - gradients.weights += global::Tensor::outer(deltas, prevLayer); + gradients.biases += deltaL; + gradients.weights += global::Tensor::outer(deltaL, prevLayer); + *deltas = &deltaL; } global::ValueType Output_Layer::getCrossEntropyLoss( @@ -87,32 +88,31 @@ void Hidden_Layer::forward(const global::Tensor &metrix) { } void Hidden_Layer::backward( - global::Tensor &deltas, + global::Tensor **deltas, const global::Tensor &prevLayer, const LayerParams *nextLayer) { if (!nextLayer) return; - deltas = getDelta(deltas, *nextLayer); + calculateDelta(**deltas, *nextLayer); if (isTraining && config.dropoutRate) { - deltas *= dropoutMask; + deltaL *= dropoutMask; } - gradients.biases += deltas; + gradients.biases += deltaL; - gradients.weights += global::Tensor::outer(deltas, prevLayer); + gradients.weights += global::Tensor::outer(deltaL, prevLayer); + *deltas = &deltaL; } -global::Tensor Hidden_Layer::getDelta( +void Hidden_Layer::calculateDelta( const global::Tensor &output, const LayerParams &nextLayer) { - auto deltas = nextLayer.weights.matmulT(output); - activationFunction.derivativeActivate(out, deltas); - - return deltas; + nextLayer.weights.matmulT(output, deltaL); + activationFunction.derivativeActivate(out, deltaL); } size_t DenseLayer::getParamCount() const { diff --git a/src/networks/fnn/DenseLayer.hpp b/src/networks/fnn/DenseLayer.hpp index 2b401a2..04c807f 100644 --- a/src/networks/fnn/DenseLayer.hpp +++ b/src/networks/fnn/DenseLayer.hpp @@ -3,6 +3,7 @@ #include "../../model/config.hpp" #include "../src/model/optimizers.hpp" +#include "tensor.hpp" namespace nn::model::fnn { constexpr global::ValueType MIN_LOSS_VALUE = 1e-10; @@ -14,6 +15,7 @@ struct LayerParams { size_t size_; size_t prevSize_; + LayerParams(size_t out_dim, size_t in_dim) : weights({out_dim, in_dim}), biases({out_dim}), size_(out_dim), prevSize_(in_dim) {} @@ -35,6 +37,7 @@ class DenseLayer { Activation activationFunction; bool isTraining{false}; + global::Tensor deltaL; void fillParamRandom(); @@ -49,7 +52,7 @@ class DenseLayer { virtual void forward(const global::Tensor &metrix) = 0; void updateWeight(IOptimizer &optimizer); virtual void backward( - global::Tensor &deltas, + global::Tensor **deltas, const global::Tensor &prevLayer, const LayerParams *nextLayer = nullptr) = 0; virtual global::ValueType getLoss(const global::Prediction &) { return 0; }; @@ -77,7 +80,7 @@ class DenseLayer { class Hidden_Layer : public DenseLayer { private: const DenseLayerConfig &config; - global::Tensor getDelta( + void calculateDelta( const global::Tensor &output, const LayerParams &nextLayer); @@ -97,7 +100,7 @@ class Hidden_Layer : public DenseLayer { void forward(const global::Tensor &metrix) override; void backward( - global::Tensor &deltas, + global::Tensor **deltas, const global::Tensor &prevLayer, const LayerParams *nextLayer) override; }; @@ -106,7 +109,7 @@ class Output_Layer : public DenseLayer { private: const FNNConfig &config; - global::Tensor getDelta(const global::Tensor &output); + void getDelta(const global::Tensor &output); static global::ValueType getCrossEntropyLoss( const global::Tensor &prediction, const size_t target); @@ -126,7 +129,7 @@ class Output_Layer : public DenseLayer { void forward(const global::Tensor &metrix) override; void backward( - global::Tensor &deltas, + global::Tensor **deltas, const global::Tensor &prevLayer, const LayerParams *) override; diff --git a/src/networks/fnn/FNNetwork.cpp b/src/networks/fnn/FNNetwork.cpp index 4304b9a..d723cc5 100644 --- a/src/networks/fnn/FNNetwork.cpp +++ b/src/networks/fnn/FNNetwork.cpp @@ -72,12 +72,10 @@ void FNNetwork::vUpdate() { visual->attempPause(); } -void FNNetwork::backward(const global::Tensor &outputDeltas) { - global::Tensor deltas = outputDeltas; - +void FNNetwork::backward(global::Tensor **outputDeltas) { resetGradient(); - layers.back()->backward(deltas, layers[layers.size() - 2]->getOut()); + layers.back()->backward(outputDeltas, layers[layers.size() - 2]->getOut()); if (visual) { visual->setGrad(layers.size() - 1, layers[layers.size() - 1]->getGrad()); @@ -85,7 +83,7 @@ void FNNetwork::backward(const global::Tensor &outputDeltas) { for (int i = static_cast(layers.size()) - 2; i >= 0; --i) { const global::Tensor &prev = (i == 0) ? input : layers[i - 1]->getOut(); - layers[i]->backward(deltas, prev, &layers[i + 1]->getParms()); + layers[i]->backward(outputDeltas, prev, &layers[i + 1]->getParms()); if (visual) { visual->setGrad(i, layers[i]->getGrad()); @@ -94,7 +92,7 @@ void FNNetwork::backward(const global::Tensor &outputDeltas) { vUpdate(); } - calculateInputDelta(deltas); + calculateInputDelta(outputDeltas); } global::ValueType FNNetwork::getLoss(const global::Prediction &pre) const { @@ -137,8 +135,8 @@ void FNNetwork::updateWeights(IOptimizer &optimizer) { } } -void FNNetwork::calculateInputDelta(const global::Tensor &deltas) { - input = layers[0]->getParms().weights.matmulT(deltas); +void FNNetwork::calculateInputDelta(global::Tensor **deltas) { + layers[0]->getParms().weights.matmulT(**deltas, input); } size_t FNNetwork::getParamCount() const { diff --git a/src/networks/fnn/FNNetwork.hpp b/src/networks/fnn/FNNetwork.hpp index 93d2366..674735c 100644 --- a/src/networks/fnn/FNNetwork.hpp +++ b/src/networks/fnn/FNNetwork.hpp @@ -13,7 +13,7 @@ class FNNetwork : public INetwork { const std::shared_ptr visual; - void calculateInputDelta(const global::Tensor &deltas); + void calculateInputDelta(global::Tensor **deltas); void vUpdate(); @@ -28,7 +28,7 @@ class FNNetwork : public INetwork { ~FNNetwork() override = default; void forward(const global::Tensor &newInput) override; - void backward(const global::Tensor &outputDeltas) override; + void backward(global::Tensor **outputDeltas) override; void updateWeights(IOptimizer &optimizer) override; void resetGradient() override; From d3133930773c18cb315455941ca6aef5d00258e8 Mon Sep 17 00:00:00 2001 From: maayan Date: Fri, 8 Aug 2025 15:35:10 +0300 Subject: [PATCH 25/40] improved performance --- include/tensor.hpp | 7 +++--- src/model/tensor.cpp | 40 ++++++++++++++++-------------- src/model/tensor_gpu.cu | 2 +- src/networks/fnn/DenseLayer.cpp | 9 +++---- tests/data/config-binary_test.json | 10 +------- 5 files changed, 32 insertions(+), 36 deletions(-) diff --git a/include/tensor.hpp b/include/tensor.hpp index f422913..06995b1 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -19,7 +19,7 @@ class Tensor { ValueType *gpu_data = nullptr; std::size_t gpu_data_size{0}; - static const bool isGpu{true}; + static const bool isGpu{false}; void computeStrides(); inline size_t flattenIndex(const std::vector &indices) const; @@ -46,6 +46,7 @@ class Tensor { const std::vector &getStrides() const { return strides; } void getData(std::vector &dest) const; void fill(const ValueType &value); + void zero(); Tensor &operator+=(const Tensor &other); Tensor &operator-=(const Tensor &other); @@ -57,8 +58,8 @@ class Tensor { Tensor &operator+=(ValueType scalar); Tensor &operator-=(ValueType scalar); - Tensor matmul(const Tensor &other) const; - static Tensor outer(const Tensor &a, const Tensor &b); + void matmul(const Tensor &other, Tensor &result) const; + static void outer(const Tensor &a, const Tensor &b, Tensor &result); void matmulT(const Tensor &vec, Tensor &result) const; }; } // namespace nn::global diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index 0bfe7b0..f4b7795 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -50,12 +50,9 @@ size_t Tensor::numElements() const { void Tensor::getData(std::vector &dest) const { if (!isGpu) { dest = cpu_data; + } else { + tensor_gpu::copyToHost(dest.data(), gpu_data, gpu_data_size * sizeof(ValueType)); } - - ValueType *newV = nullptr; - tensor_gpu::copyToHost(newV, gpu_data, gpu_data_size * sizeof(ValueType)); - - std::copy(newV, newV + gpu_data_size, dest.begin()); } void Tensor::fill(const ValueType &value) { @@ -69,6 +66,14 @@ void Tensor::fill(const ValueType &value) { } } +void Tensor::zero() { + if (isGpu) { + tensor_gpu::zero(gpu_data, gpu_data_size); + } else { + fill(0); + } +} + Tensor &Tensor::operator=(const Tensor &other) { if (this == &other) return *this; @@ -106,7 +111,6 @@ void Tensor::computeStrides() { } inline size_t Tensor::flattenIndex(const std::vector &indices) const { - // CPU version, same as before if (indices.size() != shape.size()) { throw std::invalid_argument("Incorrect number of indices."); } @@ -227,7 +231,7 @@ Tensor &Tensor::operator/=(ValueType scalar) { return *this; } -Tensor Tensor::matmul(const Tensor &other) const { +void Tensor::matmul(const Tensor &other, Tensor &result) const { const auto &aShape = shape; const auto &bShape = other.shape; if (aShape.size() != 2 || bShape.size() != 1) @@ -237,7 +241,8 @@ Tensor Tensor::matmul(const Tensor &other) const { size_t K = aShape[1]; if (K != bShape[0]) throw std::runtime_error("matmul: shape mismatch."); - Tensor result({M}); + + result.zero(); if (!isGpu) { const float *A = cpu_data.data(); @@ -252,13 +257,12 @@ Tensor Tensor::matmul(const Tensor &other) const { } R[i] = sum; } - return result; + } else { + tensor_gpu::matmul(gpu_data, other.gpu_data, result.gpu_data, M, K); } - tensor_gpu::matmul(gpu_data, other.gpu_data, result.gpu_data, M, K); - return result; } -Tensor Tensor::outer(const Tensor &a, const Tensor &b) { +void Tensor::outer(const Tensor &a, const Tensor &b, Tensor &result) { if (a.shape.size() != 1 || b.shape.size() != 1) { throw std::runtime_error("outer: both tensors must be 1D vectors"); } @@ -266,7 +270,7 @@ Tensor Tensor::outer(const Tensor &a, const Tensor &b) { size_t m = a.shape[0]; size_t n = b.shape[0]; - Tensor result({m, n}); + result.zero(); if (!isGpu) { float *r = result.cpu_data.data(); @@ -275,13 +279,12 @@ Tensor Tensor::outer(const Tensor &a, const Tensor &b) { for (size_t i = 0; i < m; ++i) { for (size_t j = 0; j < n; ++j) { - r[i * n + j] = A[i] * B[j]; + r[i * n + j] += A[i] * B[j]; } } - return result; + } else { + tensor_gpu::outer(a.gpu_data, b.gpu_data, result.gpu_data, m, n); } - tensor_gpu::outer(a.gpu_data, b.gpu_data, result.gpu_data, m, n); - return result; } void Tensor::matmulT(const Tensor &vec, Tensor &result) const { @@ -290,8 +293,9 @@ void Tensor::matmulT(const Tensor &vec, Tensor &result) const { if (vec.shape[0] != shape[0]) throw std::runtime_error("matmulT: incompatible"); + result.zero(); + if (!isGpu) { - result.fill(0); for (size_t i = 0; i < shape[1]; ++i) { for (size_t j = 0; j < shape[0]; ++j) { result.cpu_data[i] += cpu_data[j * shape[1] + i] * vec.cpu_data[j]; diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu index 6b7cbb6..587c8de 100644 --- a/src/model/tensor_gpu.cu +++ b/src/model/tensor_gpu.cu @@ -512,7 +512,7 @@ __global__ void outerKernel(const ValueType *a, const ValueType *b, ValueType *r if (idx < total) { size_t i = idx / n; size_t j = idx % n; - result[i * n + j] = a[i] * b[j]; + result[i * n + j] += a[i] * b[j]; } } diff --git a/src/networks/fnn/DenseLayer.cpp b/src/networks/fnn/DenseLayer.cpp index feacf6c..cc7b208 100644 --- a/src/networks/fnn/DenseLayer.cpp +++ b/src/networks/fnn/DenseLayer.cpp @@ -34,8 +34,7 @@ void Hidden_Layer::CreateDropoutMask() { } void Output_Layer::forward(const global::Tensor &metrix) { - - net = parameters.weights.matmul(metrix); + parameters.weights.matmul(metrix, net); net += parameters.biases; activationFunction.activate(net, out); @@ -58,7 +57,7 @@ void Output_Layer::backward( } gradients.biases += deltaL; - gradients.weights += global::Tensor::outer(deltaL, prevLayer); + global::Tensor::outer(deltaL, prevLayer, gradients.weights); *deltas = &deltaL; } @@ -76,7 +75,7 @@ void Hidden_Layer::forward(const global::Tensor &metrix) { if (isTraining) CreateDropoutMask(); - net = parameters.weights.matmul(metrix); + parameters.weights.matmul(metrix, net); net += parameters.biases; if (isTraining && config.dropoutRate > 0.0f) { @@ -103,7 +102,7 @@ void Hidden_Layer::backward( gradients.biases += deltaL; - gradients.weights += global::Tensor::outer(deltaL, prevLayer); + global::Tensor::outer(deltaL, prevLayer, gradients.weights); *deltas = &deltaL; } diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json index e6982c6..c8280ac 100644 --- a/tests/data/config-binary_test.json +++ b/tests/data/config-binary_test.json @@ -1,6 +1,6 @@ { "visual config": { - "enableVisuals": false, + "enableVisuals": true, "modes": [ { "state": "pause", "mode": true }, { "state": "precise mode", "mode": false }, @@ -10,14 +10,6 @@ "training config": { "batch size": 32, "batch count": 1000, - "auto save": { - "saveEvery": 2000, - "dataFilenameAutoSave": "mode.txt" - }, - "auto evaluating": { - "evaluateEvery": 10, - "dataBaseFilename": "../tests/data/database-binary_test" - }, "optimizer": { "type": "const", "lr": 0.1 From 0ef22708d5bf34f0491b75a543dcf48846cd8937 Mon Sep 17 00:00:00 2001 From: maayan Date: Fri, 8 Aug 2025 15:43:17 +0300 Subject: [PATCH 26/40] small change --- include/tensor.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/tensor.hpp b/include/tensor.hpp index 06995b1..41eda35 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -24,8 +24,6 @@ class Tensor { void computeStrides(); inline size_t flattenIndex(const std::vector &indices) const; - void loadTempGpu() const; - friend model::Activation; public: From 08572514e5e251e2dbff75cc337fb27c2e6601a8 Mon Sep 17 00:00:00 2001 From: maayan Date: Fri, 8 Aug 2025 19:00:12 +0300 Subject: [PATCH 27/40] improved performance --- include/tensor.hpp | 7 +++-- src/model/activations.cpp | 1 + src/model/model.cpp | 12 +++++---- src/model/tensor.cpp | 34 +++++++++++++++++-------- src/model/tensor_gpu.cu | 25 +++++++++--------- src/model/tensor_gpu.hpp | 16 ++++++------ src/networks/fnn/DenseLayer.cpp | 9 ++++--- src/visualizer/VisualizerController.cpp | 2 +- tests/data/config-binary_test.json | 6 ++--- 9 files changed, 67 insertions(+), 45 deletions(-) diff --git a/include/tensor.hpp b/include/tensor.hpp index 41eda35..ba5f7cd 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -10,6 +10,8 @@ class Activation; namespace nn::global { +constexpr bool GPU_MODE = true; + class Tensor { private: std::vector cpu_data; @@ -17,9 +19,9 @@ class Tensor { std::vector strides; ValueType *gpu_data = nullptr; - std::size_t gpu_data_size{0}; + std::size_t gpu_data_size; - static const bool isGpu{false}; + static const bool isGpu{GPU_MODE}; void computeStrides(); inline size_t flattenIndex(const std::vector &indices) const; @@ -34,6 +36,7 @@ class Tensor { ~Tensor(); Tensor &operator=(const Tensor &other); + Tensor &operator=(const std::vector &other); ValueType getValue(const std::vector &newShape) const; void setValue(const std::vector &newShape, const ValueType value); diff --git a/src/model/activations.cpp b/src/model/activations.cpp index 9f537c3..f7a70b1 100644 --- a/src/model/activations.cpp +++ b/src/model/activations.cpp @@ -1,6 +1,7 @@ #include "activations.hpp" #include "tensor.hpp" #include "tensor_gpu.hpp" +#include namespace nn::model { void Activation::activate(const global::Tensor &net, global::Tensor &out) const { diff --git a/src/model/model.cpp b/src/model/model.cpp index 70249a7..a5a958f 100644 --- a/src/model/model.cpp +++ b/src/model/model.cpp @@ -160,7 +160,7 @@ void Model::updateWeights(const int batchSize) { void Model::Backward(const global::Tensor &output) { global::Tensor deltas = output; - global::Tensor *delta = &deltas; + global::Tensor *delta = &deltas; for (int i = static_cast(network.size()) - 1; i >= 0; --i) { network[i]->backward(&delta); @@ -179,16 +179,18 @@ global::ValueType Model::runBackPropagation( } resetNetworkGradient(); + global::Tensor output({outputSize()}); for (size_t i = 0; i < batch.size(); ++i) { TrainSample *current_sample_ptr = batch.samples.at(i); - visual.updatePrediction(current_sample_ptr->pre); + output.zero(); + // visual.updatePrediction(current_sample_ptr->pre); - runModel(transformation(current_sample_ptr->input)); + // runModel(transformation(current_sample_ptr->input)); - global::Tensor output({outputSize()}); - output.setValue({current_sample_ptr->pre.index}, 1); + runModel(current_sample_ptr->input); if (doBackward) { + output.setValue({current_sample_ptr->pre.index}, 1); Backward(output); updateWeights(batch.size()); } diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index f4b7795..2c297b2 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -1,8 +1,8 @@ #include "tensor_gpu.hpp" -#include #include #include #include +#include namespace nn::global { Tensor::Tensor(const std::vector &shape_, float init) { @@ -58,7 +58,7 @@ void Tensor::getData(std::vector &dest) const { void Tensor::fill(const ValueType &value) { if (isGpu) { tensor_gpu::zero(gpu_data, gpu_data_size); - tensor_gpu::add(gpu_data, value, gpu_data, gpu_data_size); + tensor_gpu::add_scalar(gpu_data, value, gpu_data, gpu_data_size); } else { for (auto &n : cpu_data) { n = value; @@ -100,6 +100,20 @@ Tensor &Tensor::operator=(const Tensor &other) { return *this; } +Tensor &Tensor::operator=(const std::vector &other) { + if (other.size() != numElements()) { + throw std::invalid_argument(""); + } + + if (!isGpu) { + cpu_data = other; + } else { + tensor_gpu::copyToDevice(gpu_data, other.data(), gpu_data_size * sizeof(ValueType)); + } + + return *this; +} + void Tensor::computeStrides() { const size_t dim = shape.size(); strides.resize(dim); @@ -147,7 +161,7 @@ Tensor &Tensor::operator+=(const Tensor &other) { for (size_t i = 0; i < N; ++i) cpu_data[i] += other.cpu_data[i]; } else { - tensor_gpu::add(gpu_data, other.gpu_data, gpu_data, gpu_data_size); + tensor_gpu::add_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size); } return *this; } @@ -160,7 +174,7 @@ Tensor &Tensor::operator-=(const Tensor &other) { for (size_t i = 0; i < N; ++i) cpu_data[i] -= other.cpu_data[i]; } else { - tensor_gpu::subtraction(gpu_data, other.gpu_data, gpu_data, gpu_data_size); + tensor_gpu::subtraction_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size); } return *this; } @@ -173,7 +187,7 @@ Tensor &Tensor::operator*=(const Tensor &other) { for (size_t i = 0; i < N; ++i) cpu_data[i] *= other.cpu_data[i]; } else { - tensor_gpu::multiply(gpu_data, other.gpu_data, gpu_data, gpu_data_size); + tensor_gpu::multiply_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size); } return *this; } @@ -186,7 +200,7 @@ Tensor &Tensor::operator/=(const Tensor &other) { for (size_t i = 0; i < N; ++i) cpu_data[i] /= other.cpu_data[i]; } else { - tensor_gpu::division(gpu_data, other.gpu_data, gpu_data, gpu_data_size); + tensor_gpu::division_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size); } return *this; } @@ -196,7 +210,7 @@ Tensor &Tensor::operator*=(ValueType scalar) { for (auto &x : cpu_data) x *= scalar; } else { - tensor_gpu::multiply(gpu_data, scalar, gpu_data, gpu_data_size); + tensor_gpu::multiply_scalar(gpu_data, scalar, gpu_data, gpu_data_size); } return *this; } @@ -206,7 +220,7 @@ Tensor &Tensor::operator-=(ValueType scalar) { for (auto &x : cpu_data) x -= scalar; } else { - tensor_gpu::subtraction(gpu_data, scalar, gpu_data, gpu_data_size); + tensor_gpu::subtraction_scalar(gpu_data, scalar, gpu_data, gpu_data_size); } return *this; } @@ -216,7 +230,7 @@ Tensor &Tensor::operator+=(ValueType scalar) { for (auto &x : cpu_data) x += scalar; } else { - tensor_gpu::add(gpu_data, scalar, gpu_data, gpu_data_size); + tensor_gpu::add_scalar(gpu_data, scalar, gpu_data, gpu_data_size); } return *this; } @@ -226,7 +240,7 @@ Tensor &Tensor::operator/=(ValueType scalar) { for (auto &x : cpu_data) x /= scalar; } else { - tensor_gpu::division(gpu_data, scalar, gpu_data, gpu_data_size); + tensor_gpu::division_scalar(gpu_data, scalar, gpu_data, gpu_data_size); } return *this; } diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu index 587c8de..7a33f34 100644 --- a/src/model/tensor_gpu.cu +++ b/src/model/tensor_gpu.cu @@ -60,7 +60,7 @@ __global__ void addKernel(const ValueType* A, const ValueType* B, ValueType* C, } // Element-wise addition: C = A + B -void add(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { +void add_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; addKernel<<>>(A, B, C, count); @@ -76,7 +76,7 @@ __global__ void subtractionKernel(const ValueType* A, const ValueType* B, ValueT } // Element-wise addition: C = A + B -void subtraction(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { +void subtraction_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; subtractionKernel<<>>(A, B, C, count); @@ -91,7 +91,7 @@ __global__ void divisionKernel(const ValueType* A, const ValueType* B, ValueType } // Element-wise addition: C = A / B -void division(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { +void division_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; divisionKernel<<>>(A, B, C, count); @@ -106,7 +106,7 @@ __global__ void multiplyKernel(const ValueType* A, const ValueType* B, ValueType } // Element-wise multiply: C = A * B -void multiply(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { +void multiply_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; multiplyKernel<<>>(A, B, C, count); @@ -121,7 +121,7 @@ __global__ void addKernel(const ValueType* A, const ValueType B, ValueType* C, s } // Element-wise addition: C = A + B -void add(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { +void add_scalar(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; addKernel<<>>(A, B, C, count); @@ -137,7 +137,7 @@ __global__ void subtractionKernel(const ValueType* A, const ValueType B, ValueTy } // Element-wise addition: C = A + B -void subtraction(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { +void subtraction_scalar(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; subtractionKernel<<>>(A, B, C, count); @@ -152,7 +152,7 @@ __global__ void divisionKernel(const ValueType* A, const ValueType B, ValueType* } // Element-wise addition: C = A / B -void division(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { +void division_scalar(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; divisionKernel<<>>(A, B, C, count); @@ -167,7 +167,7 @@ __global__ void multiplyKernel(const ValueType* A, const ValueType B, ValueType* } // Element-wise multiply: C = A * B -void multiply(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { +void multiply_scalar(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; multiplyKernel<<>>(A, B, C, count); @@ -321,7 +321,9 @@ __global__ void softmaxKernel(const ValueType* input, ValueType* output, std::si if (idx >= count) return; // Load input into shared memory - shared[tid] = input[idx]; + if (idx < count) shared[tid] = input[idx]; + else shared[tid] = -INFINITY; // or 0 + __syncthreads(); // Step 1: Find max value for numerical stability @@ -512,16 +514,15 @@ __global__ void outerKernel(const ValueType *a, const ValueType *b, ValueType *r if (idx < total) { size_t i = idx / n; size_t j = idx % n; - result[i * n + j] += a[i] * b[j]; + result[i * n + j] = a[i] * b[j]; // Use '=' since result is zeroed before } } __global__ void matmulTKernel(const ValueType *W, const ValueType *V, ValueType *R, size_t M, size_t N) { size_t col = blockIdx.x * blockDim.x + threadIdx.x; if (col < N) { - ValueType sum = 0; + ValueType sum = 0.0f; for (size_t i = 0; i < M; ++i) { - // W is M x N, access element at (i, col) sum += W[i * N + col] * V[i]; } R[col] = sum; diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp index 4e48651..153ce14 100644 --- a/src/model/tensor_gpu.hpp +++ b/src/model/tensor_gpu.hpp @@ -29,28 +29,28 @@ void copyDeviceToDevice(void *deviceDst, const void *deviceSrc, std::size_t coun void zero(ValueType *deviceData, std::size_t count); /// Element-wise addition: C = A + B -void add(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count); +void add_vec(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count); /// Element-wise addition: C = A - B -void subtraction(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count); +void subtraction_vec(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count); /// Element-wise addition: C = A / B -void division(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count); +void division_vec(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count); /// Element-wise multiply: C = A * B -void multiply(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count); +void multiply_vec(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count); /// Element-wise addition: C = A + B -void add(const ValueType *A, const ValueType B, ValueType *C, std::size_t count); +void add_scalar(const ValueType *A, const ValueType B, ValueType *C, std::size_t count); /// Element-wise addition: C = A - B -void subtraction(const ValueType *A, const ValueType B, ValueType *C, std::size_t count); +void subtraction_scalar(const ValueType *A, const ValueType B, ValueType *C, std::size_t count); /// Element-wise addition: C = A / B -void division(const ValueType *A, const ValueType B, ValueType *C, std::size_t count); +void division_scalar(const ValueType *A, const ValueType B, ValueType *C, std::size_t count); /// Element-wise multiply: C = A * B -void multiply(const ValueType *A, const ValueType B, ValueType *C, std::size_t count); +void multiply_scalar(const ValueType *A, const ValueType B, ValueType *C, std::size_t count); void computeStridesDevice(const size_t *gpu_shape, size_t *gpu_strides, std::size_t ndim); diff --git a/src/networks/fnn/DenseLayer.cpp b/src/networks/fnn/DenseLayer.cpp index cc7b208..8584637 100644 --- a/src/networks/fnn/DenseLayer.cpp +++ b/src/networks/fnn/DenseLayer.cpp @@ -1,5 +1,6 @@ #include "DenseLayer.hpp" #include +#include namespace nn::model::fnn { DenseLayer::DenseLayer( @@ -167,11 +168,11 @@ void DenseLayer::fillParamRandom() { global::ValueType std_dev = std::sqrt(2.0 / static_cast(prevSize())); std::normal_distribution<> dist(0.0, std_dev); - for (size_t i = 0; i < parameters.size(); ++i) { - for (size_t j = 0; j < parameters.prevSize(); ++j) { - parameters.weights.setValue({i, j}, dist(gen)); - } + std::vector temp(parameters.weights.numElements()); + for (size_t i = 0; i < temp.size(); ++i) { + temp[i] = dist(gen); } + parameters.weights = temp; } void DenseLayer::resetDots() { diff --git a/src/visualizer/VisualizerController.cpp b/src/visualizer/VisualizerController.cpp index f1e9a60..fc752d3 100644 --- a/src/visualizer/VisualizerController.cpp +++ b/src/visualizer/VisualizerController.cpp @@ -3,7 +3,6 @@ namespace nn::visualizer { VisualManager::VisualManager(const model::Config &_config) : config(_config) { - printf("start Visualizer\n"); } void VisualManager::initState() { @@ -44,6 +43,7 @@ void VisualManager::start() { } void VisualManager::startVisuals() { + printf("start Visualizer\n"); Vstate = std::make_shared(config); if (!Vstate) { return; diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json index c8280ac..c078a9e 100644 --- a/tests/data/config-binary_test.json +++ b/tests/data/config-binary_test.json @@ -1,6 +1,6 @@ { "visual config": { - "enableVisuals": true, + "enableVisuals": false, "modes": [ { "state": "pause", "mode": true }, { "state": "precise mode", "mode": false }, @@ -22,8 +22,8 @@ "output size": 16, "output activation": 4, "layers": [ - { "size": 100, "activationType": 1 }, - { "size": 30, "activationType": 1 } + { "size": 1000, "activationType": 1 }, + { "size": 3000, "activationType": 1 } ] } ] From c5530d153f93f110d68b9a53c4c80452894dd85e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maayan=20Portugues=20=F0=9F=8E=97=EF=B8=8F?= Date: Fri, 8 Aug 2025 19:04:29 +0300 Subject: [PATCH 28/40] Update cmake-multi-platform.yml for cuda --- .github/workflows/cmake-multi-platform.yml | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cmake-multi-platform.yml b/.github/workflows/cmake-multi-platform.yml index 4e76790..c904d29 100644 --- a/.github/workflows/cmake-multi-platform.yml +++ b/.github/workflows/cmake-multi-platform.yml @@ -11,6 +11,9 @@ jobs: runs-on: ubuntu-latest steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Install dependencies run: | sudo apt-get update @@ -26,13 +29,23 @@ jobs: libgl1-mesa-dev \ libegl1-mesa-dev \ libxi-dev \ - libfreetype6-dev + libfreetype6-dev \ + wget - - name: Checkout code - uses: actions/checkout@v4 + - name: Install NVIDIA CUDA Toolkit + run: | + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin + sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 + sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub + sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" + sudo apt-get update + sudo apt-get -y install cuda-toolkit-12-4 # Or latest version you support + echo "/usr/local/cuda/bin" >> $GITHUB_PATH + echo "/usr/local/cuda/lib64" | sudo tee /etc/ld.so.conf.d/cuda.conf + sudo ldconfig - name: Configure - run: cmake -B build -DBUILD_SHARED_LIBS=TRUE + run: cmake -B build -DBUILD_SHARED_LIBS=TRUE -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc - name: Build run: cmake --build build --config Release From 7f8d49c9bbb83de1c77050900e89e2e29a6eb938 Mon Sep 17 00:00:00 2001 From: maayan Date: Fri, 8 Aug 2025 19:12:16 +0300 Subject: [PATCH 29/40] small change - restoring --- .github/workflows/cmake-multi-platform.yml | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/.github/workflows/cmake-multi-platform.yml b/.github/workflows/cmake-multi-platform.yml index c904d29..560d323 100644 --- a/.github/workflows/cmake-multi-platform.yml +++ b/.github/workflows/cmake-multi-platform.yml @@ -11,9 +11,6 @@ jobs: runs-on: ubuntu-latest steps: - - name: Checkout code - uses: actions/checkout@v4 - - name: Install dependencies run: | sudo apt-get update @@ -29,23 +26,14 @@ jobs: libgl1-mesa-dev \ libegl1-mesa-dev \ libxi-dev \ - libfreetype6-dev \ - wget + libfreetype6-dev - - name: Install NVIDIA CUDA Toolkit - run: | - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin - sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 - sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub - sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" - sudo apt-get update - sudo apt-get -y install cuda-toolkit-12-4 # Or latest version you support - echo "/usr/local/cuda/bin" >> $GITHUB_PATH - echo "/usr/local/cuda/lib64" | sudo tee /etc/ld.so.conf.d/cuda.conf - sudo ldconfig + - name: Checkout code + uses: actions/checkout@v4 - name: Configure - run: cmake -B build -DBUILD_SHARED_LIBS=TRUE -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc + run: cmake -B build -DBUILD_SHARED_LIBS=TRUE - name: Build run: cmake --build build --config Release + From fd16a3b0a77a5a2b644b69ed6a4e5bfddd8dec0d Mon Sep 17 00:00:00 2001 From: maayan Date: Fri, 8 Aug 2025 21:38:23 +0300 Subject: [PATCH 30/40] improved performance --- include/network/IvisualNetwork.hpp | 3 +- include/tensor.hpp | 3 ++ src/model/activations.hpp | 14 +++----- src/model/dataBase.hpp | 1 + src/model/tensor.cpp | 12 +++++++ src/networks/fnn/DenseLayer.cpp | 56 +++++++++++++----------------- src/networks/fnn/DenseLayer.hpp | 9 +++-- src/networks/fnn/FNNetwork.cpp | 16 +++------ src/networks/fnn/FnnVisualizer.cpp | 1 - src/visualizer/visualModel.cpp | 4 ++- tests/binary_test.cpp | 17 +++++---- tests/data/config-binary_test.json | 6 ++-- 12 files changed, 70 insertions(+), 72 deletions(-) diff --git a/include/network/IvisualNetwork.hpp b/include/network/IvisualNetwork.hpp index ae08c3a..58b49af 100644 --- a/include/network/IvisualNetwork.hpp +++ b/include/network/IvisualNetwork.hpp @@ -3,8 +3,7 @@ #include "../../src/visualizer/panel.hpp" #include -#include -#include +#include namespace nn::visualizer { constexpr std::uint32_t MODEL_HEIGHT = 770u; diff --git a/include/tensor.hpp b/include/tensor.hpp index ba5f7cd..1064f1d 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -40,6 +40,9 @@ class Tensor { ValueType getValue(const std::vector &newShape) const; void setValue(const std::vector &newShape, const ValueType value); + void insertRange(const Tensor &other, + const size_t startO, const size_t startT, + const size_t length); // Shape and size size_t numElements() const; diff --git a/src/model/activations.hpp b/src/model/activations.hpp index 9c6b4a2..010de2a 100644 --- a/src/model/activations.hpp +++ b/src/model/activations.hpp @@ -2,10 +2,10 @@ #define ACTIVATIONSP #include "tensor.hpp" -#include #include namespace nn::model { + constexpr global::ValueType RELU_LEAKY_ALPHA = 0.01; constexpr global::ValueType maxValue(const global::ValueType &a, const float &b) { @@ -38,16 +38,13 @@ class Activation { static global::ValueType derivativeTanh(const global::ValueType z); static void relu(const global::Tensor &net, global::Tensor &out); - static void derivativeRelu(const global::Tensor &net, - global::Tensor &out); + static void derivativeRelu(const global::Tensor &net, global::Tensor &out); - static void leakyRelu(const global::Tensor &net, - global::Tensor &out); + static void leakyRelu(const global::Tensor &net, global::Tensor &out); static void derivativeLeakyRelu(const global::Tensor &net, global::Tensor &out); - static void sigmoid(const global::Tensor &net, - global::Tensor &out); + static void sigmoid(const global::Tensor &net, global::Tensor &out); static void derivativeSigmoid(const global::Tensor &net, global::Tensor &out); @@ -55,8 +52,7 @@ class Activation { static void derivativeTanh(const global::Tensor &net, global::Tensor &out); - static void softmax(const global::Tensor &net, - global::Tensor &out); + static void softmax(const global::Tensor &net, global::Tensor &out); static global::ValueType maxVector(const global::Tensor &metrix); diff --git a/src/model/dataBase.hpp b/src/model/dataBase.hpp index 0ad890a..cf17ba8 100644 --- a/src/model/dataBase.hpp +++ b/src/model/dataBase.hpp @@ -4,6 +4,7 @@ #include "config.hpp" #include #include +#include namespace nn::model { const std::string DATABASE_FILE_EXETENTION = ".nndb"; diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index 2c297b2..fe1cd65 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -145,6 +145,18 @@ ValueType Tensor::getValue(const std::vector &indices) const { return tensor_gpu::getValueAt(gpu_data, flattenIndex(indices)); } +void Tensor::insertRange(const Tensor &other, + const size_t startO, const size_t startT, + const size_t length) { + if (isGpu) { + tensor_gpu::copyDeviceToDevice(gpu_data + startT, other.gpu_data + startO, length * sizeof(ValueType)); + } else { + for (size_t i = 0; i < length; ++i) { + cpu_data[i + startT] = other.cpu_data[i + startO]; + } + } +} + void Tensor::setValue(const std::vector &indices, const ValueType value) { if (!isGpu) { cpu_data[flattenIndex(indices)] = value; diff --git a/src/networks/fnn/DenseLayer.cpp b/src/networks/fnn/DenseLayer.cpp index 8584637..d955dc3 100644 --- a/src/networks/fnn/DenseLayer.cpp +++ b/src/networks/fnn/DenseLayer.cpp @@ -1,4 +1,5 @@ #include "DenseLayer.hpp" +#include #include #include @@ -12,8 +13,8 @@ DenseLayer::DenseLayer( out({size}), parameters(size, prevSize), gradients(size, prevSize), - activationFunction(activation), - deltaL({size}) { + deltaL({size}), + activationFunction(activation) { if (randomInit) { fillParamRandom(); } @@ -29,9 +30,12 @@ void Hidden_Layer::CreateDropoutMask() { static thread_local std::mt19937 rng{std::random_device{}()}; std::bernoulli_distribution bernoulli(keepProb); + static std::vector temp(dropoutMask.numElements(), 0); for (size_t i = 0; i < dropoutMask.numElements(); ++i) { - dropoutMask.setValue({i}, static_cast(bernoulli(rng))); + temp[i] = static_cast(bernoulli(rng)); } + + dropoutMask = temp; } void Output_Layer::forward(const global::Tensor &metrix) { @@ -125,41 +129,29 @@ void DenseLayer::updateWeight(nn::model::IOptimizer &optimizer) { } const global::Tensor DenseLayer::getData() const { - global::Tensor matrix({parameters.paramSize()}); - - size_t currentI = 0; - for (size_t i = 0; i < size(); ++i) { - for (size_t j = 0; j < prevSize(); ++j) { - matrix.setValue({currentI}, parameters.weights.getValue({i, j})); + size_t weightsSize = parameters.weights.numElements(); + size_t biasesSize = parameters.biases.numElements(); - ++currentI; - } - } + global::Tensor matrix({weightsSize + biasesSize}); - for (size_t i = 0; i < size(); ++i) { - matrix.setValue({currentI}, parameters.biases.getValue({i})); + // Copy weights + matrix.insertRange(parameters.weights, 0, 0, weightsSize); - ++currentI; - } + // Copy biases + matrix.insertRange(parameters.biases, 0, weightsSize, biasesSize); return matrix; } -void DenseLayer::setData(const global::Tensor newParam) { - size_t currentI = 0; - for (size_t i = 0; i < size(); ++i) { - for (size_t j = 0; j < prevSize(); ++j) { - parameters.weights.setValue({i, j}, newParam.getValue({currentI})); +void DenseLayer::setData(const global::Tensor newParam, const size_t offset) { + size_t weightsSize = parameters.weights.numElements(); + size_t biasesSize = parameters.biases.numElements(); - ++currentI; - } - } - - for (size_t i = 0; i < size(); ++i) { - parameters.biases.setValue({i}, newParam.getValue({currentI})); + // Copy into weights + parameters.weights.insertRange(newParam, offset, 0, weightsSize); - ++currentI; - } + // Copy into biases + parameters.biases.insertRange(newParam, offset + weightsSize, 0, biasesSize); } void DenseLayer::fillParamRandom() { @@ -168,11 +160,11 @@ void DenseLayer::fillParamRandom() { global::ValueType std_dev = std::sqrt(2.0 / static_cast(prevSize())); std::normal_distribution<> dist(0.0, std_dev); - std::vector temp(parameters.weights.numElements()); + std::vector temp(parameters.weights.numElements()); for (size_t i = 0; i < temp.size(); ++i) { - temp[i] = dist(gen); + temp[i] = dist(gen); } - parameters.weights = temp; + parameters.weights = temp; } void DenseLayer::resetDots() { diff --git a/src/networks/fnn/DenseLayer.hpp b/src/networks/fnn/DenseLayer.hpp index 04c807f..651adb2 100644 --- a/src/networks/fnn/DenseLayer.hpp +++ b/src/networks/fnn/DenseLayer.hpp @@ -1,9 +1,8 @@ #ifndef DENSELAYER #define DENSELAYER -#include "../../model/config.hpp" #include "../src/model/optimizers.hpp" -#include "tensor.hpp" +#include namespace nn::model::fnn { constexpr global::ValueType MIN_LOSS_VALUE = 1e-10; @@ -15,7 +14,6 @@ struct LayerParams { size_t size_; size_t prevSize_; - LayerParams(size_t out_dim, size_t in_dim) : weights({out_dim, in_dim}), biases({out_dim}), size_(out_dim), prevSize_(in_dim) {} @@ -34,10 +32,11 @@ class DenseLayer { LayerParams parameters; LayerParams gradients; + global::Tensor deltaL; + Activation activationFunction; bool isTraining{false}; - global::Tensor deltaL; void fillParamRandom(); @@ -72,7 +71,7 @@ class DenseLayer { size_t getParamCount() const; const global::Tensor getData() const; - void setData(const global::Tensor newParam); + void setData(const global::Tensor newParam, const size_t offset); void setTraining(const bool state) { isTraining = state; } }; diff --git a/src/networks/fnn/FNNetwork.cpp b/src/networks/fnn/FNNetwork.cpp index d723cc5..89621c1 100644 --- a/src/networks/fnn/FNNetwork.cpp +++ b/src/networks/fnn/FNNetwork.cpp @@ -157,10 +157,8 @@ global::Tensor FNNetwork::getParams() const { for (size_t i = 0; i < layers.size(); ++i) { global::Tensor params = layers[i]->getData(); - for (size_t j = 0; j < params.numElements(); ++j) { - matrix.setValue({matrixI}, params.getValue({j})); - ++matrixI; - } + matrix.insertRange(params, 0, matrixI, params.numElements()); + matrixI += params.numElements(); } return matrix; @@ -169,14 +167,8 @@ global::Tensor FNNetwork::getParams() const { void FNNetwork::setParams(const global::Tensor params) { size_t j = 0; for (size_t i = 0; i < layers.size(); ++i) { - global::Tensor newParam({layers[i]->getParamCount()}); - - for (size_t k = 0; k < newParam.numElements(); ++k) { - newParam.setValue({k}, params.getValue({j})); - ++j; - } - - layers[i]->setData(newParam); + layers[i]->setData(params, j); + j += layers[i]->getParamCount(); if (visual) { visual->setParam(i, layers[i]->getParms()); diff --git a/src/networks/fnn/FnnVisualizer.cpp b/src/networks/fnn/FnnVisualizer.cpp index 47359a0..1726a78 100644 --- a/src/networks/fnn/FnnVisualizer.cpp +++ b/src/networks/fnn/FnnVisualizer.cpp @@ -254,7 +254,6 @@ void VisualDenseLayer::setGrad(const model::fnn::LayerParams &newGrad) { } void FnnVisualier::setWidth(const std::uint32_t newWidth) { - visualWidth = newWidth; if (networkRender.resize({newWidth, networkRender.getSize().y})) { } diff --git a/src/visualizer/visualModel.cpp b/src/visualizer/visualModel.cpp index 4be7484..f22f039 100644 --- a/src/visualizer/visualModel.cpp +++ b/src/visualizer/visualModel.cpp @@ -157,7 +157,9 @@ void ModelPanel::renderSubNetwork(const size_t index) { } void ModelPanel::setPrediction(const global::Prediction &pre) { - global::Tensor output({predictionLayer.size()}); + static global::Tensor output({predictionLayer.size()}); + output.zero(); + output.setValue({pre.index}, 1); predictionLayer.setValues(output); diff --git a/tests/binary_test.cpp b/tests/binary_test.cpp index 7bdd693..17b318a 100644 --- a/tests/binary_test.cpp +++ b/tests/binary_test.cpp @@ -66,7 +66,7 @@ void print_database(int actual_size, int input_size, int database_size) { } void printVector(const nn::global::Tensor &vec) { - for (size_t i =0; i < vec.numElements(); ++i) { + for (size_t i = 0; i < vec.numElements(); ++i) { std::cout << vec.getValue({i}) << ' '; } @@ -77,20 +77,23 @@ int main(int argc, char *argv[]) { size_t input_size = 10; std::string config_FN = tests::appendToBase("config-binary_test.json"); - // nn::global::Tensor give_me_a_name({5, 3}); - // printf("test: \n"); - // give_me_a_name.setValue({2, 1}, 5); - // return 0; + // nn::global::Tensor give_me_a_name({5, 3}); + // printf("test: \n"); + // give_me_a_name.setValue({2, 1}, 5); + // return 0; nn::model::Model model(config_FN); if (argc > 1 && std::string(argv[1]) == "l") { model.load("test.txt"); } else { - std::vector files {"../tests/data/test1", "../tests/data/test2"}; + model.load("test.txt"); + nn::model::modelResult result = model.evaluateModel("../tests/data/database-binary_test"); + std::cout << "training result: " << result.percentage << "%\n"; + std::vector files{"../tests/data/test1", "../tests/data/test2"}; model.train(files); - nn::model::modelResult result = model.evaluateModel("../tests/data/database-binary_test"); + result = model.evaluateModel("../tests/data/database-binary_test"); std::cout << "training result: " << result.percentage << "%\n"; model.save("test.txt"); diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json index c078a9e..fb3283a 100644 --- a/tests/data/config-binary_test.json +++ b/tests/data/config-binary_test.json @@ -8,11 +8,11 @@ ] }, "training config": { - "batch size": 32, + "batch size": 16, "batch count": 1000, "optimizer": { "type": "const", - "lr": 0.1 + "lr": 0.5 } }, "network config": [ @@ -23,7 +23,7 @@ "output activation": 4, "layers": [ { "size": 1000, "activationType": 1 }, - { "size": 3000, "activationType": 1 } + { "size": 300, "activationType": 1 } ] } ] From 09174e4f9def6c10e232c1779c770ef8f4cb65a6 Mon Sep 17 00:00:00 2001 From: maayan Date: Fri, 8 Aug 2025 23:23:22 +0300 Subject: [PATCH 31/40] small changes --- src/model/tensor.cpp | 1 - src/model/tensor_gpu.cu | 121 --------------------------------------- src/model/tensor_gpu.hpp | 19 +----- 3 files changed, 1 insertion(+), 140 deletions(-) diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index fe1cd65..ba95d29 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -2,7 +2,6 @@ #include #include #include -#include namespace nn::global { Tensor::Tensor(const std::vector &shape_, float init) { diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu index 7a33f34..da29857 100644 --- a/src/model/tensor_gpu.cu +++ b/src/model/tensor_gpu.cu @@ -26,7 +26,6 @@ void copyToDevice(void* deviceDst, const void * hostSrc, std::size_t size) { cudaMemcpy(deviceDst, hostSrc, size, cudaMemcpyHostToDevice); } - void copyDeviceToDevice(void *deviceDst, const void *deviceSrc, std::size_t size) { cudaMemcpy(deviceDst, deviceSrc, size, cudaMemcpyDeviceToDevice); } @@ -173,19 +172,6 @@ void multiply_scalar(const ValueType* A, const ValueType B, ValueType* C, std::s multiplyKernel<<>>(A, B, C, count); } -__global__ void computeStrides(const size_t *shape, size_t *strides, size_t ndim) { - size_t stride = 1; - for (int i = ndim - 1; i >= 0; --i) { - strides[i] = stride; - stride *= shape[i]; - } -} - -void computeStridesDevice(const size_t *gpu_shape, size_t *gpu_strides, std::size_t ndim) { - computeStrides<<<1, 1>>>(gpu_shape, gpu_strides, ndim); - cudaDeviceSynchronize(); -} - // Kernel to apply ReLU activation: max(0, x) __global__ void reluKernel(const ValueType *input, ValueType *output, std::size_t count) { std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -381,31 +367,6 @@ __global__ void flattenIndexKernel(const size_t* indices, const size_t* shape, *outIndex = idx; } -// Host function to launch kernel -size_t flattenIndexGpu(const size_t* h_indices, const size_t* d_shape, - const size_t* d_strides, size_t ndim) { - size_t *d_indices, *d_outIndex; - cudaMalloc(&d_indices, ndim * sizeof(size_t)); - cudaMalloc(&d_outIndex, sizeof(size_t)); - - cudaMemcpy(d_indices, h_indices, ndim * sizeof(size_t), cudaMemcpyHostToDevice); - - flattenIndexKernel<<<1, 1>>>(d_indices, d_shape, d_strides, ndim, d_outIndex); - cudaDeviceSynchronize(); - - size_t result; - cudaMemcpy(&result, d_outIndex, sizeof(size_t), cudaMemcpyDeviceToHost); - - cudaFree(d_indices); - cudaFree(d_outIndex); - - if (result == size_t(-1)) { - throw std::out_of_range("Flattened index out of bounds."); - } - - return result; -} - __global__ void computeFlatIndexKernel( const size_t* indices, const size_t* strides, size_t rank, size_t* outIndex @@ -417,86 +378,6 @@ __global__ void computeFlatIndexKernel( *outIndex = flatIndex; } -ValueType getValueAtIndices( - const ValueType* deviceData, - const size_t* hostIndices, - const size_t* deviceStrides, - size_t size -) { - // Copy host indices to device - size_t* deviceIndices; - cudaMalloc(&deviceIndices, sizeof(size_t) * size); - cudaMemcpy(deviceIndices, hostIndices, sizeof(size_t) * size, cudaMemcpyHostToDevice); - - // Allocate output for index - size_t* deviceFlatIndex; - cudaMalloc(&deviceFlatIndex, sizeof(size_t)); - - // Launch kernel to compute flat index - computeFlatIndexKernel<<<1, 1>>>( - deviceIndices, deviceStrides, size, deviceFlatIndex - ); - cudaDeviceSynchronize(); - - // Copy back flat index - size_t flatIndex; - cudaMemcpy(&flatIndex, deviceFlatIndex, sizeof(size_t), cudaMemcpyDeviceToHost); - - // Get value at that index - ValueType value; - cudaMemcpy(&value, deviceData + flatIndex, sizeof(ValueType), cudaMemcpyDeviceToHost); - - // Cleanup - cudaFree(deviceIndices); - cudaFree(deviceFlatIndex); - - return value; -} - -__global__ void setValueAtIndexKernel(ValueType* data, size_t flatIndex, ValueType value) { - data[flatIndex] = value; -} - -void setValueAtIndices( - ValueType* deviceData, - const size_t* hostIndices, - const size_t* deviceStrides, - size_t ndim, - ValueType value -) { - // Step 1: Allocate and copy indices to GPU - size_t* deviceIndices; - cudaMalloc(&deviceIndices, ndim * sizeof(size_t)); - cudaMemcpy(deviceIndices, hostIndices, ndim * sizeof(size_t), cudaMemcpyHostToDevice); - - // Step 2: Allocate memory to store computed flat index - size_t* deviceFlatIndex; - cudaMalloc(&deviceFlatIndex, sizeof(size_t)); - - // Step 3: Launch kernel to compute flat index - computeFlatIndexKernel<<<1, 1>>>(deviceIndices, deviceStrides, ndim, deviceFlatIndex); - cudaDeviceSynchronize(); - - // Step 4: Copy flat index to host - size_t flatIndex; - cudaMemcpy(&flatIndex, deviceFlatIndex, sizeof(size_t), cudaMemcpyDeviceToHost); - - // Step 5: Validate flat index - if (flatIndex == size_t(-1)) { - cudaFree(deviceIndices); - cudaFree(deviceFlatIndex); - throw std::out_of_range("Invalid indices in setValueAtIndices"); - } - - // Step 6: Launch kernel to set value at computed flat index - setValueAtIndexKernel<<<1, 1>>>(deviceData, flatIndex, value); - cudaDeviceSynchronize(); - - // Cleanup - cudaFree(deviceIndices); - cudaFree(deviceFlatIndex); -} - __global__ void matmulKernel(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K) { size_t row = blockIdx.x * blockDim.x + threadIdx.x; if (row < M) { @@ -529,8 +410,6 @@ __global__ void matmulTKernel(const ValueType *W, const ValueType *V, ValueType } } -// Wrapper functions to launch kernels - void matmul(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K) { const int blockSize = 256; int gridSize = (M + blockSize - 1) / blockSize; diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp index 153ce14..ee65e49 100644 --- a/src/model/tensor_gpu.hpp +++ b/src/model/tensor_gpu.hpp @@ -23,6 +23,7 @@ void copyToDevice(void *deviceDst, const void *hostSrc, std::size_t count); /// Copy data from GPU to CPU. void copyToHost(void *hostDst, const void *deviceSrc, std::size_t count); +/// Copy data from GPU to GPU. void copyDeviceToDevice(void *deviceDst, const void *deviceSrc, std::size_t count); /// Set all elements to zero (on GPU). @@ -52,8 +53,6 @@ void division_scalar(const ValueType *A, const ValueType B, ValueType *C, std::s /// Element-wise multiply: C = A * B void multiply_scalar(const ValueType *A, const ValueType B, ValueType *C, std::size_t count); -void computeStridesDevice(const size_t *gpu_shape, size_t *gpu_strides, std::size_t ndim); - // ---------------- ReLU ---------------- void relu(const ValueType *input, ValueType *output, std::size_t count); void relu_derivative(const ValueType *input, ValueType *output, std::size_t count); @@ -74,24 +73,8 @@ void leaky_relu_derivative(const ValueType *input, ValueType *output, std::size_ void softmax(const ValueType *net, ValueType *out, std::size_t size); ValueType getValueAt(const ValueType *devicePtr, std::size_t index); - void setValueAt(ValueType *devicePtr, std::size_t index, ValueType value); -size_t flattenIndexGpu(const size_t *indices, const size_t *d_shape, const size_t *d_strides, size_t ndim); - -ValueType getValueAtIndices( - const ValueType *deviceData, - const size_t *hostIndices, - const size_t *deviceStrides, - size_t size); - -void setValueAtIndices( - ValueType *deviceData, - const size_t *hostIndices, - const size_t *deviceStrides, - size_t ndim, - ValueType value); - void matmul(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K); void outer(const ValueType *a, const ValueType *b, ValueType *result, size_t m, size_t n); void matmulT(const ValueType *W, const ValueType *V, ValueType *R, size_t M, size_t N); From a3825f8045c5c82dbb280272a456674cd3e4c72f Mon Sep 17 00:00:00 2001 From: maayan Date: Fri, 8 Aug 2025 23:37:49 +0300 Subject: [PATCH 32/40] small formating changes --- src/model/tensor_gpu.cu | 301 ++++++++++++++++----------------------- src/model/tensor_gpu.hpp | 130 +++++++++-------- 2 files changed, 186 insertions(+), 245 deletions(-) diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu index da29857..1a00003 100644 --- a/src/model/tensor_gpu.cu +++ b/src/model/tensor_gpu.cu @@ -4,206 +4,179 @@ #include namespace nn::global::tensor_gpu { -// Allocate memory on GPU for a tensor. + +// ================================================== +// Memory Management +// ================================================== void* allocate(std::size_t size) { void* devicePtr = nullptr; - cudaError_t err1 = cudaMalloc(&devicePtr, size); - if (err1 != cudaSuccess) { + if (cudaMalloc(&devicePtr, size) != cudaSuccess) { throw std::runtime_error("cudaMalloc failed"); } return devicePtr; } -// Free GPU memory. void deallocate(void* devicePtr) { if (devicePtr) { cudaFree(devicePtr); } } -// Copy data from CPU to GPU. -void copyToDevice(void* deviceDst, const void * hostSrc, std::size_t size) { +void copyToDevice(void* deviceDst, const void* hostSrc, std::size_t size) { cudaMemcpy(deviceDst, hostSrc, size, cudaMemcpyHostToDevice); } -void copyDeviceToDevice(void *deviceDst, const void *deviceSrc, std::size_t size) { +void copyDeviceToDevice(void* deviceDst, const void* deviceSrc, std::size_t size) { cudaMemcpy(deviceDst, deviceSrc, size, cudaMemcpyDeviceToDevice); } -// Copy data from GPU to CPU. void copyToHost(void* hostDst, const void* deviceSrc, std::size_t size) { cudaMemcpy(hostDst, deviceSrc, size, cudaMemcpyDeviceToHost); } -// Kernel to set all elements to zero. +void setValueAt(ValueType* devicePtr, std::size_t index, ValueType value) { + cudaMemcpy(devicePtr + index, &value, sizeof(ValueType), cudaMemcpyHostToDevice); +} + +ValueType getValueAt(const ValueType* devicePtr, std::size_t index) { + ValueType value; + cudaMemcpy(&value, devicePtr + index, sizeof(ValueType), cudaMemcpyDeviceToHost); + return value; +} + +// ================================================== +// Utility Kernels +// ================================================== __global__ void zeroKernel(ValueType* data, std::size_t count) { std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < count) { - data[idx] = 0.0f; - } + if (idx < count) data[idx] = 0.0f; } -// Set all elements to zero (on GPU). void zero(ValueType* deviceData, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; zeroKernel<<>>(deviceData, count); } -// Kernel for element-wise addition: C = A + B -__global__ void addKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { +// ================================================== +// Vector-Vector Operations +// ================================================== +__global__ void addVecKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < count) { - C[idx] = A[idx] + B[idx]; - } + if (idx < count) C[idx] = A[idx] + B[idx]; } -// Element-wise addition: C = A + B -void add_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { - std::size_t blockSize = 256; - std::size_t numBlocks = (count + blockSize - 1) / blockSize; - addKernel<<>>(A, B, C, count); +__global__ void subVecKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) C[idx] = A[idx] - B[idx]; } +__global__ void mulVecKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) C[idx] = A[idx] * B[idx]; +} -// Kernel for element-wise addition: C = A - B -__global__ void subtractionKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { +__global__ void divVecKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < count) { - C[idx] = A[idx] - B[idx]; - } + if (idx < count) C[idx] = A[idx] / B[idx]; } -// Element-wise addition: C = A + B -void subtraction_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { +void add_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; - subtractionKernel<<>>(A, B, C, count); + addVecKernel<<>>(A, B, C, count); } -// Kernel for element-wise addition: C = A / B -__global__ void divisionKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { - std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < count) { - C[idx] = A[idx] / B[idx]; - } -} - -// Element-wise addition: C = A / B -void division_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { +void subtraction_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; - divisionKernel<<>>(A, B, C, count); + subVecKernel<<>>(A, B, C, count); } -// Kernel for element-wise multiplication: C = A * B -__global__ void multiplyKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { - std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < count) { - C[idx] = A[idx] * B[idx]; - } +void multiply_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + mulVecKernel<<>>(A, B, C, count); } -// Element-wise multiply: C = A * B -void multiply_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { +void division_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; - multiplyKernel<<>>(A, B, C, count); + divVecKernel<<>>(A, B, C, count); } -// Kernel for element-wise addition: C = A + B -__global__ void addKernel(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { +// ================================================== +// Vector-Scalar Operations +// ================================================== +__global__ void addScalarKernel(const ValueType* A, ValueType B, ValueType* C, std::size_t count) { std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < count) { - C[idx] = A[idx] + B; - } + if (idx < count) C[idx] = A[idx] + B; } -// Element-wise addition: C = A + B -void add_scalar(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { - std::size_t blockSize = 256; - std::size_t numBlocks = (count + blockSize - 1) / blockSize; - addKernel<<>>(A, B, C, count); +__global__ void subScalarKernel(const ValueType* A, ValueType B, ValueType* C, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) C[idx] = A[idx] - B; } +__global__ void mulScalarKernel(const ValueType* A, ValueType B, ValueType* C, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) C[idx] = A[idx] * B; +} -// Kernel for element-wise addition: C = A - B -__global__ void subtractionKernel(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { +__global__ void divScalarKernel(const ValueType* A, ValueType B, ValueType* C, std::size_t count) { std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < count) { - C[idx] = A[idx] - B; - } + if (idx < count) C[idx] = A[idx] / B; } -// Element-wise addition: C = A + B -void subtraction_scalar(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { +void add_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; - subtractionKernel<<>>(A, B, C, count); -} - -// Kernel for element-wise addition: C = A / B -__global__ void divisionKernel(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { - std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < count) { - C[idx] = A[idx] / B; - } + addScalarKernel<<>>(A, B, C, count); } -// Element-wise addition: C = A / B -void division_scalar(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { +void subtraction_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; - divisionKernel<<>>(A, B, C, count); + subScalarKernel<<>>(A, B, C, count); } -// Kernel for element-wise multiplication: C = A * B -__global__ void multiplyKernel(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { - std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < count) { - C[idx] = A[idx] * B; - } +void multiply_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + mulScalarKernel<<>>(A, B, C, count); } -// Element-wise multiply: C = A * B -void multiply_scalar(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) { +void division_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; - multiplyKernel<<>>(A, B, C, count); + divScalarKernel<<>>(A, B, C, count); } -// Kernel to apply ReLU activation: max(0, x) -__global__ void reluKernel(const ValueType *input, ValueType *output, std::size_t count) { +// ================================================== +// Activation Functions +// ================================================== +__global__ void reluKernel(const ValueType* input, ValueType* output, std::size_t count) { std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < count) { - output[idx] = input[idx] > 0.0 ? input[idx] : 0.0f; - } + if (idx < count) output[idx] = input[idx] > 0.0f ? input[idx] : 0.0f; +} + +__global__ void reluDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) output[idx] = input[idx] > 0.0f ? 1.0f : 0.0f; } -// Apply activation function (e.g., ReLU) -void relu(const ValueType *input, ValueType *output, std::size_t count) { +void relu(const ValueType* input, ValueType* output, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; reluKernel<<>>(input, output, count); } -// Kernel to apply ReLU derivative: -// output[i] = input[i] > 0 ? 1 : 0 -__global__ void reluDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) { - std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < count) { - output[idx] = (input[idx] > 0.0f) ? 1.0f : 0.0f; - } -} - -// Apply derivative of activation function (e.g., ReLU') void relu_derivative(const ValueType* input, ValueType* output, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; reluDerivativeKernel<<>>(input, output, count); } -// Kernel to apply Sigmoid activation: 1 / (1 + exp(-x)) __global__ void sigmoidKernel(const ValueType* input, ValueType* output, std::size_t count) { std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < count) { @@ -212,14 +185,6 @@ __global__ void sigmoidKernel(const ValueType* input, ValueType* output, std::si } } -// Apply Sigmoid activation -void sigmoid(const ValueType* input, ValueType* output, std::size_t count) { - std::size_t blockSize = 256; - std::size_t numBlocks = (count + blockSize - 1) / blockSize; - sigmoidKernel<<>>(input, output, count); -} - -// Kernel for Sigmoid derivative: s(x) * (1 - s(x)) __global__ void sigmoidDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) { std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < count) { @@ -229,29 +194,23 @@ __global__ void sigmoidDerivativeKernel(const ValueType* input, ValueType* outpu } } -// Apply Sigmoid derivative +void sigmoid(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + sigmoidKernel<<>>(input, output, count); +} + void sigmoid_derivative(const ValueType* input, ValueType* output, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; sigmoidDerivativeKernel<<>>(input, output, count); } -// Kernel to apply Tanh activation: tanh(x) __global__ void tanhKernel(const ValueType* input, ValueType* output, std::size_t count) { std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < count) { - output[idx] = tanhf(input[idx]); - } + if (idx < count) output[idx] = tanhf(input[idx]); } -// Apply Tanh activation -void tanh_activation(const ValueType* input, ValueType* output, std::size_t count) { - std::size_t blockSize = 256; - std::size_t numBlocks = (count + blockSize - 1) / blockSize; - tanhKernel<<>>(input, output, count); -} - -// Kernel for Tanh derivative: 1 - tanh(x)^2 __global__ void tanhDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) { std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < count) { @@ -260,78 +219,69 @@ __global__ void tanhDerivativeKernel(const ValueType* input, ValueType* output, } } -// Apply Tanh derivative +void tanh_activation(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + tanhKernel<<>>(input, output, count); +} + void tanh_derivative(const ValueType* input, ValueType* output, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; tanhDerivativeKernel<<>>(input, output, count); } -// Kernel for Leaky ReLU: x > 0 ? x : alpha * x __global__ void leakyReluKernel(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) { std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < count) { - ValueType x = input[idx]; - output[idx] = (x > 0.0f) ? x : alpha * x; - } + if (idx < count) output[idx] = (input[idx] > 0.0f) ? input[idx] : alpha * input[idx]; +} + +__global__ void leakyReluDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) output[idx] = (input[idx] > 0.0f) ? 1.0f : alpha; } -// Apply Leaky ReLU void leaky_relu(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; leakyReluKernel<<>>(input, output, count, alpha); } -// Kernel for Leaky ReLU derivative: x > 0 ? 1 : alpha -__global__ void leakyReluDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) { - std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < count) { - output[idx] = (input[idx] > 0.0f) ? 1.0f : alpha; - } -} - -// Apply Leaky ReLU derivative void leaky_relu_derivative(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; leakyReluDerivativeKernel<<>>(input, output, count, alpha); } +// ================================================== +// Softmax +// ================================================== __global__ void softmaxKernel(const ValueType* input, ValueType* output, std::size_t count) { extern __shared__ ValueType shared[]; std::size_t tid = threadIdx.x; std::size_t idx = blockIdx.x * blockDim.x + tid; - if (idx >= count) return; - // Load input into shared memory - if (idx < count) shared[tid] = input[idx]; - else shared[tid] = -INFINITY; // or 0 - + shared[tid] = (idx < count) ? input[idx] : -INFINITY; __syncthreads(); - // Step 1: Find max value for numerical stability ValueType max_val = shared[0]; - for (std::size_t i = 1; i < blockDim.x && blockIdx.x * blockDim.x + i < count; ++i) { + for (std::size_t i = 1; i < blockDim.x && (blockIdx.x * blockDim.x + i) < count; ++i) { max_val = fmaxf(max_val, shared[i]); } __syncthreads(); - // Step 2: Compute exp(x - max) ValueType e = expf(shared[tid] - max_val); shared[tid] = e; __syncthreads(); - // Step 3: Sum of exponentials ValueType sum = 0.0f; - for (std::size_t i = 0; i < blockDim.x && blockIdx.x * blockDim.x + i < count; ++i) { + for (std::size_t i = 0; i < blockDim.x && (blockIdx.x * blockDim.x + i) < count; ++i) { sum += shared[i]; } __syncthreads(); - // Step 4: Normalize output[idx] = shared[tid] / sum; } @@ -339,21 +289,12 @@ void softmax(const ValueType* input, ValueType* output, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; std::size_t sharedMemSize = blockSize * sizeof(ValueType); - softmaxKernel<<>>(input, output, count); } -void setValueAt(ValueType* devicePtr, std::size_t index, ValueType value) { - cudaMemcpy(devicePtr + index, &value, sizeof(ValueType), cudaMemcpyHostToDevice); -} - -ValueType getValueAt(const ValueType* devicePtr , std::size_t index) { - ValueType value; - cudaMemcpy(&value, devicePtr + index, sizeof(ValueType), cudaMemcpyDeviceToHost); - return value; -} - -// Kernel to compute flattened index +// ================================================== +// Index Utilities +// ================================================== __global__ void flattenIndexKernel(const size_t* indices, const size_t* shape, const size_t* strides, size_t ndim, size_t* outIndex) { size_t idx = 0; @@ -367,10 +308,8 @@ __global__ void flattenIndexKernel(const size_t* indices, const size_t* shape, *outIndex = idx; } -__global__ void computeFlatIndexKernel( - const size_t* indices, const size_t* strides, - size_t rank, size_t* outIndex -) { +__global__ void computeFlatIndexKernel(const size_t* indices, const size_t* strides, + size_t rank, size_t* outIndex) { size_t flatIndex = 0; for (size_t i = 0; i < rank; ++i) { flatIndex += indices[i] * strides[i]; @@ -378,7 +317,10 @@ __global__ void computeFlatIndexKernel( *outIndex = flatIndex; } -__global__ void matmulKernel(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K) { +// ================================================== +// Matrix Operations +// ================================================== +__global__ void matmulKernel(const ValueType* A, const ValueType* B, ValueType* R, size_t M, size_t K) { size_t row = blockIdx.x * blockDim.x + threadIdx.x; if (row < M) { ValueType sum = 0; @@ -389,17 +331,17 @@ __global__ void matmulKernel(const ValueType *A, const ValueType *B, ValueType * } } -__global__ void outerKernel(const ValueType *a, const ValueType *b, ValueType *result, size_t m, size_t n) { +__global__ void outerKernel(const ValueType* a, const ValueType* b, ValueType* result, size_t m, size_t n) { size_t idx = blockIdx.x * blockDim.x + threadIdx.x; size_t total = m * n; if (idx < total) { size_t i = idx / n; size_t j = idx % n; - result[i * n + j] = a[i] * b[j]; // Use '=' since result is zeroed before + result[i * n + j] = a[i] * b[j]; } } -__global__ void matmulTKernel(const ValueType *W, const ValueType *V, ValueType *R, size_t M, size_t N) { +__global__ void matmulTKernel(const ValueType* W, const ValueType* V, ValueType* R, size_t M, size_t N) { size_t col = blockIdx.x * blockDim.x + threadIdx.x; if (col < N) { ValueType sum = 0.0f; @@ -410,24 +352,25 @@ __global__ void matmulTKernel(const ValueType *W, const ValueType *V, ValueType } } -void matmul(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K) { +void matmul(const ValueType* A, const ValueType* B, ValueType* R, size_t M, size_t K) { const int blockSize = 256; int gridSize = (M + blockSize - 1) / blockSize; matmulKernel<<>>(A, B, R, M, K); cudaDeviceSynchronize(); } -void outer(const ValueType *a, const ValueType *b, ValueType *result, size_t m, size_t n) { +void outer(const ValueType* a, const ValueType* b, ValueType* result, size_t m, size_t n) { const int blockSize = 256; int gridSize = (m * n + blockSize - 1) / blockSize; outerKernel<<>>(a, b, result, m, n); cudaDeviceSynchronize(); } -void matmulT(const ValueType *W, const ValueType *V, ValueType *R, size_t M, size_t N) { +void matmulT(const ValueType* W, const ValueType* V, ValueType* R, size_t M, size_t N) { const int blockSize = 256; int gridSize = (N + blockSize - 1) / blockSize; matmulTKernel<<>>(W, V, R, M, N); cudaDeviceSynchronize(); } -} // namespace tensor_gpu + +} // namespace nn::global::tensor_gpu diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp index ee65e49..ad2ba02 100644 --- a/src/model/tensor_gpu.hpp +++ b/src/model/tensor_gpu.hpp @@ -11,73 +11,71 @@ class Tensor; // Forward declaration namespace nn::global::tensor_gpu { -/// Allocate memory on GPU for a tensor. -void *allocate(std::size_t size); +// ============================ +// Memory Management +// ============================ +void* allocate(std::size_t size); +void deallocate(void* devicePtr); + +void copyToDevice(void* deviceDst, const void* hostSrc, std::size_t count); +void copyToHost(void* hostDst, const void* deviceSrc, std::size_t count); +void copyDeviceToDevice(void* deviceDst, const void* deviceSrc, std::size_t count); + +void zero(ValueType* deviceData, std::size_t count); + +// ============================ +// Element-wise Operations (Vector-Vector) +// ============================ +void add_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count); +void subtraction_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count); +void division_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count); +void multiply_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count); + +// ============================ +// Element-wise Operations (Vector-Scalar) +// ============================ +void add_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count); +void subtraction_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count); +void division_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count); +void multiply_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count); + +// ============================ +// Activation Functions +// ============================ + +// ReLU +void relu(const ValueType* input, ValueType* output, std::size_t count); +void relu_derivative(const ValueType* input, ValueType* output, std::size_t count); + +// Leaky ReLU +void leaky_relu(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha = 0.01f); +void leaky_relu_derivative(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha = 0.01f); + +// Sigmoid +void sigmoid(const ValueType* input, ValueType* output, std::size_t count); +void sigmoid_derivative(const ValueType* input, ValueType* output, std::size_t count); + +// Tanh +void tanh_activation(const ValueType* input, ValueType* output, std::size_t count); +void tanh_derivative(const ValueType* input, ValueType* output, std::size_t count); + +// Softmax +void softmax(const ValueType* net, ValueType* out, std::size_t size); + +// ============================ +// Single Value Access +// ============================ +ValueType getValueAt(const ValueType* devicePtr, std::size_t index); +void setValueAt(ValueType* devicePtr, std::size_t index, ValueType value); + +// ============================ +// Matrix Operations +// ============================ +void matmul(const ValueType* A, const ValueType* B, ValueType* R, std::size_t M, std::size_t K); +void matmulT(const ValueType* W, const ValueType* V, ValueType* R, std::size_t M, std::size_t N); +void outer(const ValueType* a, const ValueType* b, ValueType* result, std::size_t m, std::size_t n); -/// Free GPU memory. -void deallocate(void *devicePtr); - -/// Copy data from CPU to GPU. -void copyToDevice(void *deviceDst, const void *hostSrc, std::size_t count); - -/// Copy data from GPU to CPU. -void copyToHost(void *hostDst, const void *deviceSrc, std::size_t count); - -/// Copy data from GPU to GPU. -void copyDeviceToDevice(void *deviceDst, const void *deviceSrc, std::size_t count); - -/// Set all elements to zero (on GPU). -void zero(ValueType *deviceData, std::size_t count); - -/// Element-wise addition: C = A + B -void add_vec(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count); - -/// Element-wise addition: C = A - B -void subtraction_vec(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count); - -/// Element-wise addition: C = A / B -void division_vec(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count); - -/// Element-wise multiply: C = A * B -void multiply_vec(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count); - -/// Element-wise addition: C = A + B -void add_scalar(const ValueType *A, const ValueType B, ValueType *C, std::size_t count); - -/// Element-wise addition: C = A - B -void subtraction_scalar(const ValueType *A, const ValueType B, ValueType *C, std::size_t count); - -/// Element-wise addition: C = A / B -void division_scalar(const ValueType *A, const ValueType B, ValueType *C, std::size_t count); - -/// Element-wise multiply: C = A * B -void multiply_scalar(const ValueType *A, const ValueType B, ValueType *C, std::size_t count); - -// ---------------- ReLU ---------------- -void relu(const ValueType *input, ValueType *output, std::size_t count); -void relu_derivative(const ValueType *input, ValueType *output, std::size_t count); - -// ---------------- Sigmoid ---------------- -void sigmoid(const ValueType *input, ValueType *output, std::size_t count); -void sigmoid_derivative(const ValueType *input, ValueType *output, std::size_t count); - -// ---------------- Tanh ---------------- -void tanh_activation(const ValueType *input, ValueType *output, std::size_t count); -void tanh_derivative(const ValueType *input, ValueType *output, std::size_t count); - -// ---------------- Leaky ReLU ---------------- -void leaky_relu(const ValueType *input, ValueType *output, std::size_t count, ValueType alpha = 0.01f); -void leaky_relu_derivative(const ValueType *input, ValueType *output, std::size_t count, ValueType alpha = 0.01f); - -// ---------------- Softmax ---------------- -void softmax(const ValueType *net, ValueType *out, std::size_t size); - -ValueType getValueAt(const ValueType *devicePtr, std::size_t index); -void setValueAt(ValueType *devicePtr, std::size_t index, ValueType value); - -void matmul(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K); -void outer(const ValueType *a, const ValueType *b, ValueType *result, size_t m, size_t n); -void matmulT(const ValueType *W, const ValueType *V, ValueType *R, size_t M, size_t N); } // namespace nn::global::tensor_gpu #endif // TENSOR_GPU + From 586dc3f61222bb67ee35d3df2569d814bfed9ee3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maayan=20Portugues=20=F0=9F=8E=97=EF=B8=8F?= Date: Fri, 8 Aug 2025 23:54:08 +0300 Subject: [PATCH 33/40] Update cmake-multi-platform.yml --- .github/workflows/cmake-multi-platform.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/cmake-multi-platform.yml b/.github/workflows/cmake-multi-platform.yml index 560d323..5923ace 100644 --- a/.github/workflows/cmake-multi-platform.yml +++ b/.github/workflows/cmake-multi-platform.yml @@ -28,6 +28,13 @@ jobs: libxi-dev \ libfreetype6-dev + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-ubuntu2404.pin + sudo mv cuda-ubuntu2404.pin /etc/apt/preferences.d/cuda-repository-pin-600 + wget https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda-repo-ubuntu2404-13-0-local_13.0.0-580.65.06-1_amd64.deb + sudo dpkg -i cuda-repo-ubuntu2404-13-0-local_13.0.0-580.65.06-1_amd64.deb + sudo cp /var/cuda-repo-ubuntu2404-13-0-local/cuda-*-keyring.gpg /usr/share/keyrings/ + sudo apt-get -y install cuda-toolkit-13-0 + - name: Checkout code uses: actions/checkout@v4 From 4ddf94dd952bedfd19e4f672dff0ddbdf6e42ddf Mon Sep 17 00:00:00 2001 From: maayan Date: Sat, 9 Aug 2025 13:02:35 +0300 Subject: [PATCH 34/40] improved code --- include/model.hpp | 30 ++++++----- src/model/model.cpp | 20 +++++--- src/model/tensor_gpu.cu | 81 ++++++++++++++++++++++++------ tests/binary_test.cpp | 9 +++- tests/data/config-binary_test.json | 2 +- 5 files changed, 108 insertions(+), 34 deletions(-) diff --git a/include/model.hpp b/include/model.hpp index b9c9ae0..9e099e8 100644 --- a/include/model.hpp +++ b/include/model.hpp @@ -4,6 +4,9 @@ #include "../src/model/dataBase.hpp" #include "../src/model/optimizers.hpp" #include "../src/visualizer/VisualizerController.hpp" +#include "Globals.hpp" +#include "tensor.hpp" +#include #include namespace nn::visualizer { @@ -65,7 +68,7 @@ class Model { global::ValueType runBackPropagation( const Batch &batch, const bool updateWeights, - global::Transformation transformation = dt); + global::Transformation transformation = nullptr); void printTrainingResult( const std::chrono::high_resolution_clock::time_point &start, @@ -81,12 +84,12 @@ class Model { DataBase &dataBase, const bool cancleOnError = false, const bool showProgressbar = true, - global::Transformation transformation = dt); + global::Transformation transformation = nullptr); void trainModel( DataBase &trainedDataBase, DataBase &evaluateDataBase, - global::Transformation transformationB = dt, - global::Transformation transformationE = dt); + global::Transformation transformationB = nullptr, + global::Transformation transformationE = nullptr); size_t outputSize() const; size_t inputSize() const; @@ -103,10 +106,13 @@ class Model { void autoSave(const int i); - void addFNN(const std::uint32_t width, ISubNetworkConfig &_config); - void addCNN(const std::uint32_t width, ISubNetworkConfig &_config); + void addFNN(const std::uint32_t width, ISubNetworkConfig &_config); + void addCNN(const std::uint32_t width, ISubNetworkConfig &_config); - std::uint32_t calculateSubNetWidth() const; + std::uint32_t calculateSubNetWidth() const; + + void runModel(const global::Tensor &input, + global::Transformation transformation); public: Model(const std::string &config_filepath); @@ -115,16 +121,16 @@ class Model { void runModel(const global::Tensor &input); void train( const std::string &db_filename, - global::Transformation transformationB = dt, - global::Transformation transformationE = dt); + global::Transformation transformationB = nullptr, + global::Transformation transformationE = nullptr); void train( const std::vector &db_filename, - global::Transformation transformationB = dt, - global::Transformation transformationE = dt); + global::Transformation transformationB = nullptr, + global::Transformation transformationE = nullptr); modelResult evaluateModel( const std::string &db_filename, const bool cancleOnError = false, - global::Transformation transformation = dt); + global::Transformation transformation = nullptr); void save(const std::string &file); void load(const std::string &file); diff --git a/src/model/model.cpp b/src/model/model.cpp index a5a958f..d453f1c 100644 --- a/src/model/model.cpp +++ b/src/model/model.cpp @@ -1,5 +1,6 @@ #include "../networks/cnn/CNNetwork.hpp" #include "../networks/fnn/FNNetwork.hpp" +#include "Globals.hpp" #include "dataBase.hpp" #include "tensor.hpp" #include @@ -182,14 +183,12 @@ global::ValueType Model::runBackPropagation( global::Tensor output({outputSize()}); for (size_t i = 0; i < batch.size(); ++i) { TrainSample *current_sample_ptr = batch.samples.at(i); - output.zero(); - // visual.updatePrediction(current_sample_ptr->pre); + visual.updatePrediction(current_sample_ptr->pre); - // runModel(transformation(current_sample_ptr->input)); - - runModel(current_sample_ptr->input); + runModel(current_sample_ptr->input, transformation); if (doBackward) { + output.zero(); output.setValue({current_sample_ptr->pre.index}, 1); Backward(output); updateWeights(batch.size()); @@ -333,6 +332,15 @@ float Model::calculatePercentage(size_t currentSize, size_t totalSize) { return 100.0f * static_cast(currentSize) / static_cast(totalSize); } +void Model::runModel(const global::Tensor &input, + global::Transformation transformation) { + if (transformation) { + runModel(transformation(input)); + } else { + runModel(input); + } +} + modelResult Model::evaluateModel( DataBase &dataBase, const bool cancleOnError, @@ -352,7 +360,7 @@ modelResult Model::evaluateModel( for (int i = 0; i < result.dbSize; ++i) { TrainSample &sample = dataBase.getSample(i); - runModel(transformation(sample.input)); + runModel(sample.input, transformation); size_t predicted_index = 0; float max_value = getOutput().getValue({0}); diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu index 1a00003..5a2fc62 100644 --- a/src/model/tensor_gpu.cu +++ b/src/model/tensor_gpu.cu @@ -4,6 +4,13 @@ #include namespace nn::global::tensor_gpu { +#define CUDA_CHECK(call) do { \ + cudaError_t e = (call); \ + if (e != cudaSuccess) { \ + fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(e)); \ + throw std::runtime_error(cudaGetErrorString(e)); \ + } \ +} while(0) // ================================================== // Memory Management @@ -23,24 +30,24 @@ void deallocate(void* devicePtr) { } void copyToDevice(void* deviceDst, const void* hostSrc, std::size_t size) { - cudaMemcpy(deviceDst, hostSrc, size, cudaMemcpyHostToDevice); + CUDA_CHECK(cudaMemcpy(deviceDst, hostSrc, size, cudaMemcpyHostToDevice)); } void copyDeviceToDevice(void* deviceDst, const void* deviceSrc, std::size_t size) { - cudaMemcpy(deviceDst, deviceSrc, size, cudaMemcpyDeviceToDevice); + CUDA_CHECK(cudaMemcpy(deviceDst, deviceSrc, size, cudaMemcpyDeviceToDevice)); } void copyToHost(void* hostDst, const void* deviceSrc, std::size_t size) { - cudaMemcpy(hostDst, deviceSrc, size, cudaMemcpyDeviceToHost); + CUDA_CHECK(cudaMemcpy(hostDst, deviceSrc, size, cudaMemcpyDeviceToHost)); } void setValueAt(ValueType* devicePtr, std::size_t index, ValueType value) { - cudaMemcpy(devicePtr + index, &value, sizeof(ValueType), cudaMemcpyHostToDevice); + CUDA_CHECK(cudaMemcpy(devicePtr + index, &value, sizeof(ValueType), cudaMemcpyHostToDevice)); } ValueType getValueAt(const ValueType* devicePtr, std::size_t index) { ValueType value; - cudaMemcpy(&value, devicePtr + index, sizeof(ValueType), cudaMemcpyDeviceToHost); + CUDA_CHECK(cudaMemcpy(&value, devicePtr + index, sizeof(ValueType), cudaMemcpyDeviceToHost)); return value; } @@ -56,6 +63,8 @@ void zero(ValueType* deviceData, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; zeroKernel<<>>(deviceData, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } // ================================================== @@ -85,24 +94,32 @@ void add_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t c std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; addVecKernel<<>>(A, B, C, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } void subtraction_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; subVecKernel<<>>(A, B, C, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } void multiply_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; mulVecKernel<<>>(A, B, C, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } void division_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; divVecKernel<<>>(A, B, C, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } // ================================================== @@ -132,24 +149,32 @@ void add_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; addScalarKernel<<>>(A, B, C, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } void subtraction_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; subScalarKernel<<>>(A, B, C, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } void multiply_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; mulScalarKernel<<>>(A, B, C, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } void division_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; divScalarKernel<<>>(A, B, C, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } // ================================================== @@ -169,12 +194,16 @@ void relu(const ValueType* input, ValueType* output, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; reluKernel<<>>(input, output, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } void relu_derivative(const ValueType* input, ValueType* output, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; reluDerivativeKernel<<>>(input, output, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } __global__ void sigmoidKernel(const ValueType* input, ValueType* output, std::size_t count) { @@ -198,12 +227,16 @@ void sigmoid(const ValueType* input, ValueType* output, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; sigmoidKernel<<>>(input, output, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } void sigmoid_derivative(const ValueType* input, ValueType* output, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; sigmoidDerivativeKernel<<>>(input, output, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } __global__ void tanhKernel(const ValueType* input, ValueType* output, std::size_t count) { @@ -223,12 +256,16 @@ void tanh_activation(const ValueType* input, ValueType* output, std::size_t coun std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; tanhKernel<<>>(input, output, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } void tanh_derivative(const ValueType* input, ValueType* output, std::size_t count) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; tanhDerivativeKernel<<>>(input, output, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } __global__ void leakyReluKernel(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) { @@ -245,12 +282,16 @@ void leaky_relu(const ValueType* input, ValueType* output, std::size_t count, Va std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; leakyReluKernel<<>>(input, output, count, alpha); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } void leaky_relu_derivative(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) { std::size_t blockSize = 256; std::size_t numBlocks = (count + blockSize - 1) / blockSize; leakyReluDerivativeKernel<<>>(input, output, count, alpha); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } // ================================================== @@ -258,31 +299,35 @@ void leaky_relu_derivative(const ValueType* input, ValueType* output, std::size_ // ================================================== __global__ void softmaxKernel(const ValueType* input, ValueType* output, std::size_t count) { extern __shared__ ValueType shared[]; - std::size_t tid = threadIdx.x; - std::size_t idx = blockIdx.x * blockDim.x + tid; - if (idx >= count) return; + std::size_t blockStart = blockIdx.x * blockDim.x; + std::size_t idx = blockStart + tid; + // always write shared for every thread in block shared[tid] = (idx < count) ? input[idx] : -INFINITY; __syncthreads(); + // compute max (naive per-thread loop) ValueType max_val = shared[0]; - for (std::size_t i = 1; i < blockDim.x && (blockIdx.x * blockDim.x + i) < count; ++i) { - max_val = fmaxf(max_val, shared[i]); + for (unsigned int i = 1; i < blockDim.x; ++i) { + std::size_t curIdx = blockStart + i; + if (curIdx < count) max_val = fmaxf(max_val, shared[i]); } __syncthreads(); - ValueType e = expf(shared[tid] - max_val); + ValueType e = (idx < count) ? expf(shared[tid] - max_val) : 0.0f; shared[tid] = e; __syncthreads(); + // compute sum (naive) ValueType sum = 0.0f; - for (std::size_t i = 0; i < blockDim.x && (blockIdx.x * blockDim.x + i) < count; ++i) { - sum += shared[i]; + for (unsigned int i = 0; i < blockDim.x; ++i) { + std::size_t curIdx = blockStart + i; + if (curIdx < count) sum += shared[i]; } __syncthreads(); - output[idx] = shared[tid] / sum; + if (idx < count) output[idx] = shared[tid] / (sum == 0.0f ? 1.0f : sum); } void softmax(const ValueType* input, ValueType* output, std::size_t count) { @@ -290,6 +335,8 @@ void softmax(const ValueType* input, ValueType* output, std::size_t count) { std::size_t numBlocks = (count + blockSize - 1) / blockSize; std::size_t sharedMemSize = blockSize * sizeof(ValueType); softmaxKernel<<>>(input, output, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } // ================================================== @@ -357,6 +404,8 @@ void matmul(const ValueType* A, const ValueType* B, ValueType* R, size_t M, size int gridSize = (M + blockSize - 1) / blockSize; matmulKernel<<>>(A, B, R, M, K); cudaDeviceSynchronize(); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } void outer(const ValueType* a, const ValueType* b, ValueType* result, size_t m, size_t n) { @@ -364,6 +413,8 @@ void outer(const ValueType* a, const ValueType* b, ValueType* result, size_t m, int gridSize = (m * n + blockSize - 1) / blockSize; outerKernel<<>>(a, b, result, m, n); cudaDeviceSynchronize(); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } void matmulT(const ValueType* W, const ValueType* V, ValueType* R, size_t M, size_t N) { @@ -371,6 +422,8 @@ void matmulT(const ValueType* W, const ValueType* V, ValueType* R, size_t M, siz int gridSize = (N + blockSize - 1) / blockSize; matmulTKernel<<>>(W, V, R, M, N); cudaDeviceSynchronize(); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); } } // namespace nn::global::tensor_gpu diff --git a/tests/binary_test.cpp b/tests/binary_test.cpp index 17b318a..34ac415 100644 --- a/tests/binary_test.cpp +++ b/tests/binary_test.cpp @@ -79,7 +79,15 @@ int main(int argc, char *argv[]) { std::string config_FN = tests::appendToBase("config-binary_test.json"); // nn::global::Tensor give_me_a_name({5, 3}); // printf("test: \n"); + // give_me_a_name.fill(5); // give_me_a_name.setValue({2, 1}, 5); + // + // nn::global::Tensor give_me_a_name1({5, 3}); + // printf("test: \n"); + // give_me_a_name1.fill(3); + // give_me_a_name1.setValue({2, 1}, 10); + // give_me_a_name1 += give_me_a_name; + // printf("test: %f\n", give_me_a_name1.getValue({2,1})); // return 0; nn::model::Model model(config_FN); @@ -87,7 +95,6 @@ int main(int argc, char *argv[]) { if (argc > 1 && std::string(argv[1]) == "l") { model.load("test.txt"); } else { - model.load("test.txt"); nn::model::modelResult result = model.evaluateModel("../tests/data/database-binary_test"); std::cout << "training result: " << result.percentage << "%\n"; std::vector files{"../tests/data/test1", "../tests/data/test2"}; diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json index fb3283a..e95589c 100644 --- a/tests/data/config-binary_test.json +++ b/tests/data/config-binary_test.json @@ -8,7 +8,7 @@ ] }, "training config": { - "batch size": 16, + "batch size": 64, "batch count": 1000, "optimizer": { "type": "const", From 369b2ab93a846cc0154ef0b5026636bb9a71a31c Mon Sep 17 00:00:00 2001 From: maayan Date: Sat, 9 Aug 2025 16:37:24 +0300 Subject: [PATCH 35/40] bug fixes --- include/model.hpp | 4 ++-- src/model/activations.cpp | 17 +++++++------- src/model/model.cpp | 36 +++++++++++++++++++++++------- src/model/tensor.cpp | 11 ++++----- src/networks/fnn/DenseLayer.cpp | 8 +++---- src/networks/fnn/FnnVisualizer.hpp | 8 +++---- tests/binary_test.cpp | 12 ---------- tests/data/config-binary_test.json | 7 ++++-- 8 files changed, 58 insertions(+), 45 deletions(-) diff --git a/include/model.hpp b/include/model.hpp index 9e099e8..26aed62 100644 --- a/include/model.hpp +++ b/include/model.hpp @@ -132,8 +132,8 @@ class Model { const bool cancleOnError = false, global::Transformation transformation = nullptr); - void save(const std::string &file); - void load(const std::string &file); + void save(const std::string &file, bool print = true); + void load(const std::string &file, bool print = true); global::Prediction getPrediction() const; }; diff --git a/src/model/activations.cpp b/src/model/activations.cpp index f7a70b1..0a22ac3 100644 --- a/src/model/activations.cpp +++ b/src/model/activations.cpp @@ -46,16 +46,17 @@ void Activation::derivativeActivate(const global::Tensor &net, global::Tensor &o } global::ValueType Activation::maxVector(const global::Tensor &metrix) { - if (metrix.isGpu) { - } - global::ValueType max = metrix.cpu_data[0]; - for (size_t i = 0; i < metrix.numElements(); ++i) { - if (metrix.getValue({i}) > max) { - max = metrix.getValue({i}); + if (!metrix.isGpu) { + global::ValueType max = metrix.cpu_data[0]; + for (size_t i = 0; i < metrix.numElements(); ++i) { + if (metrix.getValue({i}) > max) { + max = metrix.getValue({i}); + } } - } - return max; + return max; + } + return 0; } global::ValueType Activation::relu(const global::ValueType z) { diff --git a/src/model/model.cpp b/src/model/model.cpp index d453f1c..3fa4f24 100644 --- a/src/model/model.cpp +++ b/src/model/model.cpp @@ -1,8 +1,6 @@ #include "../networks/cnn/CNNetwork.hpp" #include "../networks/fnn/FNNetwork.hpp" -#include "Globals.hpp" #include "dataBase.hpp" -#include "tensor.hpp" #include #include #include @@ -92,6 +90,7 @@ std::uint32_t Model::calculateSubNetWidth() const { void Model::initModel() { const std::uint32_t WIDTH = calculateSubNetWidth(); + size_t param_amount = 0; for (size_t i = 0; i < config.networkConfig.SubNetworksConfig.size(); ++i) { ISubNetworkConfig &_config = *config.networkConfig.SubNetworksConfig[i]; @@ -101,7 +100,14 @@ void Model::initModel() { } else if (_config.NNLable() == cnn::CNN_LABLE) { addCNN(WIDTH, _config); } + + param_amount += network[i]->getParams().numElements(); } + + std::cout << "initialize model - " + << param_amount << " parameters, " + << config.networkConfig.SubNetworksConfig.size() << " sub networks" + << std::endl; } void Model::addFNN(const std::uint32_t width, ISubNetworkConfig &_config) { @@ -284,7 +290,7 @@ bool Model::autoEvaluating( void Model::autoSave(const int i) { if (config.trainingConfig.isAutoSave() && i % config.trainingConfig.getAutoSave().saveEvery == 0) { - save(config.trainingConfig.getAutoSave().dataFilenameAutoSave); + save(config.trainingConfig.getAutoSave().dataFilenameAutoSave, false); } } @@ -415,9 +421,13 @@ size_t Model::inputSize() const { return network[0]->inputSize(); } -void Model::save(const std::string &file) { +void Model::save(const std::string &file, bool print) { std::ofstream outFile(file); + if (print) { + std::cout << "Start saving" << std::endl; + } + for (size_t i = 0; i < network.size(); ++i) { global::Tensor params = network[i]->getParams(); @@ -425,18 +435,26 @@ void Model::save(const std::string &file) { for (size_t j = 0; j < params.numElements(); ++j) { outFile << params.getValue({j}) << " "; } - outFile << std::endl; } + if (print) { + std::cout << " saving complete" << std::endl; + } + outFile.close(); } -void Model::load(const std::string &file) { +void Model::load(const std::string &file, bool print) { std::ifstream inFile(file); std::string line; int networkI = 0; + + if (print) { + std::cout << "Start loading" << std::endl; + } + while (std::getline(inFile, line)) { std::istringstream iss(line); @@ -445,17 +463,19 @@ void Model::load(const std::string &file) { global::Tensor numbers({ParamSize}); float num; - for (size_t i = 0; i < ParamSize; ++i) { iss >> num; numbers.setValue({i}, num); } network[networkI]->setParams(numbers); - networkI++; } + if (print) { + std::cout << " loading complete" << std::endl; + } + inFile.close(); } diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index ba95d29..88c3f4e 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -33,6 +33,7 @@ Tensor::Tensor(const Tensor &other) { if (isGpu) { gpu_data_size = other.gpu_data_size; gpu_data = (ValueType *)tensor_gpu::allocate(gpu_data_size * sizeof(ValueType)); + tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType)); } else { cpu_data = other.cpu_data; @@ -80,18 +81,18 @@ Tensor &Tensor::operator=(const Tensor &other) { if (!isGpu) { cpu_data = other.cpu_data; } else { - ValueType *temp = gpu_data; if (gpu_data_size != other.gpu_data_size) { + ValueType *temp = gpu_data; temp = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType)); gpu_data_size = other.gpu_data_size; - } - tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType)); + tensor_gpu::copyDeviceToDevice(temp, other.gpu_data, gpu_data_size * sizeof(ValueType)); - if (gpu_data_size != other.gpu_data_size) { tensor_gpu::deallocate(gpu_data); gpu_data = temp; - } + } else { + tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType)); + } } shape = other.shape; diff --git a/src/networks/fnn/DenseLayer.cpp b/src/networks/fnn/DenseLayer.cpp index d955dc3..b467dc2 100644 --- a/src/networks/fnn/DenseLayer.cpp +++ b/src/networks/fnn/DenseLayer.cpp @@ -168,12 +168,12 @@ void DenseLayer::fillParamRandom() { } void DenseLayer::resetDots() { - net.fill(0); - out.fill(0); + net.zero(); + out.zero(); } void DenseLayer::resetGradient() { - gradients.biases.fill(0); - gradients.weights.fill(0); + gradients.biases.zero(); + gradients.weights.zero(); } } // namespace nn::model::fnn diff --git a/src/networks/fnn/FnnVisualizer.hpp b/src/networks/fnn/FnnVisualizer.hpp index 067050c..a497484 100644 --- a/src/networks/fnn/FnnVisualizer.hpp +++ b/src/networks/fnn/FnnVisualizer.hpp @@ -32,11 +32,11 @@ static const std::array color_lookup = { class VisualDenseLayer { private: - global::Tensor net{{0}}; - global::Tensor out{{0}}; + global::Tensor net{{1}}; + global::Tensor out{{1}}; - model::fnn::LayerParams parameters{0, 0}; - model::fnn::LayerParams gradients{0, 0}; + model::fnn::LayerParams parameters{1, 1}; + model::fnn::LayerParams gradients{1, 1}; sf::Vector2f pos; diff --git a/tests/binary_test.cpp b/tests/binary_test.cpp index 34ac415..4fdba8f 100644 --- a/tests/binary_test.cpp +++ b/tests/binary_test.cpp @@ -77,18 +77,6 @@ int main(int argc, char *argv[]) { size_t input_size = 10; std::string config_FN = tests::appendToBase("config-binary_test.json"); - // nn::global::Tensor give_me_a_name({5, 3}); - // printf("test: \n"); - // give_me_a_name.fill(5); - // give_me_a_name.setValue({2, 1}, 5); - // - // nn::global::Tensor give_me_a_name1({5, 3}); - // printf("test: \n"); - // give_me_a_name1.fill(3); - // give_me_a_name1.setValue({2, 1}, 10); - // give_me_a_name1 += give_me_a_name; - // printf("test: %f\n", give_me_a_name1.getValue({2,1})); - // return 0; nn::model::Model model(config_FN); diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json index e95589c..aef3f34 100644 --- a/tests/data/config-binary_test.json +++ b/tests/data/config-binary_test.json @@ -1,6 +1,6 @@ { "visual config": { - "enableVisuals": false, + "enableVisuals": true, "modes": [ { "state": "pause", "mode": true }, { "state": "precise mode", "mode": false }, @@ -22,7 +22,10 @@ "output size": 16, "output activation": 4, "layers": [ - { "size": 1000, "activationType": 1 }, + { "size": 10, "activationType": 1 }, + { "size": 10, "activationType": 1 }, + { "size": 10, "activationType": 1 }, + { "size": 10, "activationType": 1 }, { "size": 300, "activationType": 1 } ] } From c271d62ea41020c5dfa7429678d2cf0e1e6b4a70 Mon Sep 17 00:00:00 2001 From: maayan Date: Sat, 9 Aug 2025 18:25:59 +0300 Subject: [PATCH 36/40] now it is possible to show only status on visual --- include/tensor.hpp | 7 +- src/model/config.hpp | 3 +- src/model/model.cpp | 7 +- src/model/tensor.cpp | 126 ++++++++++++-------------- src/visualizer/VInterface.cpp | 35 +++++-- src/visualizer/VInterface.hpp | 6 +- src/visualizer/VisualizerRenderer.cpp | 43 ++++++--- src/visualizer/VisualizerRenderer.hpp | 5 +- tests/data/config-binary_test.json | 9 +- 9 files changed, 137 insertions(+), 104 deletions(-) diff --git a/include/tensor.hpp b/include/tensor.hpp index 1064f1d..bb0749d 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -30,7 +30,7 @@ class Tensor { public: // Constructors - Tensor(const std::vector &shape, float init = 0.0f); + Tensor(const std::vector &shape, ValueType init = 0.0f); Tensor(const Tensor &other); ~Tensor(); @@ -40,9 +40,8 @@ class Tensor { ValueType getValue(const std::vector &newShape) const; void setValue(const std::vector &newShape, const ValueType value); - void insertRange(const Tensor &other, - const size_t startO, const size_t startT, - const size_t length); + void insertRange(const Tensor &other, const size_t startO, + const size_t startT, const size_t length); // Shape and size size_t numElements() const; diff --git a/src/model/config.hpp b/src/model/config.hpp index 2c4b4ef..5aa54b7 100644 --- a/src/model/config.hpp +++ b/src/model/config.hpp @@ -148,9 +148,10 @@ NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(VisualMode, state, mode); struct VisualConfig { bool enableVisuals{true}; + bool enableNetwrokVisual{true}; std::vector modes; }; -NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(VisualConfig, enableVisuals, modes); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(VisualConfig, enableVisuals, enableNetwrokVisual, modes); class Config { public: diff --git a/src/model/model.cpp b/src/model/model.cpp index 3fa4f24..694dd8c 100644 --- a/src/model/model.cpp +++ b/src/model/model.cpp @@ -78,6 +78,9 @@ void Model::initOptimizer() { void Model::initVisual() { visual.start(); + if (!config.visualConfig.enableNetwrokVisual) + return; + for (size_t i = 0; i < config.networkConfig.SubNetworksConfig.size(); ++i) { visual.addVisualSubNetwork(network[i]->getVisual()); network[i]->getVisual()->setVstate(visual.Vstate); @@ -113,7 +116,7 @@ void Model::initModel() { void Model::addFNN(const std::uint32_t width, ISubNetworkConfig &_config) { fnn::FNNConfig &sub_ = (fnn::FNNConfig &)(_config); - if (config.visualConfig.enableVisuals) { + if (config.visualConfig.enableVisuals && config.visualConfig.enableNetwrokVisual) { std::shared_ptr visual_ = std::make_shared( visual.Vstate, @@ -129,7 +132,7 @@ void Model::addFNN(const std::uint32_t width, ISubNetworkConfig &_config) { void Model::addCNN(const std::uint32_t width, ISubNetworkConfig &_config) { cnn::CNNConfig &sub_ = (cnn::CNNConfig &)(_config); - if (config.visualConfig.enableVisuals) { + if (config.visualConfig.enableVisuals && config.visualConfig.enableNetwrokVisual) { std::shared_ptr visual_ = std::make_shared( visual.Vstate, diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index 88c3f4e..8e40e5b 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -4,7 +4,7 @@ #include namespace nn::global { -Tensor::Tensor(const std::vector &shape_, float init) { +Tensor::Tensor(const std::vector &shape_, ValueType init) { if (shape_.empty()) { throw std::invalid_argument("Tensor shape cannot be empty."); } @@ -16,12 +16,12 @@ Tensor::Tensor(const std::vector &shape_, float init) { std::multiplies<>()); shape = shape_; - if (!isGpu) { - cpu_data.assign(totalSize, init); - } else { + if (isGpu) { gpu_data = (ValueType *)tensor_gpu::allocate(totalSize * sizeof(ValueType)); gpu_data_size = totalSize; fill(init); + } else { + cpu_data.assign(totalSize, init); } computeStrides(); @@ -33,7 +33,6 @@ Tensor::Tensor(const Tensor &other) { if (isGpu) { gpu_data_size = other.gpu_data_size; gpu_data = (ValueType *)tensor_gpu::allocate(gpu_data_size * sizeof(ValueType)); - tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType)); } else { cpu_data = other.cpu_data; @@ -48,10 +47,10 @@ size_t Tensor::numElements() const { } void Tensor::getData(std::vector &dest) const { - if (!isGpu) { - dest = cpu_data; - } else { + if (isGpu) { tensor_gpu::copyToHost(dest.data(), gpu_data, gpu_data_size * sizeof(ValueType)); + } else { + dest = cpu_data; } } @@ -78,21 +77,18 @@ Tensor &Tensor::operator=(const Tensor &other) { if (this == &other) return *this; - if (!isGpu) { - cpu_data = other.cpu_data; - } else { + if (isGpu) { if (gpu_data_size != other.gpu_data_size) { - ValueType *temp = gpu_data; - temp = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType)); - + ValueType *temp = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType)); gpu_data_size = other.gpu_data_size; tensor_gpu::copyDeviceToDevice(temp, other.gpu_data, gpu_data_size * sizeof(ValueType)); - tensor_gpu::deallocate(gpu_data); gpu_data = temp; } else { tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType)); - } + } + } else { + cpu_data = other.cpu_data; } shape = other.shape; @@ -102,13 +98,13 @@ Tensor &Tensor::operator=(const Tensor &other) { Tensor &Tensor::operator=(const std::vector &other) { if (other.size() != numElements()) { - throw std::invalid_argument(""); + throw std::length_error("Tensor assignment size mismatch"); } - if (!isGpu) { - cpu_data = other; - } else { + if (isGpu) { tensor_gpu::copyToDevice(gpu_data, other.data(), gpu_data_size * sizeof(ValueType)); + } else { + cpu_data = other; } return *this; @@ -138,11 +134,11 @@ inline size_t Tensor::flattenIndex(const std::vector &indices) const { } ValueType Tensor::getValue(const std::vector &indices) const { - if (!isGpu) { - return cpu_data[flattenIndex(indices)]; + if (isGpu) { + return tensor_gpu::getValueAt(gpu_data, flattenIndex(indices)); } - return tensor_gpu::getValueAt(gpu_data, flattenIndex(indices)); + return cpu_data[flattenIndex(indices)]; } void Tensor::insertRange(const Tensor &other, @@ -158,22 +154,21 @@ void Tensor::insertRange(const Tensor &other, } void Tensor::setValue(const std::vector &indices, const ValueType value) { - if (!isGpu) { - cpu_data[flattenIndex(indices)] = value; - } else { + if (isGpu) { tensor_gpu::setValueAt(gpu_data, flattenIndex(indices), value); + } else { + cpu_data[flattenIndex(indices)] = value; } } Tensor &Tensor::operator+=(const Tensor &other) { if (shape != other.shape) throw std::invalid_argument("Shape mismatch in Tensor::operator+=."); - if (!isGpu) { - const size_t N = cpu_data.size(); - for (size_t i = 0; i < N; ++i) - cpu_data[i] += other.cpu_data[i]; - } else { + if (isGpu) { tensor_gpu::add_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size); + } else { + for (size_t i = 0; i < cpu_data.size(); ++i) + cpu_data[i] += other.cpu_data[i]; } return *this; } @@ -181,12 +176,11 @@ Tensor &Tensor::operator+=(const Tensor &other) { Tensor &Tensor::operator-=(const Tensor &other) { if (shape != other.shape) throw std::invalid_argument("Shape mismatch in Tensor::operator-=."); - if (!isGpu) { - const size_t N = cpu_data.size(); - for (size_t i = 0; i < N; ++i) - cpu_data[i] -= other.cpu_data[i]; - } else { + if (isGpu) { tensor_gpu::subtraction_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size); + } else { + for (size_t i = 0; i < cpu_data.size(); ++i) + cpu_data[i] -= other.cpu_data[i]; } return *this; } @@ -194,12 +188,11 @@ Tensor &Tensor::operator-=(const Tensor &other) { Tensor &Tensor::operator*=(const Tensor &other) { if (shape != other.shape) throw std::invalid_argument("Shape mismatch in Tensor::operator*=."); - if (!isGpu) { - const size_t N = cpu_data.size(); - for (size_t i = 0; i < N; ++i) - cpu_data[i] *= other.cpu_data[i]; - } else { + if (isGpu) { tensor_gpu::multiply_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size); + } else { + for (size_t i = 0; i < cpu_data.size(); ++i) + cpu_data[i] *= other.cpu_data[i]; } return *this; } @@ -207,52 +200,51 @@ Tensor &Tensor::operator*=(const Tensor &other) { Tensor &Tensor::operator/=(const Tensor &other) { if (shape != other.shape) throw std::invalid_argument("Shape mismatch in Tensor::operator/=."); - if (!isGpu) { - const size_t N = cpu_data.size(); - for (size_t i = 0; i < N; ++i) - cpu_data[i] /= other.cpu_data[i]; - } else { + if (isGpu) { tensor_gpu::division_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size); + } else { + for (size_t i = 0; i < cpu_data.size(); ++i) + cpu_data[i] /= other.cpu_data[i]; } return *this; } Tensor &Tensor::operator*=(ValueType scalar) { - if (!isGpu) { + if (isGpu) { + tensor_gpu::multiply_scalar(gpu_data, scalar, gpu_data, gpu_data_size); + } else { for (auto &x : cpu_data) x *= scalar; - } else { - tensor_gpu::multiply_scalar(gpu_data, scalar, gpu_data, gpu_data_size); } return *this; } Tensor &Tensor::operator-=(ValueType scalar) { - if (!isGpu) { + if (isGpu) { + tensor_gpu::subtraction_scalar(gpu_data, scalar, gpu_data, gpu_data_size); + } else { for (auto &x : cpu_data) x -= scalar; - } else { - tensor_gpu::subtraction_scalar(gpu_data, scalar, gpu_data, gpu_data_size); } return *this; } Tensor &Tensor::operator+=(ValueType scalar) { - if (!isGpu) { + if (isGpu) { + tensor_gpu::add_scalar(gpu_data, scalar, gpu_data, gpu_data_size); + } else { for (auto &x : cpu_data) x += scalar; - } else { - tensor_gpu::add_scalar(gpu_data, scalar, gpu_data, gpu_data_size); } return *this; } Tensor &Tensor::operator/=(ValueType scalar) { - if (!isGpu) { + if (isGpu) { + tensor_gpu::division_scalar(gpu_data, scalar, gpu_data, gpu_data_size); + } else { for (auto &x : cpu_data) x /= scalar; - } else { - tensor_gpu::division_scalar(gpu_data, scalar, gpu_data, gpu_data_size); } return *this; } @@ -270,7 +262,9 @@ void Tensor::matmul(const Tensor &other, Tensor &result) const { result.zero(); - if (!isGpu) { + if (isGpu) { + tensor_gpu::matmul(gpu_data, other.gpu_data, result.gpu_data, M, K); + } else { const float *A = cpu_data.data(); const float *B = other.cpu_data.data(); float *R = result.cpu_data.data(); @@ -283,8 +277,6 @@ void Tensor::matmul(const Tensor &other, Tensor &result) const { } R[i] = sum; } - } else { - tensor_gpu::matmul(gpu_data, other.gpu_data, result.gpu_data, M, K); } } @@ -298,7 +290,9 @@ void Tensor::outer(const Tensor &a, const Tensor &b, Tensor &result) { result.zero(); - if (!isGpu) { + if (isGpu) { + tensor_gpu::outer(a.gpu_data, b.gpu_data, result.gpu_data, m, n); + } else { float *r = result.cpu_data.data(); const float *A = a.cpu_data.data(); const float *B = b.cpu_data.data(); @@ -308,8 +302,6 @@ void Tensor::outer(const Tensor &a, const Tensor &b, Tensor &result) { r[i * n + j] += A[i] * B[j]; } } - } else { - tensor_gpu::outer(a.gpu_data, b.gpu_data, result.gpu_data, m, n); } } @@ -321,14 +313,14 @@ void Tensor::matmulT(const Tensor &vec, Tensor &result) const { result.zero(); - if (!isGpu) { + if (isGpu) { + tensor_gpu::matmulT(gpu_data, vec.gpu_data, result.gpu_data, shape[0], shape[1]); + } else { for (size_t i = 0; i < shape[1]; ++i) { for (size_t j = 0; j < shape[0]; ++j) { result.cpu_data[i] += cpu_data[j * shape[1] + i] * vec.cpu_data[j]; } } - } else { - tensor_gpu::matmulT(gpu_data, vec.gpu_data, result.gpu_data, shape[0], shape[1]); } } diff --git a/src/visualizer/VInterface.cpp b/src/visualizer/VInterface.cpp index 5fa4ebd..8a81499 100644 --- a/src/visualizer/VInterface.cpp +++ b/src/visualizer/VInterface.cpp @@ -1,30 +1,45 @@ #include "VInterface.hpp" +#include "state.hpp" namespace nn::visualizer { -IntefacePanel::IntefacePanel(const std::shared_ptr vstate) +InterfacePanel::InterfacePanel(const std::shared_ptr vstate) : Panel(vstate), VRender({VINTERFACE_WIDTH, VINTERFACE_HEIGHT}) { createVInterface(); } -void IntefacePanel::createVInterface() { +void InterfacePanel::createVInterface() { VRender.clear(INTERFACE_PANEL_COLOR); buttons.reserve(STATES_COUNT); + constexpr std::array skipWhenDisabled = { + SettingType::AutoPause, + SettingType::Pause, + SettingType::PreciseMode}; + for (int i = 0; i < STATES_COUNT; ++i) { - buttons.push_back(std::make_unique