From 84a7b8be1d5db3c88243441213c2e83adf4390a8 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Tue, 5 Aug 2025 16:25:49 +0300
Subject: [PATCH 01/40] first commit

---
 include/tensor.hpp | 7 +++++++
 1 file changed, 7 insertions(+)
diff --git a/include/tensor.hpp b/include/tensor.hpp
index 71b24d6..351cdef 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -7,12 +7,19 @@
 namespace nn::global {
 using ValueType = float;
 
+enum class Backend {
+	CPU,
+	GPU,
+};
+
 class Tensor {
   private:
 	std::vector<ValueType> data;
 	std::vector<size_t> shape;
 	std::vector<size_t> strides;
 
+	Backend BackendType;
+
 	void computeStrides();
 	inline size_t flattenIndex(const std::vector<size_t> &indices) const;
 

From 313c342f1156f262189e86808197f637e930d58a Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Tue, 5 Aug 2025 16:43:49 +0300
Subject: [PATCH 02/40] new commit

---
 CMakeLists.txt           | 21 +++++++++++++++++----
 src/model/tensor.cpp     |  1 +
 src/model/tensor_gpu.cu  |  5 +++++
 src/model/tensor_gpu.hpp | 39 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 62 insertions(+), 4 deletions(-)
 create mode 100644 src/model/tensor_gpu.cu
 create mode 100644 src/model/tensor_gpu.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 86f4cc4..a622b5b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,10 +1,12 @@
 cmake_minimum_required(VERSION 3.28)
-project(NeuralNetwork LANGUAGES CXX)
+project(NeuralNetwork LANGUAGES CXX CUDA)  # Add CUDA here
 
 # ------------------------------------------------------------------
 # Configuration
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_STANDARD 17)                    # Add CUDA standard
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)           # Enforce it
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
 
@@ -40,7 +42,7 @@ FetchContent_Declare(nlohmann_json
 FetchContent_MakeAvailable(SFML nlohmann_json)
 
 # ------------------------------------------------------------------
-# Function: Apply sanitizers
+# Function: Apply sanitizers (for CPU code only)
 function(apply_sanitizers target)
     target_compile_options(${target} PRIVATE -fsanitize=address -fno-omit-frame-pointer -g)
     target_link_libraries(${target} PRIVATE -fsanitize=address)
@@ -48,13 +50,22 @@ endfunction()
 
 # ------------------------------------------------------------------
 # Main library
+
+# Add both C++ and CUDA sources
 file(GLOB_RECURSE NN_SOURCES CONFIGURE_DEPENDS
     "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cu"   # Include CUDA source files
 )
 
 add_library(NeuralNetwork STATIC ${NN_SOURCES})
 set_target_properties(NeuralNetwork PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
+# Enable separable compilation for CUDA files
+set_target_properties(NeuralNetwork PROPERTIES
+    CUDA_SEPARABLE_COMPILATION ON
+    CUDA_RESOLVE_DEVICE_SYMBOLS ON
+)
+
 target_include_directories(NeuralNetwork
     PUBLIC
         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
@@ -71,19 +82,20 @@ target_link_libraries(NeuralNetwork
         SFML::Window
         SFML::System
         nlohmann_json::nlohmann_json
+        cuda
+        cudart
 )
 
 target_compile_options(NeuralNetwork PRIVATE -Wall -Wextra -Wpedantic)
 
 # ------------------------------------------------------------------
-# Tests (with sanitizers)
+# Tests
 option(BUILD_NN_TESTS "Build NeuralNetwork tests" OFF)
 
 if(BUILD_NN_TESTS)
     enable_testing()
     include(CTest)
 
-    # Apply sanitizers only for test builds
     apply_sanitizers(NeuralNetwork)
 
     file(GLOB TEST_SOURCES CONFIGURE_DEPENDS tests/*.cpp)
@@ -109,3 +121,4 @@ endif()
 # Install
 install(TARGETS NeuralNetwork ARCHIVE DESTINATION lib)
 install(DIRECTORY include/ DESTINATION include)
+
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index 80fd8cb..3765a33 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -1,6 +1,7 @@
 #include <numeric>
 #include <stdexcept>
 #include <tensor.hpp>
+#include "tensor_gpu.hpp"
 
 namespace nn::global {
 Tensor::Tensor(const std::vector<size_t> &shape, float init)
diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
new file mode 100644
index 0000000..8530403
--- /dev/null
+++ b/src/model/tensor_gpu.cu
@@ -0,0 +1,5 @@
+#include "tensor_gpu.hpp"
+#include <cuda_runtime.h>
+
+namespace tensor_gpu {
+}
diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp
new file mode 100644
index 0000000..8294ad7
--- /dev/null
+++ b/src/model/tensor_gpu.hpp
@@ -0,0 +1,39 @@
+#include <cstddef>
+
+class Tensor; // Forward declaration
+
+namespace tensor_gpu {
+
+/// Allocate memory on GPU for a tensor.
+float *allocate(std::size_t count);
+
+/// Free GPU memory.
+void deallocate(float *devicePtr);
+
+/// Copy data from CPU to GPU.
+void copyToDevice(float *deviceDst, const float *hostSrc, std::size_t count);
+
+/// Copy data from GPU to CPU.
+void copyToHost(float *hostDst, const float *deviceSrc, std::size_t count);
+
+/// Set all elements to zero (on GPU).
+void zero(float *deviceData, std::size_t count);
+
+/// Element-wise addition: C = A + B
+void add(const float *A, const float *B, float *C, std::size_t count);
+
+/// Element-wise multiply: C = A * B
+void multiply(const float *A, const float *B, float *C, std::size_t count);
+
+/// Dot product between two vectors (A · B)
+float dot(const float *A, const float *B, std::size_t count);
+
+/// Apply activation function (e.g., ReLU)
+void relu(float *deviceData, std::size_t count);
+
+/// Apply derivative of activation function (e.g., ReLU')
+void relu_derivative(const float *input, float *output, std::size_t count);
+
+// Add more operations as needed...
+
+} // namespace tensor_gpu

From 2294ccd197002e4b0495710e9b1ce44a252823ae Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Tue, 5 Aug 2025 17:21:28 +0300
Subject: [PATCH 03/40] new commit

---
 include/tensor.hpp       |  38 +++++----
 src/model/tensor.cpp     | 164 +++++++++++++++++++++++---------------
 src/model/tensor_gpu.cu  | 165 ++++++++++++++++++++++++++++++++++++++-
 src/model/tensor_gpu.hpp |   2 -
 4 files changed, 284 insertions(+), 85 deletions(-)

diff --git a/include/tensor.hpp b/include/tensor.hpp
index 351cdef..f27a3e3 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -14,11 +14,15 @@ enum class Backend {
 
 class Tensor {
   private:
-	std::vector<ValueType> data;
-	std::vector<size_t> shape;
-	std::vector<size_t> strides;
+	std::vector<ValueType> cpu_data;
+	std::vector<size_t> cpu_shape;
+	std::vector<size_t> cpu_strides;
 
-	Backend BackendType;
+	ValueType *gpu_data = nullptr;
+	ValueType *gpu_shape = nullptr;
+	ValueType *gpu_strides = nullptr;
+
+	bool isGpu() const { return true; }
 
 	void computeStrides();
 	inline size_t flattenIndex(const std::vector<size_t> &indices) const;
@@ -27,29 +31,29 @@ class Tensor {
 	// Constructors
 	Tensor(const std::vector<size_t> &shape, float init = 0.0f);
 	Tensor(const Tensor &other)
-	    : data(other.data),
-	      shape(other.shape),
-	      strides(other.strides) {}
+	    : cpu_data(other.cpu_data),
+	      cpu_shape(other.cpu_shape),
+	      cpu_strides(other.cpu_strides) {}
 
 	Tensor &operator=(const Tensor &other);
 
 	// Element access
 	ValueType &operator()(const std::vector<size_t> &indices);
 	ValueType operator()(const std::vector<size_t> &indices) const;
-	inline ValueType &operator[](size_t i) { return data[i]; }
-	inline const ValueType &operator[](size_t i) const { return data[i]; }
+	ValueType &operator[](size_t i);
+	const ValueType &operator[](size_t i) const;
 
 	// Iterators (for range-based loops)
-	auto begin() noexcept { return data.begin(); }
-	auto end() noexcept { return data.end(); }
-	auto begin() const noexcept { return data.begin(); }
-	auto end() const noexcept { return data.end(); }
+	auto begin() noexcept { return cpu_data.begin(); }
+	auto end() noexcept { return cpu_data.end(); }
+	auto begin() const noexcept { return cpu_data.begin(); }
+	auto end() const noexcept { return cpu_data.end(); }
 
 	// Shape and size
-	inline const std::vector<size_t> &getShape() const { return shape; }
-	inline size_t numElements() const { return data.size(); }
-	inline const std::vector<ValueType> &getData() const { return data; }
-	inline void fill(const ValueType &value) { std::fill(begin(), end(), value); }
+	const std::vector<size_t> &getShape() const;
+	size_t numElements() const;
+	const std::vector<ValueType> &getData() const;
+	void fill(const ValueType &value);
 
 	// Arithmetic operations
 	Tensor operator+(const Tensor &other) const;
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index 3765a33..bf513bf 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -1,11 +1,11 @@
+#include "tensor_gpu.hpp"
 #include <numeric>
 #include <stdexcept>
 #include <tensor.hpp>
-#include "tensor_gpu.hpp"
 
 namespace nn::global {
 Tensor::Tensor(const std::vector<size_t> &shape, float init)
-    : shape(shape) {
+    : cpu_shape(shape) {
 	if (shape.empty()) {
 		throw std::invalid_argument("Tensor shape cannot be empty.");
 	}
@@ -15,101 +15,135 @@ Tensor::Tensor(const std::vector<size_t> &shape, float init)
 	    shape.end(),
 	    size_t(1),
 	    std::multiplies<>());
-	data.assign(totalSize, init);
+	cpu_data.assign(totalSize, init);
 	computeStrides();
 }
 
+ValueType &Tensor::operator[](size_t i) {
+	if (isGpu()) {
+		return gpu_data[i];
+	}
+	return cpu_data[i];
+}
+
+const ValueType &Tensor::operator[](size_t i) const {
+	if (isGpu()) {
+		return gpu_data[i];
+	}
+	return cpu_data[i];
+}
+
+const std::vector<size_t> &Tensor::getShape() const {
+
+	if (isGpu()) {
+		return gpu_data[i];
+	}
+	return cpu_shape;
+}
+
+size_t Tensor::numElements() const {
+	return cpu_data.size();
+}
+
+const std::vector<ValueType> &Tensor::getData() const {
+	return cpu_data;
+}
+
+void Tensor::fill(const ValueType &value) {
+	std::fill(begin(), end(), value);
+}
+
 Tensor &Tensor::operator=(const Tensor &other) {
 	if (this == &other)
 		return *this;
 
-	data = other.data;
-	shape = other.shape;
-	strides = other.strides;
+	cpu_data = other.cpu_data;
+	cpu_shape = other.cpu_shape;
+	cpu_strides = other.cpu_strides;
 
 	return *this;
 }
 
 void Tensor::computeStrides() {
-	const size_t dim = shape.size();
-	strides.resize(dim);
+	const size_t dim = cpu_shape.size();
+	cpu_strides.resize(dim);
 	size_t stride = 1;
 	for (size_t i = dim; i-- > 0;) {
-		strides[i] = stride;
-		stride *= shape[i];
+		cpu_strides[i] = stride;
+		stride *= cpu_shape[i];
 	}
 }
 
 inline size_t Tensor::flattenIndex(const std::vector<size_t> &indices) const {
-	if (indices.size() != shape.size()) {
+	if (indices.size() != cpu_shape.size()) {
 		throw std::invalid_argument("Incorrect number of indices.");
 	}
 	size_t index = 0;
-	for (size_t i = 0; i < shape.size(); ++i) {
-		if (indices[i] >= shape[i])
+	for (size_t i = 0; i < cpu_shape.size(); ++i) {
+		if (indices[i] >= cpu_shape[i])
 			throw std::out_of_range("Index out of bounds.");
-		index += indices[i] * strides[i];
+		index += indices[i] * cpu_strides[i];
 	}
 	return index;
 }
 
 ValueType &Tensor::operator()(const std::vector<size_t> &indices) {
-	return data[flattenIndex(indices)];
+	return cpu_data[flattenIndex(indices)];
 }
 
 ValueType Tensor::operator()(const std::vector<size_t> &indices) const {
-	return data[flattenIndex(indices)];
+	return cpu_data[flattenIndex(indices)];
 }
 
 Tensor Tensor::operator+(const Tensor &other) const {
-	if (shape != other.shape) {
+	if (cpu_shape != other.cpu_shape) {
 		throw std::invalid_argument("Shape mismatch in Tensor::operator+.");
 	}
-	Tensor result(shape);
-	const float *a = data.data();
-	const float *b = other.data.data();
-	float *r = result.data.data();
-	const size_t N = data.size();
+	Tensor result(cpu_shape);
+	const float *a = cpu_data.data();
+	const float *b = other.cpu_data.data();
+	float *r = result.cpu_data.data();
+	const size_t N = cpu_data.size();
 	for (size_t i = 0; i < N; ++i)
 		r[i] = a[i] + b[i];
 	return result;
 }
 
 Tensor Tensor::operator-(const Tensor &other) const {
-	if (shape != other.shape) {
+	if (cpu_shape != other.cpu_shape) {
 		throw std::invalid_argument("Shape mismatch in Tensor::operator-.");
 	}
-	Tensor result(shape);
-	const float *a = data.data();
-	const float *b = other.data.data();
-	float *r = result.data.data();
-	const size_t N = data.size();
+	Tensor result(cpu_shape);
+	const float *a = cpu_data.data();
+	const float *b = other.cpu_data.data();
+	float *r = result.cpu_data.data();
+	const size_t N = cpu_data.size();
 	for (size_t i = 0; i < N; ++i)
 		r[i] = a[i] - b[i];
 	return result;
 }
 
 Tensor Tensor::operator/(const Tensor &other) const {
-	if (shape != other.shape) {
+	if (cpu_shape != other.cpu_shape) {
 		throw std::invalid_argument("Shape mismatch in Tensor::operator/.");
 	}
-	Tensor result(shape);
-	const float *a = data.data();
-	const float *b = other.data.data();
-	float *r = result.data.data();
-	const size_t N = data.size();
+	Tensor result(cpu_shape);
+	const float *a = cpu_data.data();
+	const float *b = other.cpu_data.data();
+	float *r = result.cpu_data.data();
+	const size_t N = cpu_data.size();
 	for (size_t i = 0; i < N; ++i)
 		r[i] = a[i] / b[i];
 	return result;
 }
 
 Tensor &Tensor::operator+=(const Tensor &other) {
-	if (shape != other.shape)
+	if (cpu_shape != other.cpu_shape)
 		throw std::invalid_argument("Shape mismatch.");
 
-	float *__restrict__ a = data.data();
-	const float *__restrict__ b = other.data.data();
-	const size_t N = data.size();
+	float *__restrict__ a = cpu_data.data();
+	const float *__restrict__ b = other.cpu_data.data();
+	const size_t N = cpu_data.size();
 
 	for (size_t i = 0; i < N; ++i)
 		a[i] += b[i];
@@ -118,53 +152,53 @@ Tensor &Tensor::operator+=(const Tensor &other) {
 }
 
 Tensor &Tensor::operator-=(const Tensor &other) {
-	if (shape != other.shape)
+	if (cpu_shape != other.cpu_shape)
 		throw std::invalid_argument("Shape mismatch.");
-	float *a = data.data();
-	const float *b = other.data.data();
-	const size_t N = data.size();
+	float *a = cpu_data.data();
+	const float *b = other.cpu_data.data();
+	const size_t N = cpu_data.size();
 	for (size_t i = 0; i < N; ++i)
 		a[i] -= b[i];
 	return *this;
 }
 
 Tensor &Tensor::operator*=(const Tensor &other) {
-	if (shape != other.shape)
+	if (cpu_shape != other.cpu_shape)
 		throw std::invalid_argument("Shape mismatch in Tensor::operator*=.");
-	const size_t N = data.size();
+	const size_t N = cpu_data.size();
 	for (size_t i = 0; i < N; ++i)
-		data[i] *= other.data[i];
+		cpu_data[i] *= other.cpu_data[i];
 	return *this;
 }
 
 Tensor &Tensor::operator/=(const Tensor &other) {
-	if (shape != other.shape)
+	if (cpu_shape != other.cpu_shape)
 		throw std::invalid_argument("Shape mismatch in Tensor::operator/=.");
-	const size_t N = data.size();
+	const size_t N = cpu_data.size();
 	for (size_t i = 0; i < N; ++i)
-		data[i] /= other.data[i];
+		cpu_data[i] /= other.cpu_data[i];
 	return *this;
 }
 
 Tensor &Tensor::operator*=(ValueType scalar) {
-	for (auto &x : data)
+	for (auto &x : cpu_data)
 		x *= scalar;
 	return *this;
 }
 
 Tensor &Tensor::operator-=(ValueType scalar) {
-	for (auto &x : data)
+	for (auto &x : cpu_data)
 		x -= scalar;
 	return *this;
 }
 
 Tensor &Tensor::operator+=(ValueType scalar) {
-	for (auto &x : data)
+	for (auto &x : cpu_data)
 		x += scalar;
 	return *this;
 }
 Tensor &Tensor::operator/=(ValueType scalar) {
-	for (auto &x : data)
+	for (auto &x : cpu_data)
 		x /= scalar;
 	return *this;
 }
@@ -194,8 +228,8 @@ Tensor Tensor::operator+(ValueType scalar) const {
 }
 
 Tensor Tensor::matmul(const Tensor &other) const {
-	const auto &aShape = shape;
-	const auto &bShape = other.shape;
+	const auto &aShape = cpu_shape;
+	const auto &bShape = other.cpu_shape;
 
 	if (aShape.size() != 2 || bShape.size() != 1)
 		throw std::runtime_error("matmul: unsupported shapes.");
@@ -207,9 +241,9 @@ Tensor Tensor::matmul(const Tensor &other) const {
 
 	Tensor result({M});
 
-	const float *A = data.data();
-	const float *B = other.data.data();
-	float *R = result.data.data();
+	const float *A = cpu_data.data();
+	const float *B = other.cpu_data.data();
+	float *R = result.cpu_data.data();
 
 	for (size_t i = 0; i < M; ++i) {
 		float sum = 0.0f;
@@ -234,9 +268,9 @@ Tensor Tensor::outer(const Tensor &a, const Tensor &b) {
 	size_t n = bShape[0];
 
 	Tensor result({m, n});
-	float *r = result.data.data();
-	const float *A = a.data.data();
-	const float *B = b.data.data();
+	float *r = result.cpu_data.data();
+	const float *A = a.cpu_data.data();
+	const float *B = b.cpu_data.data();
 
 	for (size_t i = 0; i < m; ++i) {
 		for (size_t j = 0; j < n; ++j) {
@@ -247,8 +281,8 @@ Tensor Tensor::outer(const Tensor &a, const Tensor &b) {
 }
 
 Tensor Tensor::matmulT(const Tensor &vec) const {
-	const auto &wShape = shape;
-	const auto &vShape = vec.shape;
+	const auto &wShape = cpu_shape;
+	const auto &vShape = vec.cpu_shape;
 
 	if (wShape.size() != 2 || vShape.size() != 1)
 		throw std::runtime_error("matmulT: bad dimensions");
@@ -260,9 +294,9 @@ Tensor Tensor::matmulT(const Tensor &vec) const {
 
 	Tensor result({N}, 0.0f);
 
-	const float *W = data.data();
-	const float *V = vec.data.data();
-	float *R = result.data.data();
+	const float *W = cpu_data.data();
+	const float *V = vec.cpu_data.data();
+	float *R = result.cpu_data.data();
 
 	for (size_t i = 0; i < N; ++i) {
 		float sum = 0.0f;
diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
index 8530403..8bf3322 100644
--- a/src/model/tensor_gpu.cu
+++ b/src/model/tensor_gpu.cu
@@ -1,5 +1,168 @@
-#include "tensor_gpu.hpp"
 #include <cuda_runtime.h>
+#include "tensor_gpu.hpp"
+#include <cstddef>
+#include <stdexcept>
 
 namespace tensor_gpu {
+
+// Allocate memory on GPU for a tensor.
+float* allocate(std::size_t count) {
+    float* devicePtr = nullptr;
+    cudaError_t err = cudaMalloc(&devicePtr, count * sizeof(float));
+    if (err != cudaSuccess) {
+        throw std::runtime_error("cudaMalloc failed");
+    }
+    return devicePtr;
+}
+
+// Free GPU memory.
+void deallocate(float* devicePtr) {
+    if (devicePtr) {
+        cudaFree(devicePtr);
+    }
+}
+
+// Copy data from CPU to GPU.
+void copyToDevice(float* deviceDst, const float* hostSrc, std::size_t count) {
+    cudaMemcpy(deviceDst, hostSrc, count * sizeof(float), cudaMemcpyHostToDevice);
+}
+
+// Copy data from GPU to CPU.
+void copyToHost(float* hostDst, const float* deviceSrc, std::size_t count) {
+    cudaMemcpy(hostDst, deviceSrc, count * sizeof(float), cudaMemcpyDeviceToHost);
+}
+
+// Kernel to set all elements to zero.
+__global__ void zeroKernel(float* data, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        data[idx] = 0.0f;
+    }
+}
+
+// Set all elements to zero (on GPU).
+void zero(float* deviceData, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    zeroKernel<<<numBlocks, blockSize>>>(deviceData, count);
+    cudaDeviceSynchronize();
+}
+
+// Kernel for element-wise addition: C = A + B
+__global__ void addKernel(const float* A, const float* B, float* C, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        C[idx] = A[idx] + B[idx];
+    }
+}
+
+// Element-wise addition: C = A + B
+void add(const float* A, const float* B, float* C, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    addKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    cudaDeviceSynchronize();
+}
+
+// Kernel for element-wise multiplication: C = A * B
+__global__ void multiplyKernel(const float* A, const float* B, float* C, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        C[idx] = A[idx] * B[idx];
+    }
+}
+
+// Element-wise multiply: C = A * B
+void multiply(const float* A, const float* B, float* C, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    multiplyKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    cudaDeviceSynchronize();
+}
+
+// Dot product kernel using parallel reduction (simplified version)
+__global__ void dotKernel(const float* A, const float* B, float* partialSum, std::size_t count) {
+    __shared__ float cache[256];
+    std::size_t tid = threadIdx.x;
+    std::size_t idx = blockIdx.x * blockDim.x + tid;
+
+    float temp = 0.0f;
+    if (idx < count) {
+        temp = A[idx] * B[idx];
+    }
+    cache[tid] = temp;
+    __syncthreads();
+
+    // Reduction in shared memory
+    for (std::size_t stride = blockDim.x / 2; stride > 0; stride /= 2) {
+        if (tid < stride) {
+            cache[tid] += cache[tid + stride];
+        }
+        __syncthreads();
+    }
+
+    if (tid == 0) {
+        partialSum[blockIdx.x] = cache[0];
+    }
+}
+
+// Dot product between two vectors (A · B)
+float dot(const float* A, const float* B, std::size_t count) {
+    const std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+
+    // Allocate partial sums
+    float* d_partialSum = nullptr;
+    cudaMalloc(&d_partialSum, numBlocks * sizeof(float));
+
+    dotKernel<<<numBlocks, blockSize>>>(A, B, d_partialSum, count);
+    cudaDeviceSynchronize();
+
+    // Copy partial sums to host
+    float* h_partialSum = new float[numBlocks];
+    cudaMemcpy(h_partialSum, d_partialSum, numBlocks * sizeof(float), cudaMemcpyDeviceToHost);
+
+    // Final reduction on CPU
+    float totalSum = 0.0f;
+    for (std::size_t i = 0; i < numBlocks; i++) {
+        totalSum += h_partialSum[i];
+    }
+
+    delete[] h_partialSum;
+    cudaFree(d_partialSum);
+    return totalSum;
+}
+
+// Kernel to apply ReLU activation: max(0, x)
+__global__ void reluKernel(float* data, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        data[idx] = data[idx] > 0.0f ? data[idx] : 0.0f;
+    }
+}
+
+// Apply activation function (e.g., ReLU)
+void relu(float* deviceData, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    reluKernel<<<numBlocks, blockSize>>>(deviceData, count);
+    cudaDeviceSynchronize();
+}
+
+// Kernel to apply ReLU derivative:
+// output[i] = input[i] > 0 ? 1 : 0
+__global__ void reluDerivativeKernel(const float* input, float* output, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        output[idx] = (input[idx] > 0.0f) ? 1.0f : 0.0f;
+    }
+}
+
+// Apply derivative of activation function (e.g., ReLU')
+void relu_derivative(const float* input, float* output, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    reluDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count);
+    cudaDeviceSynchronize();
 }
+} // namespace tensor_gpu
diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp
index 8294ad7..f6d9e62 100644
--- a/src/model/tensor_gpu.hpp
+++ b/src/model/tensor_gpu.hpp
@@ -34,6 +34,4 @@ void relu(float *deviceData, std::size_t count);
 /// Apply derivative of activation function (e.g., ReLU')
 void relu_derivative(const float *input, float *output, std::size_t count);
 
-// Add more operations as needed...
-
 } // namespace tensor_gpu

From 590e9ccc66be7b10be0feac7e43072b846a8ce75 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Tue, 5 Aug 2025 17:56:15 +0300
Subject: [PATCH 04/40] new commit

---
 include/tensor.hpp              |   7 +-
 src/model/tensor.cpp            | 258 +++++++++++++++++---------------
 src/networks/fnn/DenseLayer.hpp |  10 +-
 3 files changed, 152 insertions(+), 123 deletions(-)

diff --git a/include/tensor.hpp b/include/tensor.hpp
index f27a3e3..6af117d 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -19,10 +19,14 @@ class Tensor {
 	std::vector<size_t> cpu_strides;
 
 	ValueType *gpu_data = nullptr;
+    size_t gpu_data_size{0};
 	ValueType *gpu_shape = nullptr;
+    size_t gpu_shape_size{0};
 	ValueType *gpu_strides = nullptr;
+    size_t gpu_strides_size{0};
 
-	bool isGpu() const { return true; }
+
+	static const bool isGpu{false};
 
 	void computeStrides();
 	inline size_t flattenIndex(const std::vector<size_t> &indices) const;
@@ -50,7 +54,6 @@ class Tensor {
 	auto end() const noexcept { return cpu_data.end(); }
 
 	// Shape and size
-	const std::vector<size_t> &getShape() const;
 	size_t numElements() const;
 	const std::vector<ValueType> &getData() const;
 	void fill(const ValueType &value);
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index bf513bf..ed2dedc 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -4,8 +4,7 @@
 #include <tensor.hpp>
 
 namespace nn::global {
-Tensor::Tensor(const std::vector<size_t> &shape, float init)
-    : cpu_shape(shape) {
+Tensor::Tensor(const std::vector<size_t> &shape, float init) {
 	if (shape.empty()) {
 		throw std::invalid_argument("Tensor shape cannot be empty.");
 	}
@@ -15,38 +14,38 @@ Tensor::Tensor(const std::vector<size_t> &shape, float init)
 	    shape.end(),
 	    size_t(1),
 	    std::multiplies<>());
-	cpu_data.assign(totalSize, init);
+
+	if (!isGpu) {
+		cpu_shape = shape;
+		cpu_data.assign(totalSize, init);
+	}
+
 	computeStrides();
 }
 
 ValueType &Tensor::operator[](size_t i) {
-	if (isGpu()) {
-		return gpu_data[i];
+	if (isGpu) {
 	}
 	return cpu_data[i];
 }
 
 const ValueType &Tensor::operator[](size_t i) const {
-	if (isGpu()) {
-		return gpu_data[i];
+	if (isGpu) {
 	}
 	return cpu_data[i];
 }
 
-const std::vector<size_t> &Tensor::getShape() const {
-
-	if (isGpu()) {
-		return gpu_data[i];
-	}
-	return cpu_shape;
-}
-
 size_t Tensor::numElements() const {
+	if (isGpu) {
+		return gpu_data_size;
+	}
 	return cpu_data.size();
 }
 
 const std::vector<ValueType> &Tensor::getData() const {
-	return cpu_data;
+	if (!isGpu) {
+		return cpu_data;
+	}
 }
 
 void Tensor::fill(const ValueType &value) {
@@ -57,10 +56,12 @@ Tensor &Tensor::operator=(const Tensor &other) {
 	if (this == &other)
 		return *this;
 
-	cpu_data = other.cpu_data;
-	cpu_shape = other.cpu_shape;
-	cpu_strides = other.cpu_strides;
-
+	if (!isGpu) {
+		cpu_data = other.cpu_data;
+		cpu_shape = other.cpu_shape;
+		cpu_strides = other.cpu_strides;
+	} else {
+	}
 	return *this;
 }
 
@@ -138,68 +139,86 @@ Tensor Tensor::operator/(const Tensor &other) const {
 }
 
 Tensor &Tensor::operator+=(const Tensor &other) {
-	if (cpu_shape != other.cpu_shape)
-		throw std::invalid_argument("Shape mismatch.");
-
-	float *__restrict__ a = cpu_data.data();
-	const float *__restrict__ b = other.cpu_data.data();
-	const size_t N = cpu_data.size();
-
-	for (size_t i = 0; i < N; ++i)
-		a[i] += b[i];
-
+	if (!isGpu) {
+		if (cpu_shape != other.cpu_shape)
+			throw std::invalid_argument("Shape mismatch in Tensor::operator+=.");
+		const size_t N = cpu_data.size();
+		for (size_t i = 0; i < N; ++i)
+			cpu_data[i] += other.cpu_data[i];
+	} else {
+	}
 	return *this;
 }
 
 Tensor &Tensor::operator-=(const Tensor &other) {
-	if (cpu_shape != other.cpu_shape)
-		throw std::invalid_argument("Shape mismatch.");
-	float *a = cpu_data.data();
-	const float *b = other.cpu_data.data();
-	const size_t N = cpu_data.size();
-	for (size_t i = 0; i < N; ++i)
-		a[i] -= b[i];
+	if (!isGpu) {
+		if (cpu_shape != other.cpu_shape)
+			throw std::invalid_argument("Shape mismatch in Tensor::operator-=.");
+		const size_t N = cpu_data.size();
+		for (size_t i = 0; i < N; ++i)
+			cpu_data[i] -= other.cpu_data[i];
+	} else {
+	}
 	return *this;
 }
 
 Tensor &Tensor::operator*=(const Tensor &other) {
-	if (cpu_shape != other.cpu_shape)
-		throw std::invalid_argument("Shape mismatch in Tensor::operator*=.");
-	const size_t N = cpu_data.size();
-	for (size_t i = 0; i < N; ++i)
-		cpu_data[i] *= other.cpu_data[i];
+	if (!isGpu) {
+		if (cpu_shape != other.cpu_shape)
+			throw std::invalid_argument("Shape mismatch in Tensor::operator*=.");
+		const size_t N = cpu_data.size();
+		for (size_t i = 0; i < N; ++i)
+			cpu_data[i] *= other.cpu_data[i];
+	} else {
+	}
 	return *this;
 }
 
 Tensor &Tensor::operator/=(const Tensor &other) {
-	if (cpu_shape != other.cpu_shape)
-		throw std::invalid_argument("Shape mismatch in Tensor::operator/=.");
-	const size_t N = cpu_data.size();
-	for (size_t i = 0; i < N; ++i)
-		cpu_data[i] /= other.cpu_data[i];
+	if (!isGpu) {
+		if (cpu_shape != other.cpu_shape)
+			throw std::invalid_argument("Shape mismatch in Tensor::operator/=.");
+		const size_t N = cpu_data.size();
+		for (size_t i = 0; i < N; ++i)
+			cpu_data[i] /= other.cpu_data[i];
+	} else {
+	}
 	return *this;
 }
 
 Tensor &Tensor::operator*=(ValueType scalar) {
-	for (auto &x : cpu_data)
-		x *= scalar;
+	if (!isGpu) {
+		for (auto &x : cpu_data)
+			x *= scalar;
+	} else {
+	}
 	return *this;
 }
 
 Tensor &Tensor::operator-=(ValueType scalar) {
-	for (auto &x : cpu_data)
-		x -= scalar;
+	if (!isGpu) {
+		for (auto &x : cpu_data)
+			x -= scalar;
+	} else {
+	}
 	return *this;
 }
 
 Tensor &Tensor::operator+=(ValueType scalar) {
-	for (auto &x : cpu_data)
-		x += scalar;
+	if (!isGpu) {
+		for (auto &x : cpu_data)
+			x += scalar;
+	} else {
+	}
 	return *this;
 }
+
 Tensor &Tensor::operator/=(ValueType scalar) {
-	for (auto &x : cpu_data)
-		x /= scalar;
+	if (!isGpu) {
+		for (auto &x : cpu_data)
+			x /= scalar;
+	} else {
+	}
 	return *this;
 }
 
@@ -228,83 +247,86 @@ Tensor Tensor::operator+(ValueType scalar) const {
 }
 
 Tensor Tensor::matmul(const Tensor &other) const {
-	const auto &aShape = cpu_shape;
-	const auto &bShape = other.cpu_shape;
-
-	if (aShape.size() != 2 || bShape.size() != 1)
-		throw std::runtime_error("matmul: unsupported shapes.");
-
-	size_t M = aShape[0];
-	size_t K = aShape[1];
-	if (K != bShape[0])
-		throw std::runtime_error("matmul: shape mismatch.");
-
-	Tensor result({M});
-
-	const float *A = cpu_data.data();
-	const float *B = other.cpu_data.data();
-	float *R = result.cpu_data.data();
-
-	for (size_t i = 0; i < M; ++i) {
-		float sum = 0.0f;
-		size_t base = i * K;
-		for (size_t j = 0; j < K; ++j) {
-			sum += A[base + j] * B[j];
+	if (!isGpu) {
+		const auto &aShape = cpu_shape;
+		const auto &bShape = other.cpu_shape;
+
+		if (aShape.size() != 2 || bShape.size() != 1)
+			throw std::runtime_error("matmul: unsupported shapes.");
+
+		size_t M = aShape[0];
+		size_t K = aShape[1];
+		if (K != bShape[0])
+			throw std::runtime_error("matmul: shape mismatch.");
+
+		Tensor result({M});
+
+		const float *A = cpu_data.data();
+		const float *B = other.cpu_data.data();
+		float *R = result.cpu_data.data();
+
+		for (size_t i = 0; i < M; ++i) {
+			float sum = 0.0f;
+			size_t base = i * K;
+			for (size_t j = 0; j < K; ++j) {
+				sum += A[base + j] * B[j];
+			}
+			R[i] = sum;
 		}
-		R[i] = sum;
+		return result;
 	}
-	return result;
 }
 
 Tensor Tensor::outer(const Tensor &a, const Tensor &b) {
-	const std::vector<size_t> &aShape = a.getShape();
-	const std::vector<size_t> &bShape = b.getShape();
-
-	if (aShape.size() != 1 || bShape.size() != 1) {
-		throw std::runtime_error("outer: both tensors must be 1D vectors");
-	}
+	if (!isGpu) {
+		if (a.cpu_shape.size() != 1 || b.cpu_shape.size() != 1) {
+			throw std::runtime_error("outer: both tensors must be 1D vectors");
+		}
 
-	size_t m = aShape[0];
-	size_t n = bShape[0];
+		size_t m = a.cpu_shape[0];
+		size_t n = b.cpu_shape[0];
 
-	Tensor result({m, n});
-	float *r = result.cpu_data.data();
-	const float *A = a.cpu_data.data();
-	const float *B = b.cpu_data.data();
+		Tensor result({m, n});
+		float *r = result.cpu_data.data();
+		const float *A = a.cpu_data.data();
+		const float *B = b.cpu_data.data();
 
-	for (size_t i = 0; i < m; ++i) {
-		for (size_t j = 0; j < n; ++j) {
-			r[i * n + j] = A[i] * B[j];
+		for (size_t i = 0; i < m; ++i) {
+			for (size_t j = 0; j < n; ++j) {
+				r[i * n + j] = A[i] * B[j];
+			}
 		}
+		return result;
 	}
-	return result;
 }
 
 Tensor Tensor::matmulT(const Tensor &vec) const {
-	const auto &wShape = cpu_shape;
-	const auto &vShape = vec.cpu_shape;
-
-	if (wShape.size() != 2 || vShape.size() != 1)
-		throw std::runtime_error("matmulT: bad dimensions");
-
-	size_t M = wShape[0];
-	size_t N = wShape[1];
-	if (vShape[0] != M)
-		throw std::runtime_error("matmulT: incompatible");
-
-	Tensor result({N}, 0.0f);
-
-	const float *W = cpu_data.data();
-	const float *V = vec.cpu_data.data();
-	float *R = result.cpu_data.data();
-
-	for (size_t i = 0; i < N; ++i) {
-		float sum = 0.0f;
-		for (size_t j = 0; j < M; ++j) {
-			sum += W[j * N + i] * V[j];
+	if (!isGpu) {
+		const auto &wShape = cpu_shape;
+		const auto &vShape = vec.cpu_shape;
+
+		if (wShape.size() != 2 || vShape.size() != 1)
+			throw std::runtime_error("matmulT: bad dimensions");
+
+		size_t M = wShape[0];
+		size_t N = wShape[1];
+		if (vShape[0] != M)
+			throw std::runtime_error("matmulT: incompatible");
+
+		Tensor result({N}, 0.0f);
+
+		const float *W = cpu_data.data();
+		const float *V = vec.cpu_data.data();
+		float *R = result.cpu_data.data();
+
+		for (size_t i = 0; i < N; ++i) {
+			float sum = 0.0f;
+			for (size_t j = 0; j < M; ++j) {
+				sum += W[j * N + i] * V[j];
+			}
+			R[i] = sum;
 		}
-		R[i] = sum;
+		return result;
 	}
-	return result;
 }
 } // namespace nn::global
diff --git a/src/networks/fnn/DenseLayer.hpp b/src/networks/fnn/DenseLayer.hpp
index 84587aa..2b401a2 100644
--- a/src/networks/fnn/DenseLayer.hpp
+++ b/src/networks/fnn/DenseLayer.hpp
@@ -11,11 +11,15 @@ struct LayerParams {
 	global::Tensor weights;
 	global::Tensor biases;
 
+	size_t size_;
+	size_t prevSize_;
+
 	LayerParams(size_t out_dim, size_t in_dim)
-	    : weights({out_dim, in_dim}), biases({out_dim}) {}
+	    : weights({out_dim, in_dim}), biases({out_dim}),
+	      size_(out_dim), prevSize_(in_dim) {}
 
-	size_t size() const { return biases.numElements(); }
-	size_t prevSize() const { return weights.getShape()[1]; }
+	size_t size() const { return size_; }
+	size_t prevSize() const { return prevSize_; }
 
 	size_t paramSize() const { return biases.numElements() + weights.numElements(); }
 };

From b1a9c5359496a676b201ab65565c424bff2c9e5e Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Tue, 5 Aug 2025 18:38:27 +0300
Subject: [PATCH 05/40] new commit

---
 include/tensor.hpp        |  13 +++-
 src/model/activations.cpp | 135 +++++++++++++++++---------------
 src/model/activations.hpp |   3 -
 src/model/tensor_gpu.cu   | 158 +++++++++++++++++++++++++++++++-------
 src/model/tensor_gpu.hpp  |  44 +++++++----
 5 files changed, 243 insertions(+), 110 deletions(-)

diff --git a/include/tensor.hpp b/include/tensor.hpp
index 6af117d..758e513 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -4,6 +4,10 @@
 #include <cstddef>
 #include <vector>
 
+namespace nn::model {
+class Activation;
+}
+
 namespace nn::global {
 using ValueType = float;
 
@@ -19,18 +23,19 @@ class Tensor {
 	std::vector<size_t> cpu_strides;
 
 	ValueType *gpu_data = nullptr;
-    size_t gpu_data_size{0};
+    std::size_t gpu_data_size{0};
 	ValueType *gpu_shape = nullptr;
-    size_t gpu_shape_size{0};
+	size_t gpu_shape_size{0};
 	ValueType *gpu_strides = nullptr;
-    size_t gpu_strides_size{0};
-
+	size_t gpu_strides_size{0};
 
 	static const bool isGpu{false};
 
 	void computeStrides();
 	inline size_t flattenIndex(const std::vector<size_t> &indices) const;
 
+	friend model::Activation;
+
   public:
 	// Constructors
 	Tensor(const std::vector<size_t> &shape, float init = 0.0f);
diff --git a/src/model/activations.cpp b/src/model/activations.cpp
index 19669bf..525c227 100644
--- a/src/model/activations.cpp
+++ b/src/model/activations.cpp
@@ -1,36 +1,7 @@
 #include "activations.hpp"
+#include "tensor_gpu.hpp"
 
 namespace nn::model {
-global::ValueType Activation::activate(const global::ValueType z) const {
-	switch (activationType) {
-	case ActivationType::Relu:
-		return relu(z);
-	case ActivationType::LeakyRelu:
-		return leakyRelu(z);
-	case ActivationType::Sigmoid:
-		return sigmoid(z);
-	case ActivationType::Tanh:
-		return tanh(z);
-	default:
-		return z;
-	}
-}
-
-global::ValueType Activation::derivativeActivate(const global::ValueType z) const {
-	switch (activationType) {
-	case ActivationType::Relu:
-		return derivativeRelu(z);
-	case ActivationType::LeakyRelu:
-		return derivativeLeakyRelu(z);
-	case ActivationType::Sigmoid:
-		return derivativeSigmoid(z);
-	case ActivationType::Tanh:
-		return derivativeTanh(z);
-	default:
-		return z;
-	}
-}
-
 void Activation::activate(const global::Tensor &net, global::Tensor &out) const {
 	switch (activationType) {
 	case ActivationType::Relu:
@@ -114,61 +85,105 @@ global::ValueType Activation::derivativeTanh(const global::ValueType z) {
 }
 
 void Activation::relu(const global::Tensor &net, global::Tensor &out) {
-	for (size_t i = 0; i < net.numElements(); ++i)
-		out[i] = relu(net[i]);
+	if (net.isGpu) {
+		global::tensor_gpu::relu(net.gpu_data, out.gpu_data, net.gpu_data_size);
+	} else {
+		for (size_t i = 0; i < net.numElements(); ++i) {
+			out[i] = relu(net[i]);
+		}
+	}
 }
 
 void Activation::derivativeRelu(const global::Tensor &net, global::Tensor &out) {
-	for (size_t i = 0; i < net.numElements(); ++i)
-		out[i] *= derivativeRelu(net[i]);
+	if (net.isGpu) {
+		global::tensor_gpu::relu_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size);
+	} else {
+		for (size_t i = 0; i < net.numElements(); ++i) {
+			out[i] *= derivativeRelu(net[i]);
+		}
+	}
 }
 
 void Activation::leakyRelu(const global::Tensor &net, global::Tensor &out) {
-	for (size_t i = 0; i < net.numElements(); ++i)
-		out[i] = leakyRelu(net[i]);
+	if (net.isGpu) {
+		global::tensor_gpu::leaky_relu(net.gpu_data, out.gpu_data, net.gpu_data_size);
+	} else {
+		for (size_t i = 0; i < net.numElements(); ++i) {
+			out[i] = leakyRelu(net[i]);
+		}
+	}
 }
 
 void Activation::derivativeLeakyRelu(const global::Tensor &net, global::Tensor &out) {
-	for (size_t i = 0; i < net.numElements(); ++i)
-		out[i] *= derivativeLeakyRelu(net[i]);
+	if (net.isGpu) {
+		global::tensor_gpu::leaky_relu_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size);
+	} else {
+		for (size_t i = 0; i < net.numElements(); ++i) {
+			out[i] *= derivativeLeakyRelu(net[i]);
+		}
+	}
 }
 
 void Activation::sigmoid(const global::Tensor &net, global::Tensor &out) {
-	for (size_t i = 0; i < net.numElements(); ++i)
-		out[i] = sigmoid(net[i]);
+	if (net.isGpu) {
+		global::tensor_gpu::sigmoid(net.gpu_data, out.gpu_data, net.gpu_data_size);
+	} else {
+		for (size_t i = 0; i < net.numElements(); ++i) {
+			out[i] = sigmoid(net[i]);
+		}
+	}
 }
 
 void Activation::derivativeSigmoid(const global::Tensor &net, global::Tensor &out) {
-	for (size_t i = 0; i < net.numElements(); ++i)
-		out[i] *= derivativeSigmoid(net[i]);
+	if (net.isGpu) {
+		global::tensor_gpu::sigmoid_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size);
+	} else {
+		for (size_t i = 0; i < net.numElements(); ++i) {
+			out[i] *= derivativeSigmoid(net[i]);
+		}
+	}
 }
 
 void Activation::tanh(const global::Tensor &net, global::Tensor &out) {
-	for (size_t i = 0; i < net.numElements(); ++i)
-		out[i] = tanh(net[i]);
+	if (net.isGpu) {
+		global::tensor_gpu::tanh_activation(net.gpu_data, out.gpu_data, net.gpu_data_size);
+	} else {
+		for (size_t i = 0; i < net.numElements(); ++i) {
+			out[i] = tanh(net[i]);
+		}
+	}
 }
 
 void Activation::derivativeTanh(const global::Tensor &net, global::Tensor &out) {
-	for (size_t i = 0; i < net.numElements(); ++i)
-		out[i] *= derivativeTanh(net[i]);
+	if (net.isGpu) {
+		global::tensor_gpu::tanh_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size);
+	} else {
+		for (size_t i = 0; i < net.numElements(); ++i) {
+			out[i] *= derivativeTanh(net[i]);
+		}
+	}
 }
 
 void Activation::softmax(const global::Tensor &net, global::Tensor &out) {
-	global::ValueType max = maxVector(net);
-	global::ValueType sum = 0.0;
-
-	for (size_t i = 0; i < net.numElements(); ++i) {
-		global::ValueType x = net[i] - max;
-		if (x < -700.0)
-			x = -700.0;
-		if (x > 700.0)
-			x = 700.0;
-		out[i] = std::exp(x);
-		sum += out[i];
-	}
+	if (net.isGpu) {
+
+	} else {
+		global::ValueType max = maxVector(net);
+		global::ValueType sum = 0.0;
+
+		for (size_t i = 0; i < net.numElements(); ++i) {
+			global::ValueType x = net[i] - max;
+			if (x < -700.0)
+				x = -700.0;
+			if (x > 700.0)
+				x = 700.0;
+			out[i] = std::exp(x);
+			sum += out[i];
+		}
 
-	sum = maxValue(sum, 1e-10);
+		sum = maxValue(sum, 1e-10);
 
-	out /= sum;
+		out /= sum;
+	}
 }
 } // namespace nn::model
diff --git a/src/model/activations.hpp b/src/model/activations.hpp
index e49a8f0..9c6b4a2 100644
--- a/src/model/activations.hpp
+++ b/src/model/activations.hpp
@@ -67,9 +67,6 @@ class Activation {
 	    : activationType(other.activationType) {}
 	~Activation() = default;
 
-	global::ValueType activate(const global::ValueType x) const;
-	global::ValueType derivativeActivate(const global::ValueType x) const;
-
 	void activate(const global::Tensor &net,
 	              global::Tensor &out) const;
 	void derivativeActivate(const global::Tensor &net,
diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
index 8bf3322..cd2d5a7 100644
--- a/src/model/tensor_gpu.cu
+++ b/src/model/tensor_gpu.cu
@@ -3,12 +3,11 @@
 #include <cstddef>
 #include <stdexcept>
 
-namespace tensor_gpu {
-
+namespace nn::global::tensor_gpu {
 // Allocate memory on GPU for a tensor.
-float* allocate(std::size_t count) {
-    float* devicePtr = nullptr;
-    cudaError_t err = cudaMalloc(&devicePtr, count * sizeof(float));
+ValueType* allocate(std::size_t count) {
+    ValueType* devicePtr = nullptr;
+    cudaError_t err = cudaMalloc(&devicePtr, count * sizeof(ValueType));
     if (err != cudaSuccess) {
         throw std::runtime_error("cudaMalloc failed");
     }
@@ -16,24 +15,24 @@ float* allocate(std::size_t count) {
 }
 
 // Free GPU memory.
-void deallocate(float* devicePtr) {
+void deallocate(ValueType* devicePtr) {
     if (devicePtr) {
         cudaFree(devicePtr);
     }
 }
 
 // Copy data from CPU to GPU.
-void copyToDevice(float* deviceDst, const float* hostSrc, std::size_t count) {
-    cudaMemcpy(deviceDst, hostSrc, count * sizeof(float), cudaMemcpyHostToDevice);
+void copyToDevice(ValueType* deviceDst, const ValueType* hostSrc, std::size_t count) {
+    cudaMemcpy(deviceDst, hostSrc, count * sizeof(ValueType), cudaMemcpyHostToDevice);
 }
 
 // Copy data from GPU to CPU.
-void copyToHost(float* hostDst, const float* deviceSrc, std::size_t count) {
-    cudaMemcpy(hostDst, deviceSrc, count * sizeof(float), cudaMemcpyDeviceToHost);
+void copyToHost(ValueType* hostDst, const ValueType* deviceSrc, std::size_t count) {
+    cudaMemcpy(hostDst, deviceSrc, count * sizeof(ValueType), cudaMemcpyDeviceToHost);
 }
 
 // Kernel to set all elements to zero.
-__global__ void zeroKernel(float* data, std::size_t count) {
+__global__ void zeroKernel(ValueType* data, std::size_t count) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < count) {
         data[idx] = 0.0f;
@@ -41,7 +40,7 @@ __global__ void zeroKernel(float* data, std::size_t count) {
 }
 
 // Set all elements to zero (on GPU).
-void zero(float* deviceData, std::size_t count) {
+void zero(ValueType* deviceData, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     zeroKernel<<<numBlocks, blockSize>>>(deviceData, count);
@@ -49,7 +48,7 @@ void zero(float* deviceData, std::size_t count) {
 }
 
 // Kernel for element-wise addition: C = A + B
-__global__ void addKernel(const float* A, const float* B, float* C, std::size_t count) {
+__global__ void addKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < count) {
         C[idx] = A[idx] + B[idx];
@@ -57,7 +56,7 @@ __global__ void addKernel(const float* A, const float* B, float* C, std::size_t
 }
 
 // Element-wise addition: C = A + B
-void add(const float* A, const float* B, float* C, std::size_t count) {
+void add(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     addKernel<<<numBlocks, blockSize>>>(A, B, C, count);
@@ -65,7 +64,7 @@ void add(const float* A, const float* B, float* C, std::size_t count) {
 }
 
 // Kernel for element-wise multiplication: C = A * B
-__global__ void multiplyKernel(const float* A, const float* B, float* C, std::size_t count) {
+__global__ void multiplyKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < count) {
         C[idx] = A[idx] * B[idx];
@@ -73,7 +72,7 @@ __global__ void multiplyKernel(const float* A, const float* B, float* C, std::si
 }
 
 // Element-wise multiply: C = A * B
-void multiply(const float* A, const float* B, float* C, std::size_t count) {
+void multiply(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     multiplyKernel<<<numBlocks, blockSize>>>(A, B, C, count);
@@ -81,8 +80,8 @@ void multiply(const float* A, const float* B, float* C, std::size_t count) {
 }
 
 // Dot product kernel using parallel reduction (simplified version)
-__global__ void dotKernel(const float* A, const float* B, float* partialSum, std::size_t count) {
-    __shared__ float cache[256];
+__global__ void dotKernel(const ValueType* A, const ValueType* B, ValueType* partialSum, std::size_t count) {
+    __shared__ ValueType cache[256];
     std::size_t tid = threadIdx.x;
     std::size_t idx = blockIdx.x * blockDim.x + tid;
 
@@ -107,23 +106,23 @@ __global__ void dotKernel(const float* A, const float* B, float* partialSum, std
 }
 
 // Dot product between two vectors (A · B)
-float dot(const float* A, const float* B, std::size_t count) {
+float dot(const ValueType* A, const ValueType* B, std::size_t count) {
     const std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
 
     // Allocate partial sums
-    float* d_partialSum = nullptr;
-    cudaMalloc(&d_partialSum, numBlocks * sizeof(float));
+    ValueType* d_partialSum = nullptr;
+    cudaMalloc(&d_partialSum, numBlocks * sizeof(ValueType));
 
     dotKernel<<<numBlocks, blockSize>>>(A, B, d_partialSum, count);
     cudaDeviceSynchronize();
 
     // Copy partial sums to host
-    float* h_partialSum = new float[numBlocks];
+    ValueType* h_partialSum = new ValueType[numBlocks];
     cudaMemcpy(h_partialSum, d_partialSum, numBlocks * sizeof(float), cudaMemcpyDeviceToHost);
 
     // Final reduction on CPU
-    float totalSum = 0.0f;
+    ValueType totalSum = 0.0f;
     for (std::size_t i = 0; i < numBlocks; i++) {
         totalSum += h_partialSum[i];
     }
@@ -134,24 +133,24 @@ float dot(const float* A, const float* B, std::size_t count) {
 }
 
 // Kernel to apply ReLU activation: max(0, x)
-__global__ void reluKernel(float* data, std::size_t count) {
+__global__ void reluKernel(const ValueType *input, ValueType *output, std::size_t count) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < count) {
-        data[idx] = data[idx] > 0.0f ? data[idx] : 0.0f;
+        output[idx] = input[idx] > 0.0 ? input[idx] : 0.0f;
     }
 }
 
 // Apply activation function (e.g., ReLU)
-void relu(float* deviceData, std::size_t count) {
+void relu(const ValueType *input, ValueType *output, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
-    reluKernel<<<numBlocks, blockSize>>>(deviceData, count);
+    reluKernel<<<numBlocks, blockSize>>>(input, output, count);
     cudaDeviceSynchronize();
 }
 
 // Kernel to apply ReLU derivative:
 // output[i] = input[i] > 0 ? 1 : 0
-__global__ void reluDerivativeKernel(const float* input, float* output, std::size_t count) {
+__global__ void reluDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < count) {
         output[idx] = (input[idx] > 0.0f) ? 1.0f : 0.0f;
@@ -159,10 +158,111 @@ __global__ void reluDerivativeKernel(const float* input, float* output, std::siz
 }
 
 // Apply derivative of activation function (e.g., ReLU')
-void relu_derivative(const float* input, float* output, std::size_t count) {
+void relu_derivative(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     reluDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count);
     cudaDeviceSynchronize();
 }
+
+// Kernel to apply Sigmoid activation: 1 / (1 + exp(-x))
+__global__ void sigmoidKernel(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        ValueType x = input[idx];
+        output[idx] = 1.0f / (1.0f + expf(-x));
+    }
+}
+
+// Apply Sigmoid activation
+void sigmoid(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    sigmoidKernel<<<numBlocks, blockSize>>>(input, output, count);
+    cudaDeviceSynchronize();
+}
+
+// Kernel for Sigmoid derivative: s(x) * (1 - s(x))
+__global__ void sigmoidDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        ValueType x = input[idx];
+        ValueType s = 1.0f / (1.0f + expf(-x));
+        output[idx] = s * (1.0f - s);
+    }
+}
+
+// Apply Sigmoid derivative
+void sigmoid_derivative(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    sigmoidDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count);
+    cudaDeviceSynchronize();
+}
+
+// Kernel to apply Tanh activation: tanh(x)
+__global__ void tanhKernel(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        output[idx] = tanhf(input[idx]);
+    }
+}
+
+// Apply Tanh activation
+void tanh_activation(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    tanhKernel<<<numBlocks, blockSize>>>(input, output, count);
+    cudaDeviceSynchronize();
+}
+
+// Kernel for Tanh derivative: 1 - tanh(x)^2
+__global__ void tanhDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        ValueType t = tanhf(input[idx]);
+        output[idx] = 1.0f - t * t;
+    }
+}
+
+// Apply Tanh derivative
+void tanh_derivative(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    tanhDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count);
+    cudaDeviceSynchronize();
+}
+
+// Kernel for Leaky ReLU: x > 0 ? x : alpha * x
+__global__ void leakyReluKernel(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        ValueType x = input[idx];
+        output[idx] = (x > 0.0f) ? x : alpha * x;
+    }
+}
+
+// Apply Leaky ReLU
+void leaky_relu(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    leakyReluKernel<<<numBlocks, blockSize>>>(input, output, count, alpha);
+    cudaDeviceSynchronize();
+}
+
+// Kernel for Leaky ReLU derivative: x > 0 ? 1 : alpha
+__global__ void leakyReluDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        output[idx] = (input[idx] > 0.0f) ? 1.0f : alpha;
+    }
+}
+
+// Apply Leaky ReLU derivative
+void leaky_relu_derivative(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    leakyReluDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count, alpha);
+    cudaDeviceSynchronize();
+}
 } // namespace tensor_gpu
diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp
index f6d9e62..cde66d2 100644
--- a/src/model/tensor_gpu.hpp
+++ b/src/model/tensor_gpu.hpp
@@ -1,37 +1,53 @@
+#ifndef TENSOR_GPU
+#define TENSOR_GPU
+
+#include "tensor.hpp"
 #include <cstddef>
 
 class Tensor; // Forward declaration
 
-namespace tensor_gpu {
+namespace nn::global::tensor_gpu {
 
 /// Allocate memory on GPU for a tensor.
-float *allocate(std::size_t count);
+ValueType *allocate(std::size_t count);
 
 /// Free GPU memory.
-void deallocate(float *devicePtr);
+void deallocate(ValueType *devicePtr);
 
 /// Copy data from CPU to GPU.
-void copyToDevice(float *deviceDst, const float *hostSrc, std::size_t count);
+void copyToDevice(ValueType *deviceDst, const ValueType *hostSrc, std::size_t count);
 
 /// Copy data from GPU to CPU.
-void copyToHost(float *hostDst, const float *deviceSrc, std::size_t count);
+void copyToHost(ValueType *hostDst, const ValueType *deviceSrc, std::size_t count);
 
 /// Set all elements to zero (on GPU).
-void zero(float *deviceData, std::size_t count);
+void zero(ValueType *deviceData, std::size_t count);
 
 /// Element-wise addition: C = A + B
-void add(const float *A, const float *B, float *C, std::size_t count);
+void add(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count);
 
 /// Element-wise multiply: C = A * B
-void multiply(const float *A, const float *B, float *C, std::size_t count);
+void multiply(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count);
 
 /// Dot product between two vectors (A · B)
-float dot(const float *A, const float *B, std::size_t count);
+float dot(const ValueType *A, const ValueType *B, std::size_t count);
+
+// ---------------- ReLU ----------------
+void relu(const ValueType *input, ValueType *output, std::size_t count);
+void relu_derivative(const ValueType *input, ValueType *output, std::size_t count);
+
+// ---------------- Sigmoid ----------------
+void sigmoid(const ValueType *input, ValueType *output, std::size_t count);
+void sigmoid_derivative(const ValueType *input, ValueType *output, std::size_t count);
+
+// ---------------- Tanh ----------------
+void tanh_activation(const ValueType *input, ValueType *output, std::size_t count);
+void tanh_derivative(const ValueType *input, ValueType *output, std::size_t count);
 
-/// Apply activation function (e.g., ReLU)
-void relu(float *deviceData, std::size_t count);
+// ---------------- Leaky ReLU ----------------
+void leaky_relu(const ValueType *input, ValueType *output, std::size_t count, ValueType alpha = 0.01f);
+void leaky_relu_derivative(const ValueType *input, ValueType *output, std::size_t count, ValueType alpha = 0.01f);
 
-/// Apply derivative of activation function (e.g., ReLU')
-void relu_derivative(const float *input, float *output, std::size_t count);
+} // namespace nn::global::tensor_gpu
 
-} // namespace tensor_gpu
+#endif // TENSOR_GPU

From 6aecfb500b79ae1e5865877cd9c9376aeb6525a1 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Tue, 5 Aug 2025 19:03:50 +0300
Subject: [PATCH 06/40] new commit

---
 include/tensor.hpp       |  6 +--
 src/model/tensor.cpp     | 99 +++++++++++++++++++++++-----------------
 src/model/tensor_gpu.cu  | 10 ++--
 src/model/tensor_gpu.hpp |  8 ++--
 4 files changed, 71 insertions(+), 52 deletions(-)

diff --git a/include/tensor.hpp b/include/tensor.hpp
index 758e513..6da851a 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -3,13 +3,13 @@
 
 #include <cstddef>
 #include <vector>
+#include "../src/model/tensor_gpu.hpp"
 
 namespace nn::model {
 class Activation;
 }
 
 namespace nn::global {
-using ValueType = float;
 
 enum class Backend {
 	CPU,
@@ -24,9 +24,9 @@ class Tensor {
 
 	ValueType *gpu_data = nullptr;
     std::size_t gpu_data_size{0};
-	ValueType *gpu_shape = nullptr;
+	size_t *gpu_shape = nullptr;
 	size_t gpu_shape_size{0};
-	ValueType *gpu_strides = nullptr;
+	size_t *gpu_strides = nullptr;
 	size_t gpu_strides_size{0};
 
 	static const bool isGpu{false};
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index ed2dedc..67d06c7 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -18,6 +18,11 @@ Tensor::Tensor(const std::vector<size_t> &shape, float init) {
 	if (!isGpu) {
 		cpu_shape = shape;
 		cpu_data.assign(totalSize, init);
+	} else {
+		gpu_shape = (size_t *)tensor_gpu::allocate(shape.size() * sizeof(size_t));
+		tensor_gpu::copyToDevice(gpu_shape, shape.data(), gpu_data_size * sizeof(size_t));
+		gpu_data = (ValueType *)tensor_gpu::allocate(totalSize * sizeof(ValueType));
+		gpu_data_size = totalSize;
 	}
 
 	computeStrides();
@@ -76,66 +81,78 @@ void Tensor::computeStrides() {
 }
 
 inline size_t Tensor::flattenIndex(const std::vector<size_t> &indices) const {
-	if (indices.size() != cpu_shape.size()) {
-		throw std::invalid_argument("Incorrect number of indices.");
-	}
-	size_t index = 0;
-	for (size_t i = 0; i < cpu_shape.size(); ++i) {
-		if (indices[i] >= cpu_shape[i])
-			throw std::out_of_range("Index out of bounds.");
-		index += indices[i] * cpu_strides[i];
+	if (!isGpu) {
+		if (indices.size() != cpu_shape.size()) {
+			throw std::invalid_argument("Incorrect number of indices.");
+		}
+		size_t index = 0;
+		for (size_t i = 0; i < cpu_shape.size(); ++i) {
+			if (indices[i] >= cpu_shape[i])
+				throw std::out_of_range("Index out of bounds.");
+			index += indices[i] * cpu_strides[i];
+		}
+		return index;
 	}
-	return index;
 }
 
 ValueType &Tensor::operator()(const std::vector<size_t> &indices) {
-	return cpu_data[flattenIndex(indices)];
+	if (!isGpu) {
+		return cpu_data[flattenIndex(indices)];
+	}
 }
 
 ValueType Tensor::operator()(const std::vector<size_t> &indices) const {
-	return cpu_data[flattenIndex(indices)];
+	if (!isGpu) {
+		return cpu_data[flattenIndex(indices)];
+	}
 }
 
 Tensor Tensor::operator+(const Tensor &other) const {
-	if (cpu_shape != other.cpu_shape) {
-		throw std::invalid_argument("Shape mismatch in Tensor::operator+.");
+	if (!isGpu) {
+		if (cpu_shape != other.cpu_shape) {
+			throw std::invalid_argument("Shape mismatch in Tensor::operator+.");
+		}
+		Tensor result(cpu_shape);
+		const float *a = cpu_data.data();
+		const float *b = other.cpu_data.data();
+		float *r = result.cpu_data.data();
+		const size_t N = cpu_data.size();
+		for (size_t i = 0; i < N; ++i)
+			r[i] = a[i] + b[i];
+		return result;
 	}
-	Tensor result(cpu_shape);
-	const float *a = cpu_data.data();
-	const float *b = other.cpu_data.data();
-	float *r = result.cpu_data.data();
-	const size_t N = cpu_data.size();
-	for (size_t i = 0; i < N; ++i)
-		r[i] = a[i] + b[i];
-	return result;
 }
 
 Tensor Tensor::operator-(const Tensor &other) const {
-	if (cpu_shape != other.cpu_shape) {
-		throw std::invalid_argument("Shape mismatch in Tensor::operator-.");
+	if (!isGpu) {
+		if (cpu_shape != other.cpu_shape) {
+			throw std::invalid_argument("Shape mismatch in Tensor::operator-.");
+		}
+		Tensor result(cpu_shape);
+		const float *a = cpu_data.data();
+		const float *b = other.cpu_data.data();
+		float *r = result.cpu_data.data();
+		const size_t N = cpu_data.size();
+		for (size_t i = 0; i < N; ++i)
+			r[i] = a[i] - b[i];
+		return result;
 	}
-	Tensor result(cpu_shape);
-	const float *a = cpu_data.data();
-	const float *b = other.cpu_data.data();
-	float *r = result.cpu_data.data();
-	const size_t N = cpu_data.size();
-	for (size_t i = 0; i < N; ++i)
-		r[i] = a[i] - b[i];
-	return result;
 }
 
 Tensor Tensor::operator/(const Tensor &other) const {
-	if (cpu_shape != other.cpu_shape) {
-		throw std::invalid_argument("Shape mismatch in Tensor::operator/.");
+	if (!isGpu) {
+		if (cpu_shape != other.cpu_shape) {
+			throw std::invalid_argument("Shape mismatch in Tensor::operator/.");
+		}
+		Tensor result(cpu_shape);
+		const float *a = cpu_data.data();
+		const float *b = other.cpu_data.data();
+		float *r = result.cpu_data.data();
+		const size_t N = cpu_data.size();
+		for (size_t i = 0; i < N; ++i)
+			r[i] = a[i] / b[i];
+		return result;
 	}
-	Tensor result(cpu_shape);
-	const float *a = cpu_data.data();
-	const float *b = other.cpu_data.data();
-	float *r = result.cpu_data.data();
-	const size_t N = cpu_data.size();
-	for (size_t i = 0; i < N; ++i)
-		r[i] = a[i] / b[i];
-	return result;
 }
 
 Tensor &Tensor::operator+=(const Tensor &other) {
diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
index cd2d5a7..b911481 100644
--- a/src/model/tensor_gpu.cu
+++ b/src/model/tensor_gpu.cu
@@ -5,9 +5,9 @@
 
 namespace nn::global::tensor_gpu {
 // Allocate memory on GPU for a tensor.
-ValueType* allocate(std::size_t count) {
-    ValueType* devicePtr = nullptr;
-    cudaError_t err = cudaMalloc(&devicePtr, count * sizeof(ValueType));
+void* allocate(std::size_t count) {
+    void* devicePtr = nullptr;
+    cudaError_t err = cudaMalloc(&devicePtr, count);
     if (err != cudaSuccess) {
         throw std::runtime_error("cudaMalloc failed");
     }
@@ -22,8 +22,8 @@ void deallocate(ValueType* devicePtr) {
 }
 
 // Copy data from CPU to GPU.
-void copyToDevice(ValueType* deviceDst, const ValueType* hostSrc, std::size_t count) {
-    cudaMemcpy(deviceDst, hostSrc, count * sizeof(ValueType), cudaMemcpyHostToDevice);
+void copyToDevice(void* deviceDst, const void * hostSrc, std::size_t size) {
+    cudaMemcpy(deviceDst, hostSrc, size, cudaMemcpyHostToDevice);
 }
 
 // Copy data from GPU to CPU.
diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp
index cde66d2..77f7aa3 100644
--- a/src/model/tensor_gpu.hpp
+++ b/src/model/tensor_gpu.hpp
@@ -1,21 +1,23 @@
 #ifndef TENSOR_GPU
 #define TENSOR_GPU
 
-#include "tensor.hpp"
 #include <cstddef>
+namespace nn::global {
+using ValueType = float;
+}
 
 class Tensor; // Forward declaration
 
 namespace nn::global::tensor_gpu {
 
 /// Allocate memory on GPU for a tensor.
-ValueType *allocate(std::size_t count);
+void *allocate(std::size_t count);
 
 /// Free GPU memory.
 void deallocate(ValueType *devicePtr);
 
 /// Copy data from CPU to GPU.
-void copyToDevice(ValueType *deviceDst, const ValueType *hostSrc, std::size_t count);
+void copyToDevice(void *deviceDst, const void *hostSrc, std::size_t count);
 
 /// Copy data from GPU to CPU.
 void copyToHost(ValueType *hostDst, const ValueType *deviceSrc, std::size_t count);

From 2d8637c3f098543ed07e8b2af7b0ad64208f67a3 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Tue, 5 Aug 2025 19:51:36 +0300
Subject: [PATCH 07/40] new commit

---
 include/tensor.hpp       |  3 +--
 src/model/tensor.cpp     | 25 +++++++++++++++++++------
 src/model/tensor_gpu.cu  | 22 ++++++++++++++++++++--
 src/model/tensor_gpu.hpp |  6 +++++-
 4 files changed, 45 insertions(+), 11 deletions(-)

diff --git a/include/tensor.hpp b/include/tensor.hpp
index 6da851a..2a66eb8 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -25,9 +25,8 @@ class Tensor {
 	ValueType *gpu_data = nullptr;
     std::size_t gpu_data_size{0};
 	size_t *gpu_shape = nullptr;
-	size_t gpu_shape_size{0};
 	size_t *gpu_strides = nullptr;
-	size_t gpu_strides_size{0};
+	size_t gpu_shape_size{0};
 
 	static const bool isGpu{false};
 
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index 67d06c7..6547556 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -66,17 +66,30 @@ Tensor &Tensor::operator=(const Tensor &other) {
 		cpu_shape = other.cpu_shape;
 		cpu_strides = other.cpu_strides;
 	} else {
+		gpu_shape = (size_t *)tensor_gpu::allocate(other.gpu_shape_size * sizeof(size_t));
+		gpu_data = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType));
+		gpu_data_size = other.gpu_data_size;
+		gpu_shape_size = other.gpu_shape_size;
+
+		tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType));
+		tensor_gpu::copyDeviceToDevice(gpu_shape, other.gpu_shape, gpu_shape_size * sizeof(size_t));
+		tensor_gpu::copyDeviceToDevice(gpu_strides, other.gpu_strides, gpu_shape_size * sizeof(size_t));
 	}
 	return *this;
 }
 
 void Tensor::computeStrides() {
-	const size_t dim = cpu_shape.size();
-	cpu_strides.resize(dim);
-	size_t stride = 1;
-	for (size_t i = dim; i-- > 0;) {
-		cpu_strides[i] = stride;
-		stride *= cpu_shape[i];
+	if (isGpu) {
+		gpu_strides = (size_t *)tensor_gpu::allocate(gpu_shape_size * sizeof(size_t));
+		tensor_gpu::computeStridesDevice(gpu_shape, gpu_strides, gpu_shape_size * sizeof(size_t));
+	} else {
+		const size_t dim = cpu_shape.size();
+		cpu_strides.resize(dim);
+		size_t stride = 1;
+		for (size_t i = dim; i-- > 0;) {
+			cpu_strides[i] = stride;
+			stride *= cpu_shape[i];
+		}
 	}
 }
 
diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
index b911481..9199305 100644
--- a/src/model/tensor_gpu.cu
+++ b/src/model/tensor_gpu.cu
@@ -26,9 +26,14 @@ void copyToDevice(void* deviceDst, const void * hostSrc, std::size_t size) {
     cudaMemcpy(deviceDst, hostSrc, size, cudaMemcpyHostToDevice);
 }
 
+
+void copyDeviceToDevice(void *deviceDst, const void *deviceSrc, std::size_t count) {
+    cudaMemcpy(deviceDst, deviceDst, count, cudaMemcpyDeviceToDevice);
+}
+
 // Copy data from GPU to CPU.
-void copyToHost(ValueType* hostDst, const ValueType* deviceSrc, std::size_t count) {
-    cudaMemcpy(hostDst, deviceSrc, count * sizeof(ValueType), cudaMemcpyDeviceToHost);
+void copyToHost(void* hostDst, const void* deviceSrc, std::size_t count) {
+    cudaMemcpy(hostDst, deviceSrc, count, cudaMemcpyDeviceToHost);
 }
 
 // Kernel to set all elements to zero.
@@ -132,6 +137,19 @@ float dot(const ValueType* A, const ValueType* B, std::size_t count) {
     return totalSum;
 }
 
+__global__ void computeStrides(const size_t *shape, size_t *strides, size_t ndim) {
+    size_t stride = 1;
+    for (int i = ndim - 1; i >= 0; --i) {
+        strides[i] = stride;
+        stride *= shape[i];
+    }
+}
+
+void computeStridesDevice(const size_t *gpu_shape, size_t *gpu_strides, std::size_t ndim) {
+    computeStrides<<<1, 1>>>(gpu_shape, gpu_strides, ndim);
+    cudaDeviceSynchronize(); // Ensure computation completes
+}
+
 // Kernel to apply ReLU activation: max(0, x)
 __global__ void reluKernel(const ValueType *input, ValueType *output, std::size_t count) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp
index 77f7aa3..3195650 100644
--- a/src/model/tensor_gpu.hpp
+++ b/src/model/tensor_gpu.hpp
@@ -20,7 +20,9 @@ void deallocate(ValueType *devicePtr);
 void copyToDevice(void *deviceDst, const void *hostSrc, std::size_t count);
 
 /// Copy data from GPU to CPU.
-void copyToHost(ValueType *hostDst, const ValueType *deviceSrc, std::size_t count);
+void copyToHost(void *hostDst, const void *deviceSrc, std::size_t count);
+
+void copyDeviceToDevice(void *deviceDst, const void *deviceSrc, std::size_t count);
 
 /// Set all elements to zero (on GPU).
 void zero(ValueType *deviceData, std::size_t count);
@@ -34,6 +36,8 @@ void multiply(const ValueType *A, const ValueType *B, ValueType *C, std::size_t
 /// Dot product between two vectors (A · B)
 float dot(const ValueType *A, const ValueType *B, std::size_t count);
 
+void computeStridesDevice(const size_t *gpu_shape, size_t *gpu_strides, std::size_t ndim);
+
 // ---------------- ReLU ----------------
 void relu(const ValueType *input, ValueType *output, std::size_t count);
 void relu_derivative(const ValueType *input, ValueType *output, std::size_t count);

From 228b6d6c2d02bc1972dc73f405db553f5a085b65 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Wed, 6 Aug 2025 10:53:09 +0300
Subject: [PATCH 08/40] new commit

---
 include/tensor.hpp        |  8 ++--
 src/model/activations.cpp | 29 ++++++------
 src/model/tensor.cpp      | 23 +++++++++-
 src/model/tensor_gpu.cu   | 97 ++++++++++++++++++++++++++++++++++++++-
 src/model/tensor_gpu.hpp  | 10 ++++
 5 files changed, 148 insertions(+), 19 deletions(-)

diff --git a/include/tensor.hpp b/include/tensor.hpp
index 2a66eb8..9147fb1 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -1,9 +1,9 @@
 #ifndef TENSOR
 #define TENSOR
 
+#include "../src/model/tensor_gpu.hpp"
 #include <cstddef>
 #include <vector>
-#include "../src/model/tensor_gpu.hpp"
 
 namespace nn::model {
 class Activation;
@@ -23,7 +23,7 @@ class Tensor {
 	std::vector<size_t> cpu_strides;
 
 	ValueType *gpu_data = nullptr;
-    std::size_t gpu_data_size{0};
+	std::size_t gpu_data_size{0};
 	size_t *gpu_shape = nullptr;
 	size_t *gpu_strides = nullptr;
 	size_t gpu_shape_size{0};
@@ -33,6 +33,8 @@ class Tensor {
 	void computeStrides();
 	inline size_t flattenIndex(const std::vector<size_t> &indices) const;
 
+	void loadTempGpu() const;
+
 	friend model::Activation;
 
   public:
@@ -59,7 +61,7 @@ class Tensor {
 
 	// Shape and size
 	size_t numElements() const;
-	const std::vector<ValueType> &getData() const;
+	void getData(std::vector<ValueType> &dest) const;
 	void fill(const ValueType &value);
 
 	// Arithmetic operations
diff --git a/src/model/activations.cpp b/src/model/activations.cpp
index 525c227..319104b 100644
--- a/src/model/activations.cpp
+++ b/src/model/activations.cpp
@@ -1,4 +1,5 @@
 #include "activations.hpp"
+#include "tensor.hpp"
 #include "tensor_gpu.hpp"
 
 namespace nn::model {
@@ -44,7 +45,9 @@ void Activation::derivativeActivate(const global::Tensor &net, global::Tensor &o
 }
 
 global::ValueType Activation::maxVector(const global::Tensor &metrix) {
-	global::ValueType max = metrix[0];
+	if (metrix.isGpu) {
+	}
+	global::ValueType max = metrix.cpu_data[0];
 	for (auto &value : metrix) {
 		if (value > max) {
 			max = value;
@@ -89,7 +92,7 @@ void Activation::relu(const global::Tensor &net, global::Tensor &out) {
 		global::tensor_gpu::relu(net.gpu_data, out.gpu_data, net.gpu_data_size);
 	} else {
 		for (size_t i = 0; i < net.numElements(); ++i) {
-			out[i] = relu(net[i]);
+			out.cpu_data[i] = relu(net.cpu_data[i]);
 		}
 	}
 }
@@ -99,7 +102,7 @@ void Activation::derivativeRelu(const global::Tensor &net, global::Tensor &out)
 		global::tensor_gpu::relu_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size);
 	} else {
 		for (size_t i = 0; i < net.numElements(); ++i) {
-			out[i] *= derivativeRelu(net[i]);
+			out.cpu_data[i] *= derivativeRelu(net.cpu_data[i]);
 		}
 	}
 }
@@ -109,7 +112,7 @@ void Activation::leakyRelu(const global::Tensor &net, global::Tensor &out) {
 		global::tensor_gpu::leaky_relu(net.gpu_data, out.gpu_data, net.gpu_data_size);
 	} else {
 		for (size_t i = 0; i < net.numElements(); ++i) {
-			out[i] = leakyRelu(net[i]);
+			out.cpu_data[i] = leakyRelu(net.cpu_data[i]);
 		}
 	}
 }
@@ -119,7 +122,7 @@ void Activation::derivativeLeakyRelu(const global::Tensor &net, global::Tensor &
 		global::tensor_gpu::leaky_relu_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size);
 	} else {
 		for (size_t i = 0; i < net.numElements(); ++i) {
-			out[i] *= derivativeLeakyRelu(net[i]);
+			out.cpu_data[i] *= derivativeLeakyRelu(net.cpu_data[i]);
 		}
 	}
 }
@@ -129,7 +132,7 @@ void Activation::sigmoid(const global::Tensor &net, global::Tensor &out) {
 		global::tensor_gpu::sigmoid(net.gpu_data, out.gpu_data, net.gpu_data_size);
 	} else {
 		for (size_t i = 0; i < net.numElements(); ++i) {
-			out[i] = sigmoid(net[i]);
+			out.cpu_data[i] = sigmoid(net.cpu_data[i]);
 		}
 	}
 }
@@ -139,7 +142,7 @@ void Activation::derivativeSigmoid(const global::Tensor &net, global::Tensor &ou
 		global::tensor_gpu::sigmoid_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size);
 	} else {
 		for (size_t i = 0; i < net.numElements(); ++i) {
-			out[i] *= derivativeSigmoid(net[i]);
+			out.cpu_data[i] *= derivativeSigmoid(net.cpu_data[i]);
 		}
 	}
 }
@@ -149,7 +152,7 @@ void Activation::tanh(const global::Tensor &net, global::Tensor &out) {
 		global::tensor_gpu::tanh_activation(net.gpu_data, out.gpu_data, net.gpu_data_size);
 	} else {
 		for (size_t i = 0; i < net.numElements(); ++i) {
-			out[i] = tanh(net[i]);
+			out.cpu_data[i] = tanh(net.cpu_data[i]);
 		}
 	}
 }
@@ -159,26 +162,26 @@ void Activation::derivativeTanh(const global::Tensor &net, global::Tensor &out)
 		global::tensor_gpu::tanh_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size);
 	} else {
 		for (size_t i = 0; i < net.numElements(); ++i) {
-			out[i] *= derivativeTanh(net[i]);
+			out.cpu_data[i] *= derivativeTanh(net.cpu_data[i]);
 		}
 	}
 }
 
 void Activation::softmax(const global::Tensor &net, global::Tensor &out) {
 	if (net.isGpu) {
-
+		global::tensor_gpu::softmax(net.gpu_data, out.gpu_data, net.gpu_data_size);
 	} else {
 		global::ValueType max = maxVector(net);
 		global::ValueType sum = 0.0;
 
 		for (size_t i = 0; i < net.numElements(); ++i) {
-			global::ValueType x = net[i] - max;
+			global::ValueType x = net.cpu_data[i] - max;
 			if (x < -700.0)
 				x = -700.0;
 			if (x > 700.0)
 				x = 700.0;
-			out[i] = std::exp(x);
-			sum += out[i];
+			out.cpu_data[i] = std::exp(x);
+			sum += out.cpu_data[i];
 		}
 
 		sum = maxValue(sum, 1e-10);
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index 6547556..2a0bab9 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -1,7 +1,9 @@
 #include "tensor_gpu.hpp"
+#include <algorithm>
 #include <numeric>
 #include <stdexcept>
 #include <tensor.hpp>
+#include <vector>
 
 namespace nn::global {
 Tensor::Tensor(const std::vector<size_t> &shape, float init) {
@@ -29,13 +31,19 @@ Tensor::Tensor(const std::vector<size_t> &shape, float init) {
 }
 
 ValueType &Tensor::operator[](size_t i) {
+	static ValueType value;
 	if (isGpu) {
+		value = tensor_gpu::getValueAt<ValueType>(gpu_data, i);
+		return value;
 	}
 	return cpu_data[i];
 }
 
 const ValueType &Tensor::operator[](size_t i) const {
+	static ValueType value;
 	if (isGpu) {
+		value = tensor_gpu::getValueAt<ValueType>(gpu_data, i);
+		return value;
 	}
 	return cpu_data[i];
 }
@@ -47,10 +55,15 @@ size_t Tensor::numElements() const {
 	return cpu_data.size();
 }
 
-const std::vector<ValueType> &Tensor::getData() const {
+void Tensor::getData(std::vector<ValueType> &dest) const {
 	if (!isGpu) {
-		return cpu_data;
+		dest = cpu_data;
 	}
+
+	ValueType *newV = nullptr;
+	tensor_gpu::copyToHost(newV, gpu_data, gpu_data_size * sizeof(ValueType));
+
+	std::copy(newV, newV + gpu_data_size, dest.begin());
 }
 
 void Tensor::fill(const ValueType &value) {
@@ -95,6 +108,7 @@ void Tensor::computeStrides() {
 
 inline size_t Tensor::flattenIndex(const std::vector<size_t> &indices) const {
 	if (!isGpu) {
+		// CPU version, same as before
 		if (indices.size() != cpu_shape.size()) {
 			throw std::invalid_argument("Incorrect number of indices.");
 		}
@@ -105,6 +119,11 @@ inline size_t Tensor::flattenIndex(const std::vector<size_t> &indices) const {
 			index += indices[i] * cpu_strides[i];
 		}
 		return index;
+	} else {
+		if (indices.size() != gpu_shape_size) {
+			throw std::invalid_argument("Incorrect number of indices.");
+		}
+		return tensor_gpu::flattenIndexGpu(indices.data(), gpu_shape, gpu_strides, gpu_shape_size);
 	}
 }
 
diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
index 9199305..99f81b6 100644
--- a/src/model/tensor_gpu.cu
+++ b/src/model/tensor_gpu.cu
@@ -28,7 +28,7 @@ void copyToDevice(void* deviceDst, const void * hostSrc, std::size_t size) {
 
 
 void copyDeviceToDevice(void *deviceDst, const void *deviceSrc, std::size_t count) {
-    cudaMemcpy(deviceDst, deviceDst, count, cudaMemcpyDeviceToDevice);
+    cudaMemcpy(deviceDst, deviceSrc, count, cudaMemcpyDeviceToDevice);
 }
 
 // Copy data from GPU to CPU.
@@ -283,4 +283,99 @@ void leaky_relu_derivative(const ValueType* input, ValueType* output, std::size_
     leakyReluDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count, alpha);
     cudaDeviceSynchronize();
 }
+
+__global__ void softmaxKernel(const ValueType* input, ValueType* output, std::size_t count) {
+    extern __shared__ ValueType shared[];
+
+    std::size_t tid = threadIdx.x;
+    std::size_t idx = blockIdx.x * blockDim.x + tid;
+
+    if (idx >= count) return;
+
+    // Load input into shared memory
+    shared[tid] = input[idx];
+    __syncthreads();
+
+    // Step 1: Find max value for numerical stability
+    ValueType max_val = shared[0];
+    for (std::size_t i = 1; i < blockDim.x && blockIdx.x * blockDim.x + i < count; ++i) {
+        max_val = fmaxf(max_val, shared[i]);
+    }
+    __syncthreads();
+
+    // Step 2: Compute exp(x - max)
+    ValueType e = expf(shared[tid] - max_val);
+    shared[tid] = e;
+    __syncthreads();
+
+    // Step 3: Sum of exponentials
+    ValueType sum = 0.0f;
+    for (std::size_t i = 0; i < blockDim.x && blockIdx.x * blockDim.x + i < count; ++i) {
+        sum += shared[i];
+    }
+    __syncthreads();
+
+    // Step 4: Normalize
+    output[idx] = shared[tid] / sum;
+}
+
+void softmax(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    std::size_t sharedMemSize = blockSize * sizeof(ValueType);
+
+    softmaxKernel<<<numBlocks, blockSize, sharedMemSize>>>(input, output, count);
+    cudaDeviceSynchronize();
+}
+
+template <typename T>
+void setValueAt(T* devicePtr, std::size_t index, T value) {
+    cudaMemcpy(devicePtr + index, &value, sizeof(T), cudaMemcpyHostToDevice);
+}
+
+template <typename T>
+ValueType getValueAt(const T* devicePtr , std::size_t index) {
+    T value;
+    cudaMemcpy(&value, devicePtr + index, sizeof(T), cudaMemcpyDeviceToHost);
+    return value;
+}
+
+// Compute flattened index on device
+__global__ void flattenIndexKernel(const size_t* indices, const size_t* shape, const size_t* strides, size_t ndim, size_t* outIndex) {
+    size_t idx = 0;
+    for (size_t i = 0; i < ndim; ++i) {
+        if (indices[i] >= shape[i]) {
+            *outIndex = size_t(-1); // invalid index
+            return;
+        }
+        idx += indices[i] * strides[i];
+    }
+    *outIndex = idx;
+}
+
+size_t flattenIndexGpu(const size_t* indices,const size_t* d_shape,const size_t* d_strides,size_t ndim) {
+    // Copy indices vector to device memory
+    size_t* d_indices = nullptr;
+    cudaMalloc(&d_indices, ndim * sizeof(size_t));
+    cudaMemcpy(d_indices, indices, ndim * sizeof(size_t), cudaMemcpyHostToDevice);
+
+    size_t* d_outIndex = nullptr;
+    cudaMalloc(&d_outIndex, sizeof(size_t));
+
+    // Launch kernel with a single thread since this is a scalar computation
+    flattenIndexKernel<<<1, 1>>>(d_indices, d_shape, d_strides, ndim, d_outIndex);
+    cudaDeviceSynchronize();
+
+    size_t hostIndex;
+    cudaMemcpy(&hostIndex, d_outIndex, sizeof(size_t), cudaMemcpyDeviceToHost);
+
+    cudaFree(d_indices);
+    cudaFree(d_outIndex);
+
+    if (hostIndex == size_t(-1)) {
+        throw std::out_of_range("Index out of bounds.");
+    }
+
+    return hostIndex;
+}
 } // namespace tensor_gpu
diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp
index 3195650..a26bf29 100644
--- a/src/model/tensor_gpu.hpp
+++ b/src/model/tensor_gpu.hpp
@@ -54,6 +54,16 @@ void tanh_derivative(const ValueType *input, ValueType *output, std::size_t coun
 void leaky_relu(const ValueType *input, ValueType *output, std::size_t count, ValueType alpha = 0.01f);
 void leaky_relu_derivative(const ValueType *input, ValueType *output, std::size_t count, ValueType alpha = 0.01f);
 
+void softmax(const ValueType *net, ValueType *out, std::size_t size);
+
+template <typename T>
+ValueType getValueAt(const T *devicePtr, std::size_t index);
+
+template <typename T>
+void setValueAt(T *devicePtr, std::size_t index, T value);
+
+size_t flattenIndexGpu(const size_t *indices, const size_t *d_shape, const size_t *d_strides, size_t ndim);
+
 } // namespace nn::global::tensor_gpu
 
 #endif // TENSOR_GPU

From 7c96a3c2d7bcce83b81c503ffa1de8d273f21341 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Wed, 6 Aug 2025 11:22:11 +0300
Subject: [PATCH 09/40] new commit

---
 include/tensor.hpp       |  5 ----
 src/model/tensor.cpp     | 54 ++++++----------------------------------
 src/model/tensor_gpu.cu  | 48 +++++++++++++++++++++++++++++++++++
 src/model/tensor_gpu.hpp |  1 +
 4 files changed, 56 insertions(+), 52 deletions(-)

diff --git a/include/tensor.hpp b/include/tensor.hpp
index 9147fb1..23dc8b6 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -65,11 +65,6 @@ class Tensor {
 	void fill(const ValueType &value);
 
 	// Arithmetic operations
-	Tensor operator+(const Tensor &other) const;
-	Tensor operator*(const Tensor &other) const;
-	Tensor operator-(const Tensor &other) const;
-	Tensor operator/(const Tensor &other) const;
-
 	Tensor operator*(ValueType scalar) const;
 	Tensor operator+(ValueType scalar) const;
 	Tensor operator/(ValueType scalar) const;
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index 2a0bab9..10f7e54 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -128,63 +128,23 @@ inline size_t Tensor::flattenIndex(const std::vector<size_t> &indices) const {
 }
 
 ValueType &Tensor::operator()(const std::vector<size_t> &indices) {
+	static ValueType value;
 	if (!isGpu) {
 		return cpu_data[flattenIndex(indices)];
 	}
+
+	value = tensor_gpu::getValueAtIndices(indices.data());
+	return value;
 }
 
 ValueType Tensor::operator()(const std::vector<size_t> &indices) const {
+	static ValueType value;
 	if (!isGpu) {
 		return cpu_data[flattenIndex(indices)];
 	}
-}
 
-Tensor Tensor::operator+(const Tensor &other) const {
-	if (!isGpu) {
-		if (cpu_shape != other.cpu_shape) {
-			throw std::invalid_argument("Shape mismatch in Tensor::operator+.");
-		}
-		Tensor result(cpu_shape);
-		const float *a = cpu_data.data();
-		const float *b = other.cpu_data.data();
-		float *r = result.cpu_data.data();
-		const size_t N = cpu_data.size();
-		for (size_t i = 0; i < N; ++i)
-			r[i] = a[i] + b[i];
-		return result;
-	}
-}
-
-Tensor Tensor::operator-(const Tensor &other) const {
-	if (!isGpu) {
-		if (cpu_shape != other.cpu_shape) {
-			throw std::invalid_argument("Shape mismatch in Tensor::operator-.");
-		}
-		Tensor result(cpu_shape);
-		const float *a = cpu_data.data();
-		const float *b = other.cpu_data.data();
-		float *r = result.cpu_data.data();
-		const size_t N = cpu_data.size();
-		for (size_t i = 0; i < N; ++i)
-			r[i] = a[i] - b[i];
-		return result;
-	}
-}
-
-Tensor Tensor::operator/(const Tensor &other) const {
-	if (!isGpu) {
-		if (cpu_shape != other.cpu_shape) {
-			throw std::invalid_argument("Shape mismatch in Tensor::operator/.");
-		}
-		Tensor result(cpu_shape);
-		const float *a = cpu_data.data();
-		const float *b = other.cpu_data.data();
-		float *r = result.cpu_data.data();
-		const size_t N = cpu_data.size();
-		for (size_t i = 0; i < N; ++i)
-			r[i] = a[i] / b[i];
-		return result;
-	}
+	value = tensor_gpu::getValueAtIndices(indices.data());
+	return value;
 }
 
 Tensor &Tensor::operator+=(const Tensor &other) {
diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
index 99f81b6..0a3fe10 100644
--- a/src/model/tensor_gpu.cu
+++ b/src/model/tensor_gpu.cu
@@ -378,4 +378,52 @@ size_t flattenIndexGpu(const size_t* indices,const size_t* d_shape,const size_t*
 
     return hostIndex;
 }
+
+__global__ void computeFlatIndexKernel(
+    const size_t* indices, const size_t* shape, const size_t* strides,
+    size_t rank, size_t* outIndex
+) {
+    size_t flatIndex = 0;
+    for (size_t i = 0; i < rank; ++i) {
+        flatIndex += indices[i] * strides[i];
+    }
+    *outIndex = flatIndex;
+}
+
+ValueType getValueAtIndicesGpu(
+    const ValueType* deviceData,
+    const size_t* hostIndices,
+    const size_t* deviceShape,
+    const size_t* deviceStrides,
+    size_t rank
+) {
+    // Copy host indices to device
+    size_t* deviceIndices;
+    cudaMalloc(&deviceIndices, sizeof(size_t) * rank);
+    cudaMemcpy(deviceIndices, hostIndices, sizeof(size_t) * rank, cudaMemcpyHostToDevice);
+
+    // Allocate output for index
+    size_t* deviceFlatIndex;
+    cudaMalloc(&deviceFlatIndex, sizeof(size_t));
+
+    // Launch kernel to compute flat index
+    computeFlatIndexKernel<<<1, 1>>>(
+        deviceIndices, deviceShape, deviceStrides, rank, deviceFlatIndex
+    );
+    cudaDeviceSynchronize();
+
+    // Copy back flat index
+    size_t flatIndex;
+    cudaMemcpy(&flatIndex, deviceFlatIndex, sizeof(size_t), cudaMemcpyDeviceToHost);
+
+    // Get value at that index
+    ValueType value;
+    cudaMemcpy(&value, deviceData + flatIndex, sizeof(ValueType), cudaMemcpyDeviceToHost);
+
+    // Cleanup
+    cudaFree(deviceIndices);
+    cudaFree(deviceFlatIndex);
+
+    return value;
+}
 } // namespace tensor_gpu
diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp
index a26bf29..cf171e5 100644
--- a/src/model/tensor_gpu.hpp
+++ b/src/model/tensor_gpu.hpp
@@ -64,6 +64,7 @@ void setValueAt(T *devicePtr, std::size_t index, T value);
 
 size_t flattenIndexGpu(const size_t *indices, const size_t *d_shape, const size_t *d_strides, size_t ndim);
 
+ValueType getValueAtIndices(const size_t *indices);
 } // namespace nn::global::tensor_gpu
 
 #endif // TENSOR_GPU

From 038136f20c6bdc0a463a797c5e14ec6b27399c80 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Wed, 6 Aug 2025 12:05:56 +0300
Subject: [PATCH 10/40] new commit

---
 include/tensor.hpp       |   2 +-
 src/model/tensor.cpp     |  66 ++++++++++++++-
 src/model/tensor_gpu.cu  | 176 ++++++++++++++++++++++++++++++++++++---
 src/model/tensor_gpu.hpp |  36 ++++++--
 4 files changed, 258 insertions(+), 22 deletions(-)

diff --git a/include/tensor.hpp b/include/tensor.hpp
index 23dc8b6..60bdd3e 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -28,7 +28,7 @@ class Tensor {
 	size_t *gpu_strides = nullptr;
 	size_t gpu_shape_size{0};
 
-	static const bool isGpu{false};
+	static const bool isGpu{true};
 
 	void computeStrides();
 	inline size_t flattenIndex(const std::vector<size_t> &indices) const;
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index 10f7e54..f76e921 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -33,7 +33,7 @@ Tensor::Tensor(const std::vector<size_t> &shape, float init) {
 ValueType &Tensor::operator[](size_t i) {
 	static ValueType value;
 	if (isGpu) {
-		value = tensor_gpu::getValueAt<ValueType>(gpu_data, i);
+		value = tensor_gpu::getValueAt(gpu_data, i);
 		return value;
 	}
 	return cpu_data[i];
@@ -42,7 +42,7 @@ ValueType &Tensor::operator[](size_t i) {
 const ValueType &Tensor::operator[](size_t i) const {
 	static ValueType value;
 	if (isGpu) {
-		value = tensor_gpu::getValueAt<ValueType>(gpu_data, i);
+		value = tensor_gpu::getValueAt(gpu_data, i);
 		return value;
 	}
 	return cpu_data[i];
@@ -133,7 +133,7 @@ ValueType &Tensor::operator()(const std::vector<size_t> &indices) {
 		return cpu_data[flattenIndex(indices)];
 	}
 
-	value = tensor_gpu::getValueAtIndices(indices.data());
+	value = tensor_gpu::getValueAtIndices(gpu_data, indices.data(), gpu_shape, gpu_strides, gpu_shape_size);
 	return value;
 }
 
@@ -143,7 +143,7 @@ ValueType Tensor::operator()(const std::vector<size_t> &indices) const {
 		return cpu_data[flattenIndex(indices)];
 	}
 
-	value = tensor_gpu::getValueAtIndices(indices.data());
+	value = tensor_gpu::getValueAtIndices(gpu_data, indices.data(), gpu_shape, gpu_strides, gpu_shape_size);
 	return value;
 }
 
@@ -155,6 +155,9 @@ Tensor &Tensor::operator+=(const Tensor &other) {
 		for (size_t i = 0; i < N; ++i)
 			cpu_data[i] += other.cpu_data[i];
 	} else {
+		if (gpu_shape != other.gpu_shape)
+			throw std::invalid_argument("Shape mismatch in Tensor::operator+=.");
+		tensor_gpu::add(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType));
 	}
 	return *this;
 }
@@ -167,6 +170,9 @@ Tensor &Tensor::operator-=(const Tensor &other) {
 		for (size_t i = 0; i < N; ++i)
 			cpu_data[i] -= other.cpu_data[i];
 	} else {
+		if (gpu_shape != other.gpu_shape)
+			throw std::invalid_argument("Shape mismatch in Tensor::operator+=.");
+		tensor_gpu::subtraction(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType));
 	}
 	return *this;
 }
@@ -179,6 +185,9 @@ Tensor &Tensor::operator*=(const Tensor &other) {
 		for (size_t i = 0; i < N; ++i)
 			cpu_data[i] *= other.cpu_data[i];
 	} else {
+		if (gpu_shape != other.gpu_shape)
+			throw std::invalid_argument("Shape mismatch in Tensor::operator+=.");
+		tensor_gpu::multiply(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType));
 	}
 	return *this;
 }
@@ -191,6 +200,9 @@ Tensor &Tensor::operator/=(const Tensor &other) {
 		for (size_t i = 0; i < N; ++i)
 			cpu_data[i] /= other.cpu_data[i];
 	} else {
+		if (gpu_shape != other.gpu_shape)
+			throw std::invalid_argument("Shape mismatch in Tensor::operator+=.");
+		tensor_gpu::division(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType));
 	}
 	return *this;
 }
@@ -200,6 +212,7 @@ Tensor &Tensor::operator*=(ValueType scalar) {
 		for (auto &x : cpu_data)
 			x *= scalar;
 	} else {
+		tensor_gpu::multiply(gpu_data, scalar, gpu_data, gpu_data_size * sizeof(ValueType));
 	}
 	return *this;
 }
@@ -209,6 +222,7 @@ Tensor &Tensor::operator-=(ValueType scalar) {
 		for (auto &x : cpu_data)
 			x -= scalar;
 	} else {
+		tensor_gpu::subtraction(gpu_data, scalar, gpu_data, gpu_data_size * sizeof(ValueType));
 	}
 	return *this;
 }
@@ -218,6 +232,7 @@ Tensor &Tensor::operator+=(ValueType scalar) {
 		for (auto &x : cpu_data)
 			x += scalar;
 	} else {
+		tensor_gpu::add(gpu_data, scalar, gpu_data, gpu_data_size * sizeof(ValueType));
 	}
 	return *this;
 }
@@ -227,6 +242,7 @@ Tensor &Tensor::operator/=(ValueType scalar) {
 		for (auto &x : cpu_data)
 			x /= scalar;
 	} else {
+		tensor_gpu::division(gpu_data, scalar, gpu_data, gpu_data_size * sizeof(ValueType));
 	}
 	return *this;
 }
@@ -284,6 +300,21 @@ Tensor Tensor::matmul(const Tensor &other) const {
 		}
 		return result;
 	}
+
+	// Validate shapes similarly (assumed available via gpu_shape_size and gpu_shape pointer)
+	if (gpu_shape_size != 2 || other.gpu_shape_size != 1)
+		throw std::runtime_error("matmul (GPU): unsupported shapes.");
+
+	size_t M = gpu_shape[0];
+	size_t K = gpu_shape[1];
+	if (K != other.gpu_shape[0])
+		throw std::runtime_error("matmul (GPU): shape mismatch.");
+
+	Tensor result({M}, 0.0f);
+
+	// Call GPU kernel or helper
+	tensor_gpu::matmul(gpu_data, other.gpu_data, result.gpu_data, M, K);
+	return result;
 }
 
 Tensor Tensor::outer(const Tensor &a, const Tensor &b) {
@@ -307,6 +338,18 @@ Tensor Tensor::outer(const Tensor &a, const Tensor &b) {
 		}
 		return result;
 	}
+
+	if (a.gpu_shape_size != 1 || b.gpu_shape_size != 1)
+		throw std::runtime_error("outer (GPU): both tensors must be 1D vectors");
+
+	size_t m = a.gpu_shape[0];
+	size_t n = b.gpu_shape[0];
+
+	Tensor result({m, n});
+
+	// Call GPU kernel or helper
+	tensor_gpu::outer(a.gpu_data, b.gpu_data, result.gpu_data, m, n);
+	return result;
 }
 
 Tensor Tensor::matmulT(const Tensor &vec) const {
@@ -337,5 +380,20 @@ Tensor Tensor::matmulT(const Tensor &vec) const {
 		}
 		return result;
 	}
+
+	// GPU path
+	if (gpu_shape_size != 2 || vec.gpu_shape_size != 1)
+		throw std::runtime_error("matmulT (GPU): bad dimensions");
+
+	size_t M = gpu_shape[0];
+	size_t N = gpu_shape[1];
+	if (vec.gpu_shape[0] != M)
+		throw std::runtime_error("matmulT (GPU): incompatible");
+
+	Tensor result({N});
+
+	// Call GPU kernel or helper
+	tensor_gpu::matmulT(gpu_data, vec.gpu_data, result.gpu_data, M, N);
+	return result;
 }
 } // namespace nn::global
diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
index 0a3fe10..7741a0d 100644
--- a/src/model/tensor_gpu.cu
+++ b/src/model/tensor_gpu.cu
@@ -68,6 +68,39 @@ void add(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count
     cudaDeviceSynchronize();
 }
 
+
+// Kernel for element-wise addition: C = A - B
+__global__ void subtractionKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        C[idx] = A[idx] - B[idx];
+    }
+}
+
+// Element-wise addition: C = A + B
+void subtraction(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    subtractionKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    cudaDeviceSynchronize();
+}
+
+// Kernel for element-wise addition: C = A / B
+__global__ void divisionKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        C[idx] = A[idx] / B[idx];
+    }
+}
+
+// Element-wise addition: C = A / B
+void division(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    divisionKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    cudaDeviceSynchronize();
+}
+
 // Kernel for element-wise multiplication: C = A * B
 __global__ void multiplyKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -84,6 +117,71 @@ void multiply(const ValueType* A, const ValueType* B, ValueType* C, std::size_t
     cudaDeviceSynchronize();
 }
 
+// Kernel for element-wise addition: C = A + B
+__global__ void addKernel(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        C[idx] = A[idx] + B;
+    }
+}
+
+// Element-wise addition: C = A + B
+void add(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    addKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    cudaDeviceSynchronize();
+}
+
+
+// Kernel for element-wise addition: C = A - B
+__global__ void subtractionKernel(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        C[idx] = A[idx] - B;
+    }
+}
+
+// Element-wise addition: C = A + B
+void subtraction(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    subtractionKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    cudaDeviceSynchronize();
+}
+
+// Kernel for element-wise addition: C = A / B
+__global__ void divisionKernel(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        C[idx] = A[idx] / B;
+    }
+}
+
+// Element-wise addition: C = A / B
+void division(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    divisionKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    cudaDeviceSynchronize();
+}
+
+// Kernel for element-wise multiplication: C = A * B
+__global__ void multiplyKernel(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        C[idx] = A[idx] * B;
+    }
+}
+
+// Element-wise multiply: C = A * B
+void multiply(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    multiplyKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    cudaDeviceSynchronize();
+}
+
 // Dot product kernel using parallel reduction (simplified version)
 __global__ void dotKernel(const ValueType* A, const ValueType* B, ValueType* partialSum, std::size_t count) {
     __shared__ ValueType cache[256];
@@ -328,15 +426,13 @@ void softmax(const ValueType* input, ValueType* output, std::size_t count) {
     cudaDeviceSynchronize();
 }
 
-template <typename T>
-void setValueAt(T* devicePtr, std::size_t index, T value) {
-    cudaMemcpy(devicePtr + index, &value, sizeof(T), cudaMemcpyHostToDevice);
+void setValueAt(ValueType* devicePtr, std::size_t index, ValueType value) {
+    cudaMemcpy(devicePtr + index, &value, sizeof(ValueType), cudaMemcpyHostToDevice);
 }
 
-template <typename T>
-ValueType getValueAt(const T* devicePtr , std::size_t index) {
-    T value;
-    cudaMemcpy(&value, devicePtr + index, sizeof(T), cudaMemcpyDeviceToHost);
+ValueType getValueAt(const ValueType* devicePtr , std::size_t index) {
+    ValueType value;
+    cudaMemcpy(&value, devicePtr + index, sizeof(ValueType), cudaMemcpyDeviceToHost);
     return value;
 }
 
@@ -390,17 +486,17 @@ __global__ void computeFlatIndexKernel(
     *outIndex = flatIndex;
 }
 
-ValueType getValueAtIndicesGpu(
+ValueType getValueAtIndices(
     const ValueType* deviceData,
     const size_t* hostIndices,
     const size_t* deviceShape,
     const size_t* deviceStrides,
-    size_t rank
+    size_t size
 ) {
     // Copy host indices to device
     size_t* deviceIndices;
-    cudaMalloc(&deviceIndices, sizeof(size_t) * rank);
-    cudaMemcpy(deviceIndices, hostIndices, sizeof(size_t) * rank, cudaMemcpyHostToDevice);
+    cudaMalloc(&deviceIndices, sizeof(size_t) * size);
+    cudaMemcpy(deviceIndices, hostIndices, sizeof(size_t) * size, cudaMemcpyHostToDevice);
 
     // Allocate output for index
     size_t* deviceFlatIndex;
@@ -408,7 +504,7 @@ ValueType getValueAtIndicesGpu(
 
     // Launch kernel to compute flat index
     computeFlatIndexKernel<<<1, 1>>>(
-        deviceIndices, deviceShape, deviceStrides, rank, deviceFlatIndex
+        deviceIndices, deviceShape, deviceStrides, size, deviceFlatIndex
     );
     cudaDeviceSynchronize();
 
@@ -426,4 +522,60 @@ ValueType getValueAtIndicesGpu(
 
     return value;
 }
+
+__global__ void matmulKernel(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K) {
+    size_t row = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row < M) {
+        ValueType sum = 0;
+        for (size_t j = 0; j < K; ++j) {
+            sum += A[row * K + j] * B[j];
+        }
+        R[row] = sum;
+    }
+}
+
+__global__ void outerKernel(const ValueType *a, const ValueType *b, ValueType *result, size_t m, size_t n) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t total = m * n;
+    if (idx < total) {
+        size_t i = idx / n;
+        size_t j = idx % n;
+        result[i * n + j] = a[i] * b[j];
+    }
+}
+
+__global__ void matmulTKernel(const ValueType *W, const ValueType *V, ValueType *R, size_t M, size_t N) {
+    size_t col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (col < N) {
+        ValueType sum = 0;
+        for (size_t i = 0; i < M; ++i) {
+            // W is M x N, access element at (i, col)
+            sum += W[i * N + col] * V[i];
+        }
+        R[col] = sum;
+    }
+}
+
+// Wrapper functions to launch kernels
+
+void matmul(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K) {
+    const int blockSize = 256;
+    int gridSize = (M + blockSize - 1) / blockSize;
+    matmulKernel<<<gridSize, blockSize>>>(A, B, R, M, K);
+    cudaDeviceSynchronize();
+}
+
+void outer(const ValueType *a, const ValueType *b, ValueType *result, size_t m, size_t n) {
+    const int blockSize = 256;
+    int gridSize = (m * n + blockSize - 1) / blockSize;
+    outerKernel<<<gridSize, blockSize>>>(a, b, result, m, n);
+    cudaDeviceSynchronize();
+}
+
+void matmulT(const ValueType *W, const ValueType *V, ValueType *R, size_t M, size_t N) {
+    const int blockSize = 256;
+    int gridSize = (N + blockSize - 1) / blockSize;
+    matmulTKernel<<<gridSize, blockSize>>>(W, V, R, M, N);
+    cudaDeviceSynchronize();
+}
 } // namespace tensor_gpu
diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp
index cf171e5..e4cdc28 100644
--- a/src/model/tensor_gpu.hpp
+++ b/src/model/tensor_gpu.hpp
@@ -30,9 +30,27 @@ void zero(ValueType *deviceData, std::size_t count);
 /// Element-wise addition: C = A + B
 void add(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count);
 
+/// Element-wise addition: C = A - B
+void subtraction(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count);
+
+/// Element-wise addition: C = A / B
+void division(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count);
+
 /// Element-wise multiply: C = A * B
 void multiply(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count);
 
+/// Element-wise addition: C = A + B
+void add(const ValueType *A, const ValueType B, ValueType *C, std::size_t count);
+
+/// Element-wise addition: C = A - B
+void subtraction(const ValueType *A, const ValueType B, ValueType *C, std::size_t count);
+
+/// Element-wise addition: C = A / B
+void division(const ValueType *A, const ValueType B, ValueType *C, std::size_t count);
+
+/// Element-wise multiply: C = A * B
+void multiply(const ValueType *A, const ValueType B, ValueType *C, std::size_t count);
+
 /// Dot product between two vectors (A · B)
 float dot(const ValueType *A, const ValueType *B, std::size_t count);
 
@@ -54,17 +72,25 @@ void tanh_derivative(const ValueType *input, ValueType *output, std::size_t coun
 void leaky_relu(const ValueType *input, ValueType *output, std::size_t count, ValueType alpha = 0.01f);
 void leaky_relu_derivative(const ValueType *input, ValueType *output, std::size_t count, ValueType alpha = 0.01f);
 
+// ---------------- Softmax ----------------
 void softmax(const ValueType *net, ValueType *out, std::size_t size);
 
-template <typename T>
-ValueType getValueAt(const T *devicePtr, std::size_t index);
+ValueType getValueAt(const ValueType *devicePtr, std::size_t index);
 
-template <typename T>
-void setValueAt(T *devicePtr, std::size_t index, T value);
+void setValueAt(ValueType *devicePtr, std::size_t index, ValueType value);
 
 size_t flattenIndexGpu(const size_t *indices, const size_t *d_shape, const size_t *d_strides, size_t ndim);
 
-ValueType getValueAtIndices(const size_t *indices);
+ValueType getValueAtIndices(
+    const ValueType *deviceData,
+    const size_t *hostIndices,
+    const size_t *deviceShape,
+    const size_t *deviceStrides,
+    size_t rank);
+
+void matmul(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K);
+void outer(const ValueType *a, const ValueType *b, ValueType *result, size_t m, size_t n);
+void matmulT(const ValueType *W, const ValueType *V, ValueType *R, size_t M, size_t N);
 } // namespace nn::global::tensor_gpu
 
 #endif // TENSOR_GPU

From 27858235a39660b414c792c60630c5f2890693d9 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Wed, 6 Aug 2025 13:51:26 +0300
Subject: [PATCH 11/40] new commit

---
 CMakeLists.txt                     |  7 ++++---
 src/model/tensor_gpu.cu            | 18 +++++++++++++++---
 src/model/tensor_gpu.hpp           |  2 +-
 tests/binary_test.cpp              |  1 -
 tests/data/config-binary_test.json |  2 +-
 5 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a622b5b..1bb21b6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,5 @@
 cmake_minimum_required(VERSION 3.28)
+set(CMAKE_CUDA_ARCHITECTURES 86)  # For RTX 3060
 project(NeuralNetwork LANGUAGES CXX CUDA)  # Add CUDA here
 
 # ------------------------------------------------------------------
@@ -10,6 +11,8 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON)           # Enforce it
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
 
+enable_language(CUDA)
+
 # Default to Debug build type
 if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type" FORCE)
@@ -63,7 +66,6 @@ set_target_properties(NeuralNetwork PROPERTIES POSITION_INDEPENDENT_CODE ON)
 # Enable separable compilation for CUDA files
 set_target_properties(NeuralNetwork PROPERTIES
     CUDA_SEPARABLE_COMPILATION ON
-    CUDA_RESOLVE_DEVICE_SYMBOLS ON
 )
 
 target_include_directories(NeuralNetwork
@@ -83,7 +85,6 @@ target_link_libraries(NeuralNetwork
         SFML::System
         nlohmann_json::nlohmann_json
         cuda
-        cudart
 )
 
 target_compile_options(NeuralNetwork PRIVATE -Wall -Wextra -Wpedantic)
@@ -121,4 +122,4 @@ endif()
 # Install
 install(TARGETS NeuralNetwork ARCHIVE DESTINATION lib)
 install(DIRECTORY include/ DESTINATION include)
-
+ 
diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
index 7741a0d..74c9583 100644
--- a/src/model/tensor_gpu.cu
+++ b/src/model/tensor_gpu.cu
@@ -1,14 +1,26 @@
 #include <cuda_runtime.h>
 #include "tensor_gpu.hpp"
 #include <cstddef>
+#include <iostream>
 #include <stdexcept>
 
+
 namespace nn::global::tensor_gpu {
 // Allocate memory on GPU for a tensor.
-void* allocate(std::size_t count) {
-    void* devicePtr = nullptr;
-    cudaError_t err = cudaMalloc(&devicePtr, count);
+void* allocate(std::size_t size) {
+    int count = 0;
+    cudaError_t err = cudaGetDeviceCount(&count);
     if (err != cudaSuccess) {
+        std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl;
+    }
+    std::cout << "CUDA device count: " << count << std::endl;
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, 0);
+    std::cout << "GPU memory available: " << prop.totalGlobalMem / (1024 * 1024) << " MB\n";
+
+    void* devicePtr = nullptr;
+    cudaError_t err1 = cudaMalloc(&devicePtr, size);
+    if (err1 != cudaSuccess) {
         throw std::runtime_error("cudaMalloc failed");
     }
     return devicePtr;
diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp
index e4cdc28..2d0745f 100644
--- a/src/model/tensor_gpu.hpp
+++ b/src/model/tensor_gpu.hpp
@@ -11,7 +11,7 @@ class Tensor; // Forward declaration
 namespace nn::global::tensor_gpu {
 
 /// Allocate memory on GPU for a tensor.
-void *allocate(std::size_t count);
+void *allocate(std::size_t size);
 
 /// Free GPU memory.
 void deallocate(ValueType *devicePtr);
diff --git a/tests/binary_test.cpp b/tests/binary_test.cpp
index 7dbb327..3f740e2 100644
--- a/tests/binary_test.cpp
+++ b/tests/binary_test.cpp
@@ -1,4 +1,3 @@
-#include "tensor.hpp"
 #include "tests.hpp"
 #include <iostream>
 #include <model.hpp>
diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json
index 007cb80..59d9390 100644
--- a/tests/data/config-binary_test.json
+++ b/tests/data/config-binary_test.json
@@ -1,6 +1,6 @@
 {
 	"visual config": {
-		"enableVisuals": true,
+		"enableVisuals": false,
 		"modes": [
 			{ "state": "pause", "mode": true },
 			{ "state": "precise mode", "mode": false },

From d50e875c233edde5c9ec022bfc668e70929e511c Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Wed, 6 Aug 2025 15:42:54 +0300
Subject: [PATCH 12/40] bug fix

---
 CMakeLists.txt          | 13 +------------
 src/model/tensor.cpp    |  5 ++++-
 src/model/tensor_gpu.cu | 11 -----------
 tests/binary_test.cpp   |  3 ++-
 4 files changed, 7 insertions(+), 25 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1bb21b6..c29b4f8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,13 +44,6 @@ FetchContent_Declare(nlohmann_json
 
 FetchContent_MakeAvailable(SFML nlohmann_json)
 
-# ------------------------------------------------------------------
-# Function: Apply sanitizers (for CPU code only)
-function(apply_sanitizers target)
-    target_compile_options(${target} PRIVATE -fsanitize=address -fno-omit-frame-pointer -g)
-    target_link_libraries(${target} PRIVATE -fsanitize=address)
-endfunction()
-
 # ------------------------------------------------------------------
 # Main library
 
@@ -97,8 +90,6 @@ if(BUILD_NN_TESTS)
     enable_testing()
     include(CTest)
 
-    apply_sanitizers(NeuralNetwork)
-
     file(GLOB TEST_SOURCES CONFIGURE_DEPENDS tests/*.cpp)
 
     if(TEST_SOURCES)
@@ -109,8 +100,6 @@ if(BUILD_NN_TESTS)
             target_link_libraries(${test_name} PRIVATE NeuralNetwork)
             target_include_directories(${test_name} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include")
 
-            apply_sanitizers(${test_name})
-
             add_test(NAME ${test_name} COMMAND ${test_name})
         endforeach()
     else()
@@ -122,4 +111,4 @@ endif()
 # Install
 install(TARGETS NeuralNetwork ARCHIVE DESTINATION lib)
 install(DIRECTORY include/ DESTINATION include)
- 
+
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index f76e921..ff9e1e7 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -21,10 +21,13 @@ Tensor::Tensor(const std::vector<size_t> &shape, float init) {
 		cpu_shape = shape;
 		cpu_data.assign(totalSize, init);
 	} else {
+		gpu_data = (ValueType *)tensor_gpu::allocate(totalSize * sizeof(ValueType));
+
 		gpu_shape = (size_t *)tensor_gpu::allocate(shape.size() * sizeof(size_t));
 		tensor_gpu::copyToDevice(gpu_shape, shape.data(), gpu_data_size * sizeof(size_t));
-		gpu_data = (ValueType *)tensor_gpu::allocate(totalSize * sizeof(ValueType));
+
 		gpu_data_size = totalSize;
+		gpu_shape_size = shape.size();
 	}
 
 	computeStrides();
diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
index 74c9583..493bbf0 100644
--- a/src/model/tensor_gpu.cu
+++ b/src/model/tensor_gpu.cu
@@ -1,23 +1,12 @@
 #include <cuda_runtime.h>
 #include "tensor_gpu.hpp"
 #include <cstddef>
-#include <iostream>
 #include <stdexcept>
 
 
 namespace nn::global::tensor_gpu {
 // Allocate memory on GPU for a tensor.
 void* allocate(std::size_t size) {
-    int count = 0;
-    cudaError_t err = cudaGetDeviceCount(&count);
-    if (err != cudaSuccess) {
-        std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl;
-    }
-    std::cout << "CUDA device count: " << count << std::endl;
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, 0);
-    std::cout << "GPU memory available: " << prop.totalGlobalMem / (1024 * 1024) << " MB\n";
-
     void* devicePtr = nullptr;
     cudaError_t err1 = cudaMalloc(&devicePtr, size);
     if (err1 != cudaSuccess) {
diff --git a/tests/binary_test.cpp b/tests/binary_test.cpp
index 3f740e2..c9716ad 100644
--- a/tests/binary_test.cpp
+++ b/tests/binary_test.cpp
@@ -1,3 +1,4 @@
+#include "tensor.hpp"
 #include "tests.hpp"
 #include <iostream>
 #include <model.hpp>
@@ -81,7 +82,7 @@ int main(int argc, char *argv[]) {
 	if (argc > 1 && std::string(argv[1]) == "l") {
 		model.load("test.txt");
 	} else {
-        std::vector<std::string> files {"../tests/data/test1", "../tests/data/test2"};
+		std::vector<std::string> files{"../tests/data/test1", "../tests/data/test2"};
 		model.train(files);
 
 		nn::model::modelResult result = model.evaluateModel("../tests/data/database-binary_test");

From 5c37bfbb990df8ba437e32a6e2676ddcc8f55605 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Wed, 6 Aug 2025 16:06:39 +0300
Subject: [PATCH 13/40] bug fixes

---
 include/tensor.hpp      | 10 +---------
 src/model/dataBase.cpp  |  3 ++-
 src/model/tensor.cpp    | 20 ++++++++++++++++++++
 src/model/tensor_gpu.cu |  1 -
 4 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/include/tensor.hpp b/include/tensor.hpp
index 60bdd3e..f47d82b 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -11,11 +11,6 @@ class Activation;
 
 namespace nn::global {
 
-enum class Backend {
-	CPU,
-	GPU,
-};
-
 class Tensor {
   private:
 	std::vector<ValueType> cpu_data;
@@ -40,10 +35,7 @@ class Tensor {
   public:
 	// Constructors
 	Tensor(const std::vector<size_t> &shape, float init = 0.0f);
-	Tensor(const Tensor &other)
-	    : cpu_data(other.cpu_data),
-	      cpu_shape(other.cpu_shape),
-	      cpu_strides(other.cpu_strides) {}
+	Tensor(const Tensor &other);
 
 	Tensor &operator=(const Tensor &other);
 
diff --git a/src/model/dataBase.cpp b/src/model/dataBase.cpp
index 2c13444..bb79bce 100644
--- a/src/model/dataBase.cpp
+++ b/src/model/dataBase.cpp
@@ -65,8 +65,9 @@ int DataBase::load(const std::string &db_filename) {
 		}
 
 		TrainSample new_sample = readLine(line);
-		if (new_sample.input.numElements() == 0)
+		if (new_sample.input.numElements() == 0) {
 			continue;
+		}
 
 		samples.add(new_sample);
 	}
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index ff9e1e7..1772600 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -33,6 +33,26 @@ Tensor::Tensor(const std::vector<size_t> &shape, float init) {
 	computeStrides();
 }
 
+Tensor::Tensor(const Tensor &other) {
+    if (isGpu) {
+        gpu_data_size = other.gpu_data_size;
+        gpu_shape_size = other.gpu_shape_size;
+
+		gpu_data = (ValueType *)tensor_gpu::allocate(gpu_data_size * sizeof(ValueType));
+		gpu_strides = (size_t *)tensor_gpu::allocate(gpu_shape_size * sizeof(size_t));
+		gpu_shape = (size_t *)tensor_gpu::allocate(gpu_shape_size * sizeof(size_t));
+
+        tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size*sizeof(ValueType));
+        tensor_gpu::copyDeviceToDevice(gpu_shape, other.gpu_shape, gpu_shape_size*sizeof(size_t));
+        tensor_gpu::copyDeviceToDevice(gpu_shape, other.gpu_shape, gpu_shape_size*sizeof(size_t));
+
+    } else {
+        cpu_data = other.cpu_data;
+        cpu_shape = other.cpu_shape;
+        cpu_strides = other.cpu_strides;
+    }
+}
+
 ValueType &Tensor::operator[](size_t i) {
 	static ValueType value;
 	if (isGpu) {
diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
index 493bbf0..be277df 100644
--- a/src/model/tensor_gpu.cu
+++ b/src/model/tensor_gpu.cu
@@ -3,7 +3,6 @@
 #include <cstddef>
 #include <stdexcept>
 
-
 namespace nn::global::tensor_gpu {
 // Allocate memory on GPU for a tensor.
 void* allocate(std::size_t size) {

From 882810c3403e633bc6a3a7dd87321adefc1c841b Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Wed, 6 Aug 2025 18:04:03 +0300
Subject: [PATCH 14/40] new commit

---
 include/tensor.hpp                   |  9 ++--
 src/model/dataBase.cpp               |  2 +-
 src/model/model.cpp                  | 23 ++++++----
 src/model/tensor.cpp                 | 68 +++++++++++-----------------
 src/model/tensor_gpu.cu              | 47 ++++++++++++++++++-
 src/model/tensor_gpu.hpp             | 12 ++++-
 src/networks/fnn/DenseLayer.cpp      | 14 +++---
 src/networks/fnn/FNNetwork.cpp       |  1 +
 tests/binary_test.cpp                |  5 +-
 tests/data/config-binary_test.json   |  4 +-
 tests/data/database-binary_test.nndb |  2 +-
 tests/data/test1.nndb                |  2 +-
 tests/data/test2.nndb                |  2 +-
 13 files changed, 119 insertions(+), 72 deletions(-)

diff --git a/include/tensor.hpp b/include/tensor.hpp
index f47d82b..40c90fe 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -37,13 +37,12 @@ class Tensor {
 	Tensor(const std::vector<size_t> &shape, float init = 0.0f);
 	Tensor(const Tensor &other);
 
+	~Tensor();
+
 	Tensor &operator=(const Tensor &other);
 
-	// Element access
-	ValueType &operator()(const std::vector<size_t> &indices);
-	ValueType operator()(const std::vector<size_t> &indices) const;
-	ValueType &operator[](size_t i);
-	const ValueType &operator[](size_t i) const;
+	ValueType getValue(const std::vector<size_t> newShape) const;
+	void setValue(const std::vector<size_t> newShape, const ValueType value);
 
 	// Iterators (for range-based loops)
 	auto begin() noexcept { return cpu_data.begin(); }
diff --git a/src/model/dataBase.cpp b/src/model/dataBase.cpp
index bb79bce..2a8afce 100644
--- a/src/model/dataBase.cpp
+++ b/src/model/dataBase.cpp
@@ -26,7 +26,7 @@ TrainSample DataBase::readLine(const std::string &line) {
 	for (size_t i = 0; i < samples.sInputSize; ++i) {
 		iss >> token;
 
-		new_sample.input({i}) = std::stod(token);
+        new_sample.input.setValue({i}, std::stod(token));
 	}
 
 	return new_sample;
diff --git a/src/model/model.cpp b/src/model/model.cpp
index eb2f22e..920ad24 100644
--- a/src/model/model.cpp
+++ b/src/model/model.cpp
@@ -1,5 +1,6 @@
 #include "../networks/cnn/CNNetwork.hpp"
 #include "../networks/fnn/FNNetwork.hpp"
+#include "dataBase.hpp"
 #include <chrono>
 #include <fstream>
 #include <iostream>
@@ -134,7 +135,8 @@ void Model::addCNN(const std::uint32_t width, ISubNetworkConfig &_config) {
 }
 
 void Model::runModel(const global::Tensor &input) {
-	visual.updateInput(input);
+	// visual.updateInput(input);
+	printf("test1:\n");
 	network[0]->forward(input);
 
 	for (size_t i = 1; i < network.size(); ++i) {
@@ -177,13 +179,18 @@ global::ValueType Model::runBackPropagation(
 
 	resetNetworkGradient();
 	for (size_t i = 0; i < batch.size(); ++i) {
-		auto current_sample_ptr = batch.samples.at(i);
+		TrainSample *current_sample_ptr = batch.samples.at(i);
 		visual.updatePrediction(current_sample_ptr->pre);
 
-		runModel(transformation(current_sample_ptr->input));
+		printf("test1\n");
+		printf("test1: %zu\n", current_sample_ptr->input.numElements());
+		// runModel(transformation(current_sample_ptr->input));
+
+		printf("\n");
+		runModel(current_sample_ptr->input);
 
 		global::Tensor output({outputSize()});
-		output[current_sample_ptr->pre.index] = 1;
+		output.setValue({current_sample_ptr->pre.index}, 1);
 
 		if (doBackward) {
 			Backward(output);
@@ -404,7 +411,7 @@ void Model::save(const std::string &file) {
 
 		outFile << params.numElements() << " ";
 		for (size_t j = 0; j < params.numElements(); ++j) {
-			outFile << params[j] << " ";
+			outFile << params.getValue({j}) << " ";
 		}
 
 		outFile << std::endl;
@@ -429,7 +436,7 @@ void Model::load(const std::string &file) {
 
 		for (size_t i = 0; i < ParamSize; ++i) {
 			iss >> num;
-			numbers[i] = num;
+			numbers.setValue({i}, num);
 		}
 
 		network[networkI]->setParams(numbers);
@@ -444,12 +451,12 @@ global::Prediction Model::getPrediction() const {
 	size_t max = 0;
 
 	for (size_t i = 1; i < outputSize(); ++i) {
-		if (getOutput()[i] > getOutput()[max]) {
+		if (getOutput().getValue({i}) > getOutput().getValue({max})) {
 			max = i;
 		}
 	}
 
-	return global::Prediction(max, getOutput()[max]);
+	return global::Prediction(max, getOutput().getValue({max}));
 }
 
 void Model::setTraining() {
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index 1772600..4d3807d 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -34,41 +34,23 @@ Tensor::Tensor(const std::vector<size_t> &shape, float init) {
 }
 
 Tensor::Tensor(const Tensor &other) {
-    if (isGpu) {
-        gpu_data_size = other.gpu_data_size;
-        gpu_shape_size = other.gpu_shape_size;
+	if (isGpu) {
+		gpu_data_size = other.gpu_data_size;
+		gpu_shape_size = other.gpu_shape_size;
 
 		gpu_data = (ValueType *)tensor_gpu::allocate(gpu_data_size * sizeof(ValueType));
 		gpu_strides = (size_t *)tensor_gpu::allocate(gpu_shape_size * sizeof(size_t));
 		gpu_shape = (size_t *)tensor_gpu::allocate(gpu_shape_size * sizeof(size_t));
 
-        tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size*sizeof(ValueType));
-        tensor_gpu::copyDeviceToDevice(gpu_shape, other.gpu_shape, gpu_shape_size*sizeof(size_t));
-        tensor_gpu::copyDeviceToDevice(gpu_shape, other.gpu_shape, gpu_shape_size*sizeof(size_t));
-
-    } else {
-        cpu_data = other.cpu_data;
-        cpu_shape = other.cpu_shape;
-        cpu_strides = other.cpu_strides;
-    }
-}
-
-ValueType &Tensor::operator[](size_t i) {
-	static ValueType value;
-	if (isGpu) {
-		value = tensor_gpu::getValueAt(gpu_data, i);
-		return value;
-	}
-	return cpu_data[i];
-}
+		tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType));
+		tensor_gpu::copyDeviceToDevice(gpu_shape, other.gpu_shape, gpu_shape_size * sizeof(size_t));
+		tensor_gpu::copyDeviceToDevice(gpu_shape, other.gpu_shape, gpu_shape_size * sizeof(size_t));
 
-const ValueType &Tensor::operator[](size_t i) const {
-	static ValueType value;
-	if (isGpu) {
-		value = tensor_gpu::getValueAt(gpu_data, i);
-		return value;
+	} else {
+		cpu_data = other.cpu_data;
+		cpu_shape = other.cpu_shape;
+		cpu_strides = other.cpu_strides;
 	}
-	return cpu_data[i];
 }
 
 size_t Tensor::numElements() const {
@@ -150,24 +132,18 @@ inline size_t Tensor::flattenIndex(const std::vector<size_t> &indices) const {
 	}
 }
 
-ValueType &Tensor::operator()(const std::vector<size_t> &indices) {
-	static ValueType value;
-	if (!isGpu) {
-		return cpu_data[flattenIndex(indices)];
+ValueType Tensor::getValue(const std::vector<size_t> newShape) const {
+	if (isGpu) {
+		return tensor_gpu::getValueAtIndices(gpu_data, newShape.data(), gpu_shape, gpu_strides, gpu_shape_size);
 	}
-
-	value = tensor_gpu::getValueAtIndices(gpu_data, indices.data(), gpu_shape, gpu_strides, gpu_shape_size);
-	return value;
+	return cpu_data[flattenIndex(newShape)];
 }
 
-ValueType Tensor::operator()(const std::vector<size_t> &indices) const {
-	static ValueType value;
-	if (!isGpu) {
-		return cpu_data[flattenIndex(indices)];
+void Tensor::setValue(const std::vector<size_t> newShape, const ValueType value) {
+	if (isGpu) {
+		tensor_gpu::setValueAtIndices(gpu_data, newShape.data(), gpu_shape, gpu_strides, gpu_shape_size, value);
 	}
-
-	value = tensor_gpu::getValueAtIndices(gpu_data, indices.data(), gpu_shape, gpu_strides, gpu_shape_size);
-	return value;
+	cpu_data[flattenIndex(newShape)] = value;
 }
 
 Tensor &Tensor::operator+=(const Tensor &other) {
@@ -419,4 +395,12 @@ Tensor Tensor::matmulT(const Tensor &vec) const {
 	tensor_gpu::matmulT(gpu_data, vec.gpu_data, result.gpu_data, M, N);
 	return result;
 }
+
+Tensor::~Tensor() {
+	if (isGpu) {
+		tensor_gpu::deallocate(gpu_data);
+		tensor_gpu::deallocate(gpu_shape);
+		tensor_gpu::deallocate(gpu_strides);
+	}
+}
 } // namespace nn::global
diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
index be277df..78bb4cd 100644
--- a/src/model/tensor_gpu.cu
+++ b/src/model/tensor_gpu.cu
@@ -15,7 +15,7 @@ void* allocate(std::size_t size) {
 }
 
 // Free GPU memory.
-void deallocate(ValueType* devicePtr) {
+void deallocate(void* devicePtr) {
     if (devicePtr) {
         cudaFree(devicePtr);
     }
@@ -523,6 +523,51 @@ ValueType getValueAtIndices(
     return value;
 }
 
+__global__ void setValueAtIndexKernel(ValueType* data, size_t flatIndex, ValueType value) {
+    data[flatIndex] = value;
+}
+
+void setValueAtIndices(
+    ValueType* deviceData,
+    const size_t* hostIndices,
+    const size_t* deviceShape,
+    const size_t* deviceStrides,
+    size_t ndim,
+    ValueType value
+) {
+    // Step 1: Allocate and copy indices to GPU
+    size_t* deviceIndices;
+    cudaMalloc(&deviceIndices, ndim * sizeof(size_t));
+    cudaMemcpy(deviceIndices, hostIndices, ndim * sizeof(size_t), cudaMemcpyHostToDevice);
+
+    // Step 2: Allocate memory to store computed flat index
+    size_t* deviceFlatIndex;
+    cudaMalloc(&deviceFlatIndex, sizeof(size_t));
+
+    // Step 3: Launch kernel to compute flat index
+    computeFlatIndexKernel<<<1, 1>>>(deviceIndices, deviceShape, deviceStrides, ndim, deviceFlatIndex);
+    cudaDeviceSynchronize();
+
+    // Step 4: Copy flat index to host
+    size_t flatIndex;
+    cudaMemcpy(&flatIndex, deviceFlatIndex, sizeof(size_t), cudaMemcpyDeviceToHost);
+
+    // Step 5: Validate flat index
+    if (flatIndex == size_t(-1)) {
+        cudaFree(deviceIndices);
+        cudaFree(deviceFlatIndex);
+        throw std::out_of_range("Invalid indices in setValueAtIndices");
+    }
+
+    // Step 6: Launch kernel to set value at computed flat index
+    setValueAtIndexKernel<<<1, 1>>>(deviceData, flatIndex, value);
+    cudaDeviceSynchronize();
+
+    // Cleanup
+    cudaFree(deviceIndices);
+    cudaFree(deviceFlatIndex);
+}
+
 __global__ void matmulKernel(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K) {
     size_t row = blockIdx.x * blockDim.x + threadIdx.x;
     if (row < M) {
diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp
index 2d0745f..6142cfa 100644
--- a/src/model/tensor_gpu.hpp
+++ b/src/model/tensor_gpu.hpp
@@ -14,7 +14,7 @@ namespace nn::global::tensor_gpu {
 void *allocate(std::size_t size);
 
 /// Free GPU memory.
-void deallocate(ValueType *devicePtr);
+void deallocate(void *devicePtr);
 
 /// Copy data from CPU to GPU.
 void copyToDevice(void *deviceDst, const void *hostSrc, std::size_t count);
@@ -86,7 +86,15 @@ ValueType getValueAtIndices(
     const size_t *hostIndices,
     const size_t *deviceShape,
     const size_t *deviceStrides,
-    size_t rank);
+    size_t size);
+
+void setValueAtIndices(
+    ValueType *deviceData,
+    const size_t *hostIndices,
+    const size_t *deviceShape,
+    const size_t *deviceStrides,
+    size_t ndim,
+    ValueType value);
 
 void matmul(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K);
 void outer(const ValueType *a, const ValueType *b, ValueType *result, size_t m, size_t n);
diff --git a/src/networks/fnn/DenseLayer.cpp b/src/networks/fnn/DenseLayer.cpp
index dc8f7ea..a0f9833 100644
--- a/src/networks/fnn/DenseLayer.cpp
+++ b/src/networks/fnn/DenseLayer.cpp
@@ -28,7 +28,7 @@ void Hidden_Layer::CreateDropoutMask() {
 	std::bernoulli_distribution bernoulli(keepProb);
 
 	for (size_t i = 0; i < dropoutMask.numElements(); ++i) {
-		dropoutMask[i] = static_cast<uint8_t>(bernoulli(rng));
+		dropoutMask.setValue({i}, static_cast<uint8_t>(bernoulli(rng)));
 	}
 }
 
@@ -63,7 +63,7 @@ void Output_Layer::backward(
 global::ValueType Output_Layer::getCrossEntropyLoss(
     const global::Tensor &prediction,
     const size_t target) {
-	return -std::log(std::max(prediction[target], MIN_LOSS_VALUE));
+	return -std::log(std::max(prediction.getValue({target}), MIN_LOSS_VALUE));
 }
 
 global::ValueType Output_Layer::getLoss(const global::Prediction &targets) {
@@ -73,7 +73,7 @@ global::ValueType Output_Layer::getLoss(const global::Prediction &targets) {
 void Hidden_Layer::forward(const global::Tensor &metrix) {
 	if (isTraining)
 		CreateDropoutMask();
-    
+
 	net = parameters.weights.matmul(metrix);
 	net += parameters.biases;
 
@@ -129,14 +129,14 @@ const global::Tensor DenseLayer::getData() const {
 	size_t currentI = 0;
 	for (size_t i = 0; i < size(); ++i) {
 		for (size_t j = 0; j < prevSize(); ++j) {
-			matrix[currentI] = parameters.weights({i, j});
+			matrix.setValue({currentI}, parameters.weights.getValue({i, j}));
 
 			++currentI;
 		}
 	}
 
 	for (size_t i = 0; i < size(); ++i) {
-		matrix[currentI] = parameters.biases[i];
+		matrix.setValue({currentI}, parameters.biases.getValue({i}));
 
 		++currentI;
 	}
@@ -148,14 +148,14 @@ void DenseLayer::setData(const global::Tensor newParam) {
 	size_t currentI = 0;
 	for (size_t i = 0; i < size(); ++i) {
 		for (size_t j = 0; j < prevSize(); ++j) {
-			parameters.weights({i, j}) = newParam[currentI];
+			parameters.weights.setValue({i, j},  newParam.getValue({currentI}));
 
 			++currentI;
 		}
 	}
 
 	for (size_t i = 0; i < size(); ++i) {
-		parameters.biases[i] = newParam[currentI];
+		parameters.biases.setValue({i}, newParam.getValue({currentI}));
 
 		++currentI;
 	}
diff --git a/src/networks/fnn/FNNetwork.cpp b/src/networks/fnn/FNNetwork.cpp
index 46f163c..52bde8f 100644
--- a/src/networks/fnn/FNNetwork.cpp
+++ b/src/networks/fnn/FNNetwork.cpp
@@ -52,6 +52,7 @@ void FNNetwork::sendNewVNeurons(const size_t i) const {
 
 void FNNetwork::forward(const global::Tensor &newInput) {
 	input = newInput;
+	printf("test1: %zu", input.numElements());
 	layers[0]->forward(input);
 	sendNewVNeurons(0);
 
diff --git a/tests/binary_test.cpp b/tests/binary_test.cpp
index c9716ad..5ad6b53 100644
--- a/tests/binary_test.cpp
+++ b/tests/binary_test.cpp
@@ -77,12 +77,15 @@ int main(int argc, char *argv[]) {
 	size_t input_size = 10;
 
 	std::string config_FN = tests::appendToBase("config-binary_test.json");
+    // nn::global::Tensor give_me_a_name({5, 1});
+    // nn::global::Tensor newt = give_me_a_name;
+    // return 0;
 	nn::model::Model model(config_FN);
 
 	if (argc > 1 && std::string(argv[1]) == "l") {
 		model.load("test.txt");
 	} else {
-		std::vector<std::string> files{"../tests/data/test1", "../tests/data/test2"};
+        std::vector<std::string> files {"../tests/data/test1", "../tests/data/test2"};
 		model.train(files);
 
 		nn::model::modelResult result = model.evaluateModel("../tests/data/database-binary_test");
diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json
index 59d9390..66b35f8 100644
--- a/tests/data/config-binary_test.json
+++ b/tests/data/config-binary_test.json
@@ -26,8 +26,8 @@
 	"network config": [
 		{
 			"type": "FNN",
-			"input size": 100,
-			"output size": 100,
+			"input size": 10,
+			"output size": 16,
 			"output activation": 4,
 			"layers": [
 				{ "size": 30, "activationType": 1 },
diff --git a/tests/data/database-binary_test.nndb b/tests/data/database-binary_test.nndb
index 4a40143..f075152 100644
--- a/tests/data/database-binary_test.nndb
+++ b/tests/data/database-binary_test.nndb
@@ -1,4 +1,4 @@
-1000 100
+1000 10
  0 0.5 0.5 0.5 0.5 0.1 0.1 0.1 0.1 0.1 0.1
  0 0.1 0.5 0.5 0.5 0.5 0.1 0.1 0.1 0.1 0.1
  0 0.1 0.1 0.5 0.5 0.5 0.5 0.1 0.1 0.1 0.1
diff --git a/tests/data/test1.nndb b/tests/data/test1.nndb
index 1aba172..8ea0dc7 100644
--- a/tests/data/test1.nndb
+++ b/tests/data/test1.nndb
@@ -1,4 +1,4 @@
-60 100
+60 10
  0 0.5 0.5 0.5 0.5 0.1 0.1 0.1 0.1 0.1 0.1
  0 0.1 0.5 0.5 0.5 0.5 0.1 0.1 0.1 0.1 0.1
  0 0.1 0.1 0.5 0.5 0.5 0.5 0.1 0.1 0.1 0.1
diff --git a/tests/data/test2.nndb b/tests/data/test2.nndb
index 69eadc4..209c026 100644
--- a/tests/data/test2.nndb
+++ b/tests/data/test2.nndb
@@ -1,4 +1,4 @@
-70 100
+70 10
  5 0.5 1 0.5 1 0.1 0.1 0.1 0.1 0.1 0.1
  5 0.1 0.5 1 0.5 1 0.1 0.1 0.1 0.1 0.1
  5 0.1 0.1 0.5 1 0.5 1 0.1 0.1 0.1 0.1

From 4c83f40225d89e4364d8b290296deedad7cd5bf7 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Wed, 6 Aug 2025 18:17:11 +0300
Subject: [PATCH 15/40] more bug fixes

---
 src/networks/fnn/FNNetwork.cpp     | 12 +++---------
 src/networks/fnn/FnnVisualizer.cpp |  8 ++++----
 src/visualizer/visualModel.cpp     |  8 +++-----
 tests/binary_test.cpp              |  6 +++---
 4 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/src/networks/fnn/FNNetwork.cpp b/src/networks/fnn/FNNetwork.cpp
index 52bde8f..94ee3ac 100644
--- a/src/networks/fnn/FNNetwork.cpp
+++ b/src/networks/fnn/FNNetwork.cpp
@@ -139,13 +139,7 @@ void FNNetwork::updateWeights(IOptimizer &optimizer) {
 }
 
 void FNNetwork::calculateInputDelta(const global::Tensor &deltas) {
-	input.fill(0);
-
-	for (size_t i = 0; i < inputSize(); ++i) {
-		for (size_t j = 0; j < layers[0]->size(); ++j) {
-			input[i] += deltas[j] * layers[0]->getParms().weights({j, i});
-		}
-	}
+	input = deltas.matmulT(layers[0]->getParms().weights);
 }
 
 size_t FNNetwork::getParamCount() const {
@@ -167,7 +161,7 @@ global::Tensor FNNetwork::getParams() const {
 		global::Tensor params = layers[i]->getData();
 
 		for (size_t j = 0; j < params.numElements(); ++j) {
-			matrix[matrixI] = params[j];
+			matrix.setValue({matrixI}, params.getValue({j}));
 			++matrixI;
 		}
 	}
@@ -181,7 +175,7 @@ void FNNetwork::setParams(const global::Tensor params) {
 		global::Tensor newParam({layers[i]->getParamCount()});
 
 		for (size_t k = 0; k < newParam.numElements(); ++k) {
-			newParam[k] = params[j];
+			newParam.setValue({k}, params.getValue({j}));
 			++j;
 		}
 
diff --git a/src/networks/fnn/FnnVisualizer.cpp b/src/networks/fnn/FnnVisualizer.cpp
index 5ae7306..47359a0 100644
--- a/src/networks/fnn/FnnVisualizer.cpp
+++ b/src/networks/fnn/FnnVisualizer.cpp
@@ -98,7 +98,7 @@ void VisualDenseLayer::drawWeights(const size_t neuron_i, sf::RenderTexture &tar
 		line_[2].position = to;
 
 		line_[0].color = LINE_COLOR;
-		line_[0].color.a = parameters.weights({neuron_i, neuronP}) * 50;
+		line_[0].color.a = parameters.weights.getValue({neuron_i, neuronP}) * 50;
 		line_[1].color = line_[0].color;
 		line_[2].color = getColorFromTextT(getTextT(neuron_i, neuronP));
 		target.draw(line_);
@@ -162,7 +162,7 @@ void VisualDenseLayer::renderNeuron(const size_t index, sf::RenderTexture &targe
 		drawWeights(index, target);
 	}
 
-	drawNeuron(cacheNeurons[index], net[index], out[index], target);
+	drawNeuron(cacheNeurons[index], net.getValue({index}), out.getValue({index}), target);
 }
 
 void VisualDenseLayer::drawNeurons(sf::RenderTexture &target) {
@@ -198,10 +198,10 @@ float VisualDenseLayer::calculateGap(const int size, const float scale) {
 }
 
 textType VisualDenseLayer::getTextT(const size_t layer_i, const size_t layer_p) {
-	if (gradients.weights({layer_i, layer_p}) < 0)
+	if (gradients.weights.getValue({layer_i, layer_p}) < 0)
 		return textType::DOWN;
 
-	if (gradients.weights({layer_i, layer_p}) > 0)
+	if (gradients.weights.getValue({layer_i, layer_p}) > 0)
 		return textType::UP;
 
 	return textType::NORMAL;
diff --git a/src/visualizer/visualModel.cpp b/src/visualizer/visualModel.cpp
index 6391393..4be7484 100644
--- a/src/visualizer/visualModel.cpp
+++ b/src/visualizer/visualModel.cpp
@@ -4,8 +4,6 @@
 #include "fonts.hpp"
 #include "network/IvisualNetwork.hpp"
 #include "panel.hpp"
-#include <SFML/Graphics/Color.hpp>
-#include <SFML/System/Vector2.hpp>
 #include <cstdint>
 #include <memory>
 
@@ -77,14 +75,14 @@ sf::Color DummyLayer::getNeuronColor(const global::ValueType value) {
 
 void DummyLayer::renderNeuron(sf::RenderTexture &target, const size_t index) {
 	sf::RectangleShape shape(cacheNeurons[index].size);
-	shape.setFillColor(getNeuronColor(values({index})));
+	shape.setFillColor(getNeuronColor(values.getValue({index})));
 	shape.setPosition(cacheNeurons[index].position + pos);
 
 	target.draw(shape);
 
 	if (10 * cacheNeurons[index].size.y / global::NEURON_WIDTH > global::MIN_FONT_SIZE) {
 		std::ostringstream ss;
-		ss << std::fixed << std::setprecision(4) << values({index});
+		ss << std::fixed << std::setprecision(4) << values.getValue({index});
 
 		sf::Text text(Fonts::getFont());
 		text.setCharacterSize(10 * cacheNeurons[index].size.y / global::NEURON_WIDTH);
@@ -160,7 +158,7 @@ void ModelPanel::renderSubNetwork(const size_t index) {
 
 void ModelPanel::setPrediction(const global::Prediction &pre) {
 	global::Tensor output({predictionLayer.size()});
-	output({pre.index}) = 1;
+	output.setValue({pre.index}, 1);
 	predictionLayer.setValues(output);
 
 	setUpdate();
diff --git a/tests/binary_test.cpp b/tests/binary_test.cpp
index 5ad6b53..6bbc2f6 100644
--- a/tests/binary_test.cpp
+++ b/tests/binary_test.cpp
@@ -129,9 +129,9 @@ int main(int argc, char *argv[]) {
 		}
 
 		for (size_t i = 4 + num2; i > num2; i--) {
-			input({i - 1}) = bit_by_index(num1, 4 - i + num2);
-			if (input({i - 1}) == 0) {
-				input({i - 1}) = 0.5;
+			input.setValue({i - 1}, bit_by_index(num1, 4 - i + num2));
+			if (input.getValue({i - 1}) == 0) {
+				input.setValue({i - 1}, 0.5);
 			}
 		}
 

From 8062331aa9612c6dbae7df71fc38ecb4e8a4b6a2 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Wed, 6 Aug 2025 19:01:16 +0300
Subject: [PATCH 16/40] bug fix

---
 src/model/model.cpp            | 10 ++--------
 src/networks/fnn/FNNetwork.cpp |  3 +--
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/model/model.cpp b/src/model/model.cpp
index 920ad24..9e6442c 100644
--- a/src/model/model.cpp
+++ b/src/model/model.cpp
@@ -135,8 +135,7 @@ void Model::addCNN(const std::uint32_t width, ISubNetworkConfig &_config) {
 }
 
 void Model::runModel(const global::Tensor &input) {
-	// visual.updateInput(input);
-	printf("test1:\n");
+	visual.updateInput(input);
 	network[0]->forward(input);
 
 	for (size_t i = 1; i < network.size(); ++i) {
@@ -182,12 +181,7 @@ global::ValueType Model::runBackPropagation(
 		TrainSample *current_sample_ptr = batch.samples.at(i);
 		visual.updatePrediction(current_sample_ptr->pre);
 
-		printf("test1\n");
-		printf("test1: %zu\n", current_sample_ptr->input.numElements());
-		// runModel(transformation(current_sample_ptr->input));
-
-		printf("\n");
-		runModel(current_sample_ptr->input);
+		runModel(transformation(current_sample_ptr->input));
 
 		global::Tensor output({outputSize()});
 		output.setValue({current_sample_ptr->pre.index}, 1);
diff --git a/src/networks/fnn/FNNetwork.cpp b/src/networks/fnn/FNNetwork.cpp
index 94ee3ac..fd6f7fd 100644
--- a/src/networks/fnn/FNNetwork.cpp
+++ b/src/networks/fnn/FNNetwork.cpp
@@ -52,7 +52,6 @@ void FNNetwork::sendNewVNeurons(const size_t i) const {
 
 void FNNetwork::forward(const global::Tensor &newInput) {
 	input = newInput;
-	printf("test1: %zu", input.numElements());
 	layers[0]->forward(input);
 	sendNewVNeurons(0);
 
@@ -139,7 +138,7 @@ void FNNetwork::updateWeights(IOptimizer &optimizer) {
 }
 
 void FNNetwork::calculateInputDelta(const global::Tensor &deltas) {
-	input = deltas.matmulT(layers[0]->getParms().weights);
+	input = layers[0]->getParms().weights.matmulT(deltas);
 }
 
 size_t FNNetwork::getParamCount() const {

From 413770bf2ff8a0242c3e24ca3ee3e5d30ce17e96 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Wed, 6 Aug 2025 21:06:55 +0300
Subject: [PATCH 17/40] new commit

---
 include/tensor.hpp       |  4 ++--
 src/model/tensor.cpp     | 22 ++++++++++---------
 src/model/tensor_gpu.cu  | 46 +++++++++++++++++++---------------------
 src/model/tensor_gpu.hpp |  2 --
 tests/binary_test.cpp    |  6 ++++--
 5 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/include/tensor.hpp b/include/tensor.hpp
index 40c90fe..c1fa8fa 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -41,8 +41,8 @@ class Tensor {
 
 	Tensor &operator=(const Tensor &other);
 
-	ValueType getValue(const std::vector<size_t> newShape) const;
-	void setValue(const std::vector<size_t> newShape, const ValueType value);
+	ValueType getValue(const std::vector<size_t> &newShape) const;
+	void setValue(const std::vector<size_t> &newShape, const ValueType value);
 
 	// Iterators (for range-based loops)
 	auto begin() noexcept { return cpu_data.begin(); }
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index 4d3807d..ea9a223 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -24,7 +24,7 @@ Tensor::Tensor(const std::vector<size_t> &shape, float init) {
 		gpu_data = (ValueType *)tensor_gpu::allocate(totalSize * sizeof(ValueType));
 
 		gpu_shape = (size_t *)tensor_gpu::allocate(shape.size() * sizeof(size_t));
-		tensor_gpu::copyToDevice(gpu_shape, shape.data(), gpu_data_size * sizeof(size_t));
+		tensor_gpu::copyToDevice(gpu_shape, shape.data(), shape.size() * sizeof(size_t));
 
 		gpu_data_size = totalSize;
 		gpu_shape_size = shape.size();
@@ -99,7 +99,7 @@ Tensor &Tensor::operator=(const Tensor &other) {
 void Tensor::computeStrides() {
 	if (isGpu) {
 		gpu_strides = (size_t *)tensor_gpu::allocate(gpu_shape_size * sizeof(size_t));
-		tensor_gpu::computeStridesDevice(gpu_shape, gpu_strides, gpu_shape_size * sizeof(size_t));
+		tensor_gpu::computeStridesDevice(gpu_shape, gpu_strides, gpu_shape_size);
 	} else {
 		const size_t dim = cpu_shape.size();
 		cpu_strides.resize(dim);
@@ -132,18 +132,20 @@ inline size_t Tensor::flattenIndex(const std::vector<size_t> &indices) const {
 	}
 }
 
-ValueType Tensor::getValue(const std::vector<size_t> newShape) const {
-	if (isGpu) {
-		return tensor_gpu::getValueAtIndices(gpu_data, newShape.data(), gpu_shape, gpu_strides, gpu_shape_size);
+ValueType Tensor::getValue(const std::vector<size_t> &indices) const {
+	if (!isGpu) {
+		return cpu_data[flattenIndex(indices)];
 	}
-	return cpu_data[flattenIndex(newShape)];
+
+	return tensor_gpu::getValueAt(gpu_data, flattenIndex(indices));
 }
 
-void Tensor::setValue(const std::vector<size_t> newShape, const ValueType value) {
-	if (isGpu) {
-		tensor_gpu::setValueAtIndices(gpu_data, newShape.data(), gpu_shape, gpu_strides, gpu_shape_size, value);
+void Tensor::setValue(const std::vector<size_t> &indices, const ValueType value) {
+	if (!isGpu) {
+		cpu_data[flattenIndex(indices)] = value;
+	} else {
+		tensor_gpu::setValueAt(gpu_data, flattenIndex(indices), value);
 	}
-	cpu_data[flattenIndex(newShape)] = value;
 }
 
 Tensor &Tensor::operator+=(const Tensor &other) {
diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
index 78bb4cd..b27ffd4 100644
--- a/src/model/tensor_gpu.cu
+++ b/src/model/tensor_gpu.cu
@@ -27,13 +27,13 @@ void copyToDevice(void* deviceDst, const void * hostSrc, std::size_t size) {
 }
 
 
-void copyDeviceToDevice(void *deviceDst, const void *deviceSrc, std::size_t count) {
-    cudaMemcpy(deviceDst, deviceSrc, count, cudaMemcpyDeviceToDevice);
+void copyDeviceToDevice(void *deviceDst, const void *deviceSrc, std::size_t size) {
+    cudaMemcpy(deviceDst, deviceSrc, size, cudaMemcpyDeviceToDevice);
 }
 
 // Copy data from GPU to CPU.
-void copyToHost(void* hostDst, const void* deviceSrc, std::size_t count) {
-    cudaMemcpy(hostDst, deviceSrc, count, cudaMemcpyDeviceToHost);
+void copyToHost(void* hostDst, const void* deviceSrc, std::size_t size) {
+    cudaMemcpy(hostDst, deviceSrc, size, cudaMemcpyDeviceToHost);
 }
 
 // Kernel to set all elements to zero.
@@ -436,12 +436,13 @@ ValueType getValueAt(const ValueType* devicePtr , std::size_t index) {
     return value;
 }
 
-// Compute flattened index on device
-__global__ void flattenIndexKernel(const size_t* indices, const size_t* shape, const size_t* strides, size_t ndim, size_t* outIndex) {
+// Kernel to compute flattened index
+__global__ void flattenIndexKernel(const size_t* indices, const size_t* shape,
+                                   const size_t* strides, size_t ndim, size_t* outIndex) {
     size_t idx = 0;
     for (size_t i = 0; i < ndim; ++i) {
         if (indices[i] >= shape[i]) {
-            *outIndex = size_t(-1); // invalid index
+            *outIndex = size_t(-1);
             return;
         }
         idx += indices[i] * strides[i];
@@ -449,34 +450,33 @@ __global__ void flattenIndexKernel(const size_t* indices, const size_t* shape, c
     *outIndex = idx;
 }
 
-size_t flattenIndexGpu(const size_t* indices,const size_t* d_shape,const size_t* d_strides,size_t ndim) {
-    // Copy indices vector to device memory
-    size_t* d_indices = nullptr;
+// Host function to launch kernel
+size_t flattenIndexGpu(const size_t* h_indices, const size_t* d_shape,
+                       const size_t* d_strides, size_t ndim) {
+    size_t *d_indices,  *d_outIndex;
     cudaMalloc(&d_indices, ndim * sizeof(size_t));
-    cudaMemcpy(d_indices, indices, ndim * sizeof(size_t), cudaMemcpyHostToDevice);
-
-    size_t* d_outIndex = nullptr;
     cudaMalloc(&d_outIndex, sizeof(size_t));
 
-    // Launch kernel with a single thread since this is a scalar computation
+    cudaMemcpy(d_indices, h_indices, ndim * sizeof(size_t), cudaMemcpyHostToDevice);
+
     flattenIndexKernel<<<1, 1>>>(d_indices, d_shape, d_strides, ndim, d_outIndex);
     cudaDeviceSynchronize();
 
-    size_t hostIndex;
-    cudaMemcpy(&hostIndex, d_outIndex, sizeof(size_t), cudaMemcpyDeviceToHost);
+    size_t result;
+    cudaMemcpy(&result, d_outIndex, sizeof(size_t), cudaMemcpyDeviceToHost);
 
     cudaFree(d_indices);
     cudaFree(d_outIndex);
 
-    if (hostIndex == size_t(-1)) {
-        throw std::out_of_range("Index out of bounds.");
+    if (result == size_t(-1)) {
+        throw std::out_of_range("Flattened index out of bounds.");
     }
 
-    return hostIndex;
+    return result;
 }
 
 __global__ void computeFlatIndexKernel(
-    const size_t* indices, const size_t* shape, const size_t* strides,
+    const size_t* indices, const size_t* strides,
     size_t rank, size_t* outIndex
 ) {
     size_t flatIndex = 0;
@@ -489,7 +489,6 @@ __global__ void computeFlatIndexKernel(
 ValueType getValueAtIndices(
     const ValueType* deviceData,
     const size_t* hostIndices,
-    const size_t* deviceShape,
     const size_t* deviceStrides,
     size_t size
 ) {
@@ -504,7 +503,7 @@ ValueType getValueAtIndices(
 
     // Launch kernel to compute flat index
     computeFlatIndexKernel<<<1, 1>>>(
-        deviceIndices, deviceShape, deviceStrides, size, deviceFlatIndex
+        deviceIndices, deviceStrides, size, deviceFlatIndex
     );
     cudaDeviceSynchronize();
 
@@ -530,7 +529,6 @@ __global__ void setValueAtIndexKernel(ValueType* data, size_t flatIndex, ValueTy
 void setValueAtIndices(
     ValueType* deviceData,
     const size_t* hostIndices,
-    const size_t* deviceShape,
     const size_t* deviceStrides,
     size_t ndim,
     ValueType value
@@ -545,7 +543,7 @@ void setValueAtIndices(
     cudaMalloc(&deviceFlatIndex, sizeof(size_t));
 
     // Step 3: Launch kernel to compute flat index
-    computeFlatIndexKernel<<<1, 1>>>(deviceIndices, deviceShape, deviceStrides, ndim, deviceFlatIndex);
+    computeFlatIndexKernel<<<1, 1>>>(deviceIndices, deviceStrides, ndim, deviceFlatIndex);
     cudaDeviceSynchronize();
 
     // Step 4: Copy flat index to host
diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp
index 6142cfa..ae48a4a 100644
--- a/src/model/tensor_gpu.hpp
+++ b/src/model/tensor_gpu.hpp
@@ -84,14 +84,12 @@ size_t flattenIndexGpu(const size_t *indices, const size_t *d_shape, const size_
 ValueType getValueAtIndices(
     const ValueType *deviceData,
     const size_t *hostIndices,
-    const size_t *deviceShape,
     const size_t *deviceStrides,
     size_t size);
 
 void setValueAtIndices(
     ValueType *deviceData,
     const size_t *hostIndices,
-    const size_t *deviceShape,
     const size_t *deviceStrides,
     size_t ndim,
     ValueType value);
diff --git a/tests/binary_test.cpp b/tests/binary_test.cpp
index 6bbc2f6..aff4107 100644
--- a/tests/binary_test.cpp
+++ b/tests/binary_test.cpp
@@ -77,9 +77,11 @@ int main(int argc, char *argv[]) {
 	size_t input_size = 10;
 
 	std::string config_FN = tests::appendToBase("config-binary_test.json");
-    // nn::global::Tensor give_me_a_name({5, 1});
-    // nn::global::Tensor newt = give_me_a_name;
+    // nn::global::Tensor give_me_a_name({5, 3});
+    // printf("test: \n");
+    // give_me_a_name.setValue({2, 1}, 5);
     // return 0;
+
 	nn::model::Model model(config_FN);
 
 	if (argc > 1 && std::string(argv[1]) == "l") {

From efe75cbb62be12784c0975c2f4447585d4ca8090 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Wed, 6 Aug 2025 21:49:46 +0300
Subject: [PATCH 18/40] bug fixes, i simplify the data structure

---
 include/tensor.hpp                 |   8 +-
 src/model/tensor.cpp               | 198 +++++++++--------------------
 src/model/tensor_gpu.hpp           |   1 +
 src/networks/fnn/DenseLayer.cpp    |   1 +
 src/networks/fnn/FNNetwork.cpp     |   2 +-
 tests/data/config-binary_test.json |   4 +-
 6 files changed, 68 insertions(+), 146 deletions(-)

diff --git a/include/tensor.hpp b/include/tensor.hpp
index c1fa8fa..68999da 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -2,7 +2,6 @@
 #define TENSOR
 
 #include "../src/model/tensor_gpu.hpp"
-#include <cstddef>
 #include <vector>
 
 namespace nn::model {
@@ -14,14 +13,11 @@ namespace nn::global {
 class Tensor {
   private:
 	std::vector<ValueType> cpu_data;
-	std::vector<size_t> cpu_shape;
-	std::vector<size_t> cpu_strides;
+	std::vector<size_t> shape;
+	std::vector<size_t> strides;
 
 	ValueType *gpu_data = nullptr;
 	std::size_t gpu_data_size{0};
-	size_t *gpu_shape = nullptr;
-	size_t *gpu_strides = nullptr;
-	size_t gpu_shape_size{0};
 
 	static const bool isGpu{true};
 
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index ea9a223..c131cf7 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -6,50 +6,38 @@
 #include <vector>
 
 namespace nn::global {
-Tensor::Tensor(const std::vector<size_t> &shape, float init) {
-	if (shape.empty()) {
+Tensor::Tensor(const std::vector<size_t> &shape_, float init) {
+	if (shape_.empty()) {
 		throw std::invalid_argument("Tensor shape cannot be empty.");
 	}
 
 	size_t totalSize = std::accumulate(
-	    shape.begin(),
-	    shape.end(),
+	    shape_.begin(),
+	    shape_.end(),
 	    size_t(1),
 	    std::multiplies<>());
 
+	shape = shape_;
 	if (!isGpu) {
-		cpu_shape = shape;
 		cpu_data.assign(totalSize, init);
 	} else {
 		gpu_data = (ValueType *)tensor_gpu::allocate(totalSize * sizeof(ValueType));
 
-		gpu_shape = (size_t *)tensor_gpu::allocate(shape.size() * sizeof(size_t));
-		tensor_gpu::copyToDevice(gpu_shape, shape.data(), shape.size() * sizeof(size_t));
-
 		gpu_data_size = totalSize;
-		gpu_shape_size = shape.size();
 	}
 
 	computeStrides();
 }
 
 Tensor::Tensor(const Tensor &other) {
+	shape = other.shape;
+	strides = other.strides;
 	if (isGpu) {
 		gpu_data_size = other.gpu_data_size;
-		gpu_shape_size = other.gpu_shape_size;
-
 		gpu_data = (ValueType *)tensor_gpu::allocate(gpu_data_size * sizeof(ValueType));
-		gpu_strides = (size_t *)tensor_gpu::allocate(gpu_shape_size * sizeof(size_t));
-		gpu_shape = (size_t *)tensor_gpu::allocate(gpu_shape_size * sizeof(size_t));
-
 		tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType));
-		tensor_gpu::copyDeviceToDevice(gpu_shape, other.gpu_shape, gpu_shape_size * sizeof(size_t));
-		tensor_gpu::copyDeviceToDevice(gpu_shape, other.gpu_shape, gpu_shape_size * sizeof(size_t));
-
 	} else {
 		cpu_data = other.cpu_data;
-		cpu_shape = other.cpu_shape;
-		cpu_strides = other.cpu_strides;
 	}
 }
 
@@ -79,57 +67,40 @@ Tensor &Tensor::operator=(const Tensor &other) {
 	if (this == &other)
 		return *this;
 
+	shape = other.shape;
+	strides = other.strides;
 	if (!isGpu) {
 		cpu_data = other.cpu_data;
-		cpu_shape = other.cpu_shape;
-		cpu_strides = other.cpu_strides;
 	} else {
-		gpu_shape = (size_t *)tensor_gpu::allocate(other.gpu_shape_size * sizeof(size_t));
 		gpu_data = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType));
 		gpu_data_size = other.gpu_data_size;
-		gpu_shape_size = other.gpu_shape_size;
-
 		tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType));
-		tensor_gpu::copyDeviceToDevice(gpu_shape, other.gpu_shape, gpu_shape_size * sizeof(size_t));
-		tensor_gpu::copyDeviceToDevice(gpu_strides, other.gpu_strides, gpu_shape_size * sizeof(size_t));
 	}
 	return *this;
 }
 
 void Tensor::computeStrides() {
-	if (isGpu) {
-		gpu_strides = (size_t *)tensor_gpu::allocate(gpu_shape_size * sizeof(size_t));
-		tensor_gpu::computeStridesDevice(gpu_shape, gpu_strides, gpu_shape_size);
-	} else {
-		const size_t dim = cpu_shape.size();
-		cpu_strides.resize(dim);
-		size_t stride = 1;
-		for (size_t i = dim; i-- > 0;) {
-			cpu_strides[i] = stride;
-			stride *= cpu_shape[i];
-		}
+	const size_t dim = shape.size();
+	strides.resize(dim);
+	size_t stride = 1;
+	for (size_t i = dim; i-- > 0;) {
+		strides[i] = stride;
+		stride *= shape[i];
 	}
 }
 
 inline size_t Tensor::flattenIndex(const std::vector<size_t> &indices) const {
-	if (!isGpu) {
-		// CPU version, same as before
-		if (indices.size() != cpu_shape.size()) {
-			throw std::invalid_argument("Incorrect number of indices.");
-		}
-		size_t index = 0;
-		for (size_t i = 0; i < cpu_shape.size(); ++i) {
-			if (indices[i] >= cpu_shape[i])
-				throw std::out_of_range("Index out of bounds.");
-			index += indices[i] * cpu_strides[i];
-		}
-		return index;
-	} else {
-		if (indices.size() != gpu_shape_size) {
-			throw std::invalid_argument("Incorrect number of indices.");
-		}
-		return tensor_gpu::flattenIndexGpu(indices.data(), gpu_shape, gpu_strides, gpu_shape_size);
+	// CPU version, same as before
+	if (indices.size() != shape.size()) {
+		throw std::invalid_argument("Incorrect number of indices.");
 	}
+	size_t index = 0;
+	for (size_t i = 0; i < shape.size(); ++i) {
+		if (indices[i] >= shape[i])
+			throw std::out_of_range("Index out of bounds.");
+		index += indices[i] * strides[i];
+	}
+	return index;
 }
 
 ValueType Tensor::getValue(const std::vector<size_t> &indices) const {
@@ -149,60 +120,52 @@ void Tensor::setValue(const std::vector<size_t> &indices, const ValueType value)
 }
 
 Tensor &Tensor::operator+=(const Tensor &other) {
+	if (shape != other.shape)
+		throw std::invalid_argument("Shape mismatch in Tensor::operator+=.");
 	if (!isGpu) {
-		if (cpu_shape != other.cpu_shape)
-			throw std::invalid_argument("Shape mismatch in Tensor::operator+=.");
 		const size_t N = cpu_data.size();
 		for (size_t i = 0; i < N; ++i)
 			cpu_data[i] += other.cpu_data[i];
 	} else {
-		if (gpu_shape != other.gpu_shape)
-			throw std::invalid_argument("Shape mismatch in Tensor::operator+=.");
 		tensor_gpu::add(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType));
 	}
 	return *this;
 }
 
 Tensor &Tensor::operator-=(const Tensor &other) {
+	if (shape != other.shape)
+		throw std::invalid_argument("Shape mismatch in Tensor::operator-=.");
 	if (!isGpu) {
-		if (cpu_shape != other.cpu_shape)
-			throw std::invalid_argument("Shape mismatch in Tensor::operator-=.");
 		const size_t N = cpu_data.size();
 		for (size_t i = 0; i < N; ++i)
 			cpu_data[i] -= other.cpu_data[i];
 	} else {
-		if (gpu_shape != other.gpu_shape)
-			throw std::invalid_argument("Shape mismatch in Tensor::operator+=.");
 		tensor_gpu::subtraction(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType));
 	}
 	return *this;
 }
 
 Tensor &Tensor::operator*=(const Tensor &other) {
+	if (shape != other.shape)
+		throw std::invalid_argument("Shape mismatch in Tensor::operator*=.");
 	if (!isGpu) {
-		if (cpu_shape != other.cpu_shape)
-			throw std::invalid_argument("Shape mismatch in Tensor::operator*=.");
 		const size_t N = cpu_data.size();
 		for (size_t i = 0; i < N; ++i)
 			cpu_data[i] *= other.cpu_data[i];
 	} else {
-		if (gpu_shape != other.gpu_shape)
-			throw std::invalid_argument("Shape mismatch in Tensor::operator+=.");
 		tensor_gpu::multiply(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType));
 	}
 	return *this;
 }
 
 Tensor &Tensor::operator/=(const Tensor &other) {
+	if (shape != other.shape)
+		throw std::invalid_argument("Shape mismatch in Tensor::operator/=.");
 	if (!isGpu) {
-		if (cpu_shape != other.cpu_shape)
-			throw std::invalid_argument("Shape mismatch in Tensor::operator/=.");
 		const size_t N = cpu_data.size();
 		for (size_t i = 0; i < N; ++i)
 			cpu_data[i] /= other.cpu_data[i];
 	} else {
-		if (gpu_shape != other.gpu_shape)
-			throw std::invalid_argument("Shape mismatch in Tensor::operator+=.");
 		tensor_gpu::division(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType));
 	}
 	return *this;
@@ -273,20 +236,18 @@ Tensor Tensor::operator+(ValueType scalar) const {
 }
 
 Tensor Tensor::matmul(const Tensor &other) const {
-	if (!isGpu) {
-		const auto &aShape = cpu_shape;
-		const auto &bShape = other.cpu_shape;
+	const auto &aShape = shape;
+	const auto &bShape = other.shape;
+	if (aShape.size() != 2 || bShape.size() != 1)
+		throw std::runtime_error("matmul: unsupported shapes.");
 
-		if (aShape.size() != 2 || bShape.size() != 1)
-			throw std::runtime_error("matmul: unsupported shapes.");
-
-		size_t M = aShape[0];
-		size_t K = aShape[1];
-		if (K != bShape[0])
-			throw std::runtime_error("matmul: shape mismatch.");
-
-		Tensor result({M});
+	size_t M = aShape[0];
+	size_t K = aShape[1];
+	if (K != bShape[0])
+		throw std::runtime_error("matmul: shape mismatch.");
+	Tensor result({M});
 
+	if (!isGpu) {
 		const float *A = cpu_data.data();
 		const float *B = other.cpu_data.data();
 		float *R = result.cpu_data.data();
@@ -301,33 +262,21 @@ Tensor Tensor::matmul(const Tensor &other) const {
 		}
 		return result;
 	}
-
-	// Validate shapes similarly (assumed available via gpu_shape_size and gpu_shape pointer)
-	if (gpu_shape_size != 2 || other.gpu_shape_size != 1)
-		throw std::runtime_error("matmul (GPU): unsupported shapes.");
-
-	size_t M = gpu_shape[0];
-	size_t K = gpu_shape[1];
-	if (K != other.gpu_shape[0])
-		throw std::runtime_error("matmul (GPU): shape mismatch.");
-
-	Tensor result({M}, 0.0f);
-
-	// Call GPU kernel or helper
 	tensor_gpu::matmul(gpu_data, other.gpu_data, result.gpu_data, M, K);
 	return result;
 }
 
 Tensor Tensor::outer(const Tensor &a, const Tensor &b) {
-	if (!isGpu) {
-		if (a.cpu_shape.size() != 1 || b.cpu_shape.size() != 1) {
-			throw std::runtime_error("outer: both tensors must be 1D vectors");
-		}
+	if (a.shape.size() != 1 || b.shape.size() != 1) {
+		throw std::runtime_error("outer: both tensors must be 1D vectors");
+	}
 
-		size_t m = a.cpu_shape[0];
-		size_t n = b.cpu_shape[0];
+	size_t m = a.shape[0];
+	size_t n = b.shape[0];
 
-		Tensor result({m, n});
+	Tensor result({m, n});
+
+	if (!isGpu) {
 		float *r = result.cpu_data.data();
 		const float *A = a.cpu_data.data();
 		const float *B = b.cpu_data.data();
@@ -339,35 +288,25 @@ Tensor Tensor::outer(const Tensor &a, const Tensor &b) {
 		}
 		return result;
 	}
-
-	if (a.gpu_shape_size != 1 || b.gpu_shape_size != 1)
-		throw std::runtime_error("outer (GPU): both tensors must be 1D vectors");
-
-	size_t m = a.gpu_shape[0];
-	size_t n = b.gpu_shape[0];
-
-	Tensor result({m, n});
-
-	// Call GPU kernel or helper
 	tensor_gpu::outer(a.gpu_data, b.gpu_data, result.gpu_data, m, n);
 	return result;
 }
 
 Tensor Tensor::matmulT(const Tensor &vec) const {
-	if (!isGpu) {
-		const auto &wShape = cpu_shape;
-		const auto &vShape = vec.cpu_shape;
+	const auto &wShape = shape;
+	const auto &vShape = vec.shape;
 
-		if (wShape.size() != 2 || vShape.size() != 1)
-			throw std::runtime_error("matmulT: bad dimensions");
+	if (wShape.size() != 2 || vShape.size() != 1)
+		throw std::runtime_error("matmulT: bad dimensions");
 
-		size_t M = wShape[0];
-		size_t N = wShape[1];
-		if (vShape[0] != M)
-			throw std::runtime_error("matmulT: incompatible");
+	size_t M = wShape[0];
+	size_t N = wShape[1];
+	if (vShape[0] != M)
+		throw std::runtime_error("matmulT: incompatible");
 
-		Tensor result({N}, 0.0f);
+	Tensor result({N}, 0.0f);
 
+	if (!isGpu) {
 		const float *W = cpu_data.data();
 		const float *V = vec.cpu_data.data();
 		float *R = result.cpu_data.data();
@@ -381,19 +320,6 @@ Tensor Tensor::matmulT(const Tensor &vec) const {
 		}
 		return result;
 	}
-
-	// GPU path
-	if (gpu_shape_size != 2 || vec.gpu_shape_size != 1)
-		throw std::runtime_error("matmulT (GPU): bad dimensions");
-
-	size_t M = gpu_shape[0];
-	size_t N = gpu_shape[1];
-	if (vec.gpu_shape[0] != M)
-		throw std::runtime_error("matmulT (GPU): incompatible");
-
-	Tensor result({N});
-
-	// Call GPU kernel or helper
 	tensor_gpu::matmulT(gpu_data, vec.gpu_data, result.gpu_data, M, N);
 	return result;
 }
@@ -401,8 +327,6 @@ Tensor Tensor::matmulT(const Tensor &vec) const {
 Tensor::~Tensor() {
 	if (isGpu) {
 		tensor_gpu::deallocate(gpu_data);
-		tensor_gpu::deallocate(gpu_shape);
-		tensor_gpu::deallocate(gpu_strides);
 	}
 }
 } // namespace nn::global
diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp
index ae48a4a..1b5a97f 100644
--- a/src/model/tensor_gpu.hpp
+++ b/src/model/tensor_gpu.hpp
@@ -2,6 +2,7 @@
 #define TENSOR_GPU
 
 #include <cstddef>
+
 namespace nn::global {
 using ValueType = float;
 }
diff --git a/src/networks/fnn/DenseLayer.cpp b/src/networks/fnn/DenseLayer.cpp
index a0f9833..31c5b2a 100644
--- a/src/networks/fnn/DenseLayer.cpp
+++ b/src/networks/fnn/DenseLayer.cpp
@@ -33,6 +33,7 @@ void Hidden_Layer::CreateDropoutMask() {
 }
 
 void Output_Layer::forward(const global::Tensor &metrix) {
+
 	net = parameters.weights.matmul(metrix);
 	net += parameters.biases;
 
diff --git a/src/networks/fnn/FNNetwork.cpp b/src/networks/fnn/FNNetwork.cpp
index fd6f7fd..4304b9a 100644
--- a/src/networks/fnn/FNNetwork.cpp
+++ b/src/networks/fnn/FNNetwork.cpp
@@ -52,7 +52,7 @@ void FNNetwork::sendNewVNeurons(const size_t i) const {
 
 void FNNetwork::forward(const global::Tensor &newInput) {
 	input = newInput;
-	layers[0]->forward(input);
+	layers[0]->forward(newInput);
 	sendNewVNeurons(0);
 
 	for (size_t i = 1; i < layers.size(); ++i) {
diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json
index 66b35f8..783afa4 100644
--- a/tests/data/config-binary_test.json
+++ b/tests/data/config-binary_test.json
@@ -1,9 +1,9 @@
 {
 	"visual config": {
-		"enableVisuals": false,
+		"enableVisuals": true,
 		"modes": [
 			{ "state": "pause", "mode": true },
-			{ "state": "precise mode", "mode": false },
+			{ "state": "precise mode", "mode": true },
 			{ "state": "auto pause", "mode": false }
 		]
 	},

From ace20ca37a0ad33451cd1bb386771a80d710675b Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Thu, 7 Aug 2025 17:53:56 +0300
Subject: [PATCH 19/40] bug fixes

---
 src/model/tensor.cpp               | 1 +
 tests/data/config-binary_test.json | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index c131cf7..f247ecd 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -72,6 +72,7 @@ Tensor &Tensor::operator=(const Tensor &other) {
 	if (!isGpu) {
 		cpu_data = other.cpu_data;
 	} else {
+        tensor_gpu::deallocate(gpu_data);
 		gpu_data = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType));
 		gpu_data_size = other.gpu_data_size;
 		tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType));
diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json
index 783afa4..e6982c6 100644
--- a/tests/data/config-binary_test.json
+++ b/tests/data/config-binary_test.json
@@ -1,14 +1,14 @@
 {
 	"visual config": {
-		"enableVisuals": true,
+		"enableVisuals": false,
 		"modes": [
 			{ "state": "pause", "mode": true },
-			{ "state": "precise mode", "mode": true },
+			{ "state": "precise mode", "mode": false },
 			{ "state": "auto pause", "mode": false }
 		]
 	},
 	"training config": {
-		"batch size": 16,
+		"batch size": 32,
 		"batch count": 1000,
 		"auto save": {
 			"saveEvery": 2000,
@@ -30,7 +30,7 @@
 			"output size": 16,
 			"output activation": 4,
 			"layers": [
-				{ "size": 30, "activationType": 1 },
+				{ "size": 100, "activationType": 1 },
 				{ "size": 30, "activationType": 1 }
 			]
 		}

From 26c607aa8a735cc3456a3b371a0654d702539385 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Thu, 7 Aug 2025 18:12:01 +0300
Subject: [PATCH 20/40] bug fix

---
 include/tensor.hpp              |  6 ------
 src/model/activations.cpp       |  6 +++---
 src/model/model.cpp             | 12 +++++++++---
 src/model/tensor.cpp            | 11 +++++++++--
 src/networks/fnn/DenseLayer.cpp |  8 +++++---
 tests/binary_test.cpp           |  4 ++--
 6 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/include/tensor.hpp b/include/tensor.hpp
index 68999da..2699c5f 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -40,12 +40,6 @@ class Tensor {
 	ValueType getValue(const std::vector<size_t> &newShape) const;
 	void setValue(const std::vector<size_t> &newShape, const ValueType value);
 
-	// Iterators (for range-based loops)
-	auto begin() noexcept { return cpu_data.begin(); }
-	auto end() noexcept { return cpu_data.end(); }
-	auto begin() const noexcept { return cpu_data.begin(); }
-	auto end() const noexcept { return cpu_data.end(); }
-
 	// Shape and size
 	size_t numElements() const;
 	void getData(std::vector<ValueType> &dest) const;
diff --git a/src/model/activations.cpp b/src/model/activations.cpp
index 319104b..9f537c3 100644
--- a/src/model/activations.cpp
+++ b/src/model/activations.cpp
@@ -48,9 +48,9 @@ global::ValueType Activation::maxVector(const global::Tensor &metrix) {
 	if (metrix.isGpu) {
 	}
 	global::ValueType max = metrix.cpu_data[0];
-	for (auto &value : metrix) {
-		if (value > max) {
-			max = value;
+	for (size_t i = 0; i < metrix.numElements(); ++i) {
+		if (metrix.getValue({i}) > max) {
+			max = metrix.getValue({i});
 		}
 	}
 
diff --git a/src/model/model.cpp b/src/model/model.cpp
index 9e6442c..525e4df 100644
--- a/src/model/model.cpp
+++ b/src/model/model.cpp
@@ -350,9 +350,15 @@ modelResult Model::evaluateModel(
 
 		runModel(transformation(sample.input));
 
-		size_t predicted_index = std::distance(
-		    getOutput().begin(),
-		    std::max_element(getOutput().begin(), getOutput().end()));
+		size_t predicted_index = 0;
+		float max_value = getOutput().getValue({0});
+
+		for (size_t j = 1; j < getOutput().numElements(); ++j) {
+			if (getOutput().getValue({j}) > max_value) {
+				max_value = getOutput().getValue({j});
+				predicted_index = j;
+			}
+		}
 
 		if (showProgressbar) {
 			bar++;
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index f247ecd..ca5021b 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -60,7 +60,14 @@ void Tensor::getData(std::vector<ValueType> &dest) const {
 }
 
 void Tensor::fill(const ValueType &value) {
-	std::fill(begin(), end(), value);
+	if (isGpu) {
+		tensor_gpu::zero(gpu_data, gpu_data_size);
+		tensor_gpu::add(gpu_data, value, gpu_data, gpu_data_size);
+	} else {
+		for (auto &n : cpu_data) {
+			n = value;
+		}
+	}
 }
 
 Tensor &Tensor::operator=(const Tensor &other) {
@@ -72,7 +79,7 @@ Tensor &Tensor::operator=(const Tensor &other) {
 	if (!isGpu) {
 		cpu_data = other.cpu_data;
 	} else {
-        tensor_gpu::deallocate(gpu_data);
+		tensor_gpu::deallocate(gpu_data);
 		gpu_data = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType));
 		gpu_data_size = other.gpu_data_size;
 		tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType));
diff --git a/src/networks/fnn/DenseLayer.cpp b/src/networks/fnn/DenseLayer.cpp
index 31c5b2a..51a92a6 100644
--- a/src/networks/fnn/DenseLayer.cpp
+++ b/src/networks/fnn/DenseLayer.cpp
@@ -149,7 +149,7 @@ void DenseLayer::setData(const global::Tensor newParam) {
 	size_t currentI = 0;
 	for (size_t i = 0; i < size(); ++i) {
 		for (size_t j = 0; j < prevSize(); ++j) {
-			parameters.weights.setValue({i, j},  newParam.getValue({currentI}));
+			parameters.weights.setValue({i, j}, newParam.getValue({currentI}));
 
 			++currentI;
 		}
@@ -168,8 +168,10 @@ void DenseLayer::fillParamRandom() {
 	global::ValueType std_dev = std::sqrt(2.0 / static_cast<global::ValueType>(prevSize()));
 	std::normal_distribution<> dist(0.0, std_dev);
 
-	for (auto &value : parameters.weights) {
-		value = dist(gen);
+	for (size_t i = 0; i < parameters.size(); ++i) {
+		for (size_t j = 0; j < parameters.prevSize(); ++j) {
+			parameters.weights.setValue({i, j}, dist(gen));
+		}
 	}
 }
 
diff --git a/tests/binary_test.cpp b/tests/binary_test.cpp
index aff4107..7bdd693 100644
--- a/tests/binary_test.cpp
+++ b/tests/binary_test.cpp
@@ -66,8 +66,8 @@ void print_database(int actual_size, int input_size, int database_size) {
 }
 
 void printVector(const nn::global::Tensor &vec) {
-	for (const auto &elem : vec) {
-		std::cout << elem << ' ';
+	for (size_t i =0; i < vec.numElements(); ++i) {
+		std::cout << vec.getValue({i}) << ' ';
 	}
 
 	std::cout << '\n';

From a157c5e1767c121c4d8a88ce2e96f4091c012cab Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Thu, 7 Aug 2025 18:39:42 +0300
Subject: [PATCH 21/40] new commit

---
 src/model/tensor.cpp     |  1 -
 src/model/tensor_gpu.cu  | 73 +---------------------------------------
 src/model/tensor_gpu.hpp |  3 --
 3 files changed, 1 insertion(+), 76 deletions(-)

diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index ca5021b..ef6372a 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -22,7 +22,6 @@ Tensor::Tensor(const std::vector<size_t> &shape_, float init) {
 		cpu_data.assign(totalSize, init);
 	} else {
 		gpu_data = (ValueType *)tensor_gpu::allocate(totalSize * sizeof(ValueType));
-
 		gpu_data_size = totalSize;
 	}
 
diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
index b27ffd4..6b7cbb6 100644
--- a/src/model/tensor_gpu.cu
+++ b/src/model/tensor_gpu.cu
@@ -49,7 +49,6 @@ void zero(ValueType* deviceData, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     zeroKernel<<<numBlocks, blockSize>>>(deviceData, count);
-    cudaDeviceSynchronize();
 }
 
 // Kernel for element-wise addition: C = A + B
@@ -65,7 +64,6 @@ void add(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     addKernel<<<numBlocks, blockSize>>>(A, B, C, count);
-    cudaDeviceSynchronize();
 }
 
 
@@ -82,7 +80,6 @@ void subtraction(const ValueType* A, const ValueType* B, ValueType* C, std::size
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     subtractionKernel<<<numBlocks, blockSize>>>(A, B, C, count);
-    cudaDeviceSynchronize();
 }
 
 // Kernel for element-wise addition: C = A / B
@@ -98,7 +95,6 @@ void division(const ValueType* A, const ValueType* B, ValueType* C, std::size_t
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     divisionKernel<<<numBlocks, blockSize>>>(A, B, C, count);
-    cudaDeviceSynchronize();
 }
 
 // Kernel for element-wise multiplication: C = A * B
@@ -114,7 +110,6 @@ void multiply(const ValueType* A, const ValueType* B, ValueType* C, std::size_t
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     multiplyKernel<<<numBlocks, blockSize>>>(A, B, C, count);
-    cudaDeviceSynchronize();
 }
 
 // Kernel for element-wise addition: C = A + B
@@ -130,7 +125,6 @@ void add(const ValueType* A, const ValueType B, ValueType* C, std::size_t count)
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     addKernel<<<numBlocks, blockSize>>>(A, B, C, count);
-    cudaDeviceSynchronize();
 }
 
 
@@ -147,7 +141,6 @@ void subtraction(const ValueType* A, const ValueType B, ValueType* C, std::size_
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     subtractionKernel<<<numBlocks, blockSize>>>(A, B, C, count);
-    cudaDeviceSynchronize();
 }
 
 // Kernel for element-wise addition: C = A / B
@@ -163,7 +156,6 @@ void division(const ValueType* A, const ValueType B, ValueType* C, std::size_t c
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     divisionKernel<<<numBlocks, blockSize>>>(A, B, C, count);
-    cudaDeviceSynchronize();
 }
 
 // Kernel for element-wise multiplication: C = A * B
@@ -179,60 +171,6 @@ void multiply(const ValueType* A, const ValueType B, ValueType* C, std::size_t c
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     multiplyKernel<<<numBlocks, blockSize>>>(A, B, C, count);
-    cudaDeviceSynchronize();
-}
-
-// Dot product kernel using parallel reduction (simplified version)
-__global__ void dotKernel(const ValueType* A, const ValueType* B, ValueType* partialSum, std::size_t count) {
-    __shared__ ValueType cache[256];
-    std::size_t tid = threadIdx.x;
-    std::size_t idx = blockIdx.x * blockDim.x + tid;
-
-    float temp = 0.0f;
-    if (idx < count) {
-        temp = A[idx] * B[idx];
-    }
-    cache[tid] = temp;
-    __syncthreads();
-
-    // Reduction in shared memory
-    for (std::size_t stride = blockDim.x / 2; stride > 0; stride /= 2) {
-        if (tid < stride) {
-            cache[tid] += cache[tid + stride];
-        }
-        __syncthreads();
-    }
-
-    if (tid == 0) {
-        partialSum[blockIdx.x] = cache[0];
-    }
-}
-
-// Dot product between two vectors (A · B)
-float dot(const ValueType* A, const ValueType* B, std::size_t count) {
-    const std::size_t blockSize = 256;
-    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
-
-    // Allocate partial sums
-    ValueType* d_partialSum = nullptr;
-    cudaMalloc(&d_partialSum, numBlocks * sizeof(ValueType));
-
-    dotKernel<<<numBlocks, blockSize>>>(A, B, d_partialSum, count);
-    cudaDeviceSynchronize();
-
-    // Copy partial sums to host
-    ValueType* h_partialSum = new ValueType[numBlocks];
-    cudaMemcpy(h_partialSum, d_partialSum, numBlocks * sizeof(float), cudaMemcpyDeviceToHost);
-
-    // Final reduction on CPU
-    ValueType totalSum = 0.0f;
-    for (std::size_t i = 0; i < numBlocks; i++) {
-        totalSum += h_partialSum[i];
-    }
-
-    delete[] h_partialSum;
-    cudaFree(d_partialSum);
-    return totalSum;
 }
 
 __global__ void computeStrides(const size_t *shape, size_t *strides, size_t ndim) {
@@ -245,7 +183,7 @@ __global__ void computeStrides(const size_t *shape, size_t *strides, size_t ndim
 
 void computeStridesDevice(const size_t *gpu_shape, size_t *gpu_strides, std::size_t ndim) {
     computeStrides<<<1, 1>>>(gpu_shape, gpu_strides, ndim);
-    cudaDeviceSynchronize(); // Ensure computation completes
+    cudaDeviceSynchronize();
 }
 
 // Kernel to apply ReLU activation: max(0, x)
@@ -261,7 +199,6 @@ void relu(const ValueType *input, ValueType *output, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     reluKernel<<<numBlocks, blockSize>>>(input, output, count);
-    cudaDeviceSynchronize();
 }
 
 // Kernel to apply ReLU derivative:
@@ -278,7 +215,6 @@ void relu_derivative(const ValueType* input, ValueType* output, std::size_t coun
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     reluDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count);
-    cudaDeviceSynchronize();
 }
 
 // Kernel to apply Sigmoid activation: 1 / (1 + exp(-x))
@@ -295,7 +231,6 @@ void sigmoid(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     sigmoidKernel<<<numBlocks, blockSize>>>(input, output, count);
-    cudaDeviceSynchronize();
 }
 
 // Kernel for Sigmoid derivative: s(x) * (1 - s(x))
@@ -313,7 +248,6 @@ void sigmoid_derivative(const ValueType* input, ValueType* output, std::size_t c
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     sigmoidDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count);
-    cudaDeviceSynchronize();
 }
 
 // Kernel to apply Tanh activation: tanh(x)
@@ -329,7 +263,6 @@ void tanh_activation(const ValueType* input, ValueType* output, std::size_t coun
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     tanhKernel<<<numBlocks, blockSize>>>(input, output, count);
-    cudaDeviceSynchronize();
 }
 
 // Kernel for Tanh derivative: 1 - tanh(x)^2
@@ -346,7 +279,6 @@ void tanh_derivative(const ValueType* input, ValueType* output, std::size_t coun
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     tanhDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count);
-    cudaDeviceSynchronize();
 }
 
 // Kernel for Leaky ReLU: x > 0 ? x : alpha * x
@@ -363,7 +295,6 @@ void leaky_relu(const ValueType* input, ValueType* output, std::size_t count, Va
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     leakyReluKernel<<<numBlocks, blockSize>>>(input, output, count, alpha);
-    cudaDeviceSynchronize();
 }
 
 // Kernel for Leaky ReLU derivative: x > 0 ? 1 : alpha
@@ -379,7 +310,6 @@ void leaky_relu_derivative(const ValueType* input, ValueType* output, std::size_
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     leakyReluDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count, alpha);
-    cudaDeviceSynchronize();
 }
 
 __global__ void softmaxKernel(const ValueType* input, ValueType* output, std::size_t count) {
@@ -423,7 +353,6 @@ void softmax(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t sharedMemSize = blockSize * sizeof(ValueType);
 
     softmaxKernel<<<numBlocks, blockSize, sharedMemSize>>>(input, output, count);
-    cudaDeviceSynchronize();
 }
 
 void setValueAt(ValueType* devicePtr, std::size_t index, ValueType value) {
diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp
index 1b5a97f..4e48651 100644
--- a/src/model/tensor_gpu.hpp
+++ b/src/model/tensor_gpu.hpp
@@ -52,9 +52,6 @@ void division(const ValueType *A, const ValueType B, ValueType *C, std::size_t c
 /// Element-wise multiply: C = A * B
 void multiply(const ValueType *A, const ValueType B, ValueType *C, std::size_t count);
 
-/// Dot product between two vectors (A · B)
-float dot(const ValueType *A, const ValueType *B, std::size_t count);
-
 void computeStridesDevice(const size_t *gpu_shape, size_t *gpu_strides, std::size_t ndim);
 
 // ---------------- ReLU ----------------

From 696092acd5c48af3e25749004053d350ef17ffaa Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Thu, 7 Aug 2025 18:55:29 +0300
Subject: [PATCH 22/40] performance improvment

---
 include/tensor.hpp       |  6 ------
 src/model/optimizers.cpp |  5 +++--
 src/model/optimizers.hpp |  4 ++--
 src/model/tensor.cpp     | 45 ++++++++++------------------------------
 4 files changed, 16 insertions(+), 44 deletions(-)

diff --git a/include/tensor.hpp b/include/tensor.hpp
index 2699c5f..3da043c 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -45,12 +45,6 @@ class Tensor {
 	void getData(std::vector<ValueType> &dest) const;
 	void fill(const ValueType &value);
 
-	// Arithmetic operations
-	Tensor operator*(ValueType scalar) const;
-	Tensor operator+(ValueType scalar) const;
-	Tensor operator/(ValueType scalar) const;
-	Tensor operator-(ValueType scalar) const;
-
 	Tensor &operator+=(const Tensor &other);
 	Tensor &operator-=(const Tensor &other);
 	Tensor &operator*=(const Tensor &other);
diff --git a/src/model/optimizers.cpp b/src/model/optimizers.cpp
index 9014aa2..b34a89a 100644
--- a/src/model/optimizers.cpp
+++ b/src/model/optimizers.cpp
@@ -1,7 +1,8 @@
 #include "optimizers.hpp"
 
 namespace nn::model {
-void ConstantOptimizer::step(global::Tensor &weight, const global::Tensor &grad) {
-	weight -= grad * (config.getLearningRate() / batchSize);
+void ConstantOptimizer::step(global::Tensor &weight, global::Tensor &grad) {
+	grad *= config.getLearningRate() / batchSize;
+	weight -= grad;
 }
 } // namespace nn::model
diff --git a/src/model/optimizers.hpp b/src/model/optimizers.hpp
index a910edf..203ea4e 100644
--- a/src/model/optimizers.hpp
+++ b/src/model/optimizers.hpp
@@ -13,7 +13,7 @@ class IOptimizer {
   public:
 	virtual ~IOptimizer() = default;
 
-	virtual void step(global::Tensor &weight, const global::Tensor &grad) = 0;
+	virtual void step(global::Tensor &weight, global::Tensor &grad) = 0;
 	virtual void reset() = 0;
 
 	void setOfset(const int batchSize_) { batchSize = batchSize_; }
@@ -27,7 +27,7 @@ class ConstantOptimizer : public IOptimizer {
 	ConstantOptimizer(const ConstantOptimizerConfig &config_)
 	    : config(config_) {}
 
-	void step(global::Tensor &weight, const global::Tensor &grad) override;
+	void step(global::Tensor &weight, global::Tensor &grad) override;
 
 	void reset() override {}
 };
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index ef6372a..e71c0f9 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -78,10 +78,11 @@ Tensor &Tensor::operator=(const Tensor &other) {
 	if (!isGpu) {
 		cpu_data = other.cpu_data;
 	} else {
-		tensor_gpu::deallocate(gpu_data);
-		gpu_data = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType));
+		ValueType *temp = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType));
 		gpu_data_size = other.gpu_data_size;
 		tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType));
+		tensor_gpu::deallocate(gpu_data);
+		gpu_data = temp;
 	}
 	return *this;
 }
@@ -134,7 +135,7 @@ Tensor &Tensor::operator+=(const Tensor &other) {
 		for (size_t i = 0; i < N; ++i)
 			cpu_data[i] += other.cpu_data[i];
 	} else {
-		tensor_gpu::add(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType));
+		tensor_gpu::add(gpu_data, other.gpu_data, gpu_data, gpu_data_size);
 	}
 	return *this;
 }
@@ -147,7 +148,7 @@ Tensor &Tensor::operator-=(const Tensor &other) {
 		for (size_t i = 0; i < N; ++i)
 			cpu_data[i] -= other.cpu_data[i];
 	} else {
-		tensor_gpu::subtraction(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType));
+		tensor_gpu::subtraction(gpu_data, other.gpu_data, gpu_data, gpu_data_size);
 	}
 	return *this;
 }
@@ -160,7 +161,7 @@ Tensor &Tensor::operator*=(const Tensor &other) {
 		for (size_t i = 0; i < N; ++i)
 			cpu_data[i] *= other.cpu_data[i];
 	} else {
-		tensor_gpu::multiply(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType));
+		tensor_gpu::multiply(gpu_data, other.gpu_data, gpu_data, gpu_data_size);
 	}
 	return *this;
 }
@@ -173,7 +174,7 @@ Tensor &Tensor::operator/=(const Tensor &other) {
 		for (size_t i = 0; i < N; ++i)
 			cpu_data[i] /= other.cpu_data[i];
 	} else {
-		tensor_gpu::division(gpu_data, other.gpu_data, gpu_data, gpu_data_size * sizeof(ValueType));
+		tensor_gpu::division(gpu_data, other.gpu_data, gpu_data, gpu_data_size);
 	}
 	return *this;
 }
@@ -183,7 +184,7 @@ Tensor &Tensor::operator*=(ValueType scalar) {
 		for (auto &x : cpu_data)
 			x *= scalar;
 	} else {
-		tensor_gpu::multiply(gpu_data, scalar, gpu_data, gpu_data_size * sizeof(ValueType));
+		tensor_gpu::multiply(gpu_data, scalar, gpu_data, gpu_data_size);
 	}
 	return *this;
 }
@@ -193,7 +194,7 @@ Tensor &Tensor::operator-=(ValueType scalar) {
 		for (auto &x : cpu_data)
 			x -= scalar;
 	} else {
-		tensor_gpu::subtraction(gpu_data, scalar, gpu_data, gpu_data_size * sizeof(ValueType));
+		tensor_gpu::subtraction(gpu_data, scalar, gpu_data, gpu_data_size);
 	}
 	return *this;
 }
@@ -203,7 +204,7 @@ Tensor &Tensor::operator+=(ValueType scalar) {
 		for (auto &x : cpu_data)
 			x += scalar;
 	} else {
-		tensor_gpu::add(gpu_data, scalar, gpu_data, gpu_data_size * sizeof(ValueType));
+		tensor_gpu::add(gpu_data, scalar, gpu_data, gpu_data_size);
 	}
 	return *this;
 }
@@ -213,35 +214,11 @@ Tensor &Tensor::operator/=(ValueType scalar) {
 		for (auto &x : cpu_data)
 			x /= scalar;
 	} else {
-		tensor_gpu::division(gpu_data, scalar, gpu_data, gpu_data_size * sizeof(ValueType));
+		tensor_gpu::division(gpu_data, scalar, gpu_data, gpu_data_size);
 	}
 	return *this;
 }
 
-Tensor Tensor::operator*(ValueType scalar) const {
-	Tensor result(*this);
-	result *= scalar;
-	return result;
-}
-
-Tensor Tensor::operator/(ValueType scalar) const {
-	Tensor result(*this);
-	result /= scalar;
-	return result;
-}
-
-Tensor Tensor::operator-(ValueType scalar) const {
-	Tensor result(*this);
-	result -= scalar;
-	return result;
-}
-
-Tensor Tensor::operator+(ValueType scalar) const {
-	Tensor result(*this);
-	result += scalar;
-	return result;
-}
-
 Tensor Tensor::matmul(const Tensor &other) const {
 	const auto &aShape = shape;
 	const auto &bShape = other.shape;

From 2f64b7ecce63072d870609f6bb7b9b55dd0888ca Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Thu, 7 Aug 2025 19:10:58 +0300
Subject: [PATCH 23/40] small change

---
 src/model/tensor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index e71c0f9..749eaeb 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -3,7 +3,6 @@
 #include <numeric>
 #include <stdexcept>
 #include <tensor.hpp>
-#include <vector>
 
 namespace nn::global {
 Tensor::Tensor(const std::vector<size_t> &shape_, float init) {
@@ -23,6 +22,7 @@ Tensor::Tensor(const std::vector<size_t> &shape_, float init) {
 	} else {
 		gpu_data = (ValueType *)tensor_gpu::allocate(totalSize * sizeof(ValueType));
 		gpu_data_size = totalSize;
+        fill(init);
 	}
 
 	computeStrides();

From 7890c8f88ff51ece577bc7768e105ef0dc820dd6 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Fri, 8 Aug 2025 14:49:11 +0300
Subject: [PATCH 24/40] bug fix, improve performance

---
 include/network/INetwork.hpp    |  2 +-
 include/tensor.hpp              |  4 ++-
 src/model/model.cpp             |  4 ++-
 src/model/tensor.cpp            | 54 +++++++++++++++------------------
 src/networks/cnn/CNNetwork.cpp  |  2 +-
 src/networks/cnn/CNNetwork.hpp  |  2 +-
 src/networks/fnn/DenseLayer.cpp | 42 ++++++++++++-------------
 src/networks/fnn/DenseLayer.hpp | 13 +++++---
 src/networks/fnn/FNNetwork.cpp  | 14 ++++-----
 src/networks/fnn/FNNetwork.hpp  |  4 +--
 10 files changed, 70 insertions(+), 71 deletions(-)

diff --git a/include/network/INetwork.hpp b/include/network/INetwork.hpp
index 530a444..574c600 100644
--- a/include/network/INetwork.hpp
+++ b/include/network/INetwork.hpp
@@ -11,7 +11,7 @@ class INetwork {
 	virtual ~INetwork() = default;
 
 	virtual void forward(const global::Tensor &input) = 0;
-	virtual void backward(const global::Tensor &outputDeltas) = 0;
+	virtual void backward(global::Tensor **outputDeltas) = 0;
 	virtual void updateWeights(IOptimizer &optimizer) = 0;
 	virtual void resetGradient() = 0;
 
diff --git a/include/tensor.hpp b/include/tensor.hpp
index 3da043c..f422913 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -42,6 +42,8 @@ class Tensor {
 
 	// Shape and size
 	size_t numElements() const;
+	const std::vector<size_t> &getShape() const { return shape; }
+	const std::vector<size_t> &getStrides() const { return strides; }
 	void getData(std::vector<ValueType> &dest) const;
 	void fill(const ValueType &value);
 
@@ -57,7 +59,7 @@ class Tensor {
 
 	Tensor matmul(const Tensor &other) const;
 	static Tensor outer(const Tensor &a, const Tensor &b);
-	Tensor matmulT(const Tensor &vec) const;
+	void matmulT(const Tensor &vec, Tensor &result) const;
 };
 } // namespace nn::global
 
diff --git a/src/model/model.cpp b/src/model/model.cpp
index 525e4df..70249a7 100644
--- a/src/model/model.cpp
+++ b/src/model/model.cpp
@@ -1,6 +1,7 @@
 #include "../networks/cnn/CNNetwork.hpp"
 #include "../networks/fnn/FNNetwork.hpp"
 #include "dataBase.hpp"
+#include "tensor.hpp"
 #include <chrono>
 #include <fstream>
 #include <iostream>
@@ -159,9 +160,10 @@ void Model::updateWeights(const int batchSize) {
 
 void Model::Backward(const global::Tensor &output) {
 	global::Tensor deltas = output;
+    global::Tensor *delta = &deltas;
 
 	for (int i = static_cast<int>(network.size()) - 1; i >= 0; --i) {
-		network[i]->backward(deltas);
+		network[i]->backward(&delta);
 		deltas = network[i]->getInput();
 	}
 }
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index 749eaeb..0bfe7b0 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -22,7 +22,7 @@ Tensor::Tensor(const std::vector<size_t> &shape_, float init) {
 	} else {
 		gpu_data = (ValueType *)tensor_gpu::allocate(totalSize * sizeof(ValueType));
 		gpu_data_size = totalSize;
-        fill(init);
+		fill(init);
 	}
 
 	computeStrides();
@@ -73,17 +73,25 @@ Tensor &Tensor::operator=(const Tensor &other) {
 	if (this == &other)
 		return *this;
 
-	shape = other.shape;
-	strides = other.strides;
 	if (!isGpu) {
 		cpu_data = other.cpu_data;
 	} else {
-		ValueType *temp = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType));
-		gpu_data_size = other.gpu_data_size;
+		ValueType *temp = gpu_data;
+		if (gpu_data_size != other.gpu_data_size) {
+			temp = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType));
+
+			gpu_data_size = other.gpu_data_size;
+		}
 		tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType));
-		tensor_gpu::deallocate(gpu_data);
-		gpu_data = temp;
+
+		if (gpu_data_size != other.gpu_data_size) {
+			tensor_gpu::deallocate(gpu_data);
+			gpu_data = temp;
+		}
 	}
+
+	shape = other.shape;
+	strides = other.strides;
 	return *this;
 }
 
@@ -276,36 +284,22 @@ Tensor Tensor::outer(const Tensor &a, const Tensor &b) {
 	return result;
 }
 
-Tensor Tensor::matmulT(const Tensor &vec) const {
-	const auto &wShape = shape;
-	const auto &vShape = vec.shape;
-
-	if (wShape.size() != 2 || vShape.size() != 1)
+void Tensor::matmulT(const Tensor &vec, Tensor &result) const {
+	if (shape.size() != 2 || vec.shape.size() != 1)
 		throw std::runtime_error("matmulT: bad dimensions");
-
-	size_t M = wShape[0];
-	size_t N = wShape[1];
-	if (vShape[0] != M)
+	if (vec.shape[0] != shape[0])
 		throw std::runtime_error("matmulT: incompatible");
 
-	Tensor result({N}, 0.0f);
-
 	if (!isGpu) {
-		const float *W = cpu_data.data();
-		const float *V = vec.cpu_data.data();
-		float *R = result.cpu_data.data();
-
-		for (size_t i = 0; i < N; ++i) {
-			float sum = 0.0f;
-			for (size_t j = 0; j < M; ++j) {
-				sum += W[j * N + i] * V[j];
+        result.fill(0);
+		for (size_t i = 0; i < shape[1]; ++i) {
+			for (size_t j = 0; j < shape[0]; ++j) {
+				result.cpu_data[i] += cpu_data[j * shape[1] + i] * vec.cpu_data[j];
 			}
-			R[i] = sum;
 		}
-		return result;
+	} else {
+		tensor_gpu::matmulT(gpu_data, vec.gpu_data, result.gpu_data, shape[0], shape[1]);
 	}
-	tensor_gpu::matmulT(gpu_data, vec.gpu_data, result.gpu_data, M, N);
-	return result;
 }
 
 Tensor::~Tensor() {
diff --git a/src/networks/cnn/CNNetwork.cpp b/src/networks/cnn/CNNetwork.cpp
index fc45ba3..a2d8b1c 100644
--- a/src/networks/cnn/CNNetwork.cpp
+++ b/src/networks/cnn/CNNetwork.cpp
@@ -16,7 +16,7 @@ void CNNetwork::forward(const global::Tensor &newInput) {
 	input = newInput;
 }
 
-void CNNetwork::backward(const global::Tensor &) {
+void CNNetwork::backward(global::Tensor **) {
 }
 
 global::ValueType CNNetwork::getLoss(const global::Prediction &) const {
diff --git a/src/networks/cnn/CNNetwork.hpp b/src/networks/cnn/CNNetwork.hpp
index e2ad58d..1c08705 100644
--- a/src/networks/cnn/CNNetwork.hpp
+++ b/src/networks/cnn/CNNetwork.hpp
@@ -24,7 +24,7 @@ class CNNetwork : public INetwork {
 	~CNNetwork() override = default;
 
 	void forward(const global::Tensor &newInput) override;
-	void backward(const global::Tensor &outputDeltas) override;
+	void backward(global::Tensor **outputDeltas) override;
 	void updateWeights(IOptimizer &optimizer) override;
 	void resetGradient() override;
 
diff --git a/src/networks/fnn/DenseLayer.cpp b/src/networks/fnn/DenseLayer.cpp
index 51a92a6..feacf6c 100644
--- a/src/networks/fnn/DenseLayer.cpp
+++ b/src/networks/fnn/DenseLayer.cpp
@@ -11,7 +11,8 @@ DenseLayer::DenseLayer(
       out({size}),
       parameters(size, prevSize),
       gradients(size, prevSize),
-      activationFunction(activation) {
+      activationFunction(activation),
+      deltaL({size}) {
 	if (randomInit) {
 		fillParamRandom();
 	}
@@ -40,25 +41,25 @@ void Output_Layer::forward(const global::Tensor &metrix) {
 	activationFunction.activate(net, out);
 }
 
-global::Tensor Output_Layer::getDelta(const global::Tensor &output) {
-	global::Tensor deltas = out;
-	deltas -= output;
-
-	return deltas;
+void Output_Layer::getDelta(const global::Tensor &output) {
+	deltaL = out;
+	deltaL -= output;
 }
 
 void Output_Layer::backward(
-    global::Tensor &deltas,
+    global::Tensor **deltas,
     const global::Tensor &prevLayer,
     const LayerParams *) {
 	if (activationFunction.getType() == ActivationType::Softmax) {
-		deltas = getDelta(deltas);
+		getDelta(**deltas);
 	} else {
-		activationFunction.derivativeActivate(out, deltas);
+		activationFunction.derivativeActivate(out, **deltas);
+		deltaL = **deltas;
 	}
 
-	gradients.biases += deltas;
-	gradients.weights += global::Tensor::outer(deltas, prevLayer);
+	gradients.biases += deltaL;
+	gradients.weights += global::Tensor::outer(deltaL, prevLayer);
+	*deltas = &deltaL;
 }
 
 global::ValueType Output_Layer::getCrossEntropyLoss(
@@ -87,32 +88,31 @@ void Hidden_Layer::forward(const global::Tensor &metrix) {
 }
 
 void Hidden_Layer::backward(
-    global::Tensor &deltas,
+    global::Tensor **deltas,
     const global::Tensor &prevLayer,
     const LayerParams *nextLayer) {
 
 	if (!nextLayer)
 		return;
 
-	deltas = getDelta(deltas, *nextLayer);
+	calculateDelta(**deltas, *nextLayer);
 
 	if (isTraining && config.dropoutRate) {
-		deltas *= dropoutMask;
+		deltaL *= dropoutMask;
 	}
 
-	gradients.biases += deltas;
+	gradients.biases += deltaL;
 
-	gradients.weights += global::Tensor::outer(deltas, prevLayer);
+	gradients.weights += global::Tensor::outer(deltaL, prevLayer);
+	*deltas = &deltaL;
 }
 
-global::Tensor Hidden_Layer::getDelta(
+void Hidden_Layer::calculateDelta(
     const global::Tensor &output,
     const LayerParams &nextLayer) {
 
-	auto deltas = nextLayer.weights.matmulT(output);
-	activationFunction.derivativeActivate(out, deltas);
-
-	return deltas;
+	nextLayer.weights.matmulT(output, deltaL);
+	activationFunction.derivativeActivate(out, deltaL);
 }
 
 size_t DenseLayer::getParamCount() const {
diff --git a/src/networks/fnn/DenseLayer.hpp b/src/networks/fnn/DenseLayer.hpp
index 2b401a2..04c807f 100644
--- a/src/networks/fnn/DenseLayer.hpp
+++ b/src/networks/fnn/DenseLayer.hpp
@@ -3,6 +3,7 @@
 
 #include "../../model/config.hpp"
 #include "../src/model/optimizers.hpp"
+#include "tensor.hpp"
 
 namespace nn::model::fnn {
 constexpr global::ValueType MIN_LOSS_VALUE = 1e-10;
@@ -14,6 +15,7 @@ struct LayerParams {
 	size_t size_;
 	size_t prevSize_;
 
+
 	LayerParams(size_t out_dim, size_t in_dim)
 	    : weights({out_dim, in_dim}), biases({out_dim}),
 	      size_(out_dim), prevSize_(in_dim) {}
@@ -35,6 +37,7 @@ class DenseLayer {
 	Activation activationFunction;
 
 	bool isTraining{false};
+	global::Tensor deltaL;
 
 	void fillParamRandom();
 
@@ -49,7 +52,7 @@ class DenseLayer {
 	virtual void forward(const global::Tensor &metrix) = 0;
 	void updateWeight(IOptimizer &optimizer);
 	virtual void backward(
-	    global::Tensor &deltas,
+	    global::Tensor **deltas,
 	    const global::Tensor &prevLayer,
 	    const LayerParams *nextLayer = nullptr) = 0;
 	virtual global::ValueType getLoss(const global::Prediction &) { return 0; };
@@ -77,7 +80,7 @@ class DenseLayer {
 class Hidden_Layer : public DenseLayer {
   private:
 	const DenseLayerConfig &config;
-	global::Tensor getDelta(
+	void calculateDelta(
 	    const global::Tensor &output,
 	    const LayerParams &nextLayer);
 
@@ -97,7 +100,7 @@ class Hidden_Layer : public DenseLayer {
 
 	void forward(const global::Tensor &metrix) override;
 	void backward(
-	    global::Tensor &deltas,
+	    global::Tensor **deltas,
 	    const global::Tensor &prevLayer,
 	    const LayerParams *nextLayer) override;
 };
@@ -106,7 +109,7 @@ class Output_Layer : public DenseLayer {
   private:
 	const FNNConfig &config;
 
-	global::Tensor getDelta(const global::Tensor &output);
+	void getDelta(const global::Tensor &output);
 	static global::ValueType getCrossEntropyLoss(
 	    const global::Tensor &prediction,
 	    const size_t target);
@@ -126,7 +129,7 @@ class Output_Layer : public DenseLayer {
 
 	void forward(const global::Tensor &metrix) override;
 	void backward(
-	    global::Tensor &deltas,
+	    global::Tensor **deltas,
 	    const global::Tensor &prevLayer,
 	    const LayerParams *) override;
 
diff --git a/src/networks/fnn/FNNetwork.cpp b/src/networks/fnn/FNNetwork.cpp
index 4304b9a..d723cc5 100644
--- a/src/networks/fnn/FNNetwork.cpp
+++ b/src/networks/fnn/FNNetwork.cpp
@@ -72,12 +72,10 @@ void FNNetwork::vUpdate() {
 	visual->attempPause();
 }
 
-void FNNetwork::backward(const global::Tensor &outputDeltas) {
-	global::Tensor deltas = outputDeltas;
-
+void FNNetwork::backward(global::Tensor **outputDeltas) {
 	resetGradient();
 
-	layers.back()->backward(deltas, layers[layers.size() - 2]->getOut());
+	layers.back()->backward(outputDeltas, layers[layers.size() - 2]->getOut());
 
 	if (visual) {
 		visual->setGrad(layers.size() - 1, layers[layers.size() - 1]->getGrad());
@@ -85,7 +83,7 @@ void FNNetwork::backward(const global::Tensor &outputDeltas) {
 
 	for (int i = static_cast<int>(layers.size()) - 2; i >= 0; --i) {
 		const global::Tensor &prev = (i == 0) ? input : layers[i - 1]->getOut();
-		layers[i]->backward(deltas, prev, &layers[i + 1]->getParms());
+		layers[i]->backward(outputDeltas, prev, &layers[i + 1]->getParms());
 
 		if (visual) {
 			visual->setGrad(i, layers[i]->getGrad());
@@ -94,7 +92,7 @@ void FNNetwork::backward(const global::Tensor &outputDeltas) {
 		vUpdate();
 	}
 
-	calculateInputDelta(deltas);
+	calculateInputDelta(outputDeltas);
 }
 
 global::ValueType FNNetwork::getLoss(const global::Prediction &pre) const {
@@ -137,8 +135,8 @@ void FNNetwork::updateWeights(IOptimizer &optimizer) {
 	}
 }
 
-void FNNetwork::calculateInputDelta(const global::Tensor &deltas) {
-	input = layers[0]->getParms().weights.matmulT(deltas);
+void FNNetwork::calculateInputDelta(global::Tensor **deltas) {
+	layers[0]->getParms().weights.matmulT(**deltas, input);
 }
 
 size_t FNNetwork::getParamCount() const {
diff --git a/src/networks/fnn/FNNetwork.hpp b/src/networks/fnn/FNNetwork.hpp
index 93d2366..674735c 100644
--- a/src/networks/fnn/FNNetwork.hpp
+++ b/src/networks/fnn/FNNetwork.hpp
@@ -13,7 +13,7 @@ class FNNetwork : public INetwork {
 
 	const std::shared_ptr<visualizer::fnn::FnnVisualier> visual;
 
-	void calculateInputDelta(const global::Tensor &deltas);
+	void calculateInputDelta(global::Tensor **deltas);
 
 	void vUpdate();
 
@@ -28,7 +28,7 @@ class FNNetwork : public INetwork {
 	~FNNetwork() override = default;
 
 	void forward(const global::Tensor &newInput) override;
-	void backward(const global::Tensor &outputDeltas) override;
+	void backward(global::Tensor **outputDeltas) override;
 	void updateWeights(IOptimizer &optimizer) override;
 	void resetGradient() override;
 

From d3133930773c18cb315455941ca6aef5d00258e8 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Fri, 8 Aug 2025 15:35:10 +0300
Subject: [PATCH 25/40] improved performance

---
 include/tensor.hpp                 |  7 +++---
 src/model/tensor.cpp               | 40 ++++++++++++++++--------------
 src/model/tensor_gpu.cu            |  2 +-
 src/networks/fnn/DenseLayer.cpp    |  9 +++----
 tests/data/config-binary_test.json | 10 +-------
 5 files changed, 32 insertions(+), 36 deletions(-)

diff --git a/include/tensor.hpp b/include/tensor.hpp
index f422913..06995b1 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -19,7 +19,7 @@ class Tensor {
 	ValueType *gpu_data = nullptr;
 	std::size_t gpu_data_size{0};
 
-	static const bool isGpu{true};
+	static const bool isGpu{false};
 
 	void computeStrides();
 	inline size_t flattenIndex(const std::vector<size_t> &indices) const;
@@ -46,6 +46,7 @@ class Tensor {
 	const std::vector<size_t> &getStrides() const { return strides; }
 	void getData(std::vector<ValueType> &dest) const;
 	void fill(const ValueType &value);
+	void zero();
 
 	Tensor &operator+=(const Tensor &other);
 	Tensor &operator-=(const Tensor &other);
@@ -57,8 +58,8 @@ class Tensor {
 	Tensor &operator+=(ValueType scalar);
 	Tensor &operator-=(ValueType scalar);
 
-	Tensor matmul(const Tensor &other) const;
-	static Tensor outer(const Tensor &a, const Tensor &b);
+	void matmul(const Tensor &other, Tensor &result) const;
+	static void outer(const Tensor &a, const Tensor &b, Tensor &result);
 	void matmulT(const Tensor &vec, Tensor &result) const;
 };
 } // namespace nn::global
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index 0bfe7b0..f4b7795 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -50,12 +50,9 @@ size_t Tensor::numElements() const {
 void Tensor::getData(std::vector<ValueType> &dest) const {
 	if (!isGpu) {
 		dest = cpu_data;
+	} else {
+		tensor_gpu::copyToHost(dest.data(), gpu_data, gpu_data_size * sizeof(ValueType));
 	}
-
-	ValueType *newV = nullptr;
-	tensor_gpu::copyToHost(newV, gpu_data, gpu_data_size * sizeof(ValueType));
-
-	std::copy(newV, newV + gpu_data_size, dest.begin());
 }
 
 void Tensor::fill(const ValueType &value) {
@@ -69,6 +66,14 @@ void Tensor::fill(const ValueType &value) {
 	}
 }
 
+void Tensor::zero() {
+	if (isGpu) {
+		tensor_gpu::zero(gpu_data, gpu_data_size);
+	} else {
+		fill(0);
+	}
+}
+
 Tensor &Tensor::operator=(const Tensor &other) {
 	if (this == &other)
 		return *this;
@@ -106,7 +111,6 @@ void Tensor::computeStrides() {
 }
 
 inline size_t Tensor::flattenIndex(const std::vector<size_t> &indices) const {
-	// CPU version, same as before
 	if (indices.size() != shape.size()) {
 		throw std::invalid_argument("Incorrect number of indices.");
 	}
@@ -227,7 +231,7 @@ Tensor &Tensor::operator/=(ValueType scalar) {
 	return *this;
 }
 
-Tensor Tensor::matmul(const Tensor &other) const {
+void Tensor::matmul(const Tensor &other, Tensor &result) const {
 	const auto &aShape = shape;
 	const auto &bShape = other.shape;
 	if (aShape.size() != 2 || bShape.size() != 1)
@@ -237,7 +241,8 @@ Tensor Tensor::matmul(const Tensor &other) const {
 	size_t K = aShape[1];
 	if (K != bShape[0])
 		throw std::runtime_error("matmul: shape mismatch.");
-	Tensor result({M});
+
+	result.zero();
 
 	if (!isGpu) {
 		const float *A = cpu_data.data();
@@ -252,13 +257,12 @@ Tensor Tensor::matmul(const Tensor &other) const {
 			}
 			R[i] = sum;
 		}
-		return result;
+	} else {
+		tensor_gpu::matmul(gpu_data, other.gpu_data, result.gpu_data, M, K);
 	}
-	tensor_gpu::matmul(gpu_data, other.gpu_data, result.gpu_data, M, K);
-	return result;
 }
 
-Tensor Tensor::outer(const Tensor &a, const Tensor &b) {
+void Tensor::outer(const Tensor &a, const Tensor &b, Tensor &result) {
 	if (a.shape.size() != 1 || b.shape.size() != 1) {
 		throw std::runtime_error("outer: both tensors must be 1D vectors");
 	}
@@ -266,7 +270,7 @@ Tensor Tensor::outer(const Tensor &a, const Tensor &b) {
 	size_t m = a.shape[0];
 	size_t n = b.shape[0];
 
-	Tensor result({m, n});
+	result.zero();
 
 	if (!isGpu) {
 		float *r = result.cpu_data.data();
@@ -275,13 +279,12 @@ Tensor Tensor::outer(const Tensor &a, const Tensor &b) {
 
 		for (size_t i = 0; i < m; ++i) {
 			for (size_t j = 0; j < n; ++j) {
-				r[i * n + j] = A[i] * B[j];
+				r[i * n + j] += A[i] * B[j];
 			}
 		}
-		return result;
+	} else {
+		tensor_gpu::outer(a.gpu_data, b.gpu_data, result.gpu_data, m, n);
 	}
-	tensor_gpu::outer(a.gpu_data, b.gpu_data, result.gpu_data, m, n);
-	return result;
 }
 
 void Tensor::matmulT(const Tensor &vec, Tensor &result) const {
@@ -290,8 +293,9 @@ void Tensor::matmulT(const Tensor &vec, Tensor &result) const {
 	if (vec.shape[0] != shape[0])
 		throw std::runtime_error("matmulT: incompatible");
 
+	result.zero();
+
 	if (!isGpu) {
-        result.fill(0);
 		for (size_t i = 0; i < shape[1]; ++i) {
 			for (size_t j = 0; j < shape[0]; ++j) {
 				result.cpu_data[i] += cpu_data[j * shape[1] + i] * vec.cpu_data[j];
diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
index 6b7cbb6..587c8de 100644
--- a/src/model/tensor_gpu.cu
+++ b/src/model/tensor_gpu.cu
@@ -512,7 +512,7 @@ __global__ void outerKernel(const ValueType *a, const ValueType *b, ValueType *r
     if (idx < total) {
         size_t i = idx / n;
         size_t j = idx % n;
-        result[i * n + j] = a[i] * b[j];
+        result[i * n + j] += a[i] * b[j];
     }
 }
 
diff --git a/src/networks/fnn/DenseLayer.cpp b/src/networks/fnn/DenseLayer.cpp
index feacf6c..cc7b208 100644
--- a/src/networks/fnn/DenseLayer.cpp
+++ b/src/networks/fnn/DenseLayer.cpp
@@ -34,8 +34,7 @@ void Hidden_Layer::CreateDropoutMask() {
 }
 
 void Output_Layer::forward(const global::Tensor &metrix) {
-
-	net = parameters.weights.matmul(metrix);
+	parameters.weights.matmul(metrix, net);
 	net += parameters.biases;
 
 	activationFunction.activate(net, out);
@@ -58,7 +57,7 @@ void Output_Layer::backward(
 	}
 
 	gradients.biases += deltaL;
-	gradients.weights += global::Tensor::outer(deltaL, prevLayer);
+	global::Tensor::outer(deltaL, prevLayer, gradients.weights);
 	*deltas = &deltaL;
 }
 
@@ -76,7 +75,7 @@ void Hidden_Layer::forward(const global::Tensor &metrix) {
 	if (isTraining)
 		CreateDropoutMask();
 
-	net = parameters.weights.matmul(metrix);
+	parameters.weights.matmul(metrix, net);
 	net += parameters.biases;
 
 	if (isTraining && config.dropoutRate > 0.0f) {
@@ -103,7 +102,7 @@ void Hidden_Layer::backward(
 
 	gradients.biases += deltaL;
 
-	gradients.weights += global::Tensor::outer(deltaL, prevLayer);
+	global::Tensor::outer(deltaL, prevLayer, gradients.weights);
 	*deltas = &deltaL;
 }
 
diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json
index e6982c6..c8280ac 100644
--- a/tests/data/config-binary_test.json
+++ b/tests/data/config-binary_test.json
@@ -1,6 +1,6 @@
 {
 	"visual config": {
-		"enableVisuals": false,
+		"enableVisuals": true,
 		"modes": [
 			{ "state": "pause", "mode": true },
 			{ "state": "precise mode", "mode": false },
@@ -10,14 +10,6 @@
 	"training config": {
 		"batch size": 32,
 		"batch count": 1000,
-		"auto save": {
-			"saveEvery": 2000,
-			"dataFilenameAutoSave": "mode.txt"
-		},
-		"auto evaluating": {
-			"evaluateEvery": 10,
-			"dataBaseFilename": "../tests/data/database-binary_test"
-		},
 		"optimizer": {
 			"type": "const",
 			"lr": 0.1

From 0ef22708d5bf34f0491b75a543dcf48846cd8937 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Fri, 8 Aug 2025 15:43:17 +0300
Subject: [PATCH 26/40] small change

---
 include/tensor.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/tensor.hpp b/include/tensor.hpp
index 06995b1..41eda35 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -24,8 +24,6 @@ class Tensor {
 	void computeStrides();
 	inline size_t flattenIndex(const std::vector<size_t> &indices) const;
 
-	void loadTempGpu() const;
-
 	friend model::Activation;
 
   public:

From 08572514e5e251e2dbff75cc337fb27c2e6601a8 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Fri, 8 Aug 2025 19:00:12 +0300
Subject: [PATCH 27/40] improved performance

---
 include/tensor.hpp                      |  7 +++--
 src/model/activations.cpp               |  1 +
 src/model/model.cpp                     | 12 +++++----
 src/model/tensor.cpp                    | 34 +++++++++++++++++--------
 src/model/tensor_gpu.cu                 | 25 +++++++++---------
 src/model/tensor_gpu.hpp                | 16 ++++++------
 src/networks/fnn/DenseLayer.cpp         |  9 ++++---
 src/visualizer/VisualizerController.cpp |  2 +-
 tests/data/config-binary_test.json      |  6 ++---
 9 files changed, 67 insertions(+), 45 deletions(-)

diff --git a/include/tensor.hpp b/include/tensor.hpp
index 41eda35..ba5f7cd 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -10,6 +10,8 @@ class Activation;
 
 namespace nn::global {
 
+constexpr bool GPU_MODE = true;
+
 class Tensor {
   private:
 	std::vector<ValueType> cpu_data;
@@ -17,9 +19,9 @@ class Tensor {
 	std::vector<size_t> strides;
 
 	ValueType *gpu_data = nullptr;
-	std::size_t gpu_data_size{0};
+	std::size_t gpu_data_size;
 
-	static const bool isGpu{false};
+	static const bool isGpu{GPU_MODE};
 
 	void computeStrides();
 	inline size_t flattenIndex(const std::vector<size_t> &indices) const;
@@ -34,6 +36,7 @@ class Tensor {
 	~Tensor();
 
 	Tensor &operator=(const Tensor &other);
+	Tensor &operator=(const std::vector<ValueType> &other);
 
 	ValueType getValue(const std::vector<size_t> &newShape) const;
 	void setValue(const std::vector<size_t> &newShape, const ValueType value);
diff --git a/src/model/activations.cpp b/src/model/activations.cpp
index 9f537c3..f7a70b1 100644
--- a/src/model/activations.cpp
+++ b/src/model/activations.cpp
@@ -1,6 +1,7 @@
 #include "activations.hpp"
 #include "tensor.hpp"
 #include "tensor_gpu.hpp"
+#include <cstdio>
 
 namespace nn::model {
 void Activation::activate(const global::Tensor &net, global::Tensor &out) const {
diff --git a/src/model/model.cpp b/src/model/model.cpp
index 70249a7..a5a958f 100644
--- a/src/model/model.cpp
+++ b/src/model/model.cpp
@@ -160,7 +160,7 @@ void Model::updateWeights(const int batchSize) {
 
 void Model::Backward(const global::Tensor &output) {
 	global::Tensor deltas = output;
-    global::Tensor *delta = &deltas;
+	global::Tensor *delta = &deltas;
 
 	for (int i = static_cast<int>(network.size()) - 1; i >= 0; --i) {
 		network[i]->backward(&delta);
@@ -179,16 +179,18 @@ global::ValueType Model::runBackPropagation(
 	}
 
 	resetNetworkGradient();
+	global::Tensor output({outputSize()});
 	for (size_t i = 0; i < batch.size(); ++i) {
 		TrainSample *current_sample_ptr = batch.samples.at(i);
-		visual.updatePrediction(current_sample_ptr->pre);
+		output.zero();
+		// visual.updatePrediction(current_sample_ptr->pre);
 
-		runModel(transformation(current_sample_ptr->input));
+		// runModel(transformation(current_sample_ptr->input));
 
-		global::Tensor output({outputSize()});
-		output.setValue({current_sample_ptr->pre.index}, 1);
+		runModel(current_sample_ptr->input);
 
 		if (doBackward) {
+			output.setValue({current_sample_ptr->pre.index}, 1);
 			Backward(output);
 			updateWeights(batch.size());
 		}
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index f4b7795..2c297b2 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -1,8 +1,8 @@
 #include "tensor_gpu.hpp"
-#include <algorithm>
 #include <numeric>
 #include <stdexcept>
 #include <tensor.hpp>
+#include <vector>
 
 namespace nn::global {
 Tensor::Tensor(const std::vector<size_t> &shape_, float init) {
@@ -58,7 +58,7 @@ void Tensor::getData(std::vector<ValueType> &dest) const {
 void Tensor::fill(const ValueType &value) {
 	if (isGpu) {
 		tensor_gpu::zero(gpu_data, gpu_data_size);
-		tensor_gpu::add(gpu_data, value, gpu_data, gpu_data_size);
+		tensor_gpu::add_scalar(gpu_data, value, gpu_data, gpu_data_size);
 	} else {
 		for (auto &n : cpu_data) {
 			n = value;
@@ -100,6 +100,20 @@ Tensor &Tensor::operator=(const Tensor &other) {
 	return *this;
 }
 
+Tensor &Tensor::operator=(const std::vector<ValueType> &other) {
+	if (other.size() != numElements()) {
+		throw std::invalid_argument("");
+	}
+
+	if (!isGpu) {
+		cpu_data = other;
+	} else {
+		tensor_gpu::copyToDevice(gpu_data, other.data(), gpu_data_size * sizeof(ValueType));
+	}
+
+	return *this;
+}
+
 void Tensor::computeStrides() {
 	const size_t dim = shape.size();
 	strides.resize(dim);
@@ -147,7 +161,7 @@ Tensor &Tensor::operator+=(const Tensor &other) {
 		for (size_t i = 0; i < N; ++i)
 			cpu_data[i] += other.cpu_data[i];
 	} else {
-		tensor_gpu::add(gpu_data, other.gpu_data, gpu_data, gpu_data_size);
+		tensor_gpu::add_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size);
 	}
 	return *this;
 }
@@ -160,7 +174,7 @@ Tensor &Tensor::operator-=(const Tensor &other) {
 		for (size_t i = 0; i < N; ++i)
 			cpu_data[i] -= other.cpu_data[i];
 	} else {
-		tensor_gpu::subtraction(gpu_data, other.gpu_data, gpu_data, gpu_data_size);
+		tensor_gpu::subtraction_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size);
 	}
 	return *this;
 }
@@ -173,7 +187,7 @@ Tensor &Tensor::operator*=(const Tensor &other) {
 		for (size_t i = 0; i < N; ++i)
 			cpu_data[i] *= other.cpu_data[i];
 	} else {
-		tensor_gpu::multiply(gpu_data, other.gpu_data, gpu_data, gpu_data_size);
+		tensor_gpu::multiply_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size);
 	}
 	return *this;
 }
@@ -186,7 +200,7 @@ Tensor &Tensor::operator/=(const Tensor &other) {
 		for (size_t i = 0; i < N; ++i)
 			cpu_data[i] /= other.cpu_data[i];
 	} else {
-		tensor_gpu::division(gpu_data, other.gpu_data, gpu_data, gpu_data_size);
+		tensor_gpu::division_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size);
 	}
 	return *this;
 }
@@ -196,7 +210,7 @@ Tensor &Tensor::operator*=(ValueType scalar) {
 		for (auto &x : cpu_data)
 			x *= scalar;
 	} else {
-		tensor_gpu::multiply(gpu_data, scalar, gpu_data, gpu_data_size);
+		tensor_gpu::multiply_scalar(gpu_data, scalar, gpu_data, gpu_data_size);
 	}
 	return *this;
 }
@@ -206,7 +220,7 @@ Tensor &Tensor::operator-=(ValueType scalar) {
 		for (auto &x : cpu_data)
 			x -= scalar;
 	} else {
-		tensor_gpu::subtraction(gpu_data, scalar, gpu_data, gpu_data_size);
+		tensor_gpu::subtraction_scalar(gpu_data, scalar, gpu_data, gpu_data_size);
 	}
 	return *this;
 }
@@ -216,7 +230,7 @@ Tensor &Tensor::operator+=(ValueType scalar) {
 		for (auto &x : cpu_data)
 			x += scalar;
 	} else {
-		tensor_gpu::add(gpu_data, scalar, gpu_data, gpu_data_size);
+		tensor_gpu::add_scalar(gpu_data, scalar, gpu_data, gpu_data_size);
 	}
 	return *this;
 }
@@ -226,7 +240,7 @@ Tensor &Tensor::operator/=(ValueType scalar) {
 		for (auto &x : cpu_data)
 			x /= scalar;
 	} else {
-		tensor_gpu::division(gpu_data, scalar, gpu_data, gpu_data_size);
+		tensor_gpu::division_scalar(gpu_data, scalar, gpu_data, gpu_data_size);
 	}
 	return *this;
 }
diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
index 587c8de..7a33f34 100644
--- a/src/model/tensor_gpu.cu
+++ b/src/model/tensor_gpu.cu
@@ -60,7 +60,7 @@ __global__ void addKernel(const ValueType* A, const ValueType* B, ValueType* C,
 }
 
 // Element-wise addition: C = A + B
-void add(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+void add_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     addKernel<<<numBlocks, blockSize>>>(A, B, C, count);
@@ -76,7 +76,7 @@ __global__ void subtractionKernel(const ValueType* A, const ValueType* B, ValueT
 }
 
 // Element-wise addition: C = A + B
-void subtraction(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+void subtraction_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     subtractionKernel<<<numBlocks, blockSize>>>(A, B, C, count);
@@ -91,7 +91,7 @@ __global__ void divisionKernel(const ValueType* A, const ValueType* B, ValueType
 }
 
 // Element-wise addition: C = A / B
-void division(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+void division_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     divisionKernel<<<numBlocks, blockSize>>>(A, B, C, count);
@@ -106,7 +106,7 @@ __global__ void multiplyKernel(const ValueType* A, const ValueType* B, ValueType
 }
 
 // Element-wise multiply: C = A * B
-void multiply(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+void multiply_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     multiplyKernel<<<numBlocks, blockSize>>>(A, B, C, count);
@@ -121,7 +121,7 @@ __global__ void addKernel(const ValueType* A, const ValueType B, ValueType* C, s
 }
 
 // Element-wise addition: C = A + B
-void add(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
+void add_scalar(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     addKernel<<<numBlocks, blockSize>>>(A, B, C, count);
@@ -137,7 +137,7 @@ __global__ void subtractionKernel(const ValueType* A, const ValueType B, ValueTy
 }
 
 // Element-wise addition: C = A + B
-void subtraction(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
+void subtraction_scalar(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     subtractionKernel<<<numBlocks, blockSize>>>(A, B, C, count);
@@ -152,7 +152,7 @@ __global__ void divisionKernel(const ValueType* A, const ValueType B, ValueType*
 }
 
 // Element-wise addition: C = A / B
-void division(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
+void division_scalar(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     divisionKernel<<<numBlocks, blockSize>>>(A, B, C, count);
@@ -167,7 +167,7 @@ __global__ void multiplyKernel(const ValueType* A, const ValueType B, ValueType*
 }
 
 // Element-wise multiply: C = A * B
-void multiply(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
+void multiply_scalar(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     multiplyKernel<<<numBlocks, blockSize>>>(A, B, C, count);
@@ -321,7 +321,9 @@ __global__ void softmaxKernel(const ValueType* input, ValueType* output, std::si
     if (idx >= count) return;
 
     // Load input into shared memory
-    shared[tid] = input[idx];
+    if (idx < count) shared[tid] = input[idx];
+    else shared[tid] = -INFINITY; // or 0
+
     __syncthreads();
 
     // Step 1: Find max value for numerical stability
@@ -512,16 +514,15 @@ __global__ void outerKernel(const ValueType *a, const ValueType *b, ValueType *r
     if (idx < total) {
         size_t i = idx / n;
         size_t j = idx % n;
-        result[i * n + j] += a[i] * b[j];
+        result[i * n + j] = a[i] * b[j];  // Use '=' since result is zeroed before
     }
 }
 
 __global__ void matmulTKernel(const ValueType *W, const ValueType *V, ValueType *R, size_t M, size_t N) {
     size_t col = blockIdx.x * blockDim.x + threadIdx.x;
     if (col < N) {
-        ValueType sum = 0;
+        ValueType sum = 0.0f;
         for (size_t i = 0; i < M; ++i) {
-            // W is M x N, access element at (i, col)
             sum += W[i * N + col] * V[i];
         }
         R[col] = sum;
diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp
index 4e48651..153ce14 100644
--- a/src/model/tensor_gpu.hpp
+++ b/src/model/tensor_gpu.hpp
@@ -29,28 +29,28 @@ void copyDeviceToDevice(void *deviceDst, const void *deviceSrc, std::size_t coun
 void zero(ValueType *deviceData, std::size_t count);
 
 /// Element-wise addition: C = A + B
-void add(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count);
+void add_vec(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count);
 
 /// Element-wise addition: C = A - B
-void subtraction(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count);
+void subtraction_vec(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count);
 
 /// Element-wise addition: C = A / B
-void division(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count);
+void division_vec(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count);
 
 /// Element-wise multiply: C = A * B
-void multiply(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count);
+void multiply_vec(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count);
 
 /// Element-wise addition: C = A + B
-void add(const ValueType *A, const ValueType B, ValueType *C, std::size_t count);
+void add_scalar(const ValueType *A, const ValueType B, ValueType *C, std::size_t count);
 
 /// Element-wise addition: C = A - B
-void subtraction(const ValueType *A, const ValueType B, ValueType *C, std::size_t count);
+void subtraction_scalar(const ValueType *A, const ValueType B, ValueType *C, std::size_t count);
 
 /// Element-wise addition: C = A / B
-void division(const ValueType *A, const ValueType B, ValueType *C, std::size_t count);
+void division_scalar(const ValueType *A, const ValueType B, ValueType *C, std::size_t count);
 
 /// Element-wise multiply: C = A * B
-void multiply(const ValueType *A, const ValueType B, ValueType *C, std::size_t count);
+void multiply_scalar(const ValueType *A, const ValueType B, ValueType *C, std::size_t count);
 
 void computeStridesDevice(const size_t *gpu_shape, size_t *gpu_strides, std::size_t ndim);
 
diff --git a/src/networks/fnn/DenseLayer.cpp b/src/networks/fnn/DenseLayer.cpp
index cc7b208..8584637 100644
--- a/src/networks/fnn/DenseLayer.cpp
+++ b/src/networks/fnn/DenseLayer.cpp
@@ -1,5 +1,6 @@
 #include "DenseLayer.hpp"
 #include <random>
+#include <vector>
 
 namespace nn::model::fnn {
 DenseLayer::DenseLayer(
@@ -167,11 +168,11 @@ void DenseLayer::fillParamRandom() {
 	global::ValueType std_dev = std::sqrt(2.0 / static_cast<global::ValueType>(prevSize()));
 	std::normal_distribution<> dist(0.0, std_dev);
 
-	for (size_t i = 0; i < parameters.size(); ++i) {
-		for (size_t j = 0; j < parameters.prevSize(); ++j) {
-			parameters.weights.setValue({i, j}, dist(gen));
-		}
+    std::vector<global::ValueType> temp(parameters.weights.numElements());
+	for (size_t i = 0; i < temp.size(); ++i) {
+            temp[i] = dist(gen);
 	}
+    parameters.weights = temp;
 }
 
 void DenseLayer::resetDots() {
diff --git a/src/visualizer/VisualizerController.cpp b/src/visualizer/VisualizerController.cpp
index f1e9a60..fc752d3 100644
--- a/src/visualizer/VisualizerController.cpp
+++ b/src/visualizer/VisualizerController.cpp
@@ -3,7 +3,6 @@
 
 namespace nn::visualizer {
 VisualManager::VisualManager(const model::Config &_config) : config(_config) {
-	printf("start Visualizer\n");
 }
 
 void VisualManager::initState() {
@@ -44,6 +43,7 @@ void VisualManager::start() {
 }
 
 void VisualManager::startVisuals() {
+	printf("start Visualizer\n");
 	Vstate = std::make_shared<StateManager>(config);
 	if (!Vstate) {
 		return;
diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json
index c8280ac..c078a9e 100644
--- a/tests/data/config-binary_test.json
+++ b/tests/data/config-binary_test.json
@@ -1,6 +1,6 @@
 {
 	"visual config": {
-		"enableVisuals": true,
+		"enableVisuals": false,
 		"modes": [
 			{ "state": "pause", "mode": true },
 			{ "state": "precise mode", "mode": false },
@@ -22,8 +22,8 @@
 			"output size": 16,
 			"output activation": 4,
 			"layers": [
-				{ "size": 100, "activationType": 1 },
-				{ "size": 30, "activationType": 1 }
+				{ "size": 1000, "activationType": 1 },
+				{ "size": 3000, "activationType": 1 }
 			]
 		}
 	]

From c5530d153f93f110d68b9a53c4c80452894dd85e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maayan=20Portugues=20=F0=9F=8E=97=EF=B8=8F?=
 <mayanpotu500@gmail.com>
Date: Fri, 8 Aug 2025 19:04:29 +0300
Subject: [PATCH 28/40] Update cmake-multi-platform.yml for cuda

---
 .github/workflows/cmake-multi-platform.yml | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/cmake-multi-platform.yml b/.github/workflows/cmake-multi-platform.yml
index 4e76790..c904d29 100644
--- a/.github/workflows/cmake-multi-platform.yml
+++ b/.github/workflows/cmake-multi-platform.yml
@@ -11,6 +11,9 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
     - name: Install dependencies
       run: |
         sudo apt-get update
@@ -26,13 +29,23 @@ jobs:
           libgl1-mesa-dev \
           libegl1-mesa-dev \
           libxi-dev \
-          libfreetype6-dev
+          libfreetype6-dev \
+          wget
 
-    - name: Checkout code
-      uses: actions/checkout@v4
+    - name: Install NVIDIA CUDA Toolkit
+      run: |
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
+        sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
+        sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub
+        sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /"
+        sudo apt-get update
+        sudo apt-get -y install cuda-toolkit-12-4  # Or latest version you support
+        echo "/usr/local/cuda/bin" >> $GITHUB_PATH
+        echo "/usr/local/cuda/lib64" | sudo tee /etc/ld.so.conf.d/cuda.conf
+        sudo ldconfig
 
     - name: Configure
-      run: cmake -B build -DBUILD_SHARED_LIBS=TRUE
+      run: cmake -B build -DBUILD_SHARED_LIBS=TRUE -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc
 
     - name: Build
       run: cmake --build build --config Release

From 7f8d49c9bbb83de1c77050900e89e2e29a6eb938 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Fri, 8 Aug 2025 19:12:16 +0300
Subject: [PATCH 29/40] small change - restoring

---
 .github/workflows/cmake-multi-platform.yml | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/cmake-multi-platform.yml b/.github/workflows/cmake-multi-platform.yml
index c904d29..560d323 100644
--- a/.github/workflows/cmake-multi-platform.yml
+++ b/.github/workflows/cmake-multi-platform.yml
@@ -11,9 +11,6 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - name: Checkout code
-      uses: actions/checkout@v4
-
     - name: Install dependencies
       run: |
         sudo apt-get update
@@ -29,23 +26,14 @@ jobs:
           libgl1-mesa-dev \
           libegl1-mesa-dev \
           libxi-dev \
-          libfreetype6-dev \
-          wget
+          libfreetype6-dev
 
-    - name: Install NVIDIA CUDA Toolkit
-      run: |
-        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
-        sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
-        sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub
-        sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /"
-        sudo apt-get update
-        sudo apt-get -y install cuda-toolkit-12-4  # Or latest version you support
-        echo "/usr/local/cuda/bin" >> $GITHUB_PATH
-        echo "/usr/local/cuda/lib64" | sudo tee /etc/ld.so.conf.d/cuda.conf
-        sudo ldconfig
+    - name: Checkout code
+      uses: actions/checkout@v4
 
     - name: Configure
-      run: cmake -B build -DBUILD_SHARED_LIBS=TRUE -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc
+      run: cmake -B build -DBUILD_SHARED_LIBS=TRUE
 
     - name: Build
       run: cmake --build build --config Release
+

From fd16a3b0a77a5a2b644b69ed6a4e5bfddd8dec0d Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Fri, 8 Aug 2025 21:38:23 +0300
Subject: [PATCH 30/40] improved performance

---
 include/network/IvisualNetwork.hpp |  3 +-
 include/tensor.hpp                 |  3 ++
 src/model/activations.hpp          | 14 +++-----
 src/model/dataBase.hpp             |  1 +
 src/model/tensor.cpp               | 12 +++++++
 src/networks/fnn/DenseLayer.cpp    | 56 +++++++++++++-----------------
 src/networks/fnn/DenseLayer.hpp    |  9 +++--
 src/networks/fnn/FNNetwork.cpp     | 16 +++------
 src/networks/fnn/FnnVisualizer.cpp |  1 -
 src/visualizer/visualModel.cpp     |  4 ++-
 tests/binary_test.cpp              | 17 +++++----
 tests/data/config-binary_test.json |  6 ++--
 12 files changed, 70 insertions(+), 72 deletions(-)

diff --git a/include/network/IvisualNetwork.hpp b/include/network/IvisualNetwork.hpp
index ae08c3a..58b49af 100644
--- a/include/network/IvisualNetwork.hpp
+++ b/include/network/IvisualNetwork.hpp
@@ -3,8 +3,7 @@
 
 #include "../../src/visualizer/panel.hpp"
 #include <SFML/Graphics.hpp>
-#include <SFML/Graphics/Color.hpp>
-#include <SFML/System/Vector2.hpp>
+#include <Globals.hpp>
 
 namespace nn::visualizer {
 constexpr std::uint32_t MODEL_HEIGHT = 770u;
diff --git a/include/tensor.hpp b/include/tensor.hpp
index ba5f7cd..1064f1d 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -40,6 +40,9 @@ class Tensor {
 
 	ValueType getValue(const std::vector<size_t> &newShape) const;
 	void setValue(const std::vector<size_t> &newShape, const ValueType value);
+	void insertRange(const Tensor &other,
+	                 const size_t startO, const size_t startT,
+	                 const size_t length);
 
 	// Shape and size
 	size_t numElements() const;
diff --git a/src/model/activations.hpp b/src/model/activations.hpp
index 9c6b4a2..010de2a 100644
--- a/src/model/activations.hpp
+++ b/src/model/activations.hpp
@@ -2,10 +2,10 @@
 #define ACTIVATIONSP
 
 #include "tensor.hpp"
-#include <Globals.hpp>
 #include <cmath>
 
 namespace nn::model {
+
 constexpr global::ValueType RELU_LEAKY_ALPHA = 0.01;
 
 constexpr global::ValueType maxValue(const global::ValueType &a, const float &b) {
@@ -38,16 +38,13 @@ class Activation {
 	static global::ValueType derivativeTanh(const global::ValueType z);
 
 	static void relu(const global::Tensor &net, global::Tensor &out);
-	static void derivativeRelu(const global::Tensor &net,
-	                           global::Tensor &out);
+	static void derivativeRelu(const global::Tensor &net, global::Tensor &out);
 
-	static void leakyRelu(const global::Tensor &net,
-	                      global::Tensor &out);
+	static void leakyRelu(const global::Tensor &net, global::Tensor &out);
 	static void derivativeLeakyRelu(const global::Tensor &net,
 	                                global::Tensor &out);
 
-	static void sigmoid(const global::Tensor &net,
-	                    global::Tensor &out);
+	static void sigmoid(const global::Tensor &net, global::Tensor &out);
 	static void derivativeSigmoid(const global::Tensor &net,
 	                              global::Tensor &out);
 
@@ -55,8 +52,7 @@ class Activation {
 	static void derivativeTanh(const global::Tensor &net,
 	                           global::Tensor &out);
 
-	static void softmax(const global::Tensor &net,
-	                    global::Tensor &out);
+	static void softmax(const global::Tensor &net, global::Tensor &out);
 
 	static global::ValueType maxVector(const global::Tensor &metrix);
 
diff --git a/src/model/dataBase.hpp b/src/model/dataBase.hpp
index 0ad890a..cf17ba8 100644
--- a/src/model/dataBase.hpp
+++ b/src/model/dataBase.hpp
@@ -4,6 +4,7 @@
 #include "config.hpp"
 #include <random>
 #include <vector>
+#include <Globals.hpp>
 
 namespace nn::model {
 const std::string DATABASE_FILE_EXETENTION = ".nndb";
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index 2c297b2..fe1cd65 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -145,6 +145,18 @@ ValueType Tensor::getValue(const std::vector<size_t> &indices) const {
 	return tensor_gpu::getValueAt(gpu_data, flattenIndex(indices));
 }
 
+void Tensor::insertRange(const Tensor &other,
+                         const size_t startO, const size_t startT,
+                         const size_t length) {
+	if (isGpu) {
+		tensor_gpu::copyDeviceToDevice(gpu_data + startT, other.gpu_data + startO, length * sizeof(ValueType));
+	} else {
+		for (size_t i = 0; i < length; ++i) {
+			cpu_data[i + startT] = other.cpu_data[i + startO];
+		}
+	}
+}
+
 void Tensor::setValue(const std::vector<size_t> &indices, const ValueType value) {
 	if (!isGpu) {
 		cpu_data[flattenIndex(indices)] = value;
diff --git a/src/networks/fnn/DenseLayer.cpp b/src/networks/fnn/DenseLayer.cpp
index 8584637..d955dc3 100644
--- a/src/networks/fnn/DenseLayer.cpp
+++ b/src/networks/fnn/DenseLayer.cpp
@@ -1,4 +1,5 @@
 #include "DenseLayer.hpp"
+#include <cstddef>
 #include <random>
 #include <vector>
 
@@ -12,8 +13,8 @@ DenseLayer::DenseLayer(
       out({size}),
       parameters(size, prevSize),
       gradients(size, prevSize),
-      activationFunction(activation),
-      deltaL({size}) {
+      deltaL({size}),
+      activationFunction(activation) {
 	if (randomInit) {
 		fillParamRandom();
 	}
@@ -29,9 +30,12 @@ void Hidden_Layer::CreateDropoutMask() {
 	static thread_local std::mt19937 rng{std::random_device{}()};
 	std::bernoulli_distribution bernoulli(keepProb);
 
+	static std::vector<global::ValueType> temp(dropoutMask.numElements(), 0);
 	for (size_t i = 0; i < dropoutMask.numElements(); ++i) {
-		dropoutMask.setValue({i}, static_cast<uint8_t>(bernoulli(rng)));
+		temp[i] = static_cast<uint8_t>(bernoulli(rng));
 	}
+
+	dropoutMask = temp;
 }
 
 void Output_Layer::forward(const global::Tensor &metrix) {
@@ -125,41 +129,29 @@ void DenseLayer::updateWeight(nn::model::IOptimizer &optimizer) {
 }
 
 const global::Tensor DenseLayer::getData() const {
-	global::Tensor matrix({parameters.paramSize()});
-
-	size_t currentI = 0;
-	for (size_t i = 0; i < size(); ++i) {
-		for (size_t j = 0; j < prevSize(); ++j) {
-			matrix.setValue({currentI}, parameters.weights.getValue({i, j}));
+	size_t weightsSize = parameters.weights.numElements();
+	size_t biasesSize = parameters.biases.numElements();
 
-			++currentI;
-		}
-	}
+	global::Tensor matrix({weightsSize + biasesSize});
 
-	for (size_t i = 0; i < size(); ++i) {
-		matrix.setValue({currentI}, parameters.biases.getValue({i}));
+	// Copy weights
+	matrix.insertRange(parameters.weights, 0, 0, weightsSize);
 
-		++currentI;
-	}
+	// Copy biases
+	matrix.insertRange(parameters.biases, 0, weightsSize, biasesSize);
 
 	return matrix;
 }
 
-void DenseLayer::setData(const global::Tensor newParam) {
-	size_t currentI = 0;
-	for (size_t i = 0; i < size(); ++i) {
-		for (size_t j = 0; j < prevSize(); ++j) {
-			parameters.weights.setValue({i, j}, newParam.getValue({currentI}));
+void DenseLayer::setData(const global::Tensor newParam, const size_t offset) {
+	size_t weightsSize = parameters.weights.numElements();
+	size_t biasesSize = parameters.biases.numElements();
 
-			++currentI;
-		}
-	}
-
-	for (size_t i = 0; i < size(); ++i) {
-		parameters.biases.setValue({i}, newParam.getValue({currentI}));
+	// Copy into weights
+	parameters.weights.insertRange(newParam, offset, 0, weightsSize);
 
-		++currentI;
-	}
+	// Copy into biases
+	parameters.biases.insertRange(newParam, offset + weightsSize, 0, biasesSize);
 }
 
 void DenseLayer::fillParamRandom() {
@@ -168,11 +160,11 @@ void DenseLayer::fillParamRandom() {
 	global::ValueType std_dev = std::sqrt(2.0 / static_cast<global::ValueType>(prevSize()));
 	std::normal_distribution<> dist(0.0, std_dev);
 
-    std::vector<global::ValueType> temp(parameters.weights.numElements());
+	std::vector<global::ValueType> temp(parameters.weights.numElements());
 	for (size_t i = 0; i < temp.size(); ++i) {
-            temp[i] = dist(gen);
+		temp[i] = dist(gen);
 	}
-    parameters.weights = temp;
+	parameters.weights = temp;
 }
 
 void DenseLayer::resetDots() {
diff --git a/src/networks/fnn/DenseLayer.hpp b/src/networks/fnn/DenseLayer.hpp
index 04c807f..651adb2 100644
--- a/src/networks/fnn/DenseLayer.hpp
+++ b/src/networks/fnn/DenseLayer.hpp
@@ -1,9 +1,8 @@
 #ifndef DENSELAYER
 #define DENSELAYER
 
-#include "../../model/config.hpp"
 #include "../src/model/optimizers.hpp"
-#include "tensor.hpp"
+#include <Globals.hpp>
 
 namespace nn::model::fnn {
 constexpr global::ValueType MIN_LOSS_VALUE = 1e-10;
@@ -15,7 +14,6 @@ struct LayerParams {
 	size_t size_;
 	size_t prevSize_;
 
-
 	LayerParams(size_t out_dim, size_t in_dim)
 	    : weights({out_dim, in_dim}), biases({out_dim}),
 	      size_(out_dim), prevSize_(in_dim) {}
@@ -34,10 +32,11 @@ class DenseLayer {
 	LayerParams parameters;
 	LayerParams gradients;
 
+	global::Tensor deltaL;
+
 	Activation activationFunction;
 
 	bool isTraining{false};
-	global::Tensor deltaL;
 
 	void fillParamRandom();
 
@@ -72,7 +71,7 @@ class DenseLayer {
 	size_t getParamCount() const;
 
 	const global::Tensor getData() const;
-	void setData(const global::Tensor newParam);
+	void setData(const global::Tensor newParam, const size_t offset);
 
 	void setTraining(const bool state) { isTraining = state; }
 };
diff --git a/src/networks/fnn/FNNetwork.cpp b/src/networks/fnn/FNNetwork.cpp
index d723cc5..89621c1 100644
--- a/src/networks/fnn/FNNetwork.cpp
+++ b/src/networks/fnn/FNNetwork.cpp
@@ -157,10 +157,8 @@ global::Tensor FNNetwork::getParams() const {
 	for (size_t i = 0; i < layers.size(); ++i) {
 		global::Tensor params = layers[i]->getData();
 
-		for (size_t j = 0; j < params.numElements(); ++j) {
-			matrix.setValue({matrixI}, params.getValue({j}));
-			++matrixI;
-		}
+		matrix.insertRange(params, 0, matrixI, params.numElements());
+		matrixI += params.numElements();
 	}
 
 	return matrix;
@@ -169,14 +167,8 @@ global::Tensor FNNetwork::getParams() const {
 void FNNetwork::setParams(const global::Tensor params) {
 	size_t j = 0;
 	for (size_t i = 0; i < layers.size(); ++i) {
-		global::Tensor newParam({layers[i]->getParamCount()});
-
-		for (size_t k = 0; k < newParam.numElements(); ++k) {
-			newParam.setValue({k}, params.getValue({j}));
-			++j;
-		}
-
-		layers[i]->setData(newParam);
+		layers[i]->setData(params, j);
+        j += layers[i]->getParamCount();
 
 		if (visual) {
 			visual->setParam(i, layers[i]->getParms());
diff --git a/src/networks/fnn/FnnVisualizer.cpp b/src/networks/fnn/FnnVisualizer.cpp
index 47359a0..1726a78 100644
--- a/src/networks/fnn/FnnVisualizer.cpp
+++ b/src/networks/fnn/FnnVisualizer.cpp
@@ -254,7 +254,6 @@ void VisualDenseLayer::setGrad(const model::fnn::LayerParams &newGrad) {
 }
 
 void FnnVisualier::setWidth(const std::uint32_t newWidth) {
-
 	visualWidth = newWidth;
 	if (networkRender.resize({newWidth, networkRender.getSize().y})) {
 	}
diff --git a/src/visualizer/visualModel.cpp b/src/visualizer/visualModel.cpp
index 4be7484..f22f039 100644
--- a/src/visualizer/visualModel.cpp
+++ b/src/visualizer/visualModel.cpp
@@ -157,7 +157,9 @@ void ModelPanel::renderSubNetwork(const size_t index) {
 }
 
 void ModelPanel::setPrediction(const global::Prediction &pre) {
-	global::Tensor output({predictionLayer.size()});
+	static global::Tensor output({predictionLayer.size()});
+	output.zero();
+
 	output.setValue({pre.index}, 1);
 	predictionLayer.setValues(output);
 
diff --git a/tests/binary_test.cpp b/tests/binary_test.cpp
index 7bdd693..17b318a 100644
--- a/tests/binary_test.cpp
+++ b/tests/binary_test.cpp
@@ -66,7 +66,7 @@ void print_database(int actual_size, int input_size, int database_size) {
 }
 
 void printVector(const nn::global::Tensor &vec) {
-	for (size_t i =0; i < vec.numElements(); ++i) {
+	for (size_t i = 0; i < vec.numElements(); ++i) {
 		std::cout << vec.getValue({i}) << ' ';
 	}
 
@@ -77,20 +77,23 @@ int main(int argc, char *argv[]) {
 	size_t input_size = 10;
 
 	std::string config_FN = tests::appendToBase("config-binary_test.json");
-    // nn::global::Tensor give_me_a_name({5, 3});
-    // printf("test: \n");
-    // give_me_a_name.setValue({2, 1}, 5);
-    // return 0;
+	// nn::global::Tensor give_me_a_name({5, 3});
+	// printf("test: \n");
+	// give_me_a_name.setValue({2, 1}, 5);
+	// return 0;
 
 	nn::model::Model model(config_FN);
 
 	if (argc > 1 && std::string(argv[1]) == "l") {
 		model.load("test.txt");
 	} else {
-        std::vector<std::string> files {"../tests/data/test1", "../tests/data/test2"};
+		model.load("test.txt");
+		nn::model::modelResult result = model.evaluateModel("../tests/data/database-binary_test");
+		std::cout << "training result: " << result.percentage << "%\n";
+		std::vector<std::string> files{"../tests/data/test1", "../tests/data/test2"};
 		model.train(files);
 
-		nn::model::modelResult result = model.evaluateModel("../tests/data/database-binary_test");
+		result = model.evaluateModel("../tests/data/database-binary_test");
 		std::cout << "training result: " << result.percentage << "%\n";
 
 		model.save("test.txt");
diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json
index c078a9e..fb3283a 100644
--- a/tests/data/config-binary_test.json
+++ b/tests/data/config-binary_test.json
@@ -8,11 +8,11 @@
 		]
 	},
 	"training config": {
-		"batch size": 32,
+		"batch size": 16,
 		"batch count": 1000,
 		"optimizer": {
 			"type": "const",
-			"lr": 0.1
+			"lr": 0.5
 		}
 	},
 	"network config": [
@@ -23,7 +23,7 @@
 			"output activation": 4,
 			"layers": [
 				{ "size": 1000, "activationType": 1 },
-				{ "size": 3000, "activationType": 1 }
+				{ "size": 300, "activationType": 1 }
 			]
 		}
 	]

From 09174e4f9def6c10e232c1779c770ef8f4cb65a6 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Fri, 8 Aug 2025 23:23:22 +0300
Subject: [PATCH 31/40] small changes

---
 src/model/tensor.cpp     |   1 -
 src/model/tensor_gpu.cu  | 121 ---------------------------------------
 src/model/tensor_gpu.hpp |  19 +-----
 3 files changed, 1 insertion(+), 140 deletions(-)

diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index fe1cd65..ba95d29 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -2,7 +2,6 @@
 #include <numeric>
 #include <stdexcept>
 #include <tensor.hpp>
-#include <vector>
 
 namespace nn::global {
 Tensor::Tensor(const std::vector<size_t> &shape_, float init) {
diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
index 7a33f34..da29857 100644
--- a/src/model/tensor_gpu.cu
+++ b/src/model/tensor_gpu.cu
@@ -26,7 +26,6 @@ void copyToDevice(void* deviceDst, const void * hostSrc, std::size_t size) {
     cudaMemcpy(deviceDst, hostSrc, size, cudaMemcpyHostToDevice);
 }
 
-
 void copyDeviceToDevice(void *deviceDst, const void *deviceSrc, std::size_t size) {
     cudaMemcpy(deviceDst, deviceSrc, size, cudaMemcpyDeviceToDevice);
 }
@@ -173,19 +172,6 @@ void multiply_scalar(const ValueType* A, const ValueType B, ValueType* C, std::s
     multiplyKernel<<<numBlocks, blockSize>>>(A, B, C, count);
 }
 
-__global__ void computeStrides(const size_t *shape, size_t *strides, size_t ndim) {
-    size_t stride = 1;
-    for (int i = ndim - 1; i >= 0; --i) {
-        strides[i] = stride;
-        stride *= shape[i];
-    }
-}
-
-void computeStridesDevice(const size_t *gpu_shape, size_t *gpu_strides, std::size_t ndim) {
-    computeStrides<<<1, 1>>>(gpu_shape, gpu_strides, ndim);
-    cudaDeviceSynchronize();
-}
-
 // Kernel to apply ReLU activation: max(0, x)
 __global__ void reluKernel(const ValueType *input, ValueType *output, std::size_t count) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
@@ -381,31 +367,6 @@ __global__ void flattenIndexKernel(const size_t* indices, const size_t* shape,
     *outIndex = idx;
 }
 
-// Host function to launch kernel
-size_t flattenIndexGpu(const size_t* h_indices, const size_t* d_shape,
-                       const size_t* d_strides, size_t ndim) {
-    size_t *d_indices,  *d_outIndex;
-    cudaMalloc(&d_indices, ndim * sizeof(size_t));
-    cudaMalloc(&d_outIndex, sizeof(size_t));
-
-    cudaMemcpy(d_indices, h_indices, ndim * sizeof(size_t), cudaMemcpyHostToDevice);
-
-    flattenIndexKernel<<<1, 1>>>(d_indices, d_shape, d_strides, ndim, d_outIndex);
-    cudaDeviceSynchronize();
-
-    size_t result;
-    cudaMemcpy(&result, d_outIndex, sizeof(size_t), cudaMemcpyDeviceToHost);
-
-    cudaFree(d_indices);
-    cudaFree(d_outIndex);
-
-    if (result == size_t(-1)) {
-        throw std::out_of_range("Flattened index out of bounds.");
-    }
-
-    return result;
-}
-
 __global__ void computeFlatIndexKernel(
     const size_t* indices, const size_t* strides,
     size_t rank, size_t* outIndex
@@ -417,86 +378,6 @@ __global__ void computeFlatIndexKernel(
     *outIndex = flatIndex;
 }
 
-ValueType getValueAtIndices(
-    const ValueType* deviceData,
-    const size_t* hostIndices,
-    const size_t* deviceStrides,
-    size_t size
-) {
-    // Copy host indices to device
-    size_t* deviceIndices;
-    cudaMalloc(&deviceIndices, sizeof(size_t) * size);
-    cudaMemcpy(deviceIndices, hostIndices, sizeof(size_t) * size, cudaMemcpyHostToDevice);
-
-    // Allocate output for index
-    size_t* deviceFlatIndex;
-    cudaMalloc(&deviceFlatIndex, sizeof(size_t));
-
-    // Launch kernel to compute flat index
-    computeFlatIndexKernel<<<1, 1>>>(
-        deviceIndices, deviceStrides, size, deviceFlatIndex
-    );
-    cudaDeviceSynchronize();
-
-    // Copy back flat index
-    size_t flatIndex;
-    cudaMemcpy(&flatIndex, deviceFlatIndex, sizeof(size_t), cudaMemcpyDeviceToHost);
-
-    // Get value at that index
-    ValueType value;
-    cudaMemcpy(&value, deviceData + flatIndex, sizeof(ValueType), cudaMemcpyDeviceToHost);
-
-    // Cleanup
-    cudaFree(deviceIndices);
-    cudaFree(deviceFlatIndex);
-
-    return value;
-}
-
-__global__ void setValueAtIndexKernel(ValueType* data, size_t flatIndex, ValueType value) {
-    data[flatIndex] = value;
-}
-
-void setValueAtIndices(
-    ValueType* deviceData,
-    const size_t* hostIndices,
-    const size_t* deviceStrides,
-    size_t ndim,
-    ValueType value
-) {
-    // Step 1: Allocate and copy indices to GPU
-    size_t* deviceIndices;
-    cudaMalloc(&deviceIndices, ndim * sizeof(size_t));
-    cudaMemcpy(deviceIndices, hostIndices, ndim * sizeof(size_t), cudaMemcpyHostToDevice);
-
-    // Step 2: Allocate memory to store computed flat index
-    size_t* deviceFlatIndex;
-    cudaMalloc(&deviceFlatIndex, sizeof(size_t));
-
-    // Step 3: Launch kernel to compute flat index
-    computeFlatIndexKernel<<<1, 1>>>(deviceIndices, deviceStrides, ndim, deviceFlatIndex);
-    cudaDeviceSynchronize();
-
-    // Step 4: Copy flat index to host
-    size_t flatIndex;
-    cudaMemcpy(&flatIndex, deviceFlatIndex, sizeof(size_t), cudaMemcpyDeviceToHost);
-
-    // Step 5: Validate flat index
-    if (flatIndex == size_t(-1)) {
-        cudaFree(deviceIndices);
-        cudaFree(deviceFlatIndex);
-        throw std::out_of_range("Invalid indices in setValueAtIndices");
-    }
-
-    // Step 6: Launch kernel to set value at computed flat index
-    setValueAtIndexKernel<<<1, 1>>>(deviceData, flatIndex, value);
-    cudaDeviceSynchronize();
-
-    // Cleanup
-    cudaFree(deviceIndices);
-    cudaFree(deviceFlatIndex);
-}
-
 __global__ void matmulKernel(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K) {
     size_t row = blockIdx.x * blockDim.x + threadIdx.x;
     if (row < M) {
@@ -529,8 +410,6 @@ __global__ void matmulTKernel(const ValueType *W, const ValueType *V, ValueType
     }
 }
 
-// Wrapper functions to launch kernels
-
 void matmul(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K) {
     const int blockSize = 256;
     int gridSize = (M + blockSize - 1) / blockSize;
diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp
index 153ce14..ee65e49 100644
--- a/src/model/tensor_gpu.hpp
+++ b/src/model/tensor_gpu.hpp
@@ -23,6 +23,7 @@ void copyToDevice(void *deviceDst, const void *hostSrc, std::size_t count);
 /// Copy data from GPU to CPU.
 void copyToHost(void *hostDst, const void *deviceSrc, std::size_t count);
 
+/// Copy data from GPU to GPU.
 void copyDeviceToDevice(void *deviceDst, const void *deviceSrc, std::size_t count);
 
 /// Set all elements to zero (on GPU).
@@ -52,8 +53,6 @@ void division_scalar(const ValueType *A, const ValueType B, ValueType *C, std::s
 /// Element-wise multiply: C = A * B
 void multiply_scalar(const ValueType *A, const ValueType B, ValueType *C, std::size_t count);
 
-void computeStridesDevice(const size_t *gpu_shape, size_t *gpu_strides, std::size_t ndim);
-
 // ---------------- ReLU ----------------
 void relu(const ValueType *input, ValueType *output, std::size_t count);
 void relu_derivative(const ValueType *input, ValueType *output, std::size_t count);
@@ -74,24 +73,8 @@ void leaky_relu_derivative(const ValueType *input, ValueType *output, std::size_
 void softmax(const ValueType *net, ValueType *out, std::size_t size);
 
 ValueType getValueAt(const ValueType *devicePtr, std::size_t index);
-
 void setValueAt(ValueType *devicePtr, std::size_t index, ValueType value);
 
-size_t flattenIndexGpu(const size_t *indices, const size_t *d_shape, const size_t *d_strides, size_t ndim);
-
-ValueType getValueAtIndices(
-    const ValueType *deviceData,
-    const size_t *hostIndices,
-    const size_t *deviceStrides,
-    size_t size);
-
-void setValueAtIndices(
-    ValueType *deviceData,
-    const size_t *hostIndices,
-    const size_t *deviceStrides,
-    size_t ndim,
-    ValueType value);
-
 void matmul(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K);
 void outer(const ValueType *a, const ValueType *b, ValueType *result, size_t m, size_t n);
 void matmulT(const ValueType *W, const ValueType *V, ValueType *R, size_t M, size_t N);

From a3825f8045c5c82dbb280272a456674cd3e4c72f Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Fri, 8 Aug 2025 23:37:49 +0300
Subject: [PATCH 32/40] small formating changes

---
 src/model/tensor_gpu.cu  | 301 ++++++++++++++++-----------------------
 src/model/tensor_gpu.hpp | 130 +++++++++--------
 2 files changed, 186 insertions(+), 245 deletions(-)

diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
index da29857..1a00003 100644
--- a/src/model/tensor_gpu.cu
+++ b/src/model/tensor_gpu.cu
@@ -4,206 +4,179 @@
 #include <stdexcept>
 
 namespace nn::global::tensor_gpu {
-// Allocate memory on GPU for a tensor.
+
+// ==================================================
+// Memory Management
+// ==================================================
 void* allocate(std::size_t size) {
     void* devicePtr = nullptr;
-    cudaError_t err1 = cudaMalloc(&devicePtr, size);
-    if (err1 != cudaSuccess) {
+    if (cudaMalloc(&devicePtr, size) != cudaSuccess) {
         throw std::runtime_error("cudaMalloc failed");
     }
     return devicePtr;
 }
 
-// Free GPU memory.
 void deallocate(void* devicePtr) {
     if (devicePtr) {
         cudaFree(devicePtr);
     }
 }
 
-// Copy data from CPU to GPU.
-void copyToDevice(void* deviceDst, const void * hostSrc, std::size_t size) {
+void copyToDevice(void* deviceDst, const void* hostSrc, std::size_t size) {
     cudaMemcpy(deviceDst, hostSrc, size, cudaMemcpyHostToDevice);
 }
 
-void copyDeviceToDevice(void *deviceDst, const void *deviceSrc, std::size_t size) {
+void copyDeviceToDevice(void* deviceDst, const void* deviceSrc, std::size_t size) {
     cudaMemcpy(deviceDst, deviceSrc, size, cudaMemcpyDeviceToDevice);
 }
 
-// Copy data from GPU to CPU.
 void copyToHost(void* hostDst, const void* deviceSrc, std::size_t size) {
     cudaMemcpy(hostDst, deviceSrc, size, cudaMemcpyDeviceToHost);
 }
 
-// Kernel to set all elements to zero.
+void setValueAt(ValueType* devicePtr, std::size_t index, ValueType value) {
+    cudaMemcpy(devicePtr + index, &value, sizeof(ValueType), cudaMemcpyHostToDevice);
+}
+
+ValueType getValueAt(const ValueType* devicePtr, std::size_t index) {
+    ValueType value;
+    cudaMemcpy(&value, devicePtr + index, sizeof(ValueType), cudaMemcpyDeviceToHost);
+    return value;
+}
+
+// ==================================================
+// Utility Kernels
+// ==================================================
 __global__ void zeroKernel(ValueType* data, std::size_t count) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < count) {
-        data[idx] = 0.0f;
-    }
+    if (idx < count) data[idx] = 0.0f;
 }
 
-// Set all elements to zero (on GPU).
 void zero(ValueType* deviceData, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     zeroKernel<<<numBlocks, blockSize>>>(deviceData, count);
 }
 
-// Kernel for element-wise addition: C = A + B
-__global__ void addKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+// ==================================================
+// Vector-Vector Operations
+// ==================================================
+__global__ void addVecKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < count) {
-        C[idx] = A[idx] + B[idx];
-    }
+    if (idx < count) C[idx] = A[idx] + B[idx];
 }
 
-// Element-wise addition: C = A + B
-void add_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
-    std::size_t blockSize = 256;
-    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
-    addKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+__global__ void subVecKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) C[idx] = A[idx] - B[idx];
 }
 
+__global__ void mulVecKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) C[idx] = A[idx] * B[idx];
+}
 
-// Kernel for element-wise addition: C = A - B
-__global__ void subtractionKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+__global__ void divVecKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < count) {
-        C[idx] = A[idx] - B[idx];
-    }
+    if (idx < count) C[idx] = A[idx] / B[idx];
 }
 
-// Element-wise addition: C = A + B
-void subtraction_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+void add_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
-    subtractionKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    addVecKernel<<<numBlocks, blockSize>>>(A, B, C, count);
 }
 
-// Kernel for element-wise addition: C = A / B
-__global__ void divisionKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
-    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < count) {
-        C[idx] = A[idx] / B[idx];
-    }
-}
-
-// Element-wise addition: C = A / B
-void division_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+void subtraction_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
-    divisionKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    subVecKernel<<<numBlocks, blockSize>>>(A, B, C, count);
 }
 
-// Kernel for element-wise multiplication: C = A * B
-__global__ void multiplyKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
-    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < count) {
-        C[idx] = A[idx] * B[idx];
-    }
+void multiply_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    mulVecKernel<<<numBlocks, blockSize>>>(A, B, C, count);
 }
 
-// Element-wise multiply: C = A * B
-void multiply_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+void division_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
-    multiplyKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    divVecKernel<<<numBlocks, blockSize>>>(A, B, C, count);
 }
 
-// Kernel for element-wise addition: C = A + B
-__global__ void addKernel(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
+// ==================================================
+// Vector-Scalar Operations
+// ==================================================
+__global__ void addScalarKernel(const ValueType* A, ValueType B, ValueType* C, std::size_t count) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < count) {
-        C[idx] = A[idx] + B;
-    }
+    if (idx < count) C[idx] = A[idx] + B;
 }
 
-// Element-wise addition: C = A + B
-void add_scalar(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
-    std::size_t blockSize = 256;
-    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
-    addKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+__global__ void subScalarKernel(const ValueType* A, ValueType B, ValueType* C, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) C[idx] = A[idx] - B;
 }
 
+__global__ void mulScalarKernel(const ValueType* A, ValueType B, ValueType* C, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) C[idx] = A[idx] * B;
+}
 
-// Kernel for element-wise addition: C = A - B
-__global__ void subtractionKernel(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
+__global__ void divScalarKernel(const ValueType* A, ValueType B, ValueType* C, std::size_t count) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < count) {
-        C[idx] = A[idx] - B;
-    }
+    if (idx < count) C[idx] = A[idx] / B;
 }
 
-// Element-wise addition: C = A + B
-void subtraction_scalar(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
+void add_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
-    subtractionKernel<<<numBlocks, blockSize>>>(A, B, C, count);
-}
-
-// Kernel for element-wise addition: C = A / B
-__global__ void divisionKernel(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
-    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < count) {
-        C[idx] = A[idx] / B;
-    }
+    addScalarKernel<<<numBlocks, blockSize>>>(A, B, C, count);
 }
 
-// Element-wise addition: C = A / B
-void division_scalar(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
+void subtraction_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
-    divisionKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    subScalarKernel<<<numBlocks, blockSize>>>(A, B, C, count);
 }
 
-// Kernel for element-wise multiplication: C = A * B
-__global__ void multiplyKernel(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
-    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < count) {
-        C[idx] = A[idx] * B;
-    }
+void multiply_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    mulScalarKernel<<<numBlocks, blockSize>>>(A, B, C, count);
 }
 
-// Element-wise multiply: C = A * B
-void multiply_scalar(const ValueType* A, const ValueType B, ValueType* C, std::size_t count) {
+void division_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
-    multiplyKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    divScalarKernel<<<numBlocks, blockSize>>>(A, B, C, count);
 }
 
-// Kernel to apply ReLU activation: max(0, x)
-__global__ void reluKernel(const ValueType *input, ValueType *output, std::size_t count) {
+// ==================================================
+// Activation Functions
+// ==================================================
+__global__ void reluKernel(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < count) {
-        output[idx] = input[idx] > 0.0 ? input[idx] : 0.0f;
-    }
+    if (idx < count) output[idx] = input[idx] > 0.0f ? input[idx] : 0.0f;
+}
+
+__global__ void reluDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) output[idx] = input[idx] > 0.0f ? 1.0f : 0.0f;
 }
 
-// Apply activation function (e.g., ReLU)
-void relu(const ValueType *input, ValueType *output, std::size_t count) {
+void relu(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     reluKernel<<<numBlocks, blockSize>>>(input, output, count);
 }
 
-// Kernel to apply ReLU derivative:
-// output[i] = input[i] > 0 ? 1 : 0
-__global__ void reluDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) {
-    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < count) {
-        output[idx] = (input[idx] > 0.0f) ? 1.0f : 0.0f;
-    }
-}
-
-// Apply derivative of activation function (e.g., ReLU')
 void relu_derivative(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     reluDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count);
 }
 
-// Kernel to apply Sigmoid activation: 1 / (1 + exp(-x))
 __global__ void sigmoidKernel(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < count) {
@@ -212,14 +185,6 @@ __global__ void sigmoidKernel(const ValueType* input, ValueType* output, std::si
     }
 }
 
-// Apply Sigmoid activation
-void sigmoid(const ValueType* input, ValueType* output, std::size_t count) {
-    std::size_t blockSize = 256;
-    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
-    sigmoidKernel<<<numBlocks, blockSize>>>(input, output, count);
-}
-
-// Kernel for Sigmoid derivative: s(x) * (1 - s(x))
 __global__ void sigmoidDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < count) {
@@ -229,29 +194,23 @@ __global__ void sigmoidDerivativeKernel(const ValueType* input, ValueType* outpu
     }
 }
 
-// Apply Sigmoid derivative
+void sigmoid(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    sigmoidKernel<<<numBlocks, blockSize>>>(input, output, count);
+}
+
 void sigmoid_derivative(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     sigmoidDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count);
 }
 
-// Kernel to apply Tanh activation: tanh(x)
 __global__ void tanhKernel(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < count) {
-        output[idx] = tanhf(input[idx]);
-    }
+    if (idx < count) output[idx] = tanhf(input[idx]);
 }
 
-// Apply Tanh activation
-void tanh_activation(const ValueType* input, ValueType* output, std::size_t count) {
-    std::size_t blockSize = 256;
-    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
-    tanhKernel<<<numBlocks, blockSize>>>(input, output, count);
-}
-
-// Kernel for Tanh derivative: 1 - tanh(x)^2
 __global__ void tanhDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < count) {
@@ -260,78 +219,69 @@ __global__ void tanhDerivativeKernel(const ValueType* input, ValueType* output,
     }
 }
 
-// Apply Tanh derivative
+void tanh_activation(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    tanhKernel<<<numBlocks, blockSize>>>(input, output, count);
+}
+
 void tanh_derivative(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     tanhDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count);
 }
 
-// Kernel for Leaky ReLU: x > 0 ? x : alpha * x
 __global__ void leakyReluKernel(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < count) {
-        ValueType x = input[idx];
-        output[idx] = (x > 0.0f) ? x : alpha * x;
-    }
+    if (idx < count) output[idx] = (input[idx] > 0.0f) ? input[idx] : alpha * input[idx];
+}
+
+__global__ void leakyReluDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) output[idx] = (input[idx] > 0.0f) ? 1.0f : alpha;
 }
 
-// Apply Leaky ReLU
 void leaky_relu(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     leakyReluKernel<<<numBlocks, blockSize>>>(input, output, count, alpha);
 }
 
-// Kernel for Leaky ReLU derivative: x > 0 ? 1 : alpha
-__global__ void leakyReluDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) {
-    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < count) {
-        output[idx] = (input[idx] > 0.0f) ? 1.0f : alpha;
-    }
-}
-
-// Apply Leaky ReLU derivative
 void leaky_relu_derivative(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     leakyReluDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count, alpha);
 }
 
+// ==================================================
+// Softmax
+// ==================================================
 __global__ void softmaxKernel(const ValueType* input, ValueType* output, std::size_t count) {
     extern __shared__ ValueType shared[];
 
     std::size_t tid = threadIdx.x;
     std::size_t idx = blockIdx.x * blockDim.x + tid;
-
     if (idx >= count) return;
 
-    // Load input into shared memory
-    if (idx < count) shared[tid] = input[idx];
-    else shared[tid] = -INFINITY; // or 0
-
+    shared[tid] = (idx < count) ? input[idx] : -INFINITY;
     __syncthreads();
 
-    // Step 1: Find max value for numerical stability
     ValueType max_val = shared[0];
-    for (std::size_t i = 1; i < blockDim.x && blockIdx.x * blockDim.x + i < count; ++i) {
+    for (std::size_t i = 1; i < blockDim.x && (blockIdx.x * blockDim.x + i) < count; ++i) {
         max_val = fmaxf(max_val, shared[i]);
     }
     __syncthreads();
 
-    // Step 2: Compute exp(x - max)
     ValueType e = expf(shared[tid] - max_val);
     shared[tid] = e;
     __syncthreads();
 
-    // Step 3: Sum of exponentials
     ValueType sum = 0.0f;
-    for (std::size_t i = 0; i < blockDim.x && blockIdx.x * blockDim.x + i < count; ++i) {
+    for (std::size_t i = 0; i < blockDim.x && (blockIdx.x * blockDim.x + i) < count; ++i) {
         sum += shared[i];
     }
     __syncthreads();
 
-    // Step 4: Normalize
     output[idx] = shared[tid] / sum;
 }
 
@@ -339,21 +289,12 @@ void softmax(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     std::size_t sharedMemSize = blockSize * sizeof(ValueType);
-
     softmaxKernel<<<numBlocks, blockSize, sharedMemSize>>>(input, output, count);
 }
 
-void setValueAt(ValueType* devicePtr, std::size_t index, ValueType value) {
-    cudaMemcpy(devicePtr + index, &value, sizeof(ValueType), cudaMemcpyHostToDevice);
-}
-
-ValueType getValueAt(const ValueType* devicePtr , std::size_t index) {
-    ValueType value;
-    cudaMemcpy(&value, devicePtr + index, sizeof(ValueType), cudaMemcpyDeviceToHost);
-    return value;
-}
-
-// Kernel to compute flattened index
+// ==================================================
+// Index Utilities
+// ==================================================
 __global__ void flattenIndexKernel(const size_t* indices, const size_t* shape,
                                    const size_t* strides, size_t ndim, size_t* outIndex) {
     size_t idx = 0;
@@ -367,10 +308,8 @@ __global__ void flattenIndexKernel(const size_t* indices, const size_t* shape,
     *outIndex = idx;
 }
 
-__global__ void computeFlatIndexKernel(
-    const size_t* indices, const size_t* strides,
-    size_t rank, size_t* outIndex
-) {
+__global__ void computeFlatIndexKernel(const size_t* indices, const size_t* strides,
+                                       size_t rank, size_t* outIndex) {
     size_t flatIndex = 0;
     for (size_t i = 0; i < rank; ++i) {
         flatIndex += indices[i] * strides[i];
@@ -378,7 +317,10 @@ __global__ void computeFlatIndexKernel(
     *outIndex = flatIndex;
 }
 
-__global__ void matmulKernel(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K) {
+// ==================================================
+// Matrix Operations
+// ==================================================
+__global__ void matmulKernel(const ValueType* A, const ValueType* B, ValueType* R, size_t M, size_t K) {
     size_t row = blockIdx.x * blockDim.x + threadIdx.x;
     if (row < M) {
         ValueType sum = 0;
@@ -389,17 +331,17 @@ __global__ void matmulKernel(const ValueType *A, const ValueType *B, ValueType *
     }
 }
 
-__global__ void outerKernel(const ValueType *a, const ValueType *b, ValueType *result, size_t m, size_t n) {
+__global__ void outerKernel(const ValueType* a, const ValueType* b, ValueType* result, size_t m, size_t n) {
     size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     size_t total = m * n;
     if (idx < total) {
         size_t i = idx / n;
         size_t j = idx % n;
-        result[i * n + j] = a[i] * b[j];  // Use '=' since result is zeroed before
+        result[i * n + j] = a[i] * b[j];
     }
 }
 
-__global__ void matmulTKernel(const ValueType *W, const ValueType *V, ValueType *R, size_t M, size_t N) {
+__global__ void matmulTKernel(const ValueType* W, const ValueType* V, ValueType* R, size_t M, size_t N) {
     size_t col = blockIdx.x * blockDim.x + threadIdx.x;
     if (col < N) {
         ValueType sum = 0.0f;
@@ -410,24 +352,25 @@ __global__ void matmulTKernel(const ValueType *W, const ValueType *V, ValueType
     }
 }
 
-void matmul(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K) {
+void matmul(const ValueType* A, const ValueType* B, ValueType* R, size_t M, size_t K) {
     const int blockSize = 256;
     int gridSize = (M + blockSize - 1) / blockSize;
     matmulKernel<<<gridSize, blockSize>>>(A, B, R, M, K);
     cudaDeviceSynchronize();
 }
 
-void outer(const ValueType *a, const ValueType *b, ValueType *result, size_t m, size_t n) {
+void outer(const ValueType* a, const ValueType* b, ValueType* result, size_t m, size_t n) {
     const int blockSize = 256;
     int gridSize = (m * n + blockSize - 1) / blockSize;
     outerKernel<<<gridSize, blockSize>>>(a, b, result, m, n);
     cudaDeviceSynchronize();
 }
 
-void matmulT(const ValueType *W, const ValueType *V, ValueType *R, size_t M, size_t N) {
+void matmulT(const ValueType* W, const ValueType* V, ValueType* R, size_t M, size_t N) {
     const int blockSize = 256;
     int gridSize = (N + blockSize - 1) / blockSize;
     matmulTKernel<<<gridSize, blockSize>>>(W, V, R, M, N);
     cudaDeviceSynchronize();
 }
-} // namespace tensor_gpu
+
+} // namespace nn::global::tensor_gpu
diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp
index ee65e49..ad2ba02 100644
--- a/src/model/tensor_gpu.hpp
+++ b/src/model/tensor_gpu.hpp
@@ -11,73 +11,71 @@ class Tensor; // Forward declaration
 
 namespace nn::global::tensor_gpu {
 
-/// Allocate memory on GPU for a tensor.
-void *allocate(std::size_t size);
+// ============================
+// Memory Management
+// ============================
+void* allocate(std::size_t size);
+void  deallocate(void* devicePtr);
+
+void  copyToDevice(void* deviceDst, const void* hostSrc, std::size_t count);
+void  copyToHost(void* hostDst, const void* deviceSrc, std::size_t count);
+void  copyDeviceToDevice(void* deviceDst, const void* deviceSrc, std::size_t count);
+
+void  zero(ValueType* deviceData, std::size_t count);
+
+// ============================
+// Element-wise Operations (Vector-Vector)
+// ============================
+void add_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count);
+void subtraction_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count);
+void division_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count);
+void multiply_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count);
+
+// ============================
+// Element-wise Operations (Vector-Scalar)
+// ============================
+void add_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count);
+void subtraction_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count);
+void division_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count);
+void multiply_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count);
+
+// ============================
+// Activation Functions
+// ============================
+
+// ReLU
+void relu(const ValueType* input, ValueType* output, std::size_t count);
+void relu_derivative(const ValueType* input, ValueType* output, std::size_t count);
+
+// Leaky ReLU
+void leaky_relu(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha = 0.01f);
+void leaky_relu_derivative(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha = 0.01f);
+
+// Sigmoid
+void sigmoid(const ValueType* input, ValueType* output, std::size_t count);
+void sigmoid_derivative(const ValueType* input, ValueType* output, std::size_t count);
+
+// Tanh
+void tanh_activation(const ValueType* input, ValueType* output, std::size_t count);
+void tanh_derivative(const ValueType* input, ValueType* output, std::size_t count);
+
+// Softmax
+void softmax(const ValueType* net, ValueType* out, std::size_t size);
+
+// ============================
+// Single Value Access
+// ============================
+ValueType getValueAt(const ValueType* devicePtr, std::size_t index);
+void      setValueAt(ValueType* devicePtr, std::size_t index, ValueType value);
+
+// ============================
+// Matrix Operations
+// ============================
+void matmul(const ValueType* A, const ValueType* B, ValueType* R, std::size_t M, std::size_t K);
+void matmulT(const ValueType* W, const ValueType* V, ValueType* R, std::size_t M, std::size_t N);
+void outer(const ValueType* a, const ValueType* b, ValueType* result, std::size_t m, std::size_t n);
 
-/// Free GPU memory.
-void deallocate(void *devicePtr);
-
-/// Copy data from CPU to GPU.
-void copyToDevice(void *deviceDst, const void *hostSrc, std::size_t count);
-
-/// Copy data from GPU to CPU.
-void copyToHost(void *hostDst, const void *deviceSrc, std::size_t count);
-
-/// Copy data from GPU to GPU.
-void copyDeviceToDevice(void *deviceDst, const void *deviceSrc, std::size_t count);
-
-/// Set all elements to zero (on GPU).
-void zero(ValueType *deviceData, std::size_t count);
-
-/// Element-wise addition: C = A + B
-void add_vec(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count);
-
-/// Element-wise addition: C = A - B
-void subtraction_vec(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count);
-
-/// Element-wise addition: C = A / B
-void division_vec(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count);
-
-/// Element-wise multiply: C = A * B
-void multiply_vec(const ValueType *A, const ValueType *B, ValueType *C, std::size_t count);
-
-/// Element-wise addition: C = A + B
-void add_scalar(const ValueType *A, const ValueType B, ValueType *C, std::size_t count);
-
-/// Element-wise addition: C = A - B
-void subtraction_scalar(const ValueType *A, const ValueType B, ValueType *C, std::size_t count);
-
-/// Element-wise addition: C = A / B
-void division_scalar(const ValueType *A, const ValueType B, ValueType *C, std::size_t count);
-
-/// Element-wise multiply: C = A * B
-void multiply_scalar(const ValueType *A, const ValueType B, ValueType *C, std::size_t count);
-
-// ---------------- ReLU ----------------
-void relu(const ValueType *input, ValueType *output, std::size_t count);
-void relu_derivative(const ValueType *input, ValueType *output, std::size_t count);
-
-// ---------------- Sigmoid ----------------
-void sigmoid(const ValueType *input, ValueType *output, std::size_t count);
-void sigmoid_derivative(const ValueType *input, ValueType *output, std::size_t count);
-
-// ---------------- Tanh ----------------
-void tanh_activation(const ValueType *input, ValueType *output, std::size_t count);
-void tanh_derivative(const ValueType *input, ValueType *output, std::size_t count);
-
-// ---------------- Leaky ReLU ----------------
-void leaky_relu(const ValueType *input, ValueType *output, std::size_t count, ValueType alpha = 0.01f);
-void leaky_relu_derivative(const ValueType *input, ValueType *output, std::size_t count, ValueType alpha = 0.01f);
-
-// ---------------- Softmax ----------------
-void softmax(const ValueType *net, ValueType *out, std::size_t size);
-
-ValueType getValueAt(const ValueType *devicePtr, std::size_t index);
-void setValueAt(ValueType *devicePtr, std::size_t index, ValueType value);
-
-void matmul(const ValueType *A, const ValueType *B, ValueType *R, size_t M, size_t K);
-void outer(const ValueType *a, const ValueType *b, ValueType *result, size_t m, size_t n);
-void matmulT(const ValueType *W, const ValueType *V, ValueType *R, size_t M, size_t N);
 } // namespace nn::global::tensor_gpu
 
 #endif // TENSOR_GPU
+

From 586dc3f61222bb67ee35d3df2569d814bfed9ee3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maayan=20Portugues=20=F0=9F=8E=97=EF=B8=8F?=
 <mayanpotu500@gmail.com>
Date: Fri, 8 Aug 2025 23:54:08 +0300
Subject: [PATCH 33/40] Update cmake-multi-platform.yml

---
 .github/workflows/cmake-multi-platform.yml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/.github/workflows/cmake-multi-platform.yml b/.github/workflows/cmake-multi-platform.yml
index 560d323..5923ace 100644
--- a/.github/workflows/cmake-multi-platform.yml
+++ b/.github/workflows/cmake-multi-platform.yml
@@ -28,6 +28,13 @@ jobs:
           libxi-dev \
           libfreetype6-dev
 
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-ubuntu2404.pin
+        sudo mv cuda-ubuntu2404.pin /etc/apt/preferences.d/cuda-repository-pin-600
+        wget https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda-repo-ubuntu2404-13-0-local_13.0.0-580.65.06-1_amd64.deb
+        sudo dpkg -i cuda-repo-ubuntu2404-13-0-local_13.0.0-580.65.06-1_amd64.deb
+        sudo cp /var/cuda-repo-ubuntu2404-13-0-local/cuda-*-keyring.gpg /usr/share/keyrings/
+        sudo apt-get -y install cuda-toolkit-13-0
+        
     - name: Checkout code
       uses: actions/checkout@v4
 

From 4ddf94dd952bedfd19e4f672dff0ddbdf6e42ddf Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Sat, 9 Aug 2025 13:02:35 +0300
Subject: [PATCH 34/40] improved code

---
 include/model.hpp                  | 30 ++++++-----
 src/model/model.cpp                | 20 +++++---
 src/model/tensor_gpu.cu            | 81 ++++++++++++++++++++++++------
 tests/binary_test.cpp              |  9 +++-
 tests/data/config-binary_test.json |  2 +-
 5 files changed, 108 insertions(+), 34 deletions(-)

diff --git a/include/model.hpp b/include/model.hpp
index b9c9ae0..9e099e8 100644
--- a/include/model.hpp
+++ b/include/model.hpp
@@ -4,6 +4,9 @@
 #include "../src/model/dataBase.hpp"
 #include "../src/model/optimizers.hpp"
 #include "../src/visualizer/VisualizerController.hpp"
+#include "Globals.hpp"
+#include "tensor.hpp"
+#include <cstddef>
 #include <network/INetwork.hpp>
 
 namespace nn::visualizer {
@@ -65,7 +68,7 @@ class Model {
 	global::ValueType runBackPropagation(
 	    const Batch &batch,
 	    const bool updateWeights,
-	    global::Transformation transformation = dt);
+	    global::Transformation transformation = nullptr);
 
 	void printTrainingResult(
 	    const std::chrono::high_resolution_clock::time_point &start,
@@ -81,12 +84,12 @@ class Model {
 	    DataBase &dataBase,
 	    const bool cancleOnError = false,
 	    const bool showProgressbar = true,
-	    global::Transformation transformation = dt);
+	    global::Transformation transformation = nullptr);
 	void trainModel(
 	    DataBase &trainedDataBase,
 	    DataBase &evaluateDataBase,
-	    global::Transformation transformationB = dt,
-	    global::Transformation transformationE = dt);
+	    global::Transformation transformationB = nullptr,
+	    global::Transformation transformationE = nullptr);
 
 	size_t outputSize() const;
 	size_t inputSize() const;
@@ -103,10 +106,13 @@ class Model {
 
 	void autoSave(const int i);
 
-    void addFNN(const std::uint32_t width, ISubNetworkConfig &_config);
-    void addCNN(const std::uint32_t width, ISubNetworkConfig &_config);
+	void addFNN(const std::uint32_t width, ISubNetworkConfig &_config);
+	void addCNN(const std::uint32_t width, ISubNetworkConfig &_config);
 
-    std::uint32_t calculateSubNetWidth() const;
+	std::uint32_t calculateSubNetWidth() const;
+
+	void runModel(const global::Tensor &input,
+	              global::Transformation transformation);
 
   public:
 	Model(const std::string &config_filepath);
@@ -115,16 +121,16 @@ class Model {
 	void runModel(const global::Tensor &input);
 	void train(
 	    const std::string &db_filename,
-	    global::Transformation transformationB = dt,
-	    global::Transformation transformationE = dt);
+	    global::Transformation transformationB = nullptr,
+	    global::Transformation transformationE = nullptr);
 	void train(
 	    const std::vector<std::string> &db_filename,
-	    global::Transformation transformationB = dt,
-	    global::Transformation transformationE = dt);
+	    global::Transformation transformationB = nullptr,
+	    global::Transformation transformationE = nullptr);
 	modelResult evaluateModel(
 	    const std::string &db_filename,
 	    const bool cancleOnError = false,
-	    global::Transformation transformation = dt);
+	    global::Transformation transformation = nullptr);
 
 	void save(const std::string &file);
 	void load(const std::string &file);
diff --git a/src/model/model.cpp b/src/model/model.cpp
index a5a958f..d453f1c 100644
--- a/src/model/model.cpp
+++ b/src/model/model.cpp
@@ -1,5 +1,6 @@
 #include "../networks/cnn/CNNetwork.hpp"
 #include "../networks/fnn/FNNetwork.hpp"
+#include "Globals.hpp"
 #include "dataBase.hpp"
 #include "tensor.hpp"
 #include <chrono>
@@ -182,14 +183,12 @@ global::ValueType Model::runBackPropagation(
 	global::Tensor output({outputSize()});
 	for (size_t i = 0; i < batch.size(); ++i) {
 		TrainSample *current_sample_ptr = batch.samples.at(i);
-		output.zero();
-		// visual.updatePrediction(current_sample_ptr->pre);
+		visual.updatePrediction(current_sample_ptr->pre);
 
-		// runModel(transformation(current_sample_ptr->input));
-
-		runModel(current_sample_ptr->input);
+		runModel(current_sample_ptr->input, transformation);
 
 		if (doBackward) {
+			output.zero();
 			output.setValue({current_sample_ptr->pre.index}, 1);
 			Backward(output);
 			updateWeights(batch.size());
@@ -333,6 +332,15 @@ float Model::calculatePercentage(size_t currentSize, size_t totalSize) {
 	return 100.0f * static_cast<float>(currentSize) / static_cast<float>(totalSize);
 }
 
+void Model::runModel(const global::Tensor &input,
+                     global::Transformation transformation) {
+	if (transformation) {
+		runModel(transformation(input));
+	} else {
+		runModel(input);
+	}
+}
+
 modelResult Model::evaluateModel(
     DataBase &dataBase,
     const bool cancleOnError,
@@ -352,7 +360,7 @@ modelResult Model::evaluateModel(
 	for (int i = 0; i < result.dbSize; ++i) {
 		TrainSample &sample = dataBase.getSample(i);
 
-		runModel(transformation(sample.input));
+		runModel(sample.input, transformation);
 
 		size_t predicted_index = 0;
 		float max_value = getOutput().getValue({0});
diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
index 1a00003..5a2fc62 100644
--- a/src/model/tensor_gpu.cu
+++ b/src/model/tensor_gpu.cu
@@ -4,6 +4,13 @@
 #include <stdexcept>
 
 namespace nn::global::tensor_gpu {
+#define CUDA_CHECK(call) do { \
+  cudaError_t e = (call); \
+  if (e != cudaSuccess) { \
+    fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
+    throw std::runtime_error(cudaGetErrorString(e)); \
+  } \
+} while(0)
 
 // ==================================================
 // Memory Management
@@ -23,24 +30,24 @@ void deallocate(void* devicePtr) {
 }
 
 void copyToDevice(void* deviceDst, const void* hostSrc, std::size_t size) {
-    cudaMemcpy(deviceDst, hostSrc, size, cudaMemcpyHostToDevice);
+    CUDA_CHECK(cudaMemcpy(deviceDst, hostSrc, size, cudaMemcpyHostToDevice));
 }
 
 void copyDeviceToDevice(void* deviceDst, const void* deviceSrc, std::size_t size) {
-    cudaMemcpy(deviceDst, deviceSrc, size, cudaMemcpyDeviceToDevice);
+    CUDA_CHECK(cudaMemcpy(deviceDst, deviceSrc, size, cudaMemcpyDeviceToDevice));
 }
 
 void copyToHost(void* hostDst, const void* deviceSrc, std::size_t size) {
-    cudaMemcpy(hostDst, deviceSrc, size, cudaMemcpyDeviceToHost);
+    CUDA_CHECK(cudaMemcpy(hostDst, deviceSrc, size, cudaMemcpyDeviceToHost));
 }
 
 void setValueAt(ValueType* devicePtr, std::size_t index, ValueType value) {
-    cudaMemcpy(devicePtr + index, &value, sizeof(ValueType), cudaMemcpyHostToDevice);
+    CUDA_CHECK(cudaMemcpy(devicePtr + index, &value, sizeof(ValueType), cudaMemcpyHostToDevice));
 }
 
 ValueType getValueAt(const ValueType* devicePtr, std::size_t index) {
     ValueType value;
-    cudaMemcpy(&value, devicePtr + index, sizeof(ValueType), cudaMemcpyDeviceToHost);
+    CUDA_CHECK(cudaMemcpy(&value, devicePtr + index, sizeof(ValueType), cudaMemcpyDeviceToHost));
     return value;
 }
 
@@ -56,6 +63,8 @@ void zero(ValueType* deviceData, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     zeroKernel<<<numBlocks, blockSize>>>(deviceData, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 // ==================================================
@@ -85,24 +94,32 @@ void add_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t c
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     addVecKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 void subtraction_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     subVecKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 void multiply_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     mulVecKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 void division_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     divVecKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 // ==================================================
@@ -132,24 +149,32 @@ void add_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     addScalarKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 void subtraction_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     subScalarKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 void multiply_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     mulScalarKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 void division_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     divScalarKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 // ==================================================
@@ -169,12 +194,16 @@ void relu(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     reluKernel<<<numBlocks, blockSize>>>(input, output, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 void relu_derivative(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     reluDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 __global__ void sigmoidKernel(const ValueType* input, ValueType* output, std::size_t count) {
@@ -198,12 +227,16 @@ void sigmoid(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     sigmoidKernel<<<numBlocks, blockSize>>>(input, output, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 void sigmoid_derivative(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     sigmoidDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 __global__ void tanhKernel(const ValueType* input, ValueType* output, std::size_t count) {
@@ -223,12 +256,16 @@ void tanh_activation(const ValueType* input, ValueType* output, std::size_t coun
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     tanhKernel<<<numBlocks, blockSize>>>(input, output, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 void tanh_derivative(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     tanhDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 __global__ void leakyReluKernel(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) {
@@ -245,12 +282,16 @@ void leaky_relu(const ValueType* input, ValueType* output, std::size_t count, Va
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     leakyReluKernel<<<numBlocks, blockSize>>>(input, output, count, alpha);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 void leaky_relu_derivative(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) {
     std::size_t blockSize = 256;
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     leakyReluDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count, alpha);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 // ==================================================
@@ -258,31 +299,35 @@ void leaky_relu_derivative(const ValueType* input, ValueType* output, std::size_
 // ==================================================
 __global__ void softmaxKernel(const ValueType* input, ValueType* output, std::size_t count) {
     extern __shared__ ValueType shared[];
-
     std::size_t tid = threadIdx.x;
-    std::size_t idx = blockIdx.x * blockDim.x + tid;
-    if (idx >= count) return;
+    std::size_t blockStart = blockIdx.x * blockDim.x;
+    std::size_t idx = blockStart + tid;
 
+    // always write shared for every thread in block
     shared[tid] = (idx < count) ? input[idx] : -INFINITY;
     __syncthreads();
 
+    // compute max (naive per-thread loop)
     ValueType max_val = shared[0];
-    for (std::size_t i = 1; i < blockDim.x && (blockIdx.x * blockDim.x + i) < count; ++i) {
-        max_val = fmaxf(max_val, shared[i]);
+    for (unsigned int i = 1; i < blockDim.x; ++i) {
+        std::size_t curIdx = blockStart + i;
+        if (curIdx < count) max_val = fmaxf(max_val, shared[i]);
     }
     __syncthreads();
 
-    ValueType e = expf(shared[tid] - max_val);
+    ValueType e = (idx < count) ? expf(shared[tid] - max_val) : 0.0f;
     shared[tid] = e;
     __syncthreads();
 
+    // compute sum (naive)
     ValueType sum = 0.0f;
-    for (std::size_t i = 0; i < blockDim.x && (blockIdx.x * blockDim.x + i) < count; ++i) {
-        sum += shared[i];
+    for (unsigned int i = 0; i < blockDim.x; ++i) {
+        std::size_t curIdx = blockStart + i;
+        if (curIdx < count) sum += shared[i];
     }
     __syncthreads();
 
-    output[idx] = shared[tid] / sum;
+    if (idx < count) output[idx] = shared[tid] / (sum == 0.0f ? 1.0f : sum);
 }
 
 void softmax(const ValueType* input, ValueType* output, std::size_t count) {
@@ -290,6 +335,8 @@ void softmax(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t numBlocks = (count + blockSize - 1) / blockSize;
     std::size_t sharedMemSize = blockSize * sizeof(ValueType);
     softmaxKernel<<<numBlocks, blockSize, sharedMemSize>>>(input, output, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 // ==================================================
@@ -357,6 +404,8 @@ void matmul(const ValueType* A, const ValueType* B, ValueType* R, size_t M, size
     int gridSize = (M + blockSize - 1) / blockSize;
     matmulKernel<<<gridSize, blockSize>>>(A, B, R, M, K);
     cudaDeviceSynchronize();
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 void outer(const ValueType* a, const ValueType* b, ValueType* result, size_t m, size_t n) {
@@ -364,6 +413,8 @@ void outer(const ValueType* a, const ValueType* b, ValueType* result, size_t m,
     int gridSize = (m * n + blockSize - 1) / blockSize;
     outerKernel<<<gridSize, blockSize>>>(a, b, result, m, n);
     cudaDeviceSynchronize();
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 void matmulT(const ValueType* W, const ValueType* V, ValueType* R, size_t M, size_t N) {
@@ -371,6 +422,8 @@ void matmulT(const ValueType* W, const ValueType* V, ValueType* R, size_t M, siz
     int gridSize = (N + blockSize - 1) / blockSize;
     matmulTKernel<<<gridSize, blockSize>>>(W, V, R, M, N);
     cudaDeviceSynchronize();
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 } // namespace nn::global::tensor_gpu
diff --git a/tests/binary_test.cpp b/tests/binary_test.cpp
index 17b318a..34ac415 100644
--- a/tests/binary_test.cpp
+++ b/tests/binary_test.cpp
@@ -79,7 +79,15 @@ int main(int argc, char *argv[]) {
 	std::string config_FN = tests::appendToBase("config-binary_test.json");
 	// nn::global::Tensor give_me_a_name({5, 3});
 	// printf("test: \n");
+	//    give_me_a_name.fill(5);
 	// give_me_a_name.setValue({2, 1}, 5);
+	//
+	// nn::global::Tensor give_me_a_name1({5, 3});
+	// printf("test: \n");
+	//    give_me_a_name1.fill(3);
+	// give_me_a_name1.setValue({2, 1}, 10);
+	//    give_me_a_name1 += give_me_a_name;
+	//    printf("test: %f\n", give_me_a_name1.getValue({2,1}));
 	// return 0;
 
 	nn::model::Model model(config_FN);
@@ -87,7 +95,6 @@ int main(int argc, char *argv[]) {
 	if (argc > 1 && std::string(argv[1]) == "l") {
 		model.load("test.txt");
 	} else {
-		model.load("test.txt");
 		nn::model::modelResult result = model.evaluateModel("../tests/data/database-binary_test");
 		std::cout << "training result: " << result.percentage << "%\n";
 		std::vector<std::string> files{"../tests/data/test1", "../tests/data/test2"};
diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json
index fb3283a..e95589c 100644
--- a/tests/data/config-binary_test.json
+++ b/tests/data/config-binary_test.json
@@ -8,7 +8,7 @@
 		]
 	},
 	"training config": {
-		"batch size": 16,
+		"batch size": 64,
 		"batch count": 1000,
 		"optimizer": {
 			"type": "const",

From 369b2ab93a846cc0154ef0b5026636bb9a71a31c Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Sat, 9 Aug 2025 16:37:24 +0300
Subject: [PATCH 35/40] bug fixes

---
 include/model.hpp                  |  4 ++--
 src/model/activations.cpp          | 17 +++++++-------
 src/model/model.cpp                | 36 +++++++++++++++++++++++-------
 src/model/tensor.cpp               | 11 ++++-----
 src/networks/fnn/DenseLayer.cpp    |  8 +++----
 src/networks/fnn/FnnVisualizer.hpp |  8 +++----
 tests/binary_test.cpp              | 12 ----------
 tests/data/config-binary_test.json |  7 ++++--
 8 files changed, 58 insertions(+), 45 deletions(-)

diff --git a/include/model.hpp b/include/model.hpp
index 9e099e8..26aed62 100644
--- a/include/model.hpp
+++ b/include/model.hpp
@@ -132,8 +132,8 @@ class Model {
 	    const bool cancleOnError = false,
 	    global::Transformation transformation = nullptr);
 
-	void save(const std::string &file);
-	void load(const std::string &file);
+	void save(const std::string &file, bool print = true);
+	void load(const std::string &file, bool print = true);
 
 	global::Prediction getPrediction() const;
 };
diff --git a/src/model/activations.cpp b/src/model/activations.cpp
index f7a70b1..0a22ac3 100644
--- a/src/model/activations.cpp
+++ b/src/model/activations.cpp
@@ -46,16 +46,17 @@ void Activation::derivativeActivate(const global::Tensor &net, global::Tensor &o
 }
 
 global::ValueType Activation::maxVector(const global::Tensor &metrix) {
-	if (metrix.isGpu) {
-	}
-	global::ValueType max = metrix.cpu_data[0];
-	for (size_t i = 0; i < metrix.numElements(); ++i) {
-		if (metrix.getValue({i}) > max) {
-			max = metrix.getValue({i});
+	if (!metrix.isGpu) {
+		global::ValueType max = metrix.cpu_data[0];
+		for (size_t i = 0; i < metrix.numElements(); ++i) {
+			if (metrix.getValue({i}) > max) {
+				max = metrix.getValue({i});
+			}
 		}
-	}
 
-	return max;
+		return max;
+	}
+	return 0;
 }
 
 global::ValueType Activation::relu(const global::ValueType z) {
diff --git a/src/model/model.cpp b/src/model/model.cpp
index d453f1c..3fa4f24 100644
--- a/src/model/model.cpp
+++ b/src/model/model.cpp
@@ -1,8 +1,6 @@
 #include "../networks/cnn/CNNetwork.hpp"
 #include "../networks/fnn/FNNetwork.hpp"
-#include "Globals.hpp"
 #include "dataBase.hpp"
-#include "tensor.hpp"
 #include <chrono>
 #include <fstream>
 #include <iostream>
@@ -92,6 +90,7 @@ std::uint32_t Model::calculateSubNetWidth() const {
 
 void Model::initModel() {
 	const std::uint32_t WIDTH = calculateSubNetWidth();
+	size_t param_amount = 0;
 
 	for (size_t i = 0; i < config.networkConfig.SubNetworksConfig.size(); ++i) {
 		ISubNetworkConfig &_config = *config.networkConfig.SubNetworksConfig[i];
@@ -101,7 +100,14 @@ void Model::initModel() {
 		} else if (_config.NNLable() == cnn::CNN_LABLE) {
 			addCNN(WIDTH, _config);
 		}
+
+		param_amount += network[i]->getParams().numElements();
 	}
+
+	std::cout << "initialize model - "
+	          << param_amount << " parameters, "
+	          << config.networkConfig.SubNetworksConfig.size() << " sub networks"
+	          << std::endl;
 }
 
 void Model::addFNN(const std::uint32_t width, ISubNetworkConfig &_config) {
@@ -284,7 +290,7 @@ bool Model::autoEvaluating(
 
 void Model::autoSave(const int i) {
 	if (config.trainingConfig.isAutoSave() && i % config.trainingConfig.getAutoSave().saveEvery == 0) {
-		save(config.trainingConfig.getAutoSave().dataFilenameAutoSave);
+		save(config.trainingConfig.getAutoSave().dataFilenameAutoSave, false);
 	}
 }
 
@@ -415,9 +421,13 @@ size_t Model::inputSize() const {
 	return network[0]->inputSize();
 }
 
-void Model::save(const std::string &file) {
+void Model::save(const std::string &file, bool print) {
 	std::ofstream outFile(file);
 
+	if (print) {
+		std::cout << "Start saving" << std::endl;
+	}
+
 	for (size_t i = 0; i < network.size(); ++i) {
 		global::Tensor params = network[i]->getParams();
 
@@ -425,18 +435,26 @@ void Model::save(const std::string &file) {
 		for (size_t j = 0; j < params.numElements(); ++j) {
 			outFile << params.getValue({j}) << " ";
 		}
-
 		outFile << std::endl;
 	}
 
+	if (print) {
+		std::cout << " saving complete" << std::endl;
+	}
+
 	outFile.close();
 }
 
-void Model::load(const std::string &file) {
+void Model::load(const std::string &file, bool print) {
 	std::ifstream inFile(file);
 
 	std::string line;
 	int networkI = 0;
+
+	if (print) {
+		std::cout << "Start loading" << std::endl;
+	}
+
 	while (std::getline(inFile, line)) {
 		std::istringstream iss(line);
 
@@ -445,17 +463,19 @@ void Model::load(const std::string &file) {
 		global::Tensor numbers({ParamSize});
 
 		float num;
-
 		for (size_t i = 0; i < ParamSize; ++i) {
 			iss >> num;
 			numbers.setValue({i}, num);
 		}
 
 		network[networkI]->setParams(numbers);
-
 		networkI++;
 	}
 
+	if (print) {
+		std::cout << " loading complete" << std::endl;
+	}
+
 	inFile.close();
 }
 
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index ba95d29..88c3f4e 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -33,6 +33,7 @@ Tensor::Tensor(const Tensor &other) {
 	if (isGpu) {
 		gpu_data_size = other.gpu_data_size;
 		gpu_data = (ValueType *)tensor_gpu::allocate(gpu_data_size * sizeof(ValueType));
+
 		tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType));
 	} else {
 		cpu_data = other.cpu_data;
@@ -80,18 +81,18 @@ Tensor &Tensor::operator=(const Tensor &other) {
 	if (!isGpu) {
 		cpu_data = other.cpu_data;
 	} else {
-		ValueType *temp = gpu_data;
 		if (gpu_data_size != other.gpu_data_size) {
+		ValueType *temp = gpu_data;
 			temp = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType));
 
 			gpu_data_size = other.gpu_data_size;
-		}
-		tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType));
+			tensor_gpu::copyDeviceToDevice(temp, other.gpu_data, gpu_data_size * sizeof(ValueType));
 
-		if (gpu_data_size != other.gpu_data_size) {
 			tensor_gpu::deallocate(gpu_data);
 			gpu_data = temp;
-		}
+		} else {
+			tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType));
+        }
 	}
 
 	shape = other.shape;
diff --git a/src/networks/fnn/DenseLayer.cpp b/src/networks/fnn/DenseLayer.cpp
index d955dc3..b467dc2 100644
--- a/src/networks/fnn/DenseLayer.cpp
+++ b/src/networks/fnn/DenseLayer.cpp
@@ -168,12 +168,12 @@ void DenseLayer::fillParamRandom() {
 }
 
 void DenseLayer::resetDots() {
-	net.fill(0);
-	out.fill(0);
+	net.zero();
+	out.zero();
 }
 
 void DenseLayer::resetGradient() {
-	gradients.biases.fill(0);
-	gradients.weights.fill(0);
+	gradients.biases.zero();
+	gradients.weights.zero();
 }
 } // namespace nn::model::fnn
diff --git a/src/networks/fnn/FnnVisualizer.hpp b/src/networks/fnn/FnnVisualizer.hpp
index 067050c..a497484 100644
--- a/src/networks/fnn/FnnVisualizer.hpp
+++ b/src/networks/fnn/FnnVisualizer.hpp
@@ -32,11 +32,11 @@ static const std::array<sf::Color, 3> color_lookup = {
 
 class VisualDenseLayer {
   private:
-	global::Tensor net{{0}};
-	global::Tensor out{{0}};
+	global::Tensor net{{1}};
+	global::Tensor out{{1}};
 
-	model::fnn::LayerParams parameters{0, 0};
-	model::fnn::LayerParams gradients{0, 0};
+	model::fnn::LayerParams parameters{1, 1};
+	model::fnn::LayerParams gradients{1, 1};
 
 	sf::Vector2f pos;
 
diff --git a/tests/binary_test.cpp b/tests/binary_test.cpp
index 34ac415..4fdba8f 100644
--- a/tests/binary_test.cpp
+++ b/tests/binary_test.cpp
@@ -77,18 +77,6 @@ int main(int argc, char *argv[]) {
 	size_t input_size = 10;
 
 	std::string config_FN = tests::appendToBase("config-binary_test.json");
-	// nn::global::Tensor give_me_a_name({5, 3});
-	// printf("test: \n");
-	//    give_me_a_name.fill(5);
-	// give_me_a_name.setValue({2, 1}, 5);
-	//
-	// nn::global::Tensor give_me_a_name1({5, 3});
-	// printf("test: \n");
-	//    give_me_a_name1.fill(3);
-	// give_me_a_name1.setValue({2, 1}, 10);
-	//    give_me_a_name1 += give_me_a_name;
-	//    printf("test: %f\n", give_me_a_name1.getValue({2,1}));
-	// return 0;
 
 	nn::model::Model model(config_FN);
 
diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json
index e95589c..aef3f34 100644
--- a/tests/data/config-binary_test.json
+++ b/tests/data/config-binary_test.json
@@ -1,6 +1,6 @@
 {
 	"visual config": {
-		"enableVisuals": false,
+		"enableVisuals": true,
 		"modes": [
 			{ "state": "pause", "mode": true },
 			{ "state": "precise mode", "mode": false },
@@ -22,7 +22,10 @@
 			"output size": 16,
 			"output activation": 4,
 			"layers": [
-				{ "size": 1000, "activationType": 1 },
+				{ "size": 10, "activationType": 1 },
+				{ "size": 10, "activationType": 1 },
+				{ "size": 10, "activationType": 1 },
+				{ "size": 10, "activationType": 1 },
 				{ "size": 300, "activationType": 1 }
 			]
 		}

From c271d62ea41020c5dfa7429678d2cf0e1e6b4a70 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Sat, 9 Aug 2025 18:25:59 +0300
Subject: [PATCH 36/40] now it is possible to show only status on visual

---
 include/tensor.hpp                    |   7 +-
 src/model/config.hpp                  |   3 +-
 src/model/model.cpp                   |   7 +-
 src/model/tensor.cpp                  | 126 ++++++++++++--------------
 src/visualizer/VInterface.cpp         |  35 +++++--
 src/visualizer/VInterface.hpp         |   6 +-
 src/visualizer/VisualizerRenderer.cpp |  43 ++++++---
 src/visualizer/VisualizerRenderer.hpp |   5 +-
 tests/data/config-binary_test.json    |   9 +-
 9 files changed, 137 insertions(+), 104 deletions(-)

diff --git a/include/tensor.hpp b/include/tensor.hpp
index 1064f1d..bb0749d 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -30,7 +30,7 @@ class Tensor {
 
   public:
 	// Constructors
-	Tensor(const std::vector<size_t> &shape, float init = 0.0f);
+	Tensor(const std::vector<size_t> &shape, ValueType init = 0.0f);
 	Tensor(const Tensor &other);
 
 	~Tensor();
@@ -40,9 +40,8 @@ class Tensor {
 
 	ValueType getValue(const std::vector<size_t> &newShape) const;
 	void setValue(const std::vector<size_t> &newShape, const ValueType value);
-	void insertRange(const Tensor &other,
-	                 const size_t startO, const size_t startT,
-	                 const size_t length);
+	void insertRange(const Tensor &other, const size_t startO,
+	                 const size_t startT, const size_t length);
 
 	// Shape and size
 	size_t numElements() const;
diff --git a/src/model/config.hpp b/src/model/config.hpp
index 2c4b4ef..5aa54b7 100644
--- a/src/model/config.hpp
+++ b/src/model/config.hpp
@@ -148,9 +148,10 @@ NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(VisualMode, state, mode);
 
 struct VisualConfig {
 	bool enableVisuals{true};
+	bool enableNetwrokVisual{true};
 	std::vector<VisualMode> modes;
 };
-NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(VisualConfig, enableVisuals, modes);
+NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(VisualConfig, enableVisuals, enableNetwrokVisual, modes);
 
 class Config {
   public:
diff --git a/src/model/model.cpp b/src/model/model.cpp
index 3fa4f24..694dd8c 100644
--- a/src/model/model.cpp
+++ b/src/model/model.cpp
@@ -78,6 +78,9 @@ void Model::initOptimizer() {
 void Model::initVisual() {
 	visual.start();
 
+	if (!config.visualConfig.enableNetwrokVisual)
+		return;
+
 	for (size_t i = 0; i < config.networkConfig.SubNetworksConfig.size(); ++i) {
 		visual.addVisualSubNetwork(network[i]->getVisual());
 		network[i]->getVisual()->setVstate(visual.Vstate);
@@ -113,7 +116,7 @@ void Model::initModel() {
 void Model::addFNN(const std::uint32_t width, ISubNetworkConfig &_config) {
 	fnn::FNNConfig &sub_ = (fnn::FNNConfig &)(_config);
 
-	if (config.visualConfig.enableVisuals) {
+	if (config.visualConfig.enableVisuals && config.visualConfig.enableNetwrokVisual) {
 		std::shared_ptr<visualizer::fnn::FnnVisualier> visual_ =
 		    std::make_shared<visualizer::fnn::FnnVisualier>(
 		        visual.Vstate,
@@ -129,7 +132,7 @@ void Model::addFNN(const std::uint32_t width, ISubNetworkConfig &_config) {
 void Model::addCNN(const std::uint32_t width, ISubNetworkConfig &_config) {
 	cnn::CNNConfig &sub_ = (cnn::CNNConfig &)(_config);
 
-	if (config.visualConfig.enableVisuals) {
+	if (config.visualConfig.enableVisuals && config.visualConfig.enableNetwrokVisual) {
 		std::shared_ptr<visualizer::cnn::CnnVisualier> visual_ =
 		    std::make_shared<visualizer::cnn::CnnVisualier>(
 		        visual.Vstate,
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index 88c3f4e..8e40e5b 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -4,7 +4,7 @@
 #include <tensor.hpp>
 
 namespace nn::global {
-Tensor::Tensor(const std::vector<size_t> &shape_, float init) {
+Tensor::Tensor(const std::vector<size_t> &shape_, ValueType init) {
 	if (shape_.empty()) {
 		throw std::invalid_argument("Tensor shape cannot be empty.");
 	}
@@ -16,12 +16,12 @@ Tensor::Tensor(const std::vector<size_t> &shape_, float init) {
 	    std::multiplies<>());
 
 	shape = shape_;
-	if (!isGpu) {
-		cpu_data.assign(totalSize, init);
-	} else {
+	if (isGpu) {
 		gpu_data = (ValueType *)tensor_gpu::allocate(totalSize * sizeof(ValueType));
 		gpu_data_size = totalSize;
 		fill(init);
+	} else {
+		cpu_data.assign(totalSize, init);
 	}
 
 	computeStrides();
@@ -33,7 +33,6 @@ Tensor::Tensor(const Tensor &other) {
 	if (isGpu) {
 		gpu_data_size = other.gpu_data_size;
 		gpu_data = (ValueType *)tensor_gpu::allocate(gpu_data_size * sizeof(ValueType));
-
 		tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType));
 	} else {
 		cpu_data = other.cpu_data;
@@ -48,10 +47,10 @@ size_t Tensor::numElements() const {
 }
 
 void Tensor::getData(std::vector<ValueType> &dest) const {
-	if (!isGpu) {
-		dest = cpu_data;
-	} else {
+	if (isGpu) {
 		tensor_gpu::copyToHost(dest.data(), gpu_data, gpu_data_size * sizeof(ValueType));
+	} else {
+		dest = cpu_data;
 	}
 }
 
@@ -78,21 +77,18 @@ Tensor &Tensor::operator=(const Tensor &other) {
 	if (this == &other)
 		return *this;
 
-	if (!isGpu) {
-		cpu_data = other.cpu_data;
-	} else {
+	if (isGpu) {
 		if (gpu_data_size != other.gpu_data_size) {
-		ValueType *temp = gpu_data;
-			temp = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType));
-
+			ValueType *temp = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType));
 			gpu_data_size = other.gpu_data_size;
 			tensor_gpu::copyDeviceToDevice(temp, other.gpu_data, gpu_data_size * sizeof(ValueType));
-
 			tensor_gpu::deallocate(gpu_data);
 			gpu_data = temp;
 		} else {
 			tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType));
-        }
+		}
+	} else {
+		cpu_data = other.cpu_data;
 	}
 
 	shape = other.shape;
@@ -102,13 +98,13 @@ Tensor &Tensor::operator=(const Tensor &other) {
 
 Tensor &Tensor::operator=(const std::vector<ValueType> &other) {
 	if (other.size() != numElements()) {
-		throw std::invalid_argument("");
+		throw std::length_error("Tensor assignment size mismatch");
 	}
 
-	if (!isGpu) {
-		cpu_data = other;
-	} else {
+	if (isGpu) {
 		tensor_gpu::copyToDevice(gpu_data, other.data(), gpu_data_size * sizeof(ValueType));
+	} else {
+		cpu_data = other;
 	}
 
 	return *this;
@@ -138,11 +134,11 @@ inline size_t Tensor::flattenIndex(const std::vector<size_t> &indices) const {
 }
 
 ValueType Tensor::getValue(const std::vector<size_t> &indices) const {
-	if (!isGpu) {
-		return cpu_data[flattenIndex(indices)];
+	if (isGpu) {
+		return tensor_gpu::getValueAt(gpu_data, flattenIndex(indices));
 	}
 
-	return tensor_gpu::getValueAt(gpu_data, flattenIndex(indices));
+	return cpu_data[flattenIndex(indices)];
 }
 
 void Tensor::insertRange(const Tensor &other,
@@ -158,22 +154,21 @@ void Tensor::insertRange(const Tensor &other,
 }
 
 void Tensor::setValue(const std::vector<size_t> &indices, const ValueType value) {
-	if (!isGpu) {
-		cpu_data[flattenIndex(indices)] = value;
-	} else {
+	if (isGpu) {
 		tensor_gpu::setValueAt(gpu_data, flattenIndex(indices), value);
+	} else {
+		cpu_data[flattenIndex(indices)] = value;
 	}
 }
 
 Tensor &Tensor::operator+=(const Tensor &other) {
 	if (shape != other.shape)
 		throw std::invalid_argument("Shape mismatch in Tensor::operator+=.");
-	if (!isGpu) {
-		const size_t N = cpu_data.size();
-		for (size_t i = 0; i < N; ++i)
-			cpu_data[i] += other.cpu_data[i];
-	} else {
+	if (isGpu) {
 		tensor_gpu::add_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size);
+	} else {
+		for (size_t i = 0; i < cpu_data.size(); ++i)
+			cpu_data[i] += other.cpu_data[i];
 	}
 	return *this;
 }
@@ -181,12 +176,11 @@ Tensor &Tensor::operator+=(const Tensor &other) {
 Tensor &Tensor::operator-=(const Tensor &other) {
 	if (shape != other.shape)
 		throw std::invalid_argument("Shape mismatch in Tensor::operator-=.");
-	if (!isGpu) {
-		const size_t N = cpu_data.size();
-		for (size_t i = 0; i < N; ++i)
-			cpu_data[i] -= other.cpu_data[i];
-	} else {
+	if (isGpu) {
 		tensor_gpu::subtraction_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size);
+	} else {
+		for (size_t i = 0; i < cpu_data.size(); ++i)
+			cpu_data[i] -= other.cpu_data[i];
 	}
 	return *this;
 }
@@ -194,12 +188,11 @@ Tensor &Tensor::operator-=(const Tensor &other) {
 Tensor &Tensor::operator*=(const Tensor &other) {
 	if (shape != other.shape)
 		throw std::invalid_argument("Shape mismatch in Tensor::operator*=.");
-	if (!isGpu) {
-		const size_t N = cpu_data.size();
-		for (size_t i = 0; i < N; ++i)
-			cpu_data[i] *= other.cpu_data[i];
-	} else {
+	if (isGpu) {
 		tensor_gpu::multiply_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size);
+	} else {
+		for (size_t i = 0; i < cpu_data.size(); ++i)
+			cpu_data[i] *= other.cpu_data[i];
 	}
 	return *this;
 }
@@ -207,52 +200,51 @@ Tensor &Tensor::operator*=(const Tensor &other) {
 Tensor &Tensor::operator/=(const Tensor &other) {
 	if (shape != other.shape)
 		throw std::invalid_argument("Shape mismatch in Tensor::operator/=.");
-	if (!isGpu) {
-		const size_t N = cpu_data.size();
-		for (size_t i = 0; i < N; ++i)
-			cpu_data[i] /= other.cpu_data[i];
-	} else {
+	if (isGpu) {
 		tensor_gpu::division_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size);
+	} else {
+		for (size_t i = 0; i < cpu_data.size(); ++i)
+			cpu_data[i] /= other.cpu_data[i];
 	}
 	return *this;
 }
 
 Tensor &Tensor::operator*=(ValueType scalar) {
-	if (!isGpu) {
+	if (isGpu) {
+		tensor_gpu::multiply_scalar(gpu_data, scalar, gpu_data, gpu_data_size);
+	} else {
 		for (auto &x : cpu_data)
 			x *= scalar;
-	} else {
-		tensor_gpu::multiply_scalar(gpu_data, scalar, gpu_data, gpu_data_size);
 	}
 	return *this;
 }
 
 Tensor &Tensor::operator-=(ValueType scalar) {
-	if (!isGpu) {
+	if (isGpu) {
+		tensor_gpu::subtraction_scalar(gpu_data, scalar, gpu_data, gpu_data_size);
+	} else {
 		for (auto &x : cpu_data)
 			x -= scalar;
-	} else {
-		tensor_gpu::subtraction_scalar(gpu_data, scalar, gpu_data, gpu_data_size);
 	}
 	return *this;
 }
 
 Tensor &Tensor::operator+=(ValueType scalar) {
-	if (!isGpu) {
+	if (isGpu) {
+		tensor_gpu::add_scalar(gpu_data, scalar, gpu_data, gpu_data_size);
+	} else {
 		for (auto &x : cpu_data)
 			x += scalar;
-	} else {
-		tensor_gpu::add_scalar(gpu_data, scalar, gpu_data, gpu_data_size);
 	}
 	return *this;
 }
 
 Tensor &Tensor::operator/=(ValueType scalar) {
-	if (!isGpu) {
+	if (isGpu) {
+		tensor_gpu::division_scalar(gpu_data, scalar, gpu_data, gpu_data_size);
+	} else {
 		for (auto &x : cpu_data)
 			x /= scalar;
-	} else {
-		tensor_gpu::division_scalar(gpu_data, scalar, gpu_data, gpu_data_size);
 	}
 	return *this;
 }
@@ -270,7 +262,9 @@ void Tensor::matmul(const Tensor &other, Tensor &result) const {
 
 	result.zero();
 
-	if (!isGpu) {
+	if (isGpu) {
+		tensor_gpu::matmul(gpu_data, other.gpu_data, result.gpu_data, M, K);
+	} else {
 		const float *A = cpu_data.data();
 		const float *B = other.cpu_data.data();
 		float *R = result.cpu_data.data();
@@ -283,8 +277,6 @@ void Tensor::matmul(const Tensor &other, Tensor &result) const {
 			}
 			R[i] = sum;
 		}
-	} else {
-		tensor_gpu::matmul(gpu_data, other.gpu_data, result.gpu_data, M, K);
 	}
 }
 
@@ -298,7 +290,9 @@ void Tensor::outer(const Tensor &a, const Tensor &b, Tensor &result) {
 
 	result.zero();
 
-	if (!isGpu) {
+	if (isGpu) {
+		tensor_gpu::outer(a.gpu_data, b.gpu_data, result.gpu_data, m, n);
+	} else {
 		float *r = result.cpu_data.data();
 		const float *A = a.cpu_data.data();
 		const float *B = b.cpu_data.data();
@@ -308,8 +302,6 @@ void Tensor::outer(const Tensor &a, const Tensor &b, Tensor &result) {
 				r[i * n + j] += A[i] * B[j];
 			}
 		}
-	} else {
-		tensor_gpu::outer(a.gpu_data, b.gpu_data, result.gpu_data, m, n);
 	}
 }
 
@@ -321,14 +313,14 @@ void Tensor::matmulT(const Tensor &vec, Tensor &result) const {
 
 	result.zero();
 
-	if (!isGpu) {
+	if (isGpu) {
+		tensor_gpu::matmulT(gpu_data, vec.gpu_data, result.gpu_data, shape[0], shape[1]);
+	} else {
 		for (size_t i = 0; i < shape[1]; ++i) {
 			for (size_t j = 0; j < shape[0]; ++j) {
 				result.cpu_data[i] += cpu_data[j * shape[1] + i] * vec.cpu_data[j];
 			}
 		}
-	} else {
-		tensor_gpu::matmulT(gpu_data, vec.gpu_data, result.gpu_data, shape[0], shape[1]);
 	}
 }
 
diff --git a/src/visualizer/VInterface.cpp b/src/visualizer/VInterface.cpp
index 5fa4ebd..8a81499 100644
--- a/src/visualizer/VInterface.cpp
+++ b/src/visualizer/VInterface.cpp
@@ -1,30 +1,45 @@
 #include "VInterface.hpp"
+#include "state.hpp"
 
 namespace nn::visualizer {
-IntefacePanel::IntefacePanel(const std::shared_ptr<StateManager> vstate)
+InterfacePanel::InterfacePanel(const std::shared_ptr<StateManager> vstate)
     : Panel(vstate),
       VRender({VINTERFACE_WIDTH, VINTERFACE_HEIGHT}) {
 	createVInterface();
 }
 
-void IntefacePanel::createVInterface() {
+void InterfacePanel::createVInterface() {
 	VRender.clear(INTERFACE_PANEL_COLOR);
 	buttons.reserve(STATES_COUNT);
 
+	constexpr std::array<SettingType, 3> skipWhenDisabled = {
+	    SettingType::AutoPause,
+	    SettingType::Pause,
+	    SettingType::PreciseMode};
+
 	for (int i = 0; i < STATES_COUNT; ++i) {
-		buttons.push_back(std::make_unique<Button>(vstate, vstate->getStateString((SettingType)i), (SettingType)i));
+		SettingType set = static_cast<SettingType>(i);
+
+		if (!vstate->config.visualConfig.enableNetwrokVisual &&
+		    std::find(skipWhenDisabled.begin(), skipWhenDisabled.end(), set) != skipWhenDisabled.end()) {
+			continue;
+		}
+
+		buttons.push_back(std::make_unique<Button>(
+		    vstate,
+		    vstate->getStateString(set),
+		    set));
 	}
 }
-
-void IntefacePanel::display() {
+void InterfacePanel::display() {
 	VRender.display();
 }
 
-sf::Sprite IntefacePanel::getSprite() {
+sf::Sprite InterfacePanel::getSprite() {
 	return sf::Sprite(VRender.getTexture());
 }
 
-void IntefacePanel::handleClick(const sf::Vector2i mousePos_, const sf::Vector2f boxPos) {
+void InterfacePanel::handleClick(const sf::Vector2i mousePos_, const sf::Vector2f boxPos) {
 	if (needHandlePress) {
 		return;
 	}
@@ -34,12 +49,12 @@ void IntefacePanel::handleClick(const sf::Vector2i mousePos_, const sf::Vector2f
 	handleKeyPresed(mousePos_, boxPos);
 }
 
-void IntefacePanel::handleNoClick() {
+void InterfacePanel::handleNoClick() {
 	needHandlePress = false;
 	setUpdate();
 }
 
-void IntefacePanel::doRender() {
+void InterfacePanel::doRender() {
 	int row = 0, column = -1;
 
 	for (size_t button_ = 0; button_ < buttons.size(); ++button_) {
@@ -60,7 +75,7 @@ void IntefacePanel::doRender() {
 	display();
 }
 
-void IntefacePanel::handleKeyPresed(const sf::Vector2i mousePos_, const sf::Vector2f boxPos) {
+void InterfacePanel::handleKeyPresed(const sf::Vector2i mousePos_, const sf::Vector2f boxPos) {
 	int row = 0, column = -1;
 	sf::Vector2f mousePos(static_cast<float>(mousePos_.x), static_cast<float>(mousePos_.y));
 
diff --git a/src/visualizer/VInterface.hpp b/src/visualizer/VInterface.hpp
index bb63707..41ea176 100644
--- a/src/visualizer/VInterface.hpp
+++ b/src/visualizer/VInterface.hpp
@@ -11,7 +11,7 @@ constexpr std::uint32_t VINTERFACE_HEIGHT = BUTTON_HEIGHT * BUTTON_PER_COLLUM +
 
 constexpr sf::Color INTERFACE_PANEL_COLOR = PANELS_BG;
 
-class IntefacePanel : public Panel {
+class InterfacePanel : public Panel {
   private:
 	sf::RenderTexture VRender;
 	bool needHandlePress{false};
@@ -23,8 +23,8 @@ class IntefacePanel : public Panel {
 	void doRender() override;
 
   public:
-	IntefacePanel(const std::shared_ptr<StateManager> vstate);
-	~IntefacePanel() = default;
+	InterfacePanel(const std::shared_ptr<StateManager> vstate);
+	~InterfacePanel() = default;
 
 	sf::Sprite getSprite();
 
diff --git a/src/visualizer/VisualizerRenderer.cpp b/src/visualizer/VisualizerRenderer.cpp
index 510f91d..cc539d9 100644
--- a/src/visualizer/VisualizerRenderer.cpp
+++ b/src/visualizer/VisualizerRenderer.cpp
@@ -1,15 +1,20 @@
 #include "VisualizerRenderer.hpp"
 #include "tensor.hpp"
+#include "visualModel.hpp"
+#include <memory>
 
 namespace nn::visualizer {
 constexpr std::uint32_t NN_WIDTH = 1055u;
 VisualRender::VisualRender(std::shared_ptr<StateManager> vstate)
     : window(sf::VideoMode({WINDOW_WIDTH, WINDOW_HEIGHT}), WINDOW_TITLE.data()),
-      visualModel(vstate),
       Vstate(vstate),
       interface(vstate),
       statusV(vstate),
-      Vgraph(vstate) {}
+      Vgraph(vstate) {
+	if (Vstate->config.visualConfig.enableNetwrokVisual) {
+		visualModel = std::make_unique<ModelPanel>(vstate);
+	}
+}
 
 void VisualRender::processEvents() {
 	while (const std::optional event = window.pollEvent()) {
@@ -22,6 +27,10 @@ void VisualRender::processEvents() {
 		} else if (event->is<sf::Event::Resized>()) {
 			need_resize = true;
 		} else if (const auto *keyPressed = event->getIf<sf::Event::KeyPressed>()) {
+			if (!Vstate->config.visualConfig.enableNetwrokVisual) {
+				continue;
+			}
+
 			if (keyPressed->scancode == sf::Keyboard::Scancode::Space) {
 				Vstate->toggle(SettingType::Pause);
 				interface.setUpdate();
@@ -39,10 +48,12 @@ void VisualRender::resetSize() {
 }
 
 void VisualRender::renderPanels() {
-	visualModel.render();
-	sf::Sprite visualNetworkSprite = visualModel.getSprite();
-	visualNetworkSprite.setPosition({UI_GAP, UI_GAP});
-	window.draw(visualNetworkSprite);
+	if (visualModel) {
+		visualModel->render();
+		sf::Sprite visualNetworkSprite = visualModel->getSprite();
+		visualNetworkSprite.setPosition({UI_GAP, UI_GAP});
+		window.draw(visualNetworkSprite);
+	}
 
 	interface.render();
 	sf::Sprite interfaceSprite = interface.getSprite();
@@ -65,7 +76,9 @@ void VisualRender::fullUpdate() {
 
 	statusV.setUpdate();
 	interface.setUpdate();
-	visualModel.setUpdate();
+	if (visualModel) {
+		visualModel->setUpdate();
+	}
 	Vgraph.setUpdate();
 }
 
@@ -118,7 +131,9 @@ void VisualRender::close() {
 }
 
 bool VisualRender::updateStatus() {
-	return interface.updateStatus() || statusV.updateStatus() || visualModel.updateStatus();
+	return interface.updateStatus() ||
+	       statusV.updateStatus() ||
+	       (visualModel && visualModel->updateStatus());
 }
 
 void VisualRender::start() {
@@ -148,11 +163,15 @@ void VisualRender::setNewPhaseMode(const NnMode nn_mode) {
 }
 
 void VisualRender::updatePrediction(const global::Prediction &pre) {
-	visualModel.setPrediction(pre);
+	if (visualModel) {
+		visualModel->setPrediction(pre);
+	}
 }
 
 void VisualRender::updateInput(const global::Tensor &input) {
-	visualModel.setInput(input);
+	if (visualModel) {
+		visualModel->setInput(input);
+	}
 }
 
 void VisualRender::updateLearningRate(const global::ValueType lr) {
@@ -160,6 +179,8 @@ void VisualRender::updateLearningRate(const global::ValueType lr) {
 }
 
 void VisualRender::addVisualSubNetwork(const std::shared_ptr<IVisualNetwork> newVisual) {
-	visualModel.addVisualSubNetwork(newVisual);
+	if (visualModel) {
+		visualModel->addVisualSubNetwork(newVisual);
+	}
 }
 } // namespace nn::visualizer
diff --git a/src/visualizer/VisualizerRenderer.hpp b/src/visualizer/VisualizerRenderer.hpp
index 5110bf0..0c6979b 100644
--- a/src/visualizer/VisualizerRenderer.hpp
+++ b/src/visualizer/VisualizerRenderer.hpp
@@ -6,6 +6,7 @@
 #include "graph.hpp"
 #include "tensor.hpp"
 #include "visualModel.hpp"
+#include <memory>
 
 namespace nn::visualizer {
 constexpr sf::Color BG_COLOR(100, 100, 100);
@@ -18,9 +19,9 @@ constexpr std::string_view WINDOW_TITLE = "Visualizer";
 class VisualRender {
   private:
 	sf::RenderWindow window;
-	ModelPanel visualModel;
+    std::unique_ptr<ModelPanel> visualModel;
 	std::shared_ptr<StateManager> Vstate;
-	IntefacePanel interface;
+	InterfacePanel interface;
 	StatusPanel statusV;
 	GraphUIPanel Vgraph;
 	std::atomic<bool> running{false};
diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json
index aef3f34..2c4da4b 100644
--- a/tests/data/config-binary_test.json
+++ b/tests/data/config-binary_test.json
@@ -1,6 +1,7 @@
 {
 	"visual config": {
 		"enableVisuals": true,
+		"enableNetwrokVisual": false,
 		"modes": [
 			{ "state": "pause", "mode": true },
 			{ "state": "precise mode", "mode": false },
@@ -22,10 +23,10 @@
 			"output size": 16,
 			"output activation": 4,
 			"layers": [
-				{ "size": 10, "activationType": 1 },
-				{ "size": 10, "activationType": 1 },
-				{ "size": 10, "activationType": 1 },
-				{ "size": 10, "activationType": 1 },
+				{ "size": 100, "activationType": 1 },
+				{ "size": 100, "activationType": 1 },
+				{ "size": 100, "activationType": 1 },
+				{ "size": 100, "activationType": 1 },
 				{ "size": 300, "activationType": 1 }
 			]
 		}

From 60a4cb1e000925795f2f70b2534f9ffeac3e3443 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Sat, 9 Aug 2025 18:51:56 +0300
Subject: [PATCH 37/40] small bug fix

---
 src/visualizer/VisualizerRenderer.cpp | 25 +++++++++++++++++++------
 src/visualizer/VisualizerRenderer.hpp |  8 ++++++--
 tests/data/config-binary_test.json    |  2 +-
 3 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/src/visualizer/VisualizerRenderer.cpp b/src/visualizer/VisualizerRenderer.cpp
index cc539d9..9ed0a9d 100644
--- a/src/visualizer/VisualizerRenderer.cpp
+++ b/src/visualizer/VisualizerRenderer.cpp
@@ -1,13 +1,16 @@
 #include "VisualizerRenderer.hpp"
+#include "network/IvisualNetwork.hpp"
 #include "tensor.hpp"
 #include "visualModel.hpp"
+#include <SFML/System/Vector2.hpp>
 #include <memory>
 
 namespace nn::visualizer {
 constexpr std::uint32_t NN_WIDTH = 1055u;
 VisualRender::VisualRender(std::shared_ptr<StateManager> vstate)
-    : window(sf::VideoMode({WINDOW_WIDTH, WINDOW_HEIGHT}), WINDOW_TITLE.data()),
-      Vstate(vstate),
+    : Vstate(vstate),
+      winSize(getWinSize(Vstate->config.visualConfig.enableNetwrokVisual)),
+      window(sf::VideoMode(winSize), WINDOW_TITLE.data()),
       interface(vstate),
       statusV(vstate),
       Vgraph(vstate) {
@@ -16,6 +19,14 @@ VisualRender::VisualRender(std::shared_ptr<StateManager> vstate)
 	}
 }
 
+sf::Vector2u VisualRender::getWinSize(bool enableNetwork) {
+	winSize = sf::Vector2u(WINDOW_WIDTH, WINDOW_HEIGHT);
+	if (!enableNetwork) {
+		winSize -= sf::Vector2u(MODEL_WIDTH + UI_GAP, 0);
+	}
+	return winSize;
+}
+
 void VisualRender::processEvents() {
 	while (const std::optional event = window.pollEvent()) {
 		if (event->is<sf::Event::Closed>()) {
@@ -41,33 +52,35 @@ void VisualRender::processEvents() {
 
 void VisualRender::resetSize() {
 	if (need_resize) {
-		window.setSize({WINDOW_WIDTH, WINDOW_HEIGHT});
+		window.setSize(winSize);
 	}
 
 	need_resize = false;
 }
 
 void VisualRender::renderPanels() {
+	float networkOffset = 0;
 	if (visualModel) {
 		visualModel->render();
 		sf::Sprite visualNetworkSprite = visualModel->getSprite();
 		visualNetworkSprite.setPosition({UI_GAP, UI_GAP});
 		window.draw(visualNetworkSprite);
+		networkOffset += visualNetworkSprite.getGlobalBounds().size.x + UI_GAP;
 	}
 
 	interface.render();
 	sf::Sprite interfaceSprite = interface.getSprite();
-	interfaceSprite.setPosition({NN_WIDTH + UI_GAP * 2, UI_GAP});
+	interfaceSprite.setPosition({networkOffset + UI_GAP, UI_GAP});
 	window.draw(interfaceSprite);
 
 	statusV.render();
 	sf::Sprite statusSprite = statusV.getSprite();
-	statusSprite.setPosition({NN_WIDTH + UI_GAP * 2, UI_GAP * 2 + VINTERFACE_HEIGHT});
+	statusSprite.setPosition({networkOffset + UI_GAP, UI_GAP * 2 + VINTERFACE_HEIGHT});
 	window.draw(statusSprite);
 
 	Vgraph.render();
 	sf::Sprite graphSprite = Vgraph.getSprite();
-	graphSprite.setPosition({NN_WIDTH + UI_GAP * 2, UI_GAP * 3 + VINTERFACE_HEIGHT + VSTATUS_HEIGHT});
+	graphSprite.setPosition({networkOffset + UI_GAP, UI_GAP * 3 + VINTERFACE_HEIGHT + VSTATUS_HEIGHT});
 	window.draw(graphSprite);
 }
 
diff --git a/src/visualizer/VisualizerRenderer.hpp b/src/visualizer/VisualizerRenderer.hpp
index 0c6979b..342f60a 100644
--- a/src/visualizer/VisualizerRenderer.hpp
+++ b/src/visualizer/VisualizerRenderer.hpp
@@ -6,6 +6,7 @@
 #include "graph.hpp"
 #include "tensor.hpp"
 #include "visualModel.hpp"
+#include <SFML/System/Vector2.hpp>
 #include <memory>
 
 namespace nn::visualizer {
@@ -18,9 +19,10 @@ constexpr std::string_view WINDOW_TITLE = "Visualizer";
 
 class VisualRender {
   private:
-	sf::RenderWindow window;
-    std::unique_ptr<ModelPanel> visualModel;
 	std::shared_ptr<StateManager> Vstate;
+	sf::Vector2u winSize;
+	sf::RenderWindow window;
+	std::unique_ptr<ModelPanel> visualModel;
 	InterfacePanel interface;
 	StatusPanel statusV;
 	GraphUIPanel Vgraph;
@@ -29,6 +31,8 @@ class VisualRender {
 	float bps;
 	bool need_resize{false};
 
+	sf::Vector2u getWinSize(bool enableNetwork);
+
 	void renderLoop();
 	void processEvents();
 	void renderPanels();
diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json
index 2c4da4b..46dd221 100644
--- a/tests/data/config-binary_test.json
+++ b/tests/data/config-binary_test.json
@@ -1,7 +1,7 @@
 {
 	"visual config": {
 		"enableVisuals": true,
-		"enableNetwrokVisual": false,
+		"enableNetwrokVisual": true,
 		"modes": [
 			{ "state": "pause", "mode": true },
 			{ "state": "precise mode", "mode": false },

From 0c6228e362a1ad75b8a02500aef0008a2ff3ab18 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Sat, 9 Aug 2025 19:46:07 +0300
Subject: [PATCH 38/40] gpu mode is fully work now!!!

---
 src/model/tensor_gpu.cu            | 18 +++++++++++++-----
 tests/data/config-binary_test.json | 12 ++++--------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
index 5a2fc62..f11fef7 100644
--- a/src/model/tensor_gpu.cu
+++ b/src/model/tensor_gpu.cu
@@ -187,7 +187,10 @@ __global__ void reluKernel(const ValueType* input, ValueType* output, std::size_
 
 __global__ void reluDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < count) output[idx] = input[idx] > 0.0f ? 1.0f : 0.0f;
+    if (idx < count) {
+        ValueType derivative = (input[idx] > 0.0f) ? 1.0f : 0.0f;
+        output[idx] *= derivative; // FIX: Changed = to *=
+    }
 }
 
 void relu(const ValueType* input, ValueType* output, std::size_t count) {
@@ -219,7 +222,8 @@ __global__ void sigmoidDerivativeKernel(const ValueType* input, ValueType* outpu
     if (idx < count) {
         ValueType x = input[idx];
         ValueType s = 1.0f / (1.0f + expf(-x));
-        output[idx] = s * (1.0f - s);
+        ValueType derivative = s * (1.0f - s);
+        output[idx] *= derivative;
     }
 }
 
@@ -248,7 +252,8 @@ __global__ void tanhDerivativeKernel(const ValueType* input, ValueType* output,
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
     if (idx < count) {
         ValueType t = tanhf(input[idx]);
-        output[idx] = 1.0f - t * t;
+        ValueType derivative = 1.0f - t * t;
+        output[idx] *= derivative;
     }
 }
 
@@ -275,7 +280,10 @@ __global__ void leakyReluKernel(const ValueType* input, ValueType* output, std::
 
 __global__ void leakyReluDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) {
     std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < count) output[idx] = (input[idx] > 0.0f) ? 1.0f : alpha;
+    if (idx < count) {
+        ValueType derivative = (input[idx] > 0.0f) ? 1.0f : alpha;
+        output[idx] *= derivative; // FIX: Changed = to *=
+    }
 }
 
 void leaky_relu(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) {
@@ -384,7 +392,7 @@ __global__ void outerKernel(const ValueType* a, const ValueType* b, ValueType* r
     if (idx < total) {
         size_t i = idx / n;
         size_t j = idx % n;
-        result[i * n + j] = a[i] * b[j];
+        result[i * n + j] += a[i] * b[j];
     }
 }
 
diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json
index 46dd221..8ac0e03 100644
--- a/tests/data/config-binary_test.json
+++ b/tests/data/config-binary_test.json
@@ -4,13 +4,13 @@
 		"enableNetwrokVisual": true,
 		"modes": [
 			{ "state": "pause", "mode": true },
-			{ "state": "precise mode", "mode": false },
+			{ "state": "precise mode", "mode": true },
 			{ "state": "auto pause", "mode": false }
 		]
 	},
 	"training config": {
-		"batch size": 64,
-		"batch count": 1000,
+		"batch size": 8,
+		"batch count": 10000,
 		"optimizer": {
 			"type": "const",
 			"lr": 0.5
@@ -23,11 +23,7 @@
 			"output size": 16,
 			"output activation": 4,
 			"layers": [
-				{ "size": 100, "activationType": 1 },
-				{ "size": 100, "activationType": 1 },
-				{ "size": 100, "activationType": 1 },
-				{ "size": 100, "activationType": 1 },
-				{ "size": 300, "activationType": 1 }
+				{ "size": 50, "activationType": 1 }
 			]
 		}
 	]

From 4f77d9e169ce5df40a4869dec8ad4bc5ca810350 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Sat, 9 Aug 2025 20:26:41 +0300
Subject: [PATCH 39/40] bug fixes

---
 src/visualizer/VInterface.cpp         |  1 +
 src/visualizer/VisualizerRenderer.cpp | 11 ++++++-----
 tests/data/config-binary_test.json    | 13 +++++++------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/visualizer/VInterface.cpp b/src/visualizer/VInterface.cpp
index 8a81499..3cfbd5c 100644
--- a/src/visualizer/VInterface.cpp
+++ b/src/visualizer/VInterface.cpp
@@ -31,6 +31,7 @@ void InterfacePanel::createVInterface() {
 		    set));
 	}
 }
+
 void InterfacePanel::display() {
 	VRender.display();
 }
diff --git a/src/visualizer/VisualizerRenderer.cpp b/src/visualizer/VisualizerRenderer.cpp
index 9ed0a9d..7a0615b 100644
--- a/src/visualizer/VisualizerRenderer.cpp
+++ b/src/visualizer/VisualizerRenderer.cpp
@@ -34,14 +34,15 @@ void VisualRender::processEvents() {
 		} else if (event->is<sf::Event::MouseButtonPressed>()) {
 			interface.handleNoClick();
 		} else if (event->is<sf::Event::MouseButtonReleased>()) {
-			interface.handleClick(sf::Mouse::getPosition(window), {NN_WIDTH + UI_GAP * 2, UI_GAP});
+			float x = UI_GAP;
+			if (Vstate->config.visualConfig.enableNetwrokVisual) {
+				x += NN_WIDTH + UI_GAP * 2;
+			}
+
+			interface.handleClick(sf::Mouse::getPosition(window), {x, UI_GAP});
 		} else if (event->is<sf::Event::Resized>()) {
 			need_resize = true;
 		} else if (const auto *keyPressed = event->getIf<sf::Event::KeyPressed>()) {
-			if (!Vstate->config.visualConfig.enableNetwrokVisual) {
-				continue;
-			}
-
 			if (keyPressed->scancode == sf::Keyboard::Scancode::Space) {
 				Vstate->toggle(SettingType::Pause);
 				interface.setUpdate();
diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json
index 8ac0e03..2fb2418 100644
--- a/tests/data/config-binary_test.json
+++ b/tests/data/config-binary_test.json
@@ -1,19 +1,19 @@
 {
 	"visual config": {
 		"enableVisuals": true,
-		"enableNetwrokVisual": true,
+		"enableNetwrokVisual": false,
 		"modes": [
-			{ "state": "pause", "mode": true },
-			{ "state": "precise mode", "mode": true },
+			{ "state": "pause", "mode": false },
+			{ "state": "precise mode", "mode": false },
 			{ "state": "auto pause", "mode": false }
 		]
 	},
 	"training config": {
 		"batch size": 8,
-		"batch count": 10000,
+		"batch count": 1000,
 		"optimizer": {
 			"type": "const",
-			"lr": 0.5
+			"lr": 0.1
 		}
 	},
 	"network config": [
@@ -23,7 +23,8 @@
 			"output size": 16,
 			"output activation": 4,
 			"layers": [
-				{ "size": 50, "activationType": 1 }
+				{ "size": 500, "activationType": 1 },
+				{ "size": 500, "activationType": 1 }
 			]
 		}
 	]

From c2b92e7d3c11d6a0a2f9acb52b1c0b018e1da552 Mon Sep 17 00:00:00 2001
From: maayan <mayanpotu500@gmail.com>
Date: Sat, 9 Aug 2025 21:13:32 +0300
Subject: [PATCH 40/40] now it is possible to change to gpu mode from the user
 code

---
 include/tensor.hpp                 | 12 ++++++++----
 src/model/tensor.cpp               | 27 +++++++++++++++++++++++++++
 tests/binary_test.cpp              |  2 ++
 tests/data/config-binary_test.json | 16 +++++++++++++---
 4 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/include/tensor.hpp b/include/tensor.hpp
index bb0749d..3f32e39 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -6,12 +6,11 @@
 
 namespace nn::model {
 class Activation;
-}
+void enableGpuMode();
+} // namespace nn::model
 
 namespace nn::global {
 
-constexpr bool GPU_MODE = true;
-
 class Tensor {
   private:
 	std::vector<ValueType> cpu_data;
@@ -21,7 +20,8 @@ class Tensor {
 	ValueType *gpu_data = nullptr;
 	std::size_t gpu_data_size;
 
-	static const bool isGpu{GPU_MODE};
+	static bool isGpu;
+    static size_t tensorCount;
 
 	void computeStrides();
 	inline size_t flattenIndex(const std::vector<size_t> &indices) const;
@@ -64,7 +64,11 @@ class Tensor {
 	void matmul(const Tensor &other, Tensor &result) const;
 	static void outer(const Tensor &a, const Tensor &b, Tensor &result);
 	void matmulT(const Tensor &vec, Tensor &result) const;
+
+	static void toGpu();
+	static void toCpu();
 };
+
 } // namespace nn::global
 
 #endif // TENSOR
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index 8e40e5b..0bc82f9 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -1,9 +1,13 @@
 #include "tensor_gpu.hpp"
+#include <cstddef>
 #include <numeric>
 #include <stdexcept>
 #include <tensor.hpp>
 
 namespace nn::global {
+bool Tensor::isGpu = false;
+size_t Tensor::tensorCount = 0;
+
 Tensor::Tensor(const std::vector<size_t> &shape_, ValueType init) {
 	if (shape_.empty()) {
 		throw std::invalid_argument("Tensor shape cannot be empty.");
@@ -25,6 +29,28 @@ Tensor::Tensor(const std::vector<size_t> &shape_, ValueType init) {
 	}
 
 	computeStrides();
+
+	tensorCount++;
+}
+
+void Tensor::toGpu() {
+	if (isGpu)
+		return;
+
+	if (tensorCount > 0)
+		throw std::runtime_error("Cannot switch to GPU mode: tensors already exist in CPU mode.");
+
+	isGpu = true;
+}
+
+void Tensor::toCpu() {
+	if (!isGpu)
+		return;
+
+	if (tensorCount > 0)
+		throw std::runtime_error("Cannot switch to CPU mode: tensors already exist in GPU mode.");
+
+	isGpu = false;
 }
 
 Tensor::Tensor(const Tensor &other) {
@@ -328,5 +354,6 @@ Tensor::~Tensor() {
 	if (isGpu) {
 		tensor_gpu::deallocate(gpu_data);
 	}
+	tensorCount--;
 }
 } // namespace nn::global
diff --git a/tests/binary_test.cpp b/tests/binary_test.cpp
index 4fdba8f..194dc9c 100644
--- a/tests/binary_test.cpp
+++ b/tests/binary_test.cpp
@@ -76,6 +76,8 @@ void printVector(const nn::global::Tensor &vec) {
 int main(int argc, char *argv[]) {
 	size_t input_size = 10;
 
+    nn::global::Tensor::toGpu();
+
 	std::string config_FN = tests::appendToBase("config-binary_test.json");
 
 	nn::model::Model model(config_FN);
diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json
index 2fb2418..824ea8e 100644
--- a/tests/data/config-binary_test.json
+++ b/tests/data/config-binary_test.json
@@ -9,11 +9,19 @@
 		]
 	},
 	"training config": {
-		"batch size": 8,
-		"batch count": 1000,
+		"batch size": 64,
+		"batch count": 100000,
+		"auto save": {
+			"saveEvery": 500,
+			"dataFilenameAutoSave": "test.txt"
+		},
+		"auto evaluating": {
+			"evaluateEvery": 10,
+			"dataBaseFilename": "../tests/data/database-binary_test"
+		},
 		"optimizer": {
 			"type": "const",
-			"lr": 0.1
+			"lr": 0
 		}
 	},
 	"network config": [
@@ -23,6 +31,8 @@
 			"output size": 16,
 			"output activation": 4,
 			"layers": [
+				{ "size": 784, "activationType": 1 },
+				{ "size": 500, "activationType": 1 },
 				{ "size": 500, "activationType": 1 },
 				{ "size": 500, "activationType": 1 }
 			]