diff --git a/.github/workflows/cmake-multi-platform.yml b/.github/workflows/cmake-multi-platform.yml
index 4e76790..5923ace 100644
--- a/.github/workflows/cmake-multi-platform.yml
+++ b/.github/workflows/cmake-multi-platform.yml
@@ -28,6 +28,13 @@ jobs:
           libxi-dev \
           libfreetype6-dev
 
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-ubuntu2404.pin
+        sudo mv cuda-ubuntu2404.pin /etc/apt/preferences.d/cuda-repository-pin-600
+        wget https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda-repo-ubuntu2404-13-0-local_13.0.0-580.65.06-1_amd64.deb
+        sudo dpkg -i cuda-repo-ubuntu2404-13-0-local_13.0.0-580.65.06-1_amd64.deb
+        sudo cp /var/cuda-repo-ubuntu2404-13-0-local/cuda-*-keyring.gpg /usr/share/keyrings/
+        sudo apt-get -y install cuda-toolkit-13-0
+        
     - name: Checkout code
       uses: actions/checkout@v4
 
@@ -36,3 +43,4 @@ jobs:
 
     - name: Build
       run: cmake --build build --config Release
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 86f4cc4..c29b4f8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,13 +1,18 @@
 cmake_minimum_required(VERSION 3.28)
-project(NeuralNetwork LANGUAGES CXX)
+set(CMAKE_CUDA_ARCHITECTURES 86)  # For RTX 3060
+project(NeuralNetwork LANGUAGES CXX CUDA)  # Add CUDA here
 
 # ------------------------------------------------------------------
 # Configuration
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_STANDARD 17)                    # Add CUDA standard
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)           # Enforce it
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
 
+enable_language(CUDA)
+
 # Default to Debug build type
 if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type" FORCE)
@@ -39,22 +44,23 @@ FetchContent_Declare(nlohmann_json
 
 FetchContent_MakeAvailable(SFML nlohmann_json)
 
-# ------------------------------------------------------------------
-# Function: Apply sanitizers
-function(apply_sanitizers target)
-    target_compile_options(${target} PRIVATE -fsanitize=address -fno-omit-frame-pointer -g)
-    target_link_libraries(${target} PRIVATE -fsanitize=address)
-endfunction()
-
 # ------------------------------------------------------------------
 # Main library
+
+# Add both C++ and CUDA sources
 file(GLOB_RECURSE NN_SOURCES CONFIGURE_DEPENDS
     "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp"
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cu"   # Include CUDA source files
 )
 
 add_library(NeuralNetwork STATIC ${NN_SOURCES})
 set_target_properties(NeuralNetwork PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
+# Enable separable compilation for CUDA files
+set_target_properties(NeuralNetwork PROPERTIES
+    CUDA_SEPARABLE_COMPILATION ON
+)
+
 target_include_directories(NeuralNetwork
     PUBLIC
         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
@@ -71,21 +77,19 @@ target_link_libraries(NeuralNetwork
         SFML::Window
         SFML::System
         nlohmann_json::nlohmann_json
+        cuda
 )
 
 target_compile_options(NeuralNetwork PRIVATE -Wall -Wextra -Wpedantic)
 
 # ------------------------------------------------------------------
-# Tests (with sanitizers)
+# Tests
 option(BUILD_NN_TESTS "Build NeuralNetwork tests" OFF)
 
 if(BUILD_NN_TESTS)
     enable_testing()
     include(CTest)
 
-    # Apply sanitizers only for test builds
-    apply_sanitizers(NeuralNetwork)
-
     file(GLOB TEST_SOURCES CONFIGURE_DEPENDS tests/*.cpp)
 
     if(TEST_SOURCES)
@@ -96,8 +100,6 @@ if(BUILD_NN_TESTS)
             target_link_libraries(${test_name} PRIVATE NeuralNetwork)
             target_include_directories(${test_name} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include")
 
-            apply_sanitizers(${test_name})
-
             add_test(NAME ${test_name} COMMAND ${test_name})
         endforeach()
     else()
@@ -109,3 +111,4 @@ endif()
 # Install
 install(TARGETS NeuralNetwork ARCHIVE DESTINATION lib)
 install(DIRECTORY include/ DESTINATION include)
+
diff --git a/include/model.hpp b/include/model.hpp
index b9c9ae0..26aed62 100644
--- a/include/model.hpp
+++ b/include/model.hpp
@@ -4,6 +4,9 @@
 #include "../src/model/dataBase.hpp"
 #include "../src/model/optimizers.hpp"
 #include "../src/visualizer/VisualizerController.hpp"
+#include "Globals.hpp"
+#include "tensor.hpp"
+#include <cstddef>
 #include <network/INetwork.hpp>
 
 namespace nn::visualizer {
@@ -65,7 +68,7 @@ class Model {
 	global::ValueType runBackPropagation(
 	    const Batch &batch,
 	    const bool updateWeights,
-	    global::Transformation transformation = dt);
+	    global::Transformation transformation = nullptr);
 
 	void printTrainingResult(
 	    const std::chrono::high_resolution_clock::time_point &start,
@@ -81,12 +84,12 @@ class Model {
 	    DataBase &dataBase,
 	    const bool cancleOnError = false,
 	    const bool showProgressbar = true,
-	    global::Transformation transformation = dt);
+	    global::Transformation transformation = nullptr);
 	void trainModel(
 	    DataBase &trainedDataBase,
 	    DataBase &evaluateDataBase,
-	    global::Transformation transformationB = dt,
-	    global::Transformation transformationE = dt);
+	    global::Transformation transformationB = nullptr,
+	    global::Transformation transformationE = nullptr);
 
 	size_t outputSize() const;
 	size_t inputSize() const;
@@ -103,10 +106,13 @@ class Model {
 
 	void autoSave(const int i);
 
-    void addFNN(const std::uint32_t width, ISubNetworkConfig &_config);
-    void addCNN(const std::uint32_t width, ISubNetworkConfig &_config);
+	void addFNN(const std::uint32_t width, ISubNetworkConfig &_config);
+	void addCNN(const std::uint32_t width, ISubNetworkConfig &_config);
 
-    std::uint32_t calculateSubNetWidth() const;
+	std::uint32_t calculateSubNetWidth() const;
+
+	void runModel(const global::Tensor &input,
+	              global::Transformation transformation);
 
   public:
 	Model(const std::string &config_filepath);
@@ -115,19 +121,19 @@ class Model {
 	void runModel(const global::Tensor &input);
 	void train(
 	    const std::string &db_filename,
-	    global::Transformation transformationB = dt,
-	    global::Transformation transformationE = dt);
+	    global::Transformation transformationB = nullptr,
+	    global::Transformation transformationE = nullptr);
 	void train(
 	    const std::vector<std::string> &db_filename,
-	    global::Transformation transformationB = dt,
-	    global::Transformation transformationE = dt);
+	    global::Transformation transformationB = nullptr,
+	    global::Transformation transformationE = nullptr);
 	modelResult evaluateModel(
 	    const std::string &db_filename,
 	    const bool cancleOnError = false,
-	    global::Transformation transformation = dt);
+	    global::Transformation transformation = nullptr);
 
-	void save(const std::string &file);
-	void load(const std::string &file);
+	void save(const std::string &file, bool print = true);
+	void load(const std::string &file, bool print = true);
 
 	global::Prediction getPrediction() const;
 };
diff --git a/include/network/INetwork.hpp b/include/network/INetwork.hpp
index 530a444..574c600 100644
--- a/include/network/INetwork.hpp
+++ b/include/network/INetwork.hpp
@@ -11,7 +11,7 @@ class INetwork {
 	virtual ~INetwork() = default;
 
 	virtual void forward(const global::Tensor &input) = 0;
-	virtual void backward(const global::Tensor &outputDeltas) = 0;
+	virtual void backward(global::Tensor **outputDeltas) = 0;
 	virtual void updateWeights(IOptimizer &optimizer) = 0;
 	virtual void resetGradient() = 0;
 
diff --git a/include/network/IvisualNetwork.hpp b/include/network/IvisualNetwork.hpp
index ae08c3a..58b49af 100644
--- a/include/network/IvisualNetwork.hpp
+++ b/include/network/IvisualNetwork.hpp
@@ -3,8 +3,7 @@
 
 #include "../../src/visualizer/panel.hpp"
 #include <SFML/Graphics.hpp>
-#include <SFML/Graphics/Color.hpp>
-#include <SFML/System/Vector2.hpp>
+#include <Globals.hpp>
 
 namespace nn::visualizer {
 constexpr std::uint32_t MODEL_HEIGHT = 770u;
diff --git a/include/tensor.hpp b/include/tensor.hpp
index 71b24d6..3f32e39 100644
--- a/include/tensor.hpp
+++ b/include/tensor.hpp
@@ -1,59 +1,55 @@
 #ifndef TENSOR
 #define TENSOR
 
-#include <cstddef>
+#include "../src/model/tensor_gpu.hpp"
 #include <vector>
 
+namespace nn::model {
+class Activation;
+void enableGpuMode();
+} // namespace nn::model
+
 namespace nn::global {
-using ValueType = float;
 
 class Tensor {
   private:
-	std::vector<ValueType> data;
+	std::vector<ValueType> cpu_data;
 	std::vector<size_t> shape;
 	std::vector<size_t> strides;
 
+	ValueType *gpu_data = nullptr;
+	std::size_t gpu_data_size;
+
+	static bool isGpu;
+    static size_t tensorCount;
+
 	void computeStrides();
 	inline size_t flattenIndex(const std::vector<size_t> &indices) const;
 
+	friend model::Activation;
+
   public:
 	// Constructors
-	Tensor(const std::vector<size_t> &shape, float init = 0.0f);
-	Tensor(const Tensor &other)
-	    : data(other.data),
-	      shape(other.shape),
-	      strides(other.strides) {}
+	Tensor(const std::vector<size_t> &shape, ValueType init = 0.0f);
+	Tensor(const Tensor &other);
 
-	Tensor &operator=(const Tensor &other);
+	~Tensor();
 
-	// Element access
-	ValueType &operator()(const std::vector<size_t> &indices);
-	ValueType operator()(const std::vector<size_t> &indices) const;
-	inline ValueType &operator[](size_t i) { return data[i]; }
-	inline const ValueType &operator[](size_t i) const { return data[i]; }
+	Tensor &operator=(const Tensor &other);
+	Tensor &operator=(const std::vector<ValueType> &other);
 
-	// Iterators (for range-based loops)
-	auto begin() noexcept { return data.begin(); }
-	auto end() noexcept { return data.end(); }
-	auto begin() const noexcept { return data.begin(); }
-	auto end() const noexcept { return data.end(); }
+	ValueType getValue(const std::vector<size_t> &newShape) const;
+	void setValue(const std::vector<size_t> &newShape, const ValueType value);
+	void insertRange(const Tensor &other, const size_t startO,
+	                 const size_t startT, const size_t length);
 
 	// Shape and size
-	inline const std::vector<size_t> &getShape() const { return shape; }
-	inline size_t numElements() const { return data.size(); }
-	inline const std::vector<ValueType> &getData() const { return data; }
-	inline void fill(const ValueType &value) { std::fill(begin(), end(), value); }
-
-	// Arithmetic operations
-	Tensor operator+(const Tensor &other) const;
-	Tensor operator*(const Tensor &other) const;
-	Tensor operator-(const Tensor &other) const;
-	Tensor operator/(const Tensor &other) const;
-
-	Tensor operator*(ValueType scalar) const;
-	Tensor operator+(ValueType scalar) const;
-	Tensor operator/(ValueType scalar) const;
-	Tensor operator-(ValueType scalar) const;
+	size_t numElements() const;
+	const std::vector<size_t> &getShape() const { return shape; }
+	const std::vector<size_t> &getStrides() const { return strides; }
+	void getData(std::vector<ValueType> &dest) const;
+	void fill(const ValueType &value);
+	void zero();
 
 	Tensor &operator+=(const Tensor &other);
 	Tensor &operator-=(const Tensor &other);
@@ -65,10 +61,14 @@ class Tensor {
 	Tensor &operator+=(ValueType scalar);
 	Tensor &operator-=(ValueType scalar);
 
-	Tensor matmul(const Tensor &other) const;
-	static Tensor outer(const Tensor &a, const Tensor &b);
-	Tensor matmulT(const Tensor &vec) const;
+	void matmul(const Tensor &other, Tensor &result) const;
+	static void outer(const Tensor &a, const Tensor &b, Tensor &result);
+	void matmulT(const Tensor &vec, Tensor &result) const;
+
+	static void toGpu();
+	static void toCpu();
 };
+
 } // namespace nn::global
 
 #endif // TENSOR
diff --git a/src/model/activations.cpp b/src/model/activations.cpp
index 19669bf..0a22ac3 100644
--- a/src/model/activations.cpp
+++ b/src/model/activations.cpp
@@ -1,36 +1,9 @@
 #include "activations.hpp"
+#include "tensor.hpp"
+#include "tensor_gpu.hpp"
+#include <cstdio>
 
 namespace nn::model {
-global::ValueType Activation::activate(const global::ValueType z) const {
-	switch (activationType) {
-	case ActivationType::Relu:
-		return relu(z);
-	case ActivationType::LeakyRelu:
-		return leakyRelu(z);
-	case ActivationType::Sigmoid:
-		return sigmoid(z);
-	case ActivationType::Tanh:
-		return tanh(z);
-	default:
-		return z;
-	}
-}
-
-global::ValueType Activation::derivativeActivate(const global::ValueType z) const {
-	switch (activationType) {
-	case ActivationType::Relu:
-		return derivativeRelu(z);
-	case ActivationType::LeakyRelu:
-		return derivativeLeakyRelu(z);
-	case ActivationType::Sigmoid:
-		return derivativeSigmoid(z);
-	case ActivationType::Tanh:
-		return derivativeTanh(z);
-	default:
-		return z;
-	}
-}
-
 void Activation::activate(const global::Tensor &net, global::Tensor &out) const {
 	switch (activationType) {
 	case ActivationType::Relu:
@@ -73,14 +46,17 @@ void Activation::derivativeActivate(const global::Tensor &net, global::Tensor &o
 }
 
 global::ValueType Activation::maxVector(const global::Tensor &metrix) {
-	global::ValueType max = metrix[0];
-	for (auto &value : metrix) {
-		if (value > max) {
-			max = value;
+	if (!metrix.isGpu) {
+		global::ValueType max = metrix.cpu_data[0];
+		for (size_t i = 0; i < metrix.numElements(); ++i) {
+			if (metrix.getValue({i}) > max) {
+				max = metrix.getValue({i});
+			}
 		}
-	}
 
-	return max;
+		return max;
+	}
+	return 0;
 }
 
 global::ValueType Activation::relu(const global::ValueType z) {
@@ -114,61 +90,105 @@ global::ValueType Activation::derivativeTanh(const global::ValueType z) {
 }
 
 void Activation::relu(const global::Tensor &net, global::Tensor &out) {
-	for (size_t i = 0; i < net.numElements(); ++i)
-		out[i] = relu(net[i]);
+	if (net.isGpu) {
+		global::tensor_gpu::relu(net.gpu_data, out.gpu_data, net.gpu_data_size);
+	} else {
+		for (size_t i = 0; i < net.numElements(); ++i) {
+			out.cpu_data[i] = relu(net.cpu_data[i]);
+		}
+	}
 }
 
 void Activation::derivativeRelu(const global::Tensor &net, global::Tensor &out) {
-	for (size_t i = 0; i < net.numElements(); ++i)
-		out[i] *= derivativeRelu(net[i]);
+	if (net.isGpu) {
+		global::tensor_gpu::relu_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size);
+	} else {
+		for (size_t i = 0; i < net.numElements(); ++i) {
+			out.cpu_data[i] *= derivativeRelu(net.cpu_data[i]);
+		}
+	}
 }
 
 void Activation::leakyRelu(const global::Tensor &net, global::Tensor &out) {
-	for (size_t i = 0; i < net.numElements(); ++i)
-		out[i] = leakyRelu(net[i]);
+	if (net.isGpu) {
+		global::tensor_gpu::leaky_relu(net.gpu_data, out.gpu_data, net.gpu_data_size);
+	} else {
+		for (size_t i = 0; i < net.numElements(); ++i) {
+			out.cpu_data[i] = leakyRelu(net.cpu_data[i]);
+		}
+	}
 }
 
 void Activation::derivativeLeakyRelu(const global::Tensor &net, global::Tensor &out) {
-	for (size_t i = 0; i < net.numElements(); ++i)
-		out[i] *= derivativeLeakyRelu(net[i]);
+	if (net.isGpu) {
+		global::tensor_gpu::leaky_relu_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size);
+	} else {
+		for (size_t i = 0; i < net.numElements(); ++i) {
+			out.cpu_data[i] *= derivativeLeakyRelu(net.cpu_data[i]);
+		}
+	}
 }
 
 void Activation::sigmoid(const global::Tensor &net, global::Tensor &out) {
-	for (size_t i = 0; i < net.numElements(); ++i)
-		out[i] = sigmoid(net[i]);
+	if (net.isGpu) {
+		global::tensor_gpu::sigmoid(net.gpu_data, out.gpu_data, net.gpu_data_size);
+	} else {
+		for (size_t i = 0; i < net.numElements(); ++i) {
+			out.cpu_data[i] = sigmoid(net.cpu_data[i]);
+		}
+	}
 }
 
 void Activation::derivativeSigmoid(const global::Tensor &net, global::Tensor &out) {
-	for (size_t i = 0; i < net.numElements(); ++i)
-		out[i] *= derivativeSigmoid(net[i]);
+	if (net.isGpu) {
+		global::tensor_gpu::sigmoid_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size);
+	} else {
+		for (size_t i = 0; i < net.numElements(); ++i) {
+			out.cpu_data[i] *= derivativeSigmoid(net.cpu_data[i]);
+		}
+	}
 }
 
 void Activation::tanh(const global::Tensor &net, global::Tensor &out) {
-	for (size_t i = 0; i < net.numElements(); ++i)
-		out[i] = tanh(net[i]);
+	if (net.isGpu) {
+		global::tensor_gpu::tanh_activation(net.gpu_data, out.gpu_data, net.gpu_data_size);
+	} else {
+		for (size_t i = 0; i < net.numElements(); ++i) {
+			out.cpu_data[i] = tanh(net.cpu_data[i]);
+		}
+	}
 }
 
 void Activation::derivativeTanh(const global::Tensor &net, global::Tensor &out) {
-	for (size_t i = 0; i < net.numElements(); ++i)
-		out[i] *= derivativeTanh(net[i]);
+	if (net.isGpu) {
+		global::tensor_gpu::tanh_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size);
+	} else {
+		for (size_t i = 0; i < net.numElements(); ++i) {
+			out.cpu_data[i] *= derivativeTanh(net.cpu_data[i]);
+		}
+	}
 }
 
 void Activation::softmax(const global::Tensor &net, global::Tensor &out) {
-	global::ValueType max = maxVector(net);
-	global::ValueType sum = 0.0;
-
-	for (size_t i = 0; i < net.numElements(); ++i) {
-		global::ValueType x = net[i] - max;
-		if (x < -700.0)
-			x = -700.0;
-		if (x > 700.0)
-			x = 700.0;
-		out[i] = std::exp(x);
-		sum += out[i];
-	}
+	if (net.isGpu) {
+		global::tensor_gpu::softmax(net.gpu_data, out.gpu_data, net.gpu_data_size);
+	} else {
+		global::ValueType max = maxVector(net);
+		global::ValueType sum = 0.0;
+
+		for (size_t i = 0; i < net.numElements(); ++i) {
+			global::ValueType x = net.cpu_data[i] - max;
+			if (x < -700.0)
+				x = -700.0;
+			if (x > 700.0)
+				x = 700.0;
+			out.cpu_data[i] = std::exp(x);
+			sum += out.cpu_data[i];
+		}
 
-	sum = maxValue(sum, 1e-10);
+		sum = maxValue(sum, 1e-10);
 
-	out /= sum;
+		out /= sum;
+	}
 }
 } // namespace nn::model
diff --git a/src/model/activations.hpp b/src/model/activations.hpp
index e49a8f0..010de2a 100644
--- a/src/model/activations.hpp
+++ b/src/model/activations.hpp
@@ -2,10 +2,10 @@
 #define ACTIVATIONSP
 
 #include "tensor.hpp"
-#include <Globals.hpp>
 #include <cmath>
 
 namespace nn::model {
+
 constexpr global::ValueType RELU_LEAKY_ALPHA = 0.01;
 
 constexpr global::ValueType maxValue(const global::ValueType &a, const float &b) {
@@ -38,16 +38,13 @@ class Activation {
 	static global::ValueType derivativeTanh(const global::ValueType z);
 
 	static void relu(const global::Tensor &net, global::Tensor &out);
-	static void derivativeRelu(const global::Tensor &net,
-	                           global::Tensor &out);
+	static void derivativeRelu(const global::Tensor &net, global::Tensor &out);
 
-	static void leakyRelu(const global::Tensor &net,
-	                      global::Tensor &out);
+	static void leakyRelu(const global::Tensor &net, global::Tensor &out);
 	static void derivativeLeakyRelu(const global::Tensor &net,
 	                                global::Tensor &out);
 
-	static void sigmoid(const global::Tensor &net,
-	                    global::Tensor &out);
+	static void sigmoid(const global::Tensor &net, global::Tensor &out);
 	static void derivativeSigmoid(const global::Tensor &net,
 	                              global::Tensor &out);
 
@@ -55,8 +52,7 @@ class Activation {
 	static void derivativeTanh(const global::Tensor &net,
 	                           global::Tensor &out);
 
-	static void softmax(const global::Tensor &net,
-	                    global::Tensor &out);
+	static void softmax(const global::Tensor &net, global::Tensor &out);
 
 	static global::ValueType maxVector(const global::Tensor &metrix);
 
@@ -67,9 +63,6 @@ class Activation {
 	    : activationType(other.activationType) {}
 	~Activation() = default;
 
-	global::ValueType activate(const global::ValueType x) const;
-	global::ValueType derivativeActivate(const global::ValueType x) const;
-
 	void activate(const global::Tensor &net,
 	              global::Tensor &out) const;
 	void derivativeActivate(const global::Tensor &net,
diff --git a/src/model/config.hpp b/src/model/config.hpp
index 2c4b4ef..5aa54b7 100644
--- a/src/model/config.hpp
+++ b/src/model/config.hpp
@@ -148,9 +148,10 @@ NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(VisualMode, state, mode);
 
 struct VisualConfig {
 	bool enableVisuals{true};
+	bool enableNetwrokVisual{true};
 	std::vector<VisualMode> modes;
 };
-NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(VisualConfig, enableVisuals, modes);
+NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(VisualConfig, enableVisuals, enableNetwrokVisual, modes);
 
 class Config {
   public:
diff --git a/src/model/dataBase.cpp b/src/model/dataBase.cpp
index 2c13444..2a8afce 100644
--- a/src/model/dataBase.cpp
+++ b/src/model/dataBase.cpp
@@ -26,7 +26,7 @@ TrainSample DataBase::readLine(const std::string &line) {
 	for (size_t i = 0; i < samples.sInputSize; ++i) {
 		iss >> token;
 
-		new_sample.input({i}) = std::stod(token);
+        new_sample.input.setValue({i}, std::stod(token));
 	}
 
 	return new_sample;
@@ -65,8 +65,9 @@ int DataBase::load(const std::string &db_filename) {
 		}
 
 		TrainSample new_sample = readLine(line);
-		if (new_sample.input.numElements() == 0)
+		if (new_sample.input.numElements() == 0) {
 			continue;
+		}
 
 		samples.add(new_sample);
 	}
diff --git a/src/model/dataBase.hpp b/src/model/dataBase.hpp
index 0ad890a..cf17ba8 100644
--- a/src/model/dataBase.hpp
+++ b/src/model/dataBase.hpp
@@ -4,6 +4,7 @@
 #include "config.hpp"
 #include <random>
 #include <vector>
+#include <Globals.hpp>
 
 namespace nn::model {
 const std::string DATABASE_FILE_EXETENTION = ".nndb";
diff --git a/src/model/model.cpp b/src/model/model.cpp
index eb2f22e..694dd8c 100644
--- a/src/model/model.cpp
+++ b/src/model/model.cpp
@@ -1,5 +1,6 @@
 #include "../networks/cnn/CNNetwork.hpp"
 #include "../networks/fnn/FNNetwork.hpp"
+#include "dataBase.hpp"
 #include <chrono>
 #include <fstream>
 #include <iostream>
@@ -77,6 +78,9 @@ void Model::initOptimizer() {
 void Model::initVisual() {
 	visual.start();
 
+	if (!config.visualConfig.enableNetwrokVisual)
+		return;
+
 	for (size_t i = 0; i < config.networkConfig.SubNetworksConfig.size(); ++i) {
 		visual.addVisualSubNetwork(network[i]->getVisual());
 		network[i]->getVisual()->setVstate(visual.Vstate);
@@ -89,6 +93,7 @@ std::uint32_t Model::calculateSubNetWidth() const {
 
 void Model::initModel() {
 	const std::uint32_t WIDTH = calculateSubNetWidth();
+	size_t param_amount = 0;
 
 	for (size_t i = 0; i < config.networkConfig.SubNetworksConfig.size(); ++i) {
 		ISubNetworkConfig &_config = *config.networkConfig.SubNetworksConfig[i];
@@ -98,13 +103,20 @@ void Model::initModel() {
 		} else if (_config.NNLable() == cnn::CNN_LABLE) {
 			addCNN(WIDTH, _config);
 		}
+
+		param_amount += network[i]->getParams().numElements();
 	}
+
+	std::cout << "initialize model - "
+	          << param_amount << " parameters, "
+	          << config.networkConfig.SubNetworksConfig.size() << " sub networks"
+	          << std::endl;
 }
 
 void Model::addFNN(const std::uint32_t width, ISubNetworkConfig &_config) {
 	fnn::FNNConfig &sub_ = (fnn::FNNConfig &)(_config);
 
-	if (config.visualConfig.enableVisuals) {
+	if (config.visualConfig.enableVisuals && config.visualConfig.enableNetwrokVisual) {
 		std::shared_ptr<visualizer::fnn::FnnVisualier> visual_ =
 		    std::make_shared<visualizer::fnn::FnnVisualier>(
 		        visual.Vstate,
@@ -120,7 +132,7 @@ void Model::addFNN(const std::uint32_t width, ISubNetworkConfig &_config) {
 void Model::addCNN(const std::uint32_t width, ISubNetworkConfig &_config) {
 	cnn::CNNConfig &sub_ = (cnn::CNNConfig &)(_config);
 
-	if (config.visualConfig.enableVisuals) {
+	if (config.visualConfig.enableVisuals && config.visualConfig.enableNetwrokVisual) {
 		std::shared_ptr<visualizer::cnn::CnnVisualier> visual_ =
 		    std::make_shared<visualizer::cnn::CnnVisualier>(
 		        visual.Vstate,
@@ -158,9 +170,10 @@ void Model::updateWeights(const int batchSize) {
 
 void Model::Backward(const global::Tensor &output) {
 	global::Tensor deltas = output;
+	global::Tensor *delta = &deltas;
 
 	for (int i = static_cast<int>(network.size()) - 1; i >= 0; --i) {
-		network[i]->backward(deltas);
+		network[i]->backward(&delta);
 		deltas = network[i]->getInput();
 	}
 }
@@ -176,16 +189,16 @@ global::ValueType Model::runBackPropagation(
 	}
 
 	resetNetworkGradient();
+	global::Tensor output({outputSize()});
 	for (size_t i = 0; i < batch.size(); ++i) {
-		auto current_sample_ptr = batch.samples.at(i);
+		TrainSample *current_sample_ptr = batch.samples.at(i);
 		visual.updatePrediction(current_sample_ptr->pre);
 
-		runModel(transformation(current_sample_ptr->input));
-
-		global::Tensor output({outputSize()});
-		output[current_sample_ptr->pre.index] = 1;
+		runModel(current_sample_ptr->input, transformation);
 
 		if (doBackward) {
+			output.zero();
+			output.setValue({current_sample_ptr->pre.index}, 1);
 			Backward(output);
 			updateWeights(batch.size());
 		}
@@ -280,7 +293,7 @@ bool Model::autoEvaluating(
 
 void Model::autoSave(const int i) {
 	if (config.trainingConfig.isAutoSave() && i % config.trainingConfig.getAutoSave().saveEvery == 0) {
-		save(config.trainingConfig.getAutoSave().dataFilenameAutoSave);
+		save(config.trainingConfig.getAutoSave().dataFilenameAutoSave, false);
 	}
 }
 
@@ -328,6 +341,15 @@ float Model::calculatePercentage(size_t currentSize, size_t totalSize) {
 	return 100.0f * static_cast<float>(currentSize) / static_cast<float>(totalSize);
 }
 
+void Model::runModel(const global::Tensor &input,
+                     global::Transformation transformation) {
+	if (transformation) {
+		runModel(transformation(input));
+	} else {
+		runModel(input);
+	}
+}
+
 modelResult Model::evaluateModel(
     DataBase &dataBase,
     const bool cancleOnError,
@@ -347,11 +369,17 @@ modelResult Model::evaluateModel(
 	for (int i = 0; i < result.dbSize; ++i) {
 		TrainSample &sample = dataBase.getSample(i);
 
-		runModel(transformation(sample.input));
+		runModel(sample.input, transformation);
+
+		size_t predicted_index = 0;
+		float max_value = getOutput().getValue({0});
 
-		size_t predicted_index = std::distance(
-		    getOutput().begin(),
-		    std::max_element(getOutput().begin(), getOutput().end()));
+		for (size_t j = 1; j < getOutput().numElements(); ++j) {
+			if (getOutput().getValue({j}) > max_value) {
+				max_value = getOutput().getValue({j});
+				predicted_index = j;
+			}
+		}
 
 		if (showProgressbar) {
 			bar++;
@@ -396,28 +424,40 @@ size_t Model::inputSize() const {
 	return network[0]->inputSize();
 }
 
-void Model::save(const std::string &file) {
+void Model::save(const std::string &file, bool print) {
 	std::ofstream outFile(file);
 
+	if (print) {
+		std::cout << "Start saving" << std::endl;
+	}
+
 	for (size_t i = 0; i < network.size(); ++i) {
 		global::Tensor params = network[i]->getParams();
 
 		outFile << params.numElements() << " ";
 		for (size_t j = 0; j < params.numElements(); ++j) {
-			outFile << params[j] << " ";
+			outFile << params.getValue({j}) << " ";
 		}
-
 		outFile << std::endl;
 	}
 
+	if (print) {
+		std::cout << " saving complete" << std::endl;
+	}
+
 	outFile.close();
 }
 
-void Model::load(const std::string &file) {
+void Model::load(const std::string &file, bool print) {
 	std::ifstream inFile(file);
 
 	std::string line;
 	int networkI = 0;
+
+	if (print) {
+		std::cout << "Start loading" << std::endl;
+	}
+
 	while (std::getline(inFile, line)) {
 		std::istringstream iss(line);
 
@@ -426,17 +466,19 @@ void Model::load(const std::string &file) {
 		global::Tensor numbers({ParamSize});
 
 		float num;
-
 		for (size_t i = 0; i < ParamSize; ++i) {
 			iss >> num;
-			numbers[i] = num;
+			numbers.setValue({i}, num);
 		}
 
 		network[networkI]->setParams(numbers);
-
 		networkI++;
 	}
 
+	if (print) {
+		std::cout << " loading complete" << std::endl;
+	}
+
 	inFile.close();
 }
 
@@ -444,12 +486,12 @@ global::Prediction Model::getPrediction() const {
 	size_t max = 0;
 
 	for (size_t i = 1; i < outputSize(); ++i) {
-		if (getOutput()[i] > getOutput()[max]) {
+		if (getOutput().getValue({i}) > getOutput().getValue({max})) {
 			max = i;
 		}
 	}
 
-	return global::Prediction(max, getOutput()[max]);
+	return global::Prediction(max, getOutput().getValue({max}));
 }
 
 void Model::setTraining() {
diff --git a/src/model/optimizers.cpp b/src/model/optimizers.cpp
index 9014aa2..b34a89a 100644
--- a/src/model/optimizers.cpp
+++ b/src/model/optimizers.cpp
@@ -1,7 +1,8 @@
 #include "optimizers.hpp"
 
 namespace nn::model {
-void ConstantOptimizer::step(global::Tensor &weight, const global::Tensor &grad) {
-	weight -= grad * (config.getLearningRate() / batchSize);
+void ConstantOptimizer::step(global::Tensor &weight, global::Tensor &grad) {
+	grad *= config.getLearningRate() / batchSize;
+	weight -= grad;
 }
 } // namespace nn::model
diff --git a/src/model/optimizers.hpp b/src/model/optimizers.hpp
index a910edf..203ea4e 100644
--- a/src/model/optimizers.hpp
+++ b/src/model/optimizers.hpp
@@ -13,7 +13,7 @@ class IOptimizer {
   public:
 	virtual ~IOptimizer() = default;
 
-	virtual void step(global::Tensor &weight, const global::Tensor &grad) = 0;
+	virtual void step(global::Tensor &weight, global::Tensor &grad) = 0;
 	virtual void reset() = 0;
 
 	void setOfset(const int batchSize_) { batchSize = batchSize_; }
@@ -27,7 +27,7 @@ class ConstantOptimizer : public IOptimizer {
 	ConstantOptimizer(const ConstantOptimizerConfig &config_)
 	    : config(config_) {}
 
-	void step(global::Tensor &weight, const global::Tensor &grad) override;
+	void step(global::Tensor &weight, global::Tensor &grad) override;
 
 	void reset() override {}
 };
diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp
index 80fd8cb..0bc82f9 100644
--- a/src/model/tensor.cpp
+++ b/src/model/tensor.cpp
@@ -1,30 +1,137 @@
+#include "tensor_gpu.hpp"
+#include <cstddef>
 #include <numeric>
 #include <stdexcept>
 #include <tensor.hpp>
 
 namespace nn::global {
-Tensor::Tensor(const std::vector<size_t> &shape, float init)
-    : shape(shape) {
-	if (shape.empty()) {
+bool Tensor::isGpu = false;
+size_t Tensor::tensorCount = 0;
+
+Tensor::Tensor(const std::vector<size_t> &shape_, ValueType init) {
+	if (shape_.empty()) {
 		throw std::invalid_argument("Tensor shape cannot be empty.");
 	}
 
 	size_t totalSize = std::accumulate(
-	    shape.begin(),
-	    shape.end(),
+	    shape_.begin(),
+	    shape_.end(),
 	    size_t(1),
 	    std::multiplies<>());
-	data.assign(totalSize, init);
+
+	shape = shape_;
+	if (isGpu) {
+		gpu_data = (ValueType *)tensor_gpu::allocate(totalSize * sizeof(ValueType));
+		gpu_data_size = totalSize;
+		fill(init);
+	} else {
+		cpu_data.assign(totalSize, init);
+	}
+
 	computeStrides();
+
+	tensorCount++;
+}
+
+void Tensor::toGpu() {
+	if (isGpu)
+		return;
+
+	if (tensorCount > 0)
+		throw std::runtime_error("Cannot switch to GPU mode: tensors already exist in CPU mode.");
+
+	isGpu = true;
+}
+
+void Tensor::toCpu() {
+	if (!isGpu)
+		return;
+
+	if (tensorCount > 0)
+		throw std::runtime_error("Cannot switch to CPU mode: tensors already exist in GPU mode.");
+
+	isGpu = false;
+}
+
+Tensor::Tensor(const Tensor &other) {
+	shape = other.shape;
+	strides = other.strides;
+	if (isGpu) {
+		gpu_data_size = other.gpu_data_size;
+		gpu_data = (ValueType *)tensor_gpu::allocate(gpu_data_size * sizeof(ValueType));
+		tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType));
+	} else {
+		cpu_data = other.cpu_data;
+	}
+}
+
+size_t Tensor::numElements() const {
+	if (isGpu) {
+		return gpu_data_size;
+	}
+	return cpu_data.size();
+}
+
+void Tensor::getData(std::vector<ValueType> &dest) const {
+	if (isGpu) {
+		tensor_gpu::copyToHost(dest.data(), gpu_data, gpu_data_size * sizeof(ValueType));
+	} else {
+		dest = cpu_data;
+	}
+}
+
+void Tensor::fill(const ValueType &value) {
+	if (isGpu) {
+		tensor_gpu::zero(gpu_data, gpu_data_size);
+		tensor_gpu::add_scalar(gpu_data, value, gpu_data, gpu_data_size);
+	} else {
+		for (auto &n : cpu_data) {
+			n = value;
+		}
+	}
+}
+
+void Tensor::zero() {
+	if (isGpu) {
+		tensor_gpu::zero(gpu_data, gpu_data_size);
+	} else {
+		fill(0);
+	}
 }
 
 Tensor &Tensor::operator=(const Tensor &other) {
 	if (this == &other)
 		return *this;
 
-	data = other.data;
+	if (isGpu) {
+		if (gpu_data_size != other.gpu_data_size) {
+			ValueType *temp = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType));
+			gpu_data_size = other.gpu_data_size;
+			tensor_gpu::copyDeviceToDevice(temp, other.gpu_data, gpu_data_size * sizeof(ValueType));
+			tensor_gpu::deallocate(gpu_data);
+			gpu_data = temp;
+		} else {
+			tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType));
+		}
+	} else {
+		cpu_data = other.cpu_data;
+	}
+
 	shape = other.shape;
 	strides = other.strides;
+	return *this;
+}
+
+Tensor &Tensor::operator=(const std::vector<ValueType> &other) {
+	if (other.size() != numElements()) {
+		throw std::length_error("Tensor assignment size mismatch");
+	}
+
+	if (isGpu) {
+		tensor_gpu::copyToDevice(gpu_data, other.data(), gpu_data_size * sizeof(ValueType));
+	} else {
+		cpu_data = other;
+	}
 
 	return *this;
 }
@@ -52,150 +159,125 @@ inline size_t Tensor::flattenIndex(const std::vector<size_t> &indices) const {
 	return index;
 }
 
-ValueType &Tensor::operator()(const std::vector<size_t> &indices) {
-	return data[flattenIndex(indices)];
-}
-
-ValueType Tensor::operator()(const std::vector<size_t> &indices) const {
-	return data[flattenIndex(indices)];
-}
-
-Tensor Tensor::operator+(const Tensor &other) const {
-	if (shape != other.shape) {
-		throw std::invalid_argument("Shape mismatch in Tensor::operator+.");
+ValueType Tensor::getValue(const std::vector<size_t> &indices) const {
+	if (isGpu) {
+		return tensor_gpu::getValueAt(gpu_data, flattenIndex(indices));
 	}
-	Tensor result(shape);
-	const float *a = data.data();
-	const float *b = other.data.data();
-	float *r = result.data.data();
-	const size_t N = data.size();
-	for (size_t i = 0; i < N; ++i)
-		r[i] = a[i] + b[i];
-	return result;
+
+	return cpu_data[flattenIndex(indices)];
 }
 
-Tensor Tensor::operator-(const Tensor &other) const {
-	if (shape != other.shape) {
-		throw std::invalid_argument("Shape mismatch in Tensor::operator-.");
+void Tensor::insertRange(const Tensor &other,
+                         const size_t startO, const size_t startT,
+                         const size_t length) {
+	if (isGpu) {
+		tensor_gpu::copyDeviceToDevice(gpu_data + startT, other.gpu_data + startO, length * sizeof(ValueType));
+	} else {
+		for (size_t i = 0; i < length; ++i) {
+			cpu_data[i + startT] = other.cpu_data[i + startO];
+		}
 	}
-	Tensor result(shape);
-	const float *a = data.data();
-	const float *b = other.data.data();
-	float *r = result.data.data();
-	const size_t N = data.size();
-	for (size_t i = 0; i < N; ++i)
-		r[i] = a[i] - b[i];
-	return result;
 }
 
-Tensor Tensor::operator/(const Tensor &other) const {
-	if (shape != other.shape) {
-		throw std::invalid_argument("Shape mismatch in Tensor::operator/.");
+void Tensor::setValue(const std::vector<size_t> &indices, const ValueType value) {
+	if (isGpu) {
+		tensor_gpu::setValueAt(gpu_data, flattenIndex(indices), value);
+	} else {
+		cpu_data[flattenIndex(indices)] = value;
 	}
-	Tensor result(shape);
-	const float *a = data.data();
-	const float *b = other.data.data();
-	float *r = result.data.data();
-	const size_t N = data.size();
-	for (size_t i = 0; i < N; ++i)
-		r[i] = a[i] / b[i];
-	return result;
 }
 
 Tensor &Tensor::operator+=(const Tensor &other) {
 	if (shape != other.shape)
-		throw std::invalid_argument("Shape mismatch.");
-
-	float *__restrict__ a = data.data();
-	const float *__restrict__ b = other.data.data();
-	const size_t N = data.size();
-
-	for (size_t i = 0; i < N; ++i)
-		a[i] += b[i];
-
+		throw std::invalid_argument("Shape mismatch in Tensor::operator+=.");
+	if (isGpu) {
+		tensor_gpu::add_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size);
+	} else {
+		for (size_t i = 0; i < cpu_data.size(); ++i)
+			cpu_data[i] += other.cpu_data[i];
+	}
 	return *this;
 }
 
 Tensor &Tensor::operator-=(const Tensor &other) {
 	if (shape != other.shape)
-		throw std::invalid_argument("Shape mismatch.");
-	float *a = data.data();
-	const float *b = other.data.data();
-	const size_t N = data.size();
-	for (size_t i = 0; i < N; ++i)
-		a[i] -= b[i];
+		throw std::invalid_argument("Shape mismatch in Tensor::operator-=.");
+	if (isGpu) {
+		tensor_gpu::subtraction_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size);
+	} else {
+		for (size_t i = 0; i < cpu_data.size(); ++i)
+			cpu_data[i] -= other.cpu_data[i];
+	}
 	return *this;
 }
 
 Tensor &Tensor::operator*=(const Tensor &other) {
 	if (shape != other.shape)
 		throw std::invalid_argument("Shape mismatch in Tensor::operator*=.");
-	const size_t N = data.size();
-	for (size_t i = 0; i < N; ++i)
-		data[i] *= other.data[i];
+	if (isGpu) {
+		tensor_gpu::multiply_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size);
+	} else {
+		for (size_t i = 0; i < cpu_data.size(); ++i)
+			cpu_data[i] *= other.cpu_data[i];
+	}
 	return *this;
 }
 
 Tensor &Tensor::operator/=(const Tensor &other) {
 	if (shape != other.shape)
 		throw std::invalid_argument("Shape mismatch in Tensor::operator/=.");
-	const size_t N = data.size();
-	for (size_t i = 0; i < N; ++i)
-		data[i] /= other.data[i];
+	if (isGpu) {
+		tensor_gpu::division_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size);
+	} else {
+		for (size_t i = 0; i < cpu_data.size(); ++i)
+			cpu_data[i] /= other.cpu_data[i];
+	}
 	return *this;
 }
 
 Tensor &Tensor::operator*=(ValueType scalar) {
-	for (auto &x : data)
-		x *= scalar;
+	if (isGpu) {
+		tensor_gpu::multiply_scalar(gpu_data, scalar, gpu_data, gpu_data_size);
+	} else {
+		for (auto &x : cpu_data)
+			x *= scalar;
+	}
 	return *this;
 }
 
 Tensor &Tensor::operator-=(ValueType scalar) {
-	for (auto &x : data)
-		x -= scalar;
+	if (isGpu) {
+		tensor_gpu::subtraction_scalar(gpu_data, scalar, gpu_data, gpu_data_size);
+	} else {
+		for (auto &x : cpu_data)
+			x -= scalar;
+	}
 	return *this;
 }
 
 Tensor &Tensor::operator+=(ValueType scalar) {
-	for (auto &x : data)
-		x += scalar;
+	if (isGpu) {
+		tensor_gpu::add_scalar(gpu_data, scalar, gpu_data, gpu_data_size);
+	} else {
+		for (auto &x : cpu_data)
+			x += scalar;
+	}
 	return *this;
 }
+
 Tensor &Tensor::operator/=(ValueType scalar) {
-	for (auto &x : data)
-		x /= scalar;
+	if (isGpu) {
+		tensor_gpu::division_scalar(gpu_data, scalar, gpu_data, gpu_data_size);
+	} else {
+		for (auto &x : cpu_data)
+			x /= scalar;
+	}
 	return *this;
 }
 
-Tensor Tensor::operator*(ValueType scalar) const {
-	Tensor result(*this);
-	result *= scalar;
-	return result;
-}
-
-Tensor Tensor::operator/(ValueType scalar) const {
-	Tensor result(*this);
-	result /= scalar;
-	return result;
-}
-
-Tensor Tensor::operator-(ValueType scalar) const {
-	Tensor result(*this);
-	result -= scalar;
-	return result;
-}
-
-Tensor Tensor::operator+(ValueType scalar) const {
-	Tensor result(*this);
-	result += scalar;
-	return result;
-}
-
-Tensor Tensor::matmul(const Tensor &other) const {
+void Tensor::matmul(const Tensor &other, Tensor &result) const {
 	const auto &aShape = shape;
 	const auto &bShape = other.shape;
-
 	if (aShape.size() != 2 || bShape.size() != 1)
 		throw std::runtime_error("matmul: unsupported shapes.");
 
@@ -204,72 +286,74 @@ Tensor Tensor::matmul(const Tensor &other) const {
 	if (K != bShape[0])
 		throw std::runtime_error("matmul: shape mismatch.");
 
-	Tensor result({M});
-
-	const float *A = data.data();
-	const float *B = other.data.data();
-	float *R = result.data.data();
-
-	for (size_t i = 0; i < M; ++i) {
-		float sum = 0.0f;
-		size_t base = i * K;
-		for (size_t j = 0; j < K; ++j) {
-			sum += A[base + j] * B[j];
+	result.zero();
+
+	if (isGpu) {
+		tensor_gpu::matmul(gpu_data, other.gpu_data, result.gpu_data, M, K);
+	} else {
+		const float *A = cpu_data.data();
+		const float *B = other.cpu_data.data();
+		float *R = result.cpu_data.data();
+
+		for (size_t i = 0; i < M; ++i) {
+			float sum = 0.0f;
+			size_t base = i * K;
+			for (size_t j = 0; j < K; ++j) {
+				sum += A[base + j] * B[j];
+			}
+			R[i] = sum;
 		}
-		R[i] = sum;
 	}
-	return result;
 }
 
-Tensor Tensor::outer(const Tensor &a, const Tensor &b) {
-	const std::vector<size_t> &aShape = a.getShape();
-	const std::vector<size_t> &bShape = b.getShape();
-
-	if (aShape.size() != 1 || bShape.size() != 1) {
+void Tensor::outer(const Tensor &a, const Tensor &b, Tensor &result) {
+	if (a.shape.size() != 1 || b.shape.size() != 1) {
 		throw std::runtime_error("outer: both tensors must be 1D vectors");
 	}
 
-	size_t m = aShape[0];
-	size_t n = bShape[0];
+	size_t m = a.shape[0];
+	size_t n = b.shape[0];
+
+	result.zero();
 
-	Tensor result({m, n});
-	float *r = result.data.data();
-	const float *A = a.data.data();
-	const float *B = b.data.data();
+	if (isGpu) {
+		tensor_gpu::outer(a.gpu_data, b.gpu_data, result.gpu_data, m, n);
+	} else {
+		float *r = result.cpu_data.data();
+		const float *A = a.cpu_data.data();
+		const float *B = b.cpu_data.data();
 
-	for (size_t i = 0; i < m; ++i) {
-		for (size_t j = 0; j < n; ++j) {
-			r[i * n + j] = A[i] * B[j];
+		for (size_t i = 0; i < m; ++i) {
+			for (size_t j = 0; j < n; ++j) {
+				r[i * n + j] += A[i] * B[j];
+			}
 		}
 	}
-	return result;
 }
 
-Tensor Tensor::matmulT(const Tensor &vec) const {
-	const auto &wShape = shape;
-	const auto &vShape = vec.shape;
-
-	if (wShape.size() != 2 || vShape.size() != 1)
+void Tensor::matmulT(const Tensor &vec, Tensor &result) const {
+	if (shape.size() != 2 || vec.shape.size() != 1)
 		throw std::runtime_error("matmulT: bad dimensions");
-
-	size_t M = wShape[0];
-	size_t N = wShape[1];
-	if (vShape[0] != M)
+	if (vec.shape[0] != shape[0])
 		throw std::runtime_error("matmulT: incompatible");
 
-	Tensor result({N}, 0.0f);
+	result.zero();
 
-	const float *W = data.data();
-	const float *V = vec.data.data();
-	float *R = result.data.data();
-
-	for (size_t i = 0; i < N; ++i) {
-		float sum = 0.0f;
-		for (size_t j = 0; j < M; ++j) {
-			sum += W[j * N + i] * V[j];
+	if (isGpu) {
+		tensor_gpu::matmulT(gpu_data, vec.gpu_data, result.gpu_data, shape[0], shape[1]);
+	} else {
+		for (size_t i = 0; i < shape[1]; ++i) {
+			for (size_t j = 0; j < shape[0]; ++j) {
+				result.cpu_data[i] += cpu_data[j * shape[1] + i] * vec.cpu_data[j];
+			}
 		}
-		R[i] = sum;
 	}
-	return result;
+}
+
+Tensor::~Tensor() {
+	if (isGpu) {
+		tensor_gpu::deallocate(gpu_data);
+	}
+	tensorCount--;
 }
 } // namespace nn::global
diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu
new file mode 100644
index 0000000..f11fef7
--- /dev/null
+++ b/src/model/tensor_gpu.cu
@@ -0,0 +1,437 @@
+#include <cuda_runtime.h>
+#include "tensor_gpu.hpp"
+#include <cstddef>
+#include <stdexcept>
+
+namespace nn::global::tensor_gpu {
+#define CUDA_CHECK(call) do { \
+  cudaError_t e = (call); \
+  if (e != cudaSuccess) { \
+    fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
+    throw std::runtime_error(cudaGetErrorString(e)); \
+  } \
+} while(0)
+
+// ==================================================
+// Memory Management
+// ==================================================
+void* allocate(std::size_t size) {
+    void* devicePtr = nullptr;
+    if (cudaMalloc(&devicePtr, size) != cudaSuccess) {
+        throw std::runtime_error("cudaMalloc failed");
+    }
+    return devicePtr;
+}
+
+void deallocate(void* devicePtr) {
+    if (devicePtr) {
+        cudaFree(devicePtr);
+    }
+}
+
+void copyToDevice(void* deviceDst, const void* hostSrc, std::size_t size) {
+    CUDA_CHECK(cudaMemcpy(deviceDst, hostSrc, size, cudaMemcpyHostToDevice));
+}
+
+void copyDeviceToDevice(void* deviceDst, const void* deviceSrc, std::size_t size) {
+    CUDA_CHECK(cudaMemcpy(deviceDst, deviceSrc, size, cudaMemcpyDeviceToDevice));
+}
+
+void copyToHost(void* hostDst, const void* deviceSrc, std::size_t size) {
+    CUDA_CHECK(cudaMemcpy(hostDst, deviceSrc, size, cudaMemcpyDeviceToHost));
+}
+
+void setValueAt(ValueType* devicePtr, std::size_t index, ValueType value) {
+    CUDA_CHECK(cudaMemcpy(devicePtr + index, &value, sizeof(ValueType), cudaMemcpyHostToDevice));
+}
+
+ValueType getValueAt(const ValueType* devicePtr, std::size_t index) {
+    ValueType value;
+    CUDA_CHECK(cudaMemcpy(&value, devicePtr + index, sizeof(ValueType), cudaMemcpyDeviceToHost));
+    return value;
+}
+
+// ==================================================
+// Utility Kernels
+// ==================================================
+__global__ void zeroKernel(ValueType* data, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) data[idx] = 0.0f;
+}
+
+void zero(ValueType* deviceData, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    zeroKernel<<<numBlocks, blockSize>>>(deviceData, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+// ==================================================
+// Vector-Vector Operations
+// ==================================================
+__global__ void addVecKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) C[idx] = A[idx] + B[idx];
+}
+
+__global__ void subVecKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) C[idx] = A[idx] - B[idx];
+}
+
+__global__ void mulVecKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) C[idx] = A[idx] * B[idx];
+}
+
+__global__ void divVecKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) C[idx] = A[idx] / B[idx];
+}
+
+void add_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    addVecKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+void subtraction_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    subVecKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+void multiply_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    mulVecKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+void division_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    divVecKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+// ==================================================
+// Vector-Scalar Operations
+// ==================================================
+__global__ void addScalarKernel(const ValueType* A, ValueType B, ValueType* C, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) C[idx] = A[idx] + B;
+}
+
+__global__ void subScalarKernel(const ValueType* A, ValueType B, ValueType* C, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) C[idx] = A[idx] - B;
+}
+
+__global__ void mulScalarKernel(const ValueType* A, ValueType B, ValueType* C, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) C[idx] = A[idx] * B;
+}
+
+__global__ void divScalarKernel(const ValueType* A, ValueType B, ValueType* C, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) C[idx] = A[idx] / B;
+}
+
+void add_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    addScalarKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+void subtraction_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    subScalarKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+void multiply_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    mulScalarKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+void division_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    divScalarKernel<<<numBlocks, blockSize>>>(A, B, C, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+// ==================================================
+// Activation Functions
+// ==================================================
+__global__ void reluKernel(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) output[idx] = input[idx] > 0.0f ? input[idx] : 0.0f;
+}
+
+__global__ void reluDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        ValueType derivative = (input[idx] > 0.0f) ? 1.0f : 0.0f;
+        output[idx] *= derivative; // FIX: Changed = to *=
+    }
+}
+
+void relu(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    reluKernel<<<numBlocks, blockSize>>>(input, output, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+void relu_derivative(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    reluDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+__global__ void sigmoidKernel(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        ValueType x = input[idx];
+        output[idx] = 1.0f / (1.0f + expf(-x));
+    }
+}
+
+__global__ void sigmoidDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        ValueType x = input[idx];
+        ValueType s = 1.0f / (1.0f + expf(-x));
+        ValueType derivative = s * (1.0f - s);
+        output[idx] *= derivative;
+    }
+}
+
+void sigmoid(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    sigmoidKernel<<<numBlocks, blockSize>>>(input, output, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+void sigmoid_derivative(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    sigmoidDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+__global__ void tanhKernel(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) output[idx] = tanhf(input[idx]);
+}
+
+__global__ void tanhDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        ValueType t = tanhf(input[idx]);
+        ValueType derivative = 1.0f - t * t;
+        output[idx] *= derivative;
+    }
+}
+
+void tanh_activation(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    tanhKernel<<<numBlocks, blockSize>>>(input, output, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+void tanh_derivative(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    tanhDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+__global__ void leakyReluKernel(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) output[idx] = (input[idx] > 0.0f) ? input[idx] : alpha * input[idx];
+}
+
+__global__ void leakyReluDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) {
+    std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < count) {
+        ValueType derivative = (input[idx] > 0.0f) ? 1.0f : alpha;
+        output[idx] *= derivative; // FIX: Changed = to *=
+    }
+}
+
+void leaky_relu(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    leakyReluKernel<<<numBlocks, blockSize>>>(input, output, count, alpha);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+void leaky_relu_derivative(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    leakyReluDerivativeKernel<<<numBlocks, blockSize>>>(input, output, count, alpha);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+// ==================================================
+// Softmax
+// ==================================================
+__global__ void softmaxKernel(const ValueType* input, ValueType* output, std::size_t count) {
+    extern __shared__ ValueType shared[];
+    std::size_t tid = threadIdx.x;
+    std::size_t blockStart = blockIdx.x * blockDim.x;
+    std::size_t idx = blockStart + tid;
+
+    // always write shared for every thread in block
+    shared[tid] = (idx < count) ? input[idx] : -INFINITY;
+    __syncthreads();
+
+    // compute max (naive per-thread loop)
+    ValueType max_val = shared[0];
+    for (unsigned int i = 1; i < blockDim.x; ++i) {
+        std::size_t curIdx = blockStart + i;
+        if (curIdx < count) max_val = fmaxf(max_val, shared[i]);
+    }
+    __syncthreads();
+
+    ValueType e = (idx < count) ? expf(shared[tid] - max_val) : 0.0f;
+    shared[tid] = e;
+    __syncthreads();
+
+    // compute sum (naive)
+    ValueType sum = 0.0f;
+    for (unsigned int i = 0; i < blockDim.x; ++i) {
+        std::size_t curIdx = blockStart + i;
+        if (curIdx < count) sum += shared[i];
+    }
+    __syncthreads();
+
+    if (idx < count) output[idx] = shared[tid] / (sum == 0.0f ? 1.0f : sum);
+}
+
+void softmax(const ValueType* input, ValueType* output, std::size_t count) {
+    std::size_t blockSize = 256;
+    std::size_t numBlocks = (count + blockSize - 1) / blockSize;
+    std::size_t sharedMemSize = blockSize * sizeof(ValueType);
+    softmaxKernel<<<numBlocks, blockSize, sharedMemSize>>>(input, output, count);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+// ==================================================
+// Index Utilities
+// ==================================================
+__global__ void flattenIndexKernel(const size_t* indices, const size_t* shape,
+                                   const size_t* strides, size_t ndim, size_t* outIndex) {
+    size_t idx = 0;
+    for (size_t i = 0; i < ndim; ++i) {
+        if (indices[i] >= shape[i]) {
+            *outIndex = size_t(-1);
+            return;
+        }
+        idx += indices[i] * strides[i];
+    }
+    *outIndex = idx;
+}
+
+__global__ void computeFlatIndexKernel(const size_t* indices, const size_t* strides,
+                                       size_t rank, size_t* outIndex) {
+    size_t flatIndex = 0;
+    for (size_t i = 0; i < rank; ++i) {
+        flatIndex += indices[i] * strides[i];
+    }
+    *outIndex = flatIndex;
+}
+
+// ==================================================
+// Matrix Operations
+// ==================================================
+__global__ void matmulKernel(const ValueType* A, const ValueType* B, ValueType* R, size_t M, size_t K) {
+    size_t row = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row < M) {
+        ValueType sum = 0;
+        for (size_t j = 0; j < K; ++j) {
+            sum += A[row * K + j] * B[j];
+        }
+        R[row] = sum;
+    }
+}
+
+__global__ void outerKernel(const ValueType* a, const ValueType* b, ValueType* result, size_t m, size_t n) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t total = m * n;
+    if (idx < total) {
+        size_t i = idx / n;
+        size_t j = idx % n;
+        result[i * n + j] += a[i] * b[j];
+    }
+}
+
+__global__ void matmulTKernel(const ValueType* W, const ValueType* V, ValueType* R, size_t M, size_t N) {
+    size_t col = blockIdx.x * blockDim.x + threadIdx.x;
+    if (col < N) {
+        ValueType sum = 0.0f;
+        for (size_t i = 0; i < M; ++i) {
+            sum += W[i * N + col] * V[i];
+        }
+        R[col] = sum;
+    }
+}
+
+void matmul(const ValueType* A, const ValueType* B, ValueType* R, size_t M, size_t K) {
+    const int blockSize = 256;
+    int gridSize = (M + blockSize - 1) / blockSize;
+    matmulKernel<<<gridSize, blockSize>>>(A, B, R, M, K);
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+void outer(const ValueType* a, const ValueType* b, ValueType* result, size_t m, size_t n) {
+    const int blockSize = 256;
+    int gridSize = (m * n + blockSize - 1) / blockSize;
+    outerKernel<<<gridSize, blockSize>>>(a, b, result, m, n);
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+void matmulT(const ValueType* W, const ValueType* V, ValueType* R, size_t M, size_t N) {
+    const int blockSize = 256;
+    int gridSize = (N + blockSize - 1) / blockSize;
+    matmulTKernel<<<gridSize, blockSize>>>(W, V, R, M, N);
+    cudaDeviceSynchronize();
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+} // namespace nn::global::tensor_gpu
diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp
new file mode 100644
index 0000000..ad2ba02
--- /dev/null
+++ b/src/model/tensor_gpu.hpp
@@ -0,0 +1,81 @@
+#ifndef TENSOR_GPU
+#define TENSOR_GPU
+
+#include <cstddef>
+
+namespace nn::global {
+using ValueType = float;
+}
+
+class Tensor; // Forward declaration
+
+namespace nn::global::tensor_gpu {
+
+// ============================
+// Memory Management
+// ============================
+void* allocate(std::size_t size);
+void  deallocate(void* devicePtr);
+
+void  copyToDevice(void* deviceDst, const void* hostSrc, std::size_t count);
+void  copyToHost(void* hostDst, const void* deviceSrc, std::size_t count);
+void  copyDeviceToDevice(void* deviceDst, const void* deviceSrc, std::size_t count);
+
+void  zero(ValueType* deviceData, std::size_t count);
+
+// ============================
+// Element-wise Operations (Vector-Vector)
+// ============================
+void add_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count);
+void subtraction_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count);
+void division_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count);
+void multiply_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count);
+
+// ============================
+// Element-wise Operations (Vector-Scalar)
+// ============================
+void add_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count);
+void subtraction_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count);
+void division_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count);
+void multiply_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count);
+
+// ============================
+// Activation Functions
+// ============================
+
+// ReLU
+void relu(const ValueType* input, ValueType* output, std::size_t count);
+void relu_derivative(const ValueType* input, ValueType* output, std::size_t count);
+
+// Leaky ReLU
+void leaky_relu(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha = 0.01f);
+void leaky_relu_derivative(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha = 0.01f);
+
+// Sigmoid
+void sigmoid(const ValueType* input, ValueType* output, std::size_t count);
+void sigmoid_derivative(const ValueType* input, ValueType* output, std::size_t count);
+
+// Tanh
+void tanh_activation(const ValueType* input, ValueType* output, std::size_t count);
+void tanh_derivative(const ValueType* input, ValueType* output, std::size_t count);
+
+// Softmax
+void softmax(const ValueType* net, ValueType* out, std::size_t size);
+
+// ============================
+// Single Value Access
+// ============================
+ValueType getValueAt(const ValueType* devicePtr, std::size_t index);
+void      setValueAt(ValueType* devicePtr, std::size_t index, ValueType value);
+
+// ============================
+// Matrix Operations
+// ============================
+void matmul(const ValueType* A, const ValueType* B, ValueType* R, std::size_t M, std::size_t K);
+void matmulT(const ValueType* W, const ValueType* V, ValueType* R, std::size_t M, std::size_t N);
+void outer(const ValueType* a, const ValueType* b, ValueType* result, std::size_t m, std::size_t n);
+
+} // namespace nn::global::tensor_gpu
+
+#endif // TENSOR_GPU
+
diff --git a/src/networks/cnn/CNNetwork.cpp b/src/networks/cnn/CNNetwork.cpp
index fc45ba3..a2d8b1c 100644
--- a/src/networks/cnn/CNNetwork.cpp
+++ b/src/networks/cnn/CNNetwork.cpp
@@ -16,7 +16,7 @@ void CNNetwork::forward(const global::Tensor &newInput) {
 	input = newInput;
 }
 
-void CNNetwork::backward(const global::Tensor &) {
+void CNNetwork::backward(global::Tensor **) {
 }
 
 global::ValueType CNNetwork::getLoss(const global::Prediction &) const {
diff --git a/src/networks/cnn/CNNetwork.hpp b/src/networks/cnn/CNNetwork.hpp
index e2ad58d..1c08705 100644
--- a/src/networks/cnn/CNNetwork.hpp
+++ b/src/networks/cnn/CNNetwork.hpp
@@ -24,7 +24,7 @@ class CNNetwork : public INetwork {
 	~CNNetwork() override = default;
 
 	void forward(const global::Tensor &newInput) override;
-	void backward(const global::Tensor &outputDeltas) override;
+	void backward(global::Tensor **outputDeltas) override;
 	void updateWeights(IOptimizer &optimizer) override;
 	void resetGradient() override;
 
diff --git a/src/networks/fnn/DenseLayer.cpp b/src/networks/fnn/DenseLayer.cpp
index dc8f7ea..b467dc2 100644
--- a/src/networks/fnn/DenseLayer.cpp
+++ b/src/networks/fnn/DenseLayer.cpp
@@ -1,5 +1,7 @@
 #include "DenseLayer.hpp"
+#include <cstddef>
 #include <random>
+#include <vector>
 
 namespace nn::model::fnn {
 DenseLayer::DenseLayer(
@@ -11,6 +13,7 @@ DenseLayer::DenseLayer(
       out({size}),
       parameters(size, prevSize),
       gradients(size, prevSize),
+      deltaL({size}),
       activationFunction(activation) {
 	if (randomInit) {
 		fillParamRandom();
@@ -27,43 +30,46 @@ void Hidden_Layer::CreateDropoutMask() {
 	static thread_local std::mt19937 rng{std::random_device{}()};
 	std::bernoulli_distribution bernoulli(keepProb);
 
+	static std::vector<global::ValueType> temp(dropoutMask.numElements(), 0);
 	for (size_t i = 0; i < dropoutMask.numElements(); ++i) {
-		dropoutMask[i] = static_cast<uint8_t>(bernoulli(rng));
+		temp[i] = static_cast<uint8_t>(bernoulli(rng));
 	}
+
+	dropoutMask = temp;
 }
 
 void Output_Layer::forward(const global::Tensor &metrix) {
-	net = parameters.weights.matmul(metrix);
+	parameters.weights.matmul(metrix, net);
 	net += parameters.biases;
 
 	activationFunction.activate(net, out);
 }
 
-global::Tensor Output_Layer::getDelta(const global::Tensor &output) {
-	global::Tensor deltas = out;
-	deltas -= output;
-
-	return deltas;
+void Output_Layer::getDelta(const global::Tensor &output) {
+	deltaL = out;
+	deltaL -= output;
 }
 
 void Output_Layer::backward(
-    global::Tensor &deltas,
+    global::Tensor **deltas,
     const global::Tensor &prevLayer,
     const LayerParams *) {
 	if (activationFunction.getType() == ActivationType::Softmax) {
-		deltas = getDelta(deltas);
+		getDelta(**deltas);
 	} else {
-		activationFunction.derivativeActivate(out, deltas);
+		activationFunction.derivativeActivate(out, **deltas);
+		deltaL = **deltas;
 	}
 
-	gradients.biases += deltas;
-	gradients.weights += global::Tensor::outer(deltas, prevLayer);
+	gradients.biases += deltaL;
+	global::Tensor::outer(deltaL, prevLayer, gradients.weights);
+	*deltas = &deltaL;
 }
 
 global::ValueType Output_Layer::getCrossEntropyLoss(
     const global::Tensor &prediction,
     const size_t target) {
-	return -std::log(std::max(prediction[target], MIN_LOSS_VALUE));
+	return -std::log(std::max(prediction.getValue({target}), MIN_LOSS_VALUE));
 }
 
 global::ValueType Output_Layer::getLoss(const global::Prediction &targets) {
@@ -73,8 +79,8 @@ global::ValueType Output_Layer::getLoss(const global::Prediction &targets) {
 void Hidden_Layer::forward(const global::Tensor &metrix) {
 	if (isTraining)
 		CreateDropoutMask();
-    
-	net = parameters.weights.matmul(metrix);
+
+	parameters.weights.matmul(metrix, net);
 	net += parameters.biases;
 
 	if (isTraining && config.dropoutRate > 0.0f) {
@@ -86,32 +92,31 @@ void Hidden_Layer::forward(const global::Tensor &metrix) {
 }
 
 void Hidden_Layer::backward(
-    global::Tensor &deltas,
+    global::Tensor **deltas,
     const global::Tensor &prevLayer,
     const LayerParams *nextLayer) {
 
 	if (!nextLayer)
 		return;
 
-	deltas = getDelta(deltas, *nextLayer);
+	calculateDelta(**deltas, *nextLayer);
 
 	if (isTraining && config.dropoutRate) {
-		deltas *= dropoutMask;
+		deltaL *= dropoutMask;
 	}
 
-	gradients.biases += deltas;
+	gradients.biases += deltaL;
 
-	gradients.weights += global::Tensor::outer(deltas, prevLayer);
+	global::Tensor::outer(deltaL, prevLayer, gradients.weights);
+	*deltas = &deltaL;
 }
 
-global::Tensor Hidden_Layer::getDelta(
+void Hidden_Layer::calculateDelta(
     const global::Tensor &output,
     const LayerParams &nextLayer) {
 
-	auto deltas = nextLayer.weights.matmulT(output);
-	activationFunction.derivativeActivate(out, deltas);
-
-	return deltas;
+	nextLayer.weights.matmulT(output, deltaL);
+	activationFunction.derivativeActivate(out, deltaL);
 }
 
 size_t DenseLayer::getParamCount() const {
@@ -124,41 +129,29 @@ void DenseLayer::updateWeight(nn::model::IOptimizer &optimizer) {
 }
 
 const global::Tensor DenseLayer::getData() const {
-	global::Tensor matrix({parameters.paramSize()});
-
-	size_t currentI = 0;
-	for (size_t i = 0; i < size(); ++i) {
-		for (size_t j = 0; j < prevSize(); ++j) {
-			matrix[currentI] = parameters.weights({i, j});
+	size_t weightsSize = parameters.weights.numElements();
+	size_t biasesSize = parameters.biases.numElements();
 
-			++currentI;
-		}
-	}
+	global::Tensor matrix({weightsSize + biasesSize});
 
-	for (size_t i = 0; i < size(); ++i) {
-		matrix[currentI] = parameters.biases[i];
+	// Copy weights
+	matrix.insertRange(parameters.weights, 0, 0, weightsSize);
 
-		++currentI;
-	}
+	// Copy biases
+	matrix.insertRange(parameters.biases, 0, weightsSize, biasesSize);
 
 	return matrix;
 }
 
-void DenseLayer::setData(const global::Tensor newParam) {
-	size_t currentI = 0;
-	for (size_t i = 0; i < size(); ++i) {
-		for (size_t j = 0; j < prevSize(); ++j) {
-			parameters.weights({i, j}) = newParam[currentI];
+void DenseLayer::setData(const global::Tensor newParam, const size_t offset) {
+	size_t weightsSize = parameters.weights.numElements();
+	size_t biasesSize = parameters.biases.numElements();
 
-			++currentI;
-		}
-	}
+	// Copy into weights
+	parameters.weights.insertRange(newParam, offset, 0, weightsSize);
 
-	for (size_t i = 0; i < size(); ++i) {
-		parameters.biases[i] = newParam[currentI];
-
-		++currentI;
-	}
+	// Copy into biases
+	parameters.biases.insertRange(newParam, offset + weightsSize, 0, biasesSize);
 }
 
 void DenseLayer::fillParamRandom() {
@@ -167,18 +160,20 @@ void DenseLayer::fillParamRandom() {
 	global::ValueType std_dev = std::sqrt(2.0 / static_cast<global::ValueType>(prevSize()));
 	std::normal_distribution<> dist(0.0, std_dev);
 
-	for (auto &value : parameters.weights) {
-		value = dist(gen);
+	std::vector<global::ValueType> temp(parameters.weights.numElements());
+	for (size_t i = 0; i < temp.size(); ++i) {
+		temp[i] = dist(gen);
 	}
+	parameters.weights = temp;
 }
 
 void DenseLayer::resetDots() {
-	net.fill(0);
-	out.fill(0);
+	net.zero();
+	out.zero();
 }
 
 void DenseLayer::resetGradient() {
-	gradients.biases.fill(0);
-	gradients.weights.fill(0);
+	gradients.biases.zero();
+	gradients.weights.zero();
 }
 } // namespace nn::model::fnn
diff --git a/src/networks/fnn/DenseLayer.hpp b/src/networks/fnn/DenseLayer.hpp
index 84587aa..651adb2 100644
--- a/src/networks/fnn/DenseLayer.hpp
+++ b/src/networks/fnn/DenseLayer.hpp
@@ -1,8 +1,8 @@
 #ifndef DENSELAYER
 #define DENSELAYER
 
-#include "../../model/config.hpp"
 #include "../src/model/optimizers.hpp"
+#include <Globals.hpp>
 
 namespace nn::model::fnn {
 constexpr global::ValueType MIN_LOSS_VALUE = 1e-10;
@@ -11,11 +11,15 @@ struct LayerParams {
 	global::Tensor weights;
 	global::Tensor biases;
 
+	size_t size_;
+	size_t prevSize_;
+
 	LayerParams(size_t out_dim, size_t in_dim)
-	    : weights({out_dim, in_dim}), biases({out_dim}) {}
+	    : weights({out_dim, in_dim}), biases({out_dim}),
+	      size_(out_dim), prevSize_(in_dim) {}
 
-	size_t size() const { return biases.numElements(); }
-	size_t prevSize() const { return weights.getShape()[1]; }
+	size_t size() const { return size_; }
+	size_t prevSize() const { return prevSize_; }
 
 	size_t paramSize() const { return biases.numElements() + weights.numElements(); }
 };
@@ -28,6 +32,8 @@ class DenseLayer {
 	LayerParams parameters;
 	LayerParams gradients;
 
+	global::Tensor deltaL;
+
 	Activation activationFunction;
 
 	bool isTraining{false};
@@ -45,7 +51,7 @@ class DenseLayer {
 	virtual void forward(const global::Tensor &metrix) = 0;
 	void updateWeight(IOptimizer &optimizer);
 	virtual void backward(
-	    global::Tensor &deltas,
+	    global::Tensor **deltas,
 	    const global::Tensor &prevLayer,
 	    const LayerParams *nextLayer = nullptr) = 0;
 	virtual global::ValueType getLoss(const global::Prediction &) { return 0; };
@@ -65,7 +71,7 @@ class DenseLayer {
 	size_t getParamCount() const;
 
 	const global::Tensor getData() const;
-	void setData(const global::Tensor newParam);
+	void setData(const global::Tensor newParam, const size_t offset);
 
 	void setTraining(const bool state) { isTraining = state; }
 };
@@ -73,7 +79,7 @@ class DenseLayer {
 class Hidden_Layer : public DenseLayer {
   private:
 	const DenseLayerConfig &config;
-	global::Tensor getDelta(
+	void calculateDelta(
 	    const global::Tensor &output,
 	    const LayerParams &nextLayer);
 
@@ -93,7 +99,7 @@ class Hidden_Layer : public DenseLayer {
 
 	void forward(const global::Tensor &metrix) override;
 	void backward(
-	    global::Tensor &deltas,
+	    global::Tensor **deltas,
 	    const global::Tensor &prevLayer,
 	    const LayerParams *nextLayer) override;
 };
@@ -102,7 +108,7 @@ class Output_Layer : public DenseLayer {
   private:
 	const FNNConfig &config;
 
-	global::Tensor getDelta(const global::Tensor &output);
+	void getDelta(const global::Tensor &output);
 	static global::ValueType getCrossEntropyLoss(
 	    const global::Tensor &prediction,
 	    const size_t target);
@@ -122,7 +128,7 @@ class Output_Layer : public DenseLayer {
 
 	void forward(const global::Tensor &metrix) override;
 	void backward(
-	    global::Tensor &deltas,
+	    global::Tensor **deltas,
 	    const global::Tensor &prevLayer,
 	    const LayerParams *) override;
 
diff --git a/src/networks/fnn/FNNetwork.cpp b/src/networks/fnn/FNNetwork.cpp
index 46f163c..89621c1 100644
--- a/src/networks/fnn/FNNetwork.cpp
+++ b/src/networks/fnn/FNNetwork.cpp
@@ -52,7 +52,7 @@ void FNNetwork::sendNewVNeurons(const size_t i) const {
 
 void FNNetwork::forward(const global::Tensor &newInput) {
 	input = newInput;
-	layers[0]->forward(input);
+	layers[0]->forward(newInput);
 	sendNewVNeurons(0);
 
 	for (size_t i = 1; i < layers.size(); ++i) {
@@ -72,12 +72,10 @@ void FNNetwork::vUpdate() {
 	visual->attempPause();
 }
 
-void FNNetwork::backward(const global::Tensor &outputDeltas) {
-	global::Tensor deltas = outputDeltas;
-
+void FNNetwork::backward(global::Tensor **outputDeltas) {
 	resetGradient();
 
-	layers.back()->backward(deltas, layers[layers.size() - 2]->getOut());
+	layers.back()->backward(outputDeltas, layers[layers.size() - 2]->getOut());
 
 	if (visual) {
 		visual->setGrad(layers.size() - 1, layers[layers.size() - 1]->getGrad());
@@ -85,7 +83,7 @@ void FNNetwork::backward(const global::Tensor &outputDeltas) {
 
 	for (int i = static_cast<int>(layers.size()) - 2; i >= 0; --i) {
 		const global::Tensor &prev = (i == 0) ? input : layers[i - 1]->getOut();
-		layers[i]->backward(deltas, prev, &layers[i + 1]->getParms());
+		layers[i]->backward(outputDeltas, prev, &layers[i + 1]->getParms());
 
 		if (visual) {
 			visual->setGrad(i, layers[i]->getGrad());
@@ -94,7 +92,7 @@ void FNNetwork::backward(const global::Tensor &outputDeltas) {
 		vUpdate();
 	}
 
-	calculateInputDelta(deltas);
+	calculateInputDelta(outputDeltas);
 }
 
 global::ValueType FNNetwork::getLoss(const global::Prediction &pre) const {
@@ -137,14 +135,8 @@ void FNNetwork::updateWeights(IOptimizer &optimizer) {
 	}
 }
 
-void FNNetwork::calculateInputDelta(const global::Tensor &deltas) {
-	input.fill(0);
-
-	for (size_t i = 0; i < inputSize(); ++i) {
-		for (size_t j = 0; j < layers[0]->size(); ++j) {
-			input[i] += deltas[j] * layers[0]->getParms().weights({j, i});
-		}
-	}
+void FNNetwork::calculateInputDelta(global::Tensor **deltas) {
+	layers[0]->getParms().weights.matmulT(**deltas, input);
 }
 
 size_t FNNetwork::getParamCount() const {
@@ -165,10 +157,8 @@ global::Tensor FNNetwork::getParams() const {
 	for (size_t i = 0; i < layers.size(); ++i) {
 		global::Tensor params = layers[i]->getData();
 
-		for (size_t j = 0; j < params.numElements(); ++j) {
-			matrix[matrixI] = params[j];
-			++matrixI;
-		}
+		matrix.insertRange(params, 0, matrixI, params.numElements());
+		matrixI += params.numElements();
 	}
 
 	return matrix;
@@ -177,14 +167,8 @@ global::Tensor FNNetwork::getParams() const {
 void FNNetwork::setParams(const global::Tensor params) {
 	size_t j = 0;
 	for (size_t i = 0; i < layers.size(); ++i) {
-		global::Tensor newParam({layers[i]->getParamCount()});
-
-		for (size_t k = 0; k < newParam.numElements(); ++k) {
-			newParam[k] = params[j];
-			++j;
-		}
-
-		layers[i]->setData(newParam);
+		layers[i]->setData(params, j);
+        j += layers[i]->getParamCount();
 
 		if (visual) {
 			visual->setParam(i, layers[i]->getParms());
diff --git a/src/networks/fnn/FNNetwork.hpp b/src/networks/fnn/FNNetwork.hpp
index 93d2366..674735c 100644
--- a/src/networks/fnn/FNNetwork.hpp
+++ b/src/networks/fnn/FNNetwork.hpp
@@ -13,7 +13,7 @@ class FNNetwork : public INetwork {
 
 	const std::shared_ptr<visualizer::fnn::FnnVisualier> visual;
 
-	void calculateInputDelta(const global::Tensor &deltas);
+	void calculateInputDelta(global::Tensor **deltas);
 
 	void vUpdate();
 
@@ -28,7 +28,7 @@ class FNNetwork : public INetwork {
 	~FNNetwork() override = default;
 
 	void forward(const global::Tensor &newInput) override;
-	void backward(const global::Tensor &outputDeltas) override;
+	void backward(global::Tensor **outputDeltas) override;
 	void updateWeights(IOptimizer &optimizer) override;
 	void resetGradient() override;
 
diff --git a/src/networks/fnn/FnnVisualizer.cpp b/src/networks/fnn/FnnVisualizer.cpp
index 5ae7306..1726a78 100644
--- a/src/networks/fnn/FnnVisualizer.cpp
+++ b/src/networks/fnn/FnnVisualizer.cpp
@@ -98,7 +98,7 @@ void VisualDenseLayer::drawWeights(const size_t neuron_i, sf::RenderTexture &tar
 		line_[2].position = to;
 
 		line_[0].color = LINE_COLOR;
-		line_[0].color.a = parameters.weights({neuron_i, neuronP}) * 50;
+		line_[0].color.a = parameters.weights.getValue({neuron_i, neuronP}) * 50;
 		line_[1].color = line_[0].color;
 		line_[2].color = getColorFromTextT(getTextT(neuron_i, neuronP));
 		target.draw(line_);
@@ -162,7 +162,7 @@ void VisualDenseLayer::renderNeuron(const size_t index, sf::RenderTexture &targe
 		drawWeights(index, target);
 	}
 
-	drawNeuron(cacheNeurons[index], net[index], out[index], target);
+	drawNeuron(cacheNeurons[index], net.getValue({index}), out.getValue({index}), target);
 }
 
 void VisualDenseLayer::drawNeurons(sf::RenderTexture &target) {
@@ -198,10 +198,10 @@ float VisualDenseLayer::calculateGap(const int size, const float scale) {
 }
 
 textType VisualDenseLayer::getTextT(const size_t layer_i, const size_t layer_p) {
-	if (gradients.weights({layer_i, layer_p}) < 0)
+	if (gradients.weights.getValue({layer_i, layer_p}) < 0)
 		return textType::DOWN;
 
-	if (gradients.weights({layer_i, layer_p}) > 0)
+	if (gradients.weights.getValue({layer_i, layer_p}) > 0)
 		return textType::UP;
 
 	return textType::NORMAL;
@@ -254,7 +254,6 @@ void VisualDenseLayer::setGrad(const model::fnn::LayerParams &newGrad) {
 }
 
 void FnnVisualier::setWidth(const std::uint32_t newWidth) {
-
 	visualWidth = newWidth;
 	if (networkRender.resize({newWidth, networkRender.getSize().y})) {
 	}
diff --git a/src/networks/fnn/FnnVisualizer.hpp b/src/networks/fnn/FnnVisualizer.hpp
index 067050c..a497484 100644
--- a/src/networks/fnn/FnnVisualizer.hpp
+++ b/src/networks/fnn/FnnVisualizer.hpp
@@ -32,11 +32,11 @@ static const std::array<sf::Color, 3> color_lookup = {
 
 class VisualDenseLayer {
   private:
-	global::Tensor net{{0}};
-	global::Tensor out{{0}};
+	global::Tensor net{{1}};
+	global::Tensor out{{1}};
 
-	model::fnn::LayerParams parameters{0, 0};
-	model::fnn::LayerParams gradients{0, 0};
+	model::fnn::LayerParams parameters{1, 1};
+	model::fnn::LayerParams gradients{1, 1};
 
 	sf::Vector2f pos;
 
diff --git a/src/visualizer/VInterface.cpp b/src/visualizer/VInterface.cpp
index 5fa4ebd..3cfbd5c 100644
--- a/src/visualizer/VInterface.cpp
+++ b/src/visualizer/VInterface.cpp
@@ -1,30 +1,46 @@
 #include "VInterface.hpp"
+#include "state.hpp"
 
 namespace nn::visualizer {
-IntefacePanel::IntefacePanel(const std::shared_ptr<StateManager> vstate)
+InterfacePanel::InterfacePanel(const std::shared_ptr<StateManager> vstate)
     : Panel(vstate),
       VRender({VINTERFACE_WIDTH, VINTERFACE_HEIGHT}) {
 	createVInterface();
 }
 
-void IntefacePanel::createVInterface() {
+void InterfacePanel::createVInterface() {
 	VRender.clear(INTERFACE_PANEL_COLOR);
 	buttons.reserve(STATES_COUNT);
 
+	constexpr std::array<SettingType, 3> skipWhenDisabled = {
+	    SettingType::AutoPause,
+	    SettingType::Pause,
+	    SettingType::PreciseMode};
+
 	for (int i = 0; i < STATES_COUNT; ++i) {
-		buttons.push_back(std::make_unique<Button>(vstate, vstate->getStateString((SettingType)i), (SettingType)i));
+		SettingType set = static_cast<SettingType>(i);
+
+		if (!vstate->config.visualConfig.enableNetwrokVisual &&
+		    std::find(skipWhenDisabled.begin(), skipWhenDisabled.end(), set) != skipWhenDisabled.end()) {
+			continue;
+		}
+
+		buttons.push_back(std::make_unique<Button>(
+		    vstate,
+		    vstate->getStateString(set),
+		    set));
 	}
 }
 
-void IntefacePanel::display() {
+void InterfacePanel::display() {
 	VRender.display();
 }
 
-sf::Sprite IntefacePanel::getSprite() {
+sf::Sprite InterfacePanel::getSprite() {
 	return sf::Sprite(VRender.getTexture());
 }
 
-void IntefacePanel::handleClick(const sf::Vector2i mousePos_, const sf::Vector2f boxPos) {
+void InterfacePanel::handleClick(const sf::Vector2i mousePos_, const sf::Vector2f boxPos) {
 	if (needHandlePress) {
 		return;
 	}
@@ -34,12 +50,12 @@ void IntefacePanel::handleClick(const sf::Vector2i mousePos_, const sf::Vector2f
 	handleKeyPresed(mousePos_, boxPos);
 }
 
-void IntefacePanel::handleNoClick() {
+void InterfacePanel::handleNoClick() {
 	needHandlePress = false;
 	setUpdate();
 }
 
-void IntefacePanel::doRender() {
+void InterfacePanel::doRender() {
 	int row = 0, column = -1;
 
 	for (size_t button_ = 0; button_ < buttons.size(); ++button_) {
@@ -60,7 +76,7 @@ void IntefacePanel::doRender() {
 	display();
 }
 
-void IntefacePanel::handleKeyPresed(const sf::Vector2i mousePos_, const sf::Vector2f boxPos) {
+void InterfacePanel::handleKeyPresed(const sf::Vector2i mousePos_, const sf::Vector2f boxPos) {
 	int row = 0, column = -1;
 	sf::Vector2f mousePos(static_cast<float>(mousePos_.x), static_cast<float>(mousePos_.y));
 
diff --git a/src/visualizer/VInterface.hpp b/src/visualizer/VInterface.hpp
index bb63707..41ea176 100644
--- a/src/visualizer/VInterface.hpp
+++ b/src/visualizer/VInterface.hpp
@@ -11,7 +11,7 @@ constexpr std::uint32_t VINTERFACE_HEIGHT = BUTTON_HEIGHT * BUTTON_PER_COLLUM +
 
 constexpr sf::Color INTERFACE_PANEL_COLOR = PANELS_BG;
 
-class IntefacePanel : public Panel {
+class InterfacePanel : public Panel {
   private:
 	sf::RenderTexture VRender;
 	bool needHandlePress{false};
@@ -23,8 +23,8 @@ class IntefacePanel : public Panel {
 	void doRender() override;
 
   public:
-	IntefacePanel(const std::shared_ptr<StateManager> vstate);
-	~IntefacePanel() = default;
+	InterfacePanel(const std::shared_ptr<StateManager> vstate);
+	~InterfacePanel() = default;
 
 	sf::Sprite getSprite();
 
diff --git a/src/visualizer/VisualizerController.cpp b/src/visualizer/VisualizerController.cpp
index f1e9a60..fc752d3 100644
--- a/src/visualizer/VisualizerController.cpp
+++ b/src/visualizer/VisualizerController.cpp
@@ -3,7 +3,6 @@
 
 namespace nn::visualizer {
 VisualManager::VisualManager(const model::Config &_config) : config(_config) {
-	printf("start Visualizer\n");
 }
 
 void VisualManager::initState() {
@@ -44,6 +43,7 @@ void VisualManager::start() {
 }
 
 void VisualManager::startVisuals() {
+	printf("start Visualizer\n");
 	Vstate = std::make_shared<StateManager>(config);
 	if (!Vstate) {
 		return;
diff --git a/src/visualizer/VisualizerRenderer.cpp b/src/visualizer/VisualizerRenderer.cpp
index 510f91d..7a0615b 100644
--- a/src/visualizer/VisualizerRenderer.cpp
+++ b/src/visualizer/VisualizerRenderer.cpp
@@ -1,15 +1,31 @@
 #include "VisualizerRenderer.hpp"
+#include "network/IvisualNetwork.hpp"
 #include "tensor.hpp"
+#include "visualModel.hpp"
+#include <SFML/System/Vector2.hpp>
+#include <memory>
 
 namespace nn::visualizer {
 constexpr std::uint32_t NN_WIDTH = 1055u;
 VisualRender::VisualRender(std::shared_ptr<StateManager> vstate)
-    : window(sf::VideoMode({WINDOW_WIDTH, WINDOW_HEIGHT}), WINDOW_TITLE.data()),
-      visualModel(vstate),
-      Vstate(vstate),
+    : Vstate(vstate),
+      winSize(getWinSize(Vstate->config.visualConfig.enableNetwrokVisual)),
+      window(sf::VideoMode(winSize), WINDOW_TITLE.data()),
       interface(vstate),
       statusV(vstate),
-      Vgraph(vstate) {}
+      Vgraph(vstate) {
+	if (Vstate->config.visualConfig.enableNetwrokVisual) {
+		visualModel = std::make_unique<ModelPanel>(vstate);
+	}
+}
+
+sf::Vector2u VisualRender::getWinSize(bool enableNetwork) {
+	winSize = sf::Vector2u(WINDOW_WIDTH, WINDOW_HEIGHT);
+	if (!enableNetwork) {
+		winSize -= sf::Vector2u(MODEL_WIDTH + UI_GAP, 0);
+	}
+	return winSize;
+}
 
 void VisualRender::processEvents() {
 	while (const std::optional event = window.pollEvent()) {
@@ -18,7 +34,12 @@ void VisualRender::processEvents() {
 		} else if (event->is<sf::Event::MouseButtonPressed>()) {
 			interface.handleNoClick();
 		} else if (event->is<sf::Event::MouseButtonReleased>()) {
-			interface.handleClick(sf::Mouse::getPosition(window), {NN_WIDTH + UI_GAP * 2, UI_GAP});
+			float x = UI_GAP;
+			if (Vstate->config.visualConfig.enableNetwrokVisual) {
+				x += NN_WIDTH + UI_GAP * 2;
+			}
+
+			interface.handleClick(sf::Mouse::getPosition(window), {x, UI_GAP});
 		} else if (event->is<sf::Event::Resized>()) {
 			need_resize = true;
 		} else if (const auto *keyPressed = event->getIf<sf::Event::KeyPressed>()) {
@@ -32,31 +53,35 @@ void VisualRender::processEvents() {
 
 void VisualRender::resetSize() {
 	if (need_resize) {
-		window.setSize({WINDOW_WIDTH, WINDOW_HEIGHT});
+		window.setSize(winSize);
 	}
 
 	need_resize = false;
 }
 
 void VisualRender::renderPanels() {
-	visualModel.render();
-	sf::Sprite visualNetworkSprite = visualModel.getSprite();
-	visualNetworkSprite.setPosition({UI_GAP, UI_GAP});
-	window.draw(visualNetworkSprite);
+	float networkOffset = 0;
+	if (visualModel) {
+		visualModel->render();
+		sf::Sprite visualNetworkSprite = visualModel->getSprite();
+		visualNetworkSprite.setPosition({UI_GAP, UI_GAP});
+		window.draw(visualNetworkSprite);
+		networkOffset += visualNetworkSprite.getGlobalBounds().size.x + UI_GAP;
+	}
 
 	interface.render();
 	sf::Sprite interfaceSprite = interface.getSprite();
-	interfaceSprite.setPosition({NN_WIDTH + UI_GAP * 2, UI_GAP});
+	interfaceSprite.setPosition({networkOffset + UI_GAP, UI_GAP});
 	window.draw(interfaceSprite);
 
 	statusV.render();
 	sf::Sprite statusSprite = statusV.getSprite();
-	statusSprite.setPosition({NN_WIDTH + UI_GAP * 2, UI_GAP * 2 + VINTERFACE_HEIGHT});
+	statusSprite.setPosition({networkOffset + UI_GAP, UI_GAP * 2 + VINTERFACE_HEIGHT});
 	window.draw(statusSprite);
 
 	Vgraph.render();
 	sf::Sprite graphSprite = Vgraph.getSprite();
-	graphSprite.setPosition({NN_WIDTH + UI_GAP * 2, UI_GAP * 3 + VINTERFACE_HEIGHT + VSTATUS_HEIGHT});
+	graphSprite.setPosition({networkOffset + UI_GAP, UI_GAP * 3 + VINTERFACE_HEIGHT + VSTATUS_HEIGHT});
 	window.draw(graphSprite);
 }
 
@@ -65,7 +90,9 @@ void VisualRender::fullUpdate() {
 
 	statusV.setUpdate();
 	interface.setUpdate();
-	visualModel.setUpdate();
+	if (visualModel) {
+		visualModel->setUpdate();
+	}
 	Vgraph.setUpdate();
 }
 
@@ -118,7 +145,9 @@ void VisualRender::close() {
 }
 
 bool VisualRender::updateStatus() {
-	return interface.updateStatus() || statusV.updateStatus() || visualModel.updateStatus();
+	return interface.updateStatus() ||
+	       statusV.updateStatus() ||
+	       (visualModel && visualModel->updateStatus());
 }
 
 void VisualRender::start() {
@@ -148,11 +177,15 @@ void VisualRender::setNewPhaseMode(const NnMode nn_mode) {
 }
 
 void VisualRender::updatePrediction(const global::Prediction &pre) {
-	visualModel.setPrediction(pre);
+	if (visualModel) {
+		visualModel->setPrediction(pre);
+	}
 }
 
 void VisualRender::updateInput(const global::Tensor &input) {
-	visualModel.setInput(input);
+	if (visualModel) {
+		visualModel->setInput(input);
+	}
 }
 
 void VisualRender::updateLearningRate(const global::ValueType lr) {
@@ -160,6 +193,8 @@ void VisualRender::updateLearningRate(const global::ValueType lr) {
 }
 
 void VisualRender::addVisualSubNetwork(const std::shared_ptr<IVisualNetwork> newVisual) {
-	visualModel.addVisualSubNetwork(newVisual);
+	if (visualModel) {
+		visualModel->addVisualSubNetwork(newVisual);
+	}
 }
 } // namespace nn::visualizer
diff --git a/src/visualizer/VisualizerRenderer.hpp b/src/visualizer/VisualizerRenderer.hpp
index 5110bf0..342f60a 100644
--- a/src/visualizer/VisualizerRenderer.hpp
+++ b/src/visualizer/VisualizerRenderer.hpp
@@ -6,6 +6,8 @@
 #include "graph.hpp"
 #include "tensor.hpp"
 #include "visualModel.hpp"
+#include <SFML/System/Vector2.hpp>
+#include <memory>
 
 namespace nn::visualizer {
 constexpr sf::Color BG_COLOR(100, 100, 100);
@@ -17,10 +19,11 @@ constexpr std::string_view WINDOW_TITLE = "Visualizer";
 
 class VisualRender {
   private:
-	sf::RenderWindow window;
-	ModelPanel visualModel;
 	std::shared_ptr<StateManager> Vstate;
-	IntefacePanel interface;
+	sf::Vector2u winSize;
+	sf::RenderWindow window;
+	std::unique_ptr<ModelPanel> visualModel;
+	InterfacePanel interface;
 	StatusPanel statusV;
 	GraphUIPanel Vgraph;
 	std::atomic<bool> running{false};
@@ -28,6 +31,8 @@ class VisualRender {
 	float bps;
 	bool need_resize{false};
 
+	sf::Vector2u getWinSize(bool enableNetwork);
+
 	void renderLoop();
 	void processEvents();
 	void renderPanels();
diff --git a/src/visualizer/visualModel.cpp b/src/visualizer/visualModel.cpp
index 6391393..f22f039 100644
--- a/src/visualizer/visualModel.cpp
+++ b/src/visualizer/visualModel.cpp
@@ -4,8 +4,6 @@
 #include "fonts.hpp"
 #include "network/IvisualNetwork.hpp"
 #include "panel.hpp"
-#include <SFML/Graphics/Color.hpp>
-#include <SFML/System/Vector2.hpp>
 #include <cstdint>
 #include <memory>
 
@@ -77,14 +75,14 @@ sf::Color DummyLayer::getNeuronColor(const global::ValueType value) {
 
 void DummyLayer::renderNeuron(sf::RenderTexture &target, const size_t index) {
 	sf::RectangleShape shape(cacheNeurons[index].size);
-	shape.setFillColor(getNeuronColor(values({index})));
+	shape.setFillColor(getNeuronColor(values.getValue({index})));
 	shape.setPosition(cacheNeurons[index].position + pos);
 
 	target.draw(shape);
 
 	if (10 * cacheNeurons[index].size.y / global::NEURON_WIDTH > global::MIN_FONT_SIZE) {
 		std::ostringstream ss;
-		ss << std::fixed << std::setprecision(4) << values({index});
+		ss << std::fixed << std::setprecision(4) << values.getValue({index});
 
 		sf::Text text(Fonts::getFont());
 		text.setCharacterSize(10 * cacheNeurons[index].size.y / global::NEURON_WIDTH);
@@ -159,8 +157,10 @@ void ModelPanel::renderSubNetwork(const size_t index) {
 }
 
 void ModelPanel::setPrediction(const global::Prediction &pre) {
-	global::Tensor output({predictionLayer.size()});
-	output({pre.index}) = 1;
+	static global::Tensor output({predictionLayer.size()});
+	output.zero();
+
+	output.setValue({pre.index}, 1);
 	predictionLayer.setValues(output);
 
 	setUpdate();
diff --git a/tests/binary_test.cpp b/tests/binary_test.cpp
index 7dbb327..194dc9c 100644
--- a/tests/binary_test.cpp
+++ b/tests/binary_test.cpp
@@ -66,8 +66,8 @@ void print_database(int actual_size, int input_size, int database_size) {
 }
 
 void printVector(const nn::global::Tensor &vec) {
-	for (const auto &elem : vec) {
-		std::cout << elem << ' ';
+	for (size_t i = 0; i < vec.numElements(); ++i) {
+		std::cout << vec.getValue({i}) << ' ';
 	}
 
 	std::cout << '\n';
@@ -76,16 +76,21 @@ void printVector(const nn::global::Tensor &vec) {
 int main(int argc, char *argv[]) {
 	size_t input_size = 10;
 
+    nn::global::Tensor::toGpu();
+
 	std::string config_FN = tests::appendToBase("config-binary_test.json");
+
 	nn::model::Model model(config_FN);
 
 	if (argc > 1 && std::string(argv[1]) == "l") {
 		model.load("test.txt");
 	} else {
-        std::vector<std::string> files {"../tests/data/test1", "../tests/data/test2"};
+		nn::model::modelResult result = model.evaluateModel("../tests/data/database-binary_test");
+		std::cout << "training result: " << result.percentage << "%\n";
+		std::vector<std::string> files{"../tests/data/test1", "../tests/data/test2"};
 		model.train(files);
 
-		nn::model::modelResult result = model.evaluateModel("../tests/data/database-binary_test");
+		result = model.evaluateModel("../tests/data/database-binary_test");
 		std::cout << "training result: " << result.percentage << "%\n";
 
 		model.save("test.txt");
@@ -126,9 +131,9 @@ int main(int argc, char *argv[]) {
 		}
 
 		for (size_t i = 4 + num2; i > num2; i--) {
-			input({i - 1}) = bit_by_index(num1, 4 - i + num2);
-			if (input({i - 1}) == 0) {
-				input({i - 1}) = 0.5;
+			input.setValue({i - 1}, bit_by_index(num1, 4 - i + num2));
+			if (input.getValue({i - 1}) == 0) {
+				input.setValue({i - 1}, 0.5);
 			}
 		}
 
diff --git a/tests/data/config-binary_test.json b/tests/data/config-binary_test.json
index 007cb80..824ea8e 100644
--- a/tests/data/config-binary_test.json
+++ b/tests/data/config-binary_test.json
@@ -1,18 +1,19 @@
 {
 	"visual config": {
 		"enableVisuals": true,
+		"enableNetwrokVisual": false,
 		"modes": [
-			{ "state": "pause", "mode": true },
+			{ "state": "pause", "mode": false },
 			{ "state": "precise mode", "mode": false },
 			{ "state": "auto pause", "mode": false }
 		]
 	},
 	"training config": {
-		"batch size": 16,
-		"batch count": 1000,
+		"batch size": 64,
+		"batch count": 100000,
 		"auto save": {
-			"saveEvery": 2000,
-			"dataFilenameAutoSave": "mode.txt"
+			"saveEvery": 500,
+			"dataFilenameAutoSave": "test.txt"
 		},
 		"auto evaluating": {
 			"evaluateEvery": 10,
@@ -20,18 +21,20 @@
 		},
 		"optimizer": {
 			"type": "const",
-			"lr": 0.1
+			"lr": 0
 		}
 	},
 	"network config": [
 		{
 			"type": "FNN",
-			"input size": 100,
-			"output size": 100,
+			"input size": 10,
+			"output size": 16,
 			"output activation": 4,
 			"layers": [
-				{ "size": 30, "activationType": 1 },
-				{ "size": 30, "activationType": 1 }
+				{ "size": 784, "activationType": 1 },
+				{ "size": 500, "activationType": 1 },
+				{ "size": 500, "activationType": 1 },
+				{ "size": 500, "activationType": 1 }
 			]
 		}
 	]
diff --git a/tests/data/database-binary_test.nndb b/tests/data/database-binary_test.nndb
index 4a40143..f075152 100644
--- a/tests/data/database-binary_test.nndb
+++ b/tests/data/database-binary_test.nndb
@@ -1,4 +1,4 @@
-1000 100
+1000 10
  0 0.5 0.5 0.5 0.5 0.1 0.1 0.1 0.1 0.1 0.1
  0 0.1 0.5 0.5 0.5 0.5 0.1 0.1 0.1 0.1 0.1
  0 0.1 0.1 0.5 0.5 0.5 0.5 0.1 0.1 0.1 0.1
diff --git a/tests/data/test1.nndb b/tests/data/test1.nndb
index 1aba172..8ea0dc7 100644
--- a/tests/data/test1.nndb
+++ b/tests/data/test1.nndb
@@ -1,4 +1,4 @@
-60 100
+60 10
  0 0.5 0.5 0.5 0.5 0.1 0.1 0.1 0.1 0.1 0.1
  0 0.1 0.5 0.5 0.5 0.5 0.1 0.1 0.1 0.1 0.1
  0 0.1 0.1 0.5 0.5 0.5 0.5 0.1 0.1 0.1 0.1
diff --git a/tests/data/test2.nndb b/tests/data/test2.nndb
index 69eadc4..209c026 100644
--- a/tests/data/test2.nndb
+++ b/tests/data/test2.nndb
@@ -1,4 +1,4 @@
-70 100
+70 10
  5 0.5 1 0.5 1 0.1 0.1 0.1 0.1 0.1 0.1
  5 0.1 0.5 1 0.5 1 0.1 0.1 0.1 0.1 0.1
  5 0.1 0.1 0.5 1 0.5 1 0.1 0.1 0.1 0.1