diff --git a/.github/workflows/cmake-multi-platform.yml b/.github/workflows/cmake-multi-platform.yml index 4e76790..5923ace 100644 --- a/.github/workflows/cmake-multi-platform.yml +++ b/.github/workflows/cmake-multi-platform.yml @@ -28,6 +28,13 @@ jobs: libxi-dev \ libfreetype6-dev + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-ubuntu2404.pin + sudo mv cuda-ubuntu2404.pin /etc/apt/preferences.d/cuda-repository-pin-600 + wget https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda-repo-ubuntu2404-13-0-local_13.0.0-580.65.06-1_amd64.deb + sudo dpkg -i cuda-repo-ubuntu2404-13-0-local_13.0.0-580.65.06-1_amd64.deb + sudo cp /var/cuda-repo-ubuntu2404-13-0-local/cuda-*-keyring.gpg /usr/share/keyrings/ + sudo apt-get -y install cuda-toolkit-13-0 + - name: Checkout code uses: actions/checkout@v4 @@ -36,3 +43,4 @@ jobs: - name: Build run: cmake --build build --config Release + diff --git a/CMakeLists.txt b/CMakeLists.txt index 86f4cc4..c29b4f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,13 +1,18 @@ cmake_minimum_required(VERSION 3.28) -project(NeuralNetwork LANGUAGES CXX) +set(CMAKE_CUDA_ARCHITECTURES 86) # For RTX 3060 +project(NeuralNetwork LANGUAGES CXX CUDA) # Add CUDA here # ------------------------------------------------------------------ # Configuration set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CUDA_STANDARD 17) # Add CUDA standard +set(CMAKE_CUDA_STANDARD_REQUIRED ON) # Enforce it set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin") +enable_language(CUDA) + # Default to Debug build type if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type" FORCE) @@ -39,22 +44,23 @@ FetchContent_Declare(nlohmann_json FetchContent_MakeAvailable(SFML nlohmann_json) -# ------------------------------------------------------------------ -# Function: Apply sanitizers -function(apply_sanitizers target) - target_compile_options(${target} PRIVATE -fsanitize=address -fno-omit-frame-pointer -g) - target_link_libraries(${target} PRIVATE -fsanitize=address) -endfunction() - # ------------------------------------------------------------------ # Main library + +# Add both C++ and CUDA sources file(GLOB_RECURSE NN_SOURCES CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cu" # Include CUDA source files ) add_library(NeuralNetwork STATIC ${NN_SOURCES}) set_target_properties(NeuralNetwork PROPERTIES POSITION_INDEPENDENT_CODE ON) +# Enable separable compilation for CUDA files +set_target_properties(NeuralNetwork PROPERTIES + CUDA_SEPARABLE_COMPILATION ON +) + target_include_directories(NeuralNetwork PUBLIC $ @@ -71,21 +77,19 @@ target_link_libraries(NeuralNetwork SFML::Window SFML::System nlohmann_json::nlohmann_json + cuda ) target_compile_options(NeuralNetwork PRIVATE -Wall -Wextra -Wpedantic) # ------------------------------------------------------------------ -# Tests (with sanitizers) +# Tests option(BUILD_NN_TESTS "Build NeuralNetwork tests" OFF) if(BUILD_NN_TESTS) enable_testing() include(CTest) - # Apply sanitizers only for test builds - apply_sanitizers(NeuralNetwork) - file(GLOB TEST_SOURCES CONFIGURE_DEPENDS tests/*.cpp) if(TEST_SOURCES) @@ -96,8 +100,6 @@ if(BUILD_NN_TESTS) target_link_libraries(${test_name} PRIVATE NeuralNetwork) target_include_directories(${test_name} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include") - apply_sanitizers(${test_name}) - add_test(NAME ${test_name} COMMAND ${test_name}) endforeach() else() @@ -109,3 +111,4 @@ endif() # Install install(TARGETS NeuralNetwork ARCHIVE DESTINATION lib) install(DIRECTORY include/ DESTINATION include) + diff --git a/include/model.hpp b/include/model.hpp index b9c9ae0..26aed62 100644 --- a/include/model.hpp +++ b/include/model.hpp @@ -4,6 +4,9 @@ #include "../src/model/dataBase.hpp" #include "../src/model/optimizers.hpp" #include "../src/visualizer/VisualizerController.hpp" +#include "Globals.hpp" +#include "tensor.hpp" +#include #include namespace nn::visualizer { @@ -65,7 +68,7 @@ class Model { global::ValueType runBackPropagation( const Batch &batch, const bool updateWeights, - global::Transformation transformation = dt); + global::Transformation transformation = nullptr); void printTrainingResult( const std::chrono::high_resolution_clock::time_point &start, @@ -81,12 +84,12 @@ class Model { DataBase &dataBase, const bool cancleOnError = false, const bool showProgressbar = true, - global::Transformation transformation = dt); + global::Transformation transformation = nullptr); void trainModel( DataBase &trainedDataBase, DataBase &evaluateDataBase, - global::Transformation transformationB = dt, - global::Transformation transformationE = dt); + global::Transformation transformationB = nullptr, + global::Transformation transformationE = nullptr); size_t outputSize() const; size_t inputSize() const; @@ -103,10 +106,13 @@ class Model { void autoSave(const int i); - void addFNN(const std::uint32_t width, ISubNetworkConfig &_config); - void addCNN(const std::uint32_t width, ISubNetworkConfig &_config); + void addFNN(const std::uint32_t width, ISubNetworkConfig &_config); + void addCNN(const std::uint32_t width, ISubNetworkConfig &_config); - std::uint32_t calculateSubNetWidth() const; + std::uint32_t calculateSubNetWidth() const; + + void runModel(const global::Tensor &input, + global::Transformation transformation); public: Model(const std::string &config_filepath); @@ -115,19 +121,19 @@ class Model { void runModel(const global::Tensor &input); void train( const std::string &db_filename, - global::Transformation transformationB = dt, - global::Transformation transformationE = dt); + global::Transformation transformationB = nullptr, + global::Transformation transformationE = nullptr); void train( const std::vector &db_filename, - global::Transformation transformationB = dt, - global::Transformation transformationE = dt); + global::Transformation transformationB = nullptr, + global::Transformation transformationE = nullptr); modelResult evaluateModel( const std::string &db_filename, const bool cancleOnError = false, - global::Transformation transformation = dt); + global::Transformation transformation = nullptr); - void save(const std::string &file); - void load(const std::string &file); + void save(const std::string &file, bool print = true); + void load(const std::string &file, bool print = true); global::Prediction getPrediction() const; }; diff --git a/include/network/INetwork.hpp b/include/network/INetwork.hpp index 530a444..574c600 100644 --- a/include/network/INetwork.hpp +++ b/include/network/INetwork.hpp @@ -11,7 +11,7 @@ class INetwork { virtual ~INetwork() = default; virtual void forward(const global::Tensor &input) = 0; - virtual void backward(const global::Tensor &outputDeltas) = 0; + virtual void backward(global::Tensor **outputDeltas) = 0; virtual void updateWeights(IOptimizer &optimizer) = 0; virtual void resetGradient() = 0; diff --git a/include/network/IvisualNetwork.hpp b/include/network/IvisualNetwork.hpp index ae08c3a..58b49af 100644 --- a/include/network/IvisualNetwork.hpp +++ b/include/network/IvisualNetwork.hpp @@ -3,8 +3,7 @@ #include "../../src/visualizer/panel.hpp" #include -#include -#include +#include namespace nn::visualizer { constexpr std::uint32_t MODEL_HEIGHT = 770u; diff --git a/include/tensor.hpp b/include/tensor.hpp index 71b24d6..3f32e39 100644 --- a/include/tensor.hpp +++ b/include/tensor.hpp @@ -1,59 +1,55 @@ #ifndef TENSOR #define TENSOR -#include +#include "../src/model/tensor_gpu.hpp" #include +namespace nn::model { +class Activation; +void enableGpuMode(); +} // namespace nn::model + namespace nn::global { -using ValueType = float; class Tensor { private: - std::vector data; + std::vector cpu_data; std::vector shape; std::vector strides; + ValueType *gpu_data = nullptr; + std::size_t gpu_data_size; + + static bool isGpu; + static size_t tensorCount; + void computeStrides(); inline size_t flattenIndex(const std::vector &indices) const; + friend model::Activation; + public: // Constructors - Tensor(const std::vector &shape, float init = 0.0f); - Tensor(const Tensor &other) - : data(other.data), - shape(other.shape), - strides(other.strides) {} + Tensor(const std::vector &shape, ValueType init = 0.0f); + Tensor(const Tensor &other); - Tensor &operator=(const Tensor &other); + ~Tensor(); - // Element access - ValueType &operator()(const std::vector &indices); - ValueType operator()(const std::vector &indices) const; - inline ValueType &operator[](size_t i) { return data[i]; } - inline const ValueType &operator[](size_t i) const { return data[i]; } + Tensor &operator=(const Tensor &other); + Tensor &operator=(const std::vector &other); - // Iterators (for range-based loops) - auto begin() noexcept { return data.begin(); } - auto end() noexcept { return data.end(); } - auto begin() const noexcept { return data.begin(); } - auto end() const noexcept { return data.end(); } + ValueType getValue(const std::vector &newShape) const; + void setValue(const std::vector &newShape, const ValueType value); + void insertRange(const Tensor &other, const size_t startO, + const size_t startT, const size_t length); // Shape and size - inline const std::vector &getShape() const { return shape; } - inline size_t numElements() const { return data.size(); } - inline const std::vector &getData() const { return data; } - inline void fill(const ValueType &value) { std::fill(begin(), end(), value); } - - // Arithmetic operations - Tensor operator+(const Tensor &other) const; - Tensor operator*(const Tensor &other) const; - Tensor operator-(const Tensor &other) const; - Tensor operator/(const Tensor &other) const; - - Tensor operator*(ValueType scalar) const; - Tensor operator+(ValueType scalar) const; - Tensor operator/(ValueType scalar) const; - Tensor operator-(ValueType scalar) const; + size_t numElements() const; + const std::vector &getShape() const { return shape; } + const std::vector &getStrides() const { return strides; } + void getData(std::vector &dest) const; + void fill(const ValueType &value); + void zero(); Tensor &operator+=(const Tensor &other); Tensor &operator-=(const Tensor &other); @@ -65,10 +61,14 @@ class Tensor { Tensor &operator+=(ValueType scalar); Tensor &operator-=(ValueType scalar); - Tensor matmul(const Tensor &other) const; - static Tensor outer(const Tensor &a, const Tensor &b); - Tensor matmulT(const Tensor &vec) const; + void matmul(const Tensor &other, Tensor &result) const; + static void outer(const Tensor &a, const Tensor &b, Tensor &result); + void matmulT(const Tensor &vec, Tensor &result) const; + + static void toGpu(); + static void toCpu(); }; + } // namespace nn::global #endif // TENSOR diff --git a/src/model/activations.cpp b/src/model/activations.cpp index 19669bf..0a22ac3 100644 --- a/src/model/activations.cpp +++ b/src/model/activations.cpp @@ -1,36 +1,9 @@ #include "activations.hpp" +#include "tensor.hpp" +#include "tensor_gpu.hpp" +#include namespace nn::model { -global::ValueType Activation::activate(const global::ValueType z) const { - switch (activationType) { - case ActivationType::Relu: - return relu(z); - case ActivationType::LeakyRelu: - return leakyRelu(z); - case ActivationType::Sigmoid: - return sigmoid(z); - case ActivationType::Tanh: - return tanh(z); - default: - return z; - } -} - -global::ValueType Activation::derivativeActivate(const global::ValueType z) const { - switch (activationType) { - case ActivationType::Relu: - return derivativeRelu(z); - case ActivationType::LeakyRelu: - return derivativeLeakyRelu(z); - case ActivationType::Sigmoid: - return derivativeSigmoid(z); - case ActivationType::Tanh: - return derivativeTanh(z); - default: - return z; - } -} - void Activation::activate(const global::Tensor &net, global::Tensor &out) const { switch (activationType) { case ActivationType::Relu: @@ -73,14 +46,17 @@ void Activation::derivativeActivate(const global::Tensor &net, global::Tensor &o } global::ValueType Activation::maxVector(const global::Tensor &metrix) { - global::ValueType max = metrix[0]; - for (auto &value : metrix) { - if (value > max) { - max = value; + if (!metrix.isGpu) { + global::ValueType max = metrix.cpu_data[0]; + for (size_t i = 0; i < metrix.numElements(); ++i) { + if (metrix.getValue({i}) > max) { + max = metrix.getValue({i}); + } } - } - return max; + return max; + } + return 0; } global::ValueType Activation::relu(const global::ValueType z) { @@ -114,61 +90,105 @@ global::ValueType Activation::derivativeTanh(const global::ValueType z) { } void Activation::relu(const global::Tensor &net, global::Tensor &out) { - for (size_t i = 0; i < net.numElements(); ++i) - out[i] = relu(net[i]); + if (net.isGpu) { + global::tensor_gpu::relu(net.gpu_data, out.gpu_data, net.gpu_data_size); + } else { + for (size_t i = 0; i < net.numElements(); ++i) { + out.cpu_data[i] = relu(net.cpu_data[i]); + } + } } void Activation::derivativeRelu(const global::Tensor &net, global::Tensor &out) { - for (size_t i = 0; i < net.numElements(); ++i) - out[i] *= derivativeRelu(net[i]); + if (net.isGpu) { + global::tensor_gpu::relu_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size); + } else { + for (size_t i = 0; i < net.numElements(); ++i) { + out.cpu_data[i] *= derivativeRelu(net.cpu_data[i]); + } + } } void Activation::leakyRelu(const global::Tensor &net, global::Tensor &out) { - for (size_t i = 0; i < net.numElements(); ++i) - out[i] = leakyRelu(net[i]); + if (net.isGpu) { + global::tensor_gpu::leaky_relu(net.gpu_data, out.gpu_data, net.gpu_data_size); + } else { + for (size_t i = 0; i < net.numElements(); ++i) { + out.cpu_data[i] = leakyRelu(net.cpu_data[i]); + } + } } void Activation::derivativeLeakyRelu(const global::Tensor &net, global::Tensor &out) { - for (size_t i = 0; i < net.numElements(); ++i) - out[i] *= derivativeLeakyRelu(net[i]); + if (net.isGpu) { + global::tensor_gpu::leaky_relu_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size); + } else { + for (size_t i = 0; i < net.numElements(); ++i) { + out.cpu_data[i] *= derivativeLeakyRelu(net.cpu_data[i]); + } + } } void Activation::sigmoid(const global::Tensor &net, global::Tensor &out) { - for (size_t i = 0; i < net.numElements(); ++i) - out[i] = sigmoid(net[i]); + if (net.isGpu) { + global::tensor_gpu::sigmoid(net.gpu_data, out.gpu_data, net.gpu_data_size); + } else { + for (size_t i = 0; i < net.numElements(); ++i) { + out.cpu_data[i] = sigmoid(net.cpu_data[i]); + } + } } void Activation::derivativeSigmoid(const global::Tensor &net, global::Tensor &out) { - for (size_t i = 0; i < net.numElements(); ++i) - out[i] *= derivativeSigmoid(net[i]); + if (net.isGpu) { + global::tensor_gpu::sigmoid_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size); + } else { + for (size_t i = 0; i < net.numElements(); ++i) { + out.cpu_data[i] *= derivativeSigmoid(net.cpu_data[i]); + } + } } void Activation::tanh(const global::Tensor &net, global::Tensor &out) { - for (size_t i = 0; i < net.numElements(); ++i) - out[i] = tanh(net[i]); + if (net.isGpu) { + global::tensor_gpu::tanh_activation(net.gpu_data, out.gpu_data, net.gpu_data_size); + } else { + for (size_t i = 0; i < net.numElements(); ++i) { + out.cpu_data[i] = tanh(net.cpu_data[i]); + } + } } void Activation::derivativeTanh(const global::Tensor &net, global::Tensor &out) { - for (size_t i = 0; i < net.numElements(); ++i) - out[i] *= derivativeTanh(net[i]); + if (net.isGpu) { + global::tensor_gpu::tanh_derivative(net.gpu_data, out.gpu_data, net.gpu_data_size); + } else { + for (size_t i = 0; i < net.numElements(); ++i) { + out.cpu_data[i] *= derivativeTanh(net.cpu_data[i]); + } + } } void Activation::softmax(const global::Tensor &net, global::Tensor &out) { - global::ValueType max = maxVector(net); - global::ValueType sum = 0.0; - - for (size_t i = 0; i < net.numElements(); ++i) { - global::ValueType x = net[i] - max; - if (x < -700.0) - x = -700.0; - if (x > 700.0) - x = 700.0; - out[i] = std::exp(x); - sum += out[i]; - } + if (net.isGpu) { + global::tensor_gpu::softmax(net.gpu_data, out.gpu_data, net.gpu_data_size); + } else { + global::ValueType max = maxVector(net); + global::ValueType sum = 0.0; + + for (size_t i = 0; i < net.numElements(); ++i) { + global::ValueType x = net.cpu_data[i] - max; + if (x < -700.0) + x = -700.0; + if (x > 700.0) + x = 700.0; + out.cpu_data[i] = std::exp(x); + sum += out.cpu_data[i]; + } - sum = maxValue(sum, 1e-10); + sum = maxValue(sum, 1e-10); - out /= sum; + out /= sum; + } } } // namespace nn::model diff --git a/src/model/activations.hpp b/src/model/activations.hpp index e49a8f0..010de2a 100644 --- a/src/model/activations.hpp +++ b/src/model/activations.hpp @@ -2,10 +2,10 @@ #define ACTIVATIONSP #include "tensor.hpp" -#include #include namespace nn::model { + constexpr global::ValueType RELU_LEAKY_ALPHA = 0.01; constexpr global::ValueType maxValue(const global::ValueType &a, const float &b) { @@ -38,16 +38,13 @@ class Activation { static global::ValueType derivativeTanh(const global::ValueType z); static void relu(const global::Tensor &net, global::Tensor &out); - static void derivativeRelu(const global::Tensor &net, - global::Tensor &out); + static void derivativeRelu(const global::Tensor &net, global::Tensor &out); - static void leakyRelu(const global::Tensor &net, - global::Tensor &out); + static void leakyRelu(const global::Tensor &net, global::Tensor &out); static void derivativeLeakyRelu(const global::Tensor &net, global::Tensor &out); - static void sigmoid(const global::Tensor &net, - global::Tensor &out); + static void sigmoid(const global::Tensor &net, global::Tensor &out); static void derivativeSigmoid(const global::Tensor &net, global::Tensor &out); @@ -55,8 +52,7 @@ class Activation { static void derivativeTanh(const global::Tensor &net, global::Tensor &out); - static void softmax(const global::Tensor &net, - global::Tensor &out); + static void softmax(const global::Tensor &net, global::Tensor &out); static global::ValueType maxVector(const global::Tensor &metrix); @@ -67,9 +63,6 @@ class Activation { : activationType(other.activationType) {} ~Activation() = default; - global::ValueType activate(const global::ValueType x) const; - global::ValueType derivativeActivate(const global::ValueType x) const; - void activate(const global::Tensor &net, global::Tensor &out) const; void derivativeActivate(const global::Tensor &net, diff --git a/src/model/config.hpp b/src/model/config.hpp index 2c4b4ef..5aa54b7 100644 --- a/src/model/config.hpp +++ b/src/model/config.hpp @@ -148,9 +148,10 @@ NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(VisualMode, state, mode); struct VisualConfig { bool enableVisuals{true}; + bool enableNetwrokVisual{true}; std::vector modes; }; -NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(VisualConfig, enableVisuals, modes); +NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE(VisualConfig, enableVisuals, enableNetwrokVisual, modes); class Config { public: diff --git a/src/model/dataBase.cpp b/src/model/dataBase.cpp index 2c13444..2a8afce 100644 --- a/src/model/dataBase.cpp +++ b/src/model/dataBase.cpp @@ -26,7 +26,7 @@ TrainSample DataBase::readLine(const std::string &line) { for (size_t i = 0; i < samples.sInputSize; ++i) { iss >> token; - new_sample.input({i}) = std::stod(token); + new_sample.input.setValue({i}, std::stod(token)); } return new_sample; @@ -65,8 +65,9 @@ int DataBase::load(const std::string &db_filename) { } TrainSample new_sample = readLine(line); - if (new_sample.input.numElements() == 0) + if (new_sample.input.numElements() == 0) { continue; + } samples.add(new_sample); } diff --git a/src/model/dataBase.hpp b/src/model/dataBase.hpp index 0ad890a..cf17ba8 100644 --- a/src/model/dataBase.hpp +++ b/src/model/dataBase.hpp @@ -4,6 +4,7 @@ #include "config.hpp" #include #include +#include namespace nn::model { const std::string DATABASE_FILE_EXETENTION = ".nndb"; diff --git a/src/model/model.cpp b/src/model/model.cpp index eb2f22e..694dd8c 100644 --- a/src/model/model.cpp +++ b/src/model/model.cpp @@ -1,5 +1,6 @@ #include "../networks/cnn/CNNetwork.hpp" #include "../networks/fnn/FNNetwork.hpp" +#include "dataBase.hpp" #include #include #include @@ -77,6 +78,9 @@ void Model::initOptimizer() { void Model::initVisual() { visual.start(); + if (!config.visualConfig.enableNetwrokVisual) + return; + for (size_t i = 0; i < config.networkConfig.SubNetworksConfig.size(); ++i) { visual.addVisualSubNetwork(network[i]->getVisual()); network[i]->getVisual()->setVstate(visual.Vstate); @@ -89,6 +93,7 @@ std::uint32_t Model::calculateSubNetWidth() const { void Model::initModel() { const std::uint32_t WIDTH = calculateSubNetWidth(); + size_t param_amount = 0; for (size_t i = 0; i < config.networkConfig.SubNetworksConfig.size(); ++i) { ISubNetworkConfig &_config = *config.networkConfig.SubNetworksConfig[i]; @@ -98,13 +103,20 @@ void Model::initModel() { } else if (_config.NNLable() == cnn::CNN_LABLE) { addCNN(WIDTH, _config); } + + param_amount += network[i]->getParams().numElements(); } + + std::cout << "initialize model - " + << param_amount << " parameters, " + << config.networkConfig.SubNetworksConfig.size() << " sub networks" + << std::endl; } void Model::addFNN(const std::uint32_t width, ISubNetworkConfig &_config) { fnn::FNNConfig &sub_ = (fnn::FNNConfig &)(_config); - if (config.visualConfig.enableVisuals) { + if (config.visualConfig.enableVisuals && config.visualConfig.enableNetwrokVisual) { std::shared_ptr visual_ = std::make_shared( visual.Vstate, @@ -120,7 +132,7 @@ void Model::addFNN(const std::uint32_t width, ISubNetworkConfig &_config) { void Model::addCNN(const std::uint32_t width, ISubNetworkConfig &_config) { cnn::CNNConfig &sub_ = (cnn::CNNConfig &)(_config); - if (config.visualConfig.enableVisuals) { + if (config.visualConfig.enableVisuals && config.visualConfig.enableNetwrokVisual) { std::shared_ptr visual_ = std::make_shared( visual.Vstate, @@ -158,9 +170,10 @@ void Model::updateWeights(const int batchSize) { void Model::Backward(const global::Tensor &output) { global::Tensor deltas = output; + global::Tensor *delta = &deltas; for (int i = static_cast(network.size()) - 1; i >= 0; --i) { - network[i]->backward(deltas); + network[i]->backward(&delta); deltas = network[i]->getInput(); } } @@ -176,16 +189,16 @@ global::ValueType Model::runBackPropagation( } resetNetworkGradient(); + global::Tensor output({outputSize()}); for (size_t i = 0; i < batch.size(); ++i) { - auto current_sample_ptr = batch.samples.at(i); + TrainSample *current_sample_ptr = batch.samples.at(i); visual.updatePrediction(current_sample_ptr->pre); - runModel(transformation(current_sample_ptr->input)); - - global::Tensor output({outputSize()}); - output[current_sample_ptr->pre.index] = 1; + runModel(current_sample_ptr->input, transformation); if (doBackward) { + output.zero(); + output.setValue({current_sample_ptr->pre.index}, 1); Backward(output); updateWeights(batch.size()); } @@ -280,7 +293,7 @@ bool Model::autoEvaluating( void Model::autoSave(const int i) { if (config.trainingConfig.isAutoSave() && i % config.trainingConfig.getAutoSave().saveEvery == 0) { - save(config.trainingConfig.getAutoSave().dataFilenameAutoSave); + save(config.trainingConfig.getAutoSave().dataFilenameAutoSave, false); } } @@ -328,6 +341,15 @@ float Model::calculatePercentage(size_t currentSize, size_t totalSize) { return 100.0f * static_cast(currentSize) / static_cast(totalSize); } +void Model::runModel(const global::Tensor &input, + global::Transformation transformation) { + if (transformation) { + runModel(transformation(input)); + } else { + runModel(input); + } +} + modelResult Model::evaluateModel( DataBase &dataBase, const bool cancleOnError, @@ -347,11 +369,17 @@ modelResult Model::evaluateModel( for (int i = 0; i < result.dbSize; ++i) { TrainSample &sample = dataBase.getSample(i); - runModel(transformation(sample.input)); + runModel(sample.input, transformation); + + size_t predicted_index = 0; + float max_value = getOutput().getValue({0}); - size_t predicted_index = std::distance( - getOutput().begin(), - std::max_element(getOutput().begin(), getOutput().end())); + for (size_t j = 1; j < getOutput().numElements(); ++j) { + if (getOutput().getValue({j}) > max_value) { + max_value = getOutput().getValue({j}); + predicted_index = j; + } + } if (showProgressbar) { bar++; @@ -396,28 +424,40 @@ size_t Model::inputSize() const { return network[0]->inputSize(); } -void Model::save(const std::string &file) { +void Model::save(const std::string &file, bool print) { std::ofstream outFile(file); + if (print) { + std::cout << "Start saving" << std::endl; + } + for (size_t i = 0; i < network.size(); ++i) { global::Tensor params = network[i]->getParams(); outFile << params.numElements() << " "; for (size_t j = 0; j < params.numElements(); ++j) { - outFile << params[j] << " "; + outFile << params.getValue({j}) << " "; } - outFile << std::endl; } + if (print) { + std::cout << " saving complete" << std::endl; + } + outFile.close(); } -void Model::load(const std::string &file) { +void Model::load(const std::string &file, bool print) { std::ifstream inFile(file); std::string line; int networkI = 0; + + if (print) { + std::cout << "Start loading" << std::endl; + } + while (std::getline(inFile, line)) { std::istringstream iss(line); @@ -426,17 +466,19 @@ void Model::load(const std::string &file) { global::Tensor numbers({ParamSize}); float num; - for (size_t i = 0; i < ParamSize; ++i) { iss >> num; - numbers[i] = num; + numbers.setValue({i}, num); } network[networkI]->setParams(numbers); - networkI++; } + if (print) { + std::cout << " loading complete" << std::endl; + } + inFile.close(); } @@ -444,12 +486,12 @@ global::Prediction Model::getPrediction() const { size_t max = 0; for (size_t i = 1; i < outputSize(); ++i) { - if (getOutput()[i] > getOutput()[max]) { + if (getOutput().getValue({i}) > getOutput().getValue({max})) { max = i; } } - return global::Prediction(max, getOutput()[max]); + return global::Prediction(max, getOutput().getValue({max})); } void Model::setTraining() { diff --git a/src/model/optimizers.cpp b/src/model/optimizers.cpp index 9014aa2..b34a89a 100644 --- a/src/model/optimizers.cpp +++ b/src/model/optimizers.cpp @@ -1,7 +1,8 @@ #include "optimizers.hpp" namespace nn::model { -void ConstantOptimizer::step(global::Tensor &weight, const global::Tensor &grad) { - weight -= grad * (config.getLearningRate() / batchSize); +void ConstantOptimizer::step(global::Tensor &weight, global::Tensor &grad) { + grad *= config.getLearningRate() / batchSize; + weight -= grad; } } // namespace nn::model diff --git a/src/model/optimizers.hpp b/src/model/optimizers.hpp index a910edf..203ea4e 100644 --- a/src/model/optimizers.hpp +++ b/src/model/optimizers.hpp @@ -13,7 +13,7 @@ class IOptimizer { public: virtual ~IOptimizer() = default; - virtual void step(global::Tensor &weight, const global::Tensor &grad) = 0; + virtual void step(global::Tensor &weight, global::Tensor &grad) = 0; virtual void reset() = 0; void setOfset(const int batchSize_) { batchSize = batchSize_; } @@ -27,7 +27,7 @@ class ConstantOptimizer : public IOptimizer { ConstantOptimizer(const ConstantOptimizerConfig &config_) : config(config_) {} - void step(global::Tensor &weight, const global::Tensor &grad) override; + void step(global::Tensor &weight, global::Tensor &grad) override; void reset() override {} }; diff --git a/src/model/tensor.cpp b/src/model/tensor.cpp index 80fd8cb..0bc82f9 100644 --- a/src/model/tensor.cpp +++ b/src/model/tensor.cpp @@ -1,30 +1,137 @@ +#include "tensor_gpu.hpp" +#include #include #include #include namespace nn::global { -Tensor::Tensor(const std::vector &shape, float init) - : shape(shape) { - if (shape.empty()) { +bool Tensor::isGpu = false; +size_t Tensor::tensorCount = 0; + +Tensor::Tensor(const std::vector &shape_, ValueType init) { + if (shape_.empty()) { throw std::invalid_argument("Tensor shape cannot be empty."); } size_t totalSize = std::accumulate( - shape.begin(), - shape.end(), + shape_.begin(), + shape_.end(), size_t(1), std::multiplies<>()); - data.assign(totalSize, init); + + shape = shape_; + if (isGpu) { + gpu_data = (ValueType *)tensor_gpu::allocate(totalSize * sizeof(ValueType)); + gpu_data_size = totalSize; + fill(init); + } else { + cpu_data.assign(totalSize, init); + } + computeStrides(); + + tensorCount++; +} + +void Tensor::toGpu() { + if (isGpu) + return; + + if (tensorCount > 0) + throw std::runtime_error("Cannot switch to GPU mode: tensors already exist in CPU mode."); + + isGpu = true; +} + +void Tensor::toCpu() { + if (!isGpu) + return; + + if (tensorCount > 0) + throw std::runtime_error("Cannot switch to CPU mode: tensors already exist in GPU mode."); + + isGpu = false; +} + +Tensor::Tensor(const Tensor &other) { + shape = other.shape; + strides = other.strides; + if (isGpu) { + gpu_data_size = other.gpu_data_size; + gpu_data = (ValueType *)tensor_gpu::allocate(gpu_data_size * sizeof(ValueType)); + tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType)); + } else { + cpu_data = other.cpu_data; + } +} + +size_t Tensor::numElements() const { + if (isGpu) { + return gpu_data_size; + } + return cpu_data.size(); +} + +void Tensor::getData(std::vector &dest) const { + if (isGpu) { + tensor_gpu::copyToHost(dest.data(), gpu_data, gpu_data_size * sizeof(ValueType)); + } else { + dest = cpu_data; + } +} + +void Tensor::fill(const ValueType &value) { + if (isGpu) { + tensor_gpu::zero(gpu_data, gpu_data_size); + tensor_gpu::add_scalar(gpu_data, value, gpu_data, gpu_data_size); + } else { + for (auto &n : cpu_data) { + n = value; + } + } +} + +void Tensor::zero() { + if (isGpu) { + tensor_gpu::zero(gpu_data, gpu_data_size); + } else { + fill(0); + } } Tensor &Tensor::operator=(const Tensor &other) { if (this == &other) return *this; - data = other.data; + if (isGpu) { + if (gpu_data_size != other.gpu_data_size) { + ValueType *temp = (ValueType *)tensor_gpu::allocate(other.gpu_data_size * sizeof(ValueType)); + gpu_data_size = other.gpu_data_size; + tensor_gpu::copyDeviceToDevice(temp, other.gpu_data, gpu_data_size * sizeof(ValueType)); + tensor_gpu::deallocate(gpu_data); + gpu_data = temp; + } else { + tensor_gpu::copyDeviceToDevice(gpu_data, other.gpu_data, gpu_data_size * sizeof(ValueType)); + } + } else { + cpu_data = other.cpu_data; + } + shape = other.shape; strides = other.strides; + return *this; +} + +Tensor &Tensor::operator=(const std::vector &other) { + if (other.size() != numElements()) { + throw std::length_error("Tensor assignment size mismatch"); + } + + if (isGpu) { + tensor_gpu::copyToDevice(gpu_data, other.data(), gpu_data_size * sizeof(ValueType)); + } else { + cpu_data = other; + } return *this; } @@ -52,150 +159,125 @@ inline size_t Tensor::flattenIndex(const std::vector &indices) const { return index; } -ValueType &Tensor::operator()(const std::vector &indices) { - return data[flattenIndex(indices)]; -} - -ValueType Tensor::operator()(const std::vector &indices) const { - return data[flattenIndex(indices)]; -} - -Tensor Tensor::operator+(const Tensor &other) const { - if (shape != other.shape) { - throw std::invalid_argument("Shape mismatch in Tensor::operator+."); +ValueType Tensor::getValue(const std::vector &indices) const { + if (isGpu) { + return tensor_gpu::getValueAt(gpu_data, flattenIndex(indices)); } - Tensor result(shape); - const float *a = data.data(); - const float *b = other.data.data(); - float *r = result.data.data(); - const size_t N = data.size(); - for (size_t i = 0; i < N; ++i) - r[i] = a[i] + b[i]; - return result; + + return cpu_data[flattenIndex(indices)]; } -Tensor Tensor::operator-(const Tensor &other) const { - if (shape != other.shape) { - throw std::invalid_argument("Shape mismatch in Tensor::operator-."); +void Tensor::insertRange(const Tensor &other, + const size_t startO, const size_t startT, + const size_t length) { + if (isGpu) { + tensor_gpu::copyDeviceToDevice(gpu_data + startT, other.gpu_data + startO, length * sizeof(ValueType)); + } else { + for (size_t i = 0; i < length; ++i) { + cpu_data[i + startT] = other.cpu_data[i + startO]; + } } - Tensor result(shape); - const float *a = data.data(); - const float *b = other.data.data(); - float *r = result.data.data(); - const size_t N = data.size(); - for (size_t i = 0; i < N; ++i) - r[i] = a[i] - b[i]; - return result; } -Tensor Tensor::operator/(const Tensor &other) const { - if (shape != other.shape) { - throw std::invalid_argument("Shape mismatch in Tensor::operator/."); +void Tensor::setValue(const std::vector &indices, const ValueType value) { + if (isGpu) { + tensor_gpu::setValueAt(gpu_data, flattenIndex(indices), value); + } else { + cpu_data[flattenIndex(indices)] = value; } - Tensor result(shape); - const float *a = data.data(); - const float *b = other.data.data(); - float *r = result.data.data(); - const size_t N = data.size(); - for (size_t i = 0; i < N; ++i) - r[i] = a[i] / b[i]; - return result; } Tensor &Tensor::operator+=(const Tensor &other) { if (shape != other.shape) - throw std::invalid_argument("Shape mismatch."); - - float *__restrict__ a = data.data(); - const float *__restrict__ b = other.data.data(); - const size_t N = data.size(); - - for (size_t i = 0; i < N; ++i) - a[i] += b[i]; - + throw std::invalid_argument("Shape mismatch in Tensor::operator+=."); + if (isGpu) { + tensor_gpu::add_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size); + } else { + for (size_t i = 0; i < cpu_data.size(); ++i) + cpu_data[i] += other.cpu_data[i]; + } return *this; } Tensor &Tensor::operator-=(const Tensor &other) { if (shape != other.shape) - throw std::invalid_argument("Shape mismatch."); - float *a = data.data(); - const float *b = other.data.data(); - const size_t N = data.size(); - for (size_t i = 0; i < N; ++i) - a[i] -= b[i]; + throw std::invalid_argument("Shape mismatch in Tensor::operator-=."); + if (isGpu) { + tensor_gpu::subtraction_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size); + } else { + for (size_t i = 0; i < cpu_data.size(); ++i) + cpu_data[i] -= other.cpu_data[i]; + } return *this; } Tensor &Tensor::operator*=(const Tensor &other) { if (shape != other.shape) throw std::invalid_argument("Shape mismatch in Tensor::operator*=."); - const size_t N = data.size(); - for (size_t i = 0; i < N; ++i) - data[i] *= other.data[i]; + if (isGpu) { + tensor_gpu::multiply_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size); + } else { + for (size_t i = 0; i < cpu_data.size(); ++i) + cpu_data[i] *= other.cpu_data[i]; + } return *this; } Tensor &Tensor::operator/=(const Tensor &other) { if (shape != other.shape) throw std::invalid_argument("Shape mismatch in Tensor::operator/=."); - const size_t N = data.size(); - for (size_t i = 0; i < N; ++i) - data[i] /= other.data[i]; + if (isGpu) { + tensor_gpu::division_vec(gpu_data, other.gpu_data, gpu_data, gpu_data_size); + } else { + for (size_t i = 0; i < cpu_data.size(); ++i) + cpu_data[i] /= other.cpu_data[i]; + } return *this; } Tensor &Tensor::operator*=(ValueType scalar) { - for (auto &x : data) - x *= scalar; + if (isGpu) { + tensor_gpu::multiply_scalar(gpu_data, scalar, gpu_data, gpu_data_size); + } else { + for (auto &x : cpu_data) + x *= scalar; + } return *this; } Tensor &Tensor::operator-=(ValueType scalar) { - for (auto &x : data) - x -= scalar; + if (isGpu) { + tensor_gpu::subtraction_scalar(gpu_data, scalar, gpu_data, gpu_data_size); + } else { + for (auto &x : cpu_data) + x -= scalar; + } return *this; } Tensor &Tensor::operator+=(ValueType scalar) { - for (auto &x : data) - x += scalar; + if (isGpu) { + tensor_gpu::add_scalar(gpu_data, scalar, gpu_data, gpu_data_size); + } else { + for (auto &x : cpu_data) + x += scalar; + } return *this; } + Tensor &Tensor::operator/=(ValueType scalar) { - for (auto &x : data) - x /= scalar; + if (isGpu) { + tensor_gpu::division_scalar(gpu_data, scalar, gpu_data, gpu_data_size); + } else { + for (auto &x : cpu_data) + x /= scalar; + } return *this; } -Tensor Tensor::operator*(ValueType scalar) const { - Tensor result(*this); - result *= scalar; - return result; -} - -Tensor Tensor::operator/(ValueType scalar) const { - Tensor result(*this); - result /= scalar; - return result; -} - -Tensor Tensor::operator-(ValueType scalar) const { - Tensor result(*this); - result -= scalar; - return result; -} - -Tensor Tensor::operator+(ValueType scalar) const { - Tensor result(*this); - result += scalar; - return result; -} - -Tensor Tensor::matmul(const Tensor &other) const { +void Tensor::matmul(const Tensor &other, Tensor &result) const { const auto &aShape = shape; const auto &bShape = other.shape; - if (aShape.size() != 2 || bShape.size() != 1) throw std::runtime_error("matmul: unsupported shapes."); @@ -204,72 +286,74 @@ Tensor Tensor::matmul(const Tensor &other) const { if (K != bShape[0]) throw std::runtime_error("matmul: shape mismatch."); - Tensor result({M}); - - const float *A = data.data(); - const float *B = other.data.data(); - float *R = result.data.data(); - - for (size_t i = 0; i < M; ++i) { - float sum = 0.0f; - size_t base = i * K; - for (size_t j = 0; j < K; ++j) { - sum += A[base + j] * B[j]; + result.zero(); + + if (isGpu) { + tensor_gpu::matmul(gpu_data, other.gpu_data, result.gpu_data, M, K); + } else { + const float *A = cpu_data.data(); + const float *B = other.cpu_data.data(); + float *R = result.cpu_data.data(); + + for (size_t i = 0; i < M; ++i) { + float sum = 0.0f; + size_t base = i * K; + for (size_t j = 0; j < K; ++j) { + sum += A[base + j] * B[j]; + } + R[i] = sum; } - R[i] = sum; } - return result; } -Tensor Tensor::outer(const Tensor &a, const Tensor &b) { - const std::vector &aShape = a.getShape(); - const std::vector &bShape = b.getShape(); - - if (aShape.size() != 1 || bShape.size() != 1) { +void Tensor::outer(const Tensor &a, const Tensor &b, Tensor &result) { + if (a.shape.size() != 1 || b.shape.size() != 1) { throw std::runtime_error("outer: both tensors must be 1D vectors"); } - size_t m = aShape[0]; - size_t n = bShape[0]; + size_t m = a.shape[0]; + size_t n = b.shape[0]; + + result.zero(); - Tensor result({m, n}); - float *r = result.data.data(); - const float *A = a.data.data(); - const float *B = b.data.data(); + if (isGpu) { + tensor_gpu::outer(a.gpu_data, b.gpu_data, result.gpu_data, m, n); + } else { + float *r = result.cpu_data.data(); + const float *A = a.cpu_data.data(); + const float *B = b.cpu_data.data(); - for (size_t i = 0; i < m; ++i) { - for (size_t j = 0; j < n; ++j) { - r[i * n + j] = A[i] * B[j]; + for (size_t i = 0; i < m; ++i) { + for (size_t j = 0; j < n; ++j) { + r[i * n + j] += A[i] * B[j]; + } } } - return result; } -Tensor Tensor::matmulT(const Tensor &vec) const { - const auto &wShape = shape; - const auto &vShape = vec.shape; - - if (wShape.size() != 2 || vShape.size() != 1) +void Tensor::matmulT(const Tensor &vec, Tensor &result) const { + if (shape.size() != 2 || vec.shape.size() != 1) throw std::runtime_error("matmulT: bad dimensions"); - - size_t M = wShape[0]; - size_t N = wShape[1]; - if (vShape[0] != M) + if (vec.shape[0] != shape[0]) throw std::runtime_error("matmulT: incompatible"); - Tensor result({N}, 0.0f); + result.zero(); - const float *W = data.data(); - const float *V = vec.data.data(); - float *R = result.data.data(); - - for (size_t i = 0; i < N; ++i) { - float sum = 0.0f; - for (size_t j = 0; j < M; ++j) { - sum += W[j * N + i] * V[j]; + if (isGpu) { + tensor_gpu::matmulT(gpu_data, vec.gpu_data, result.gpu_data, shape[0], shape[1]); + } else { + for (size_t i = 0; i < shape[1]; ++i) { + for (size_t j = 0; j < shape[0]; ++j) { + result.cpu_data[i] += cpu_data[j * shape[1] + i] * vec.cpu_data[j]; + } } - R[i] = sum; } - return result; +} + +Tensor::~Tensor() { + if (isGpu) { + tensor_gpu::deallocate(gpu_data); + } + tensorCount--; } } // namespace nn::global diff --git a/src/model/tensor_gpu.cu b/src/model/tensor_gpu.cu new file mode 100644 index 0000000..f11fef7 --- /dev/null +++ b/src/model/tensor_gpu.cu @@ -0,0 +1,437 @@ +#include +#include "tensor_gpu.hpp" +#include +#include + +namespace nn::global::tensor_gpu { +#define CUDA_CHECK(call) do { \ + cudaError_t e = (call); \ + if (e != cudaSuccess) { \ + fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(e)); \ + throw std::runtime_error(cudaGetErrorString(e)); \ + } \ +} while(0) + +// ================================================== +// Memory Management +// ================================================== +void* allocate(std::size_t size) { + void* devicePtr = nullptr; + if (cudaMalloc(&devicePtr, size) != cudaSuccess) { + throw std::runtime_error("cudaMalloc failed"); + } + return devicePtr; +} + +void deallocate(void* devicePtr) { + if (devicePtr) { + cudaFree(devicePtr); + } +} + +void copyToDevice(void* deviceDst, const void* hostSrc, std::size_t size) { + CUDA_CHECK(cudaMemcpy(deviceDst, hostSrc, size, cudaMemcpyHostToDevice)); +} + +void copyDeviceToDevice(void* deviceDst, const void* deviceSrc, std::size_t size) { + CUDA_CHECK(cudaMemcpy(deviceDst, deviceSrc, size, cudaMemcpyDeviceToDevice)); +} + +void copyToHost(void* hostDst, const void* deviceSrc, std::size_t size) { + CUDA_CHECK(cudaMemcpy(hostDst, deviceSrc, size, cudaMemcpyDeviceToHost)); +} + +void setValueAt(ValueType* devicePtr, std::size_t index, ValueType value) { + CUDA_CHECK(cudaMemcpy(devicePtr + index, &value, sizeof(ValueType), cudaMemcpyHostToDevice)); +} + +ValueType getValueAt(const ValueType* devicePtr, std::size_t index) { + ValueType value; + CUDA_CHECK(cudaMemcpy(&value, devicePtr + index, sizeof(ValueType), cudaMemcpyDeviceToHost)); + return value; +} + +// ================================================== +// Utility Kernels +// ================================================== +__global__ void zeroKernel(ValueType* data, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) data[idx] = 0.0f; +} + +void zero(ValueType* deviceData, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + zeroKernel<<>>(deviceData, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +// ================================================== +// Vector-Vector Operations +// ================================================== +__global__ void addVecKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) C[idx] = A[idx] + B[idx]; +} + +__global__ void subVecKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) C[idx] = A[idx] - B[idx]; +} + +__global__ void mulVecKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) C[idx] = A[idx] * B[idx]; +} + +__global__ void divVecKernel(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) C[idx] = A[idx] / B[idx]; +} + +void add_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + addVecKernel<<>>(A, B, C, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +void subtraction_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + subVecKernel<<>>(A, B, C, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +void multiply_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + mulVecKernel<<>>(A, B, C, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +void division_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + divVecKernel<<>>(A, B, C, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +// ================================================== +// Vector-Scalar Operations +// ================================================== +__global__ void addScalarKernel(const ValueType* A, ValueType B, ValueType* C, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) C[idx] = A[idx] + B; +} + +__global__ void subScalarKernel(const ValueType* A, ValueType B, ValueType* C, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) C[idx] = A[idx] - B; +} + +__global__ void mulScalarKernel(const ValueType* A, ValueType B, ValueType* C, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) C[idx] = A[idx] * B; +} + +__global__ void divScalarKernel(const ValueType* A, ValueType B, ValueType* C, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) C[idx] = A[idx] / B; +} + +void add_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + addScalarKernel<<>>(A, B, C, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +void subtraction_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + subScalarKernel<<>>(A, B, C, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +void multiply_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + mulScalarKernel<<>>(A, B, C, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +void division_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + divScalarKernel<<>>(A, B, C, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +// ================================================== +// Activation Functions +// ================================================== +__global__ void reluKernel(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) output[idx] = input[idx] > 0.0f ? input[idx] : 0.0f; +} + +__global__ void reluDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + ValueType derivative = (input[idx] > 0.0f) ? 1.0f : 0.0f; + output[idx] *= derivative; // FIX: Changed = to *= + } +} + +void relu(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + reluKernel<<>>(input, output, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +void relu_derivative(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + reluDerivativeKernel<<>>(input, output, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +__global__ void sigmoidKernel(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + ValueType x = input[idx]; + output[idx] = 1.0f / (1.0f + expf(-x)); + } +} + +__global__ void sigmoidDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + ValueType x = input[idx]; + ValueType s = 1.0f / (1.0f + expf(-x)); + ValueType derivative = s * (1.0f - s); + output[idx] *= derivative; + } +} + +void sigmoid(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + sigmoidKernel<<>>(input, output, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +void sigmoid_derivative(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + sigmoidDerivativeKernel<<>>(input, output, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +__global__ void tanhKernel(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) output[idx] = tanhf(input[idx]); +} + +__global__ void tanhDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + ValueType t = tanhf(input[idx]); + ValueType derivative = 1.0f - t * t; + output[idx] *= derivative; + } +} + +void tanh_activation(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + tanhKernel<<>>(input, output, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +void tanh_derivative(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + tanhDerivativeKernel<<>>(input, output, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +__global__ void leakyReluKernel(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) output[idx] = (input[idx] > 0.0f) ? input[idx] : alpha * input[idx]; +} + +__global__ void leakyReluDerivativeKernel(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) { + std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < count) { + ValueType derivative = (input[idx] > 0.0f) ? 1.0f : alpha; + output[idx] *= derivative; // FIX: Changed = to *= + } +} + +void leaky_relu(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + leakyReluKernel<<>>(input, output, count, alpha); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +void leaky_relu_derivative(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + leakyReluDerivativeKernel<<>>(input, output, count, alpha); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +// ================================================== +// Softmax +// ================================================== +__global__ void softmaxKernel(const ValueType* input, ValueType* output, std::size_t count) { + extern __shared__ ValueType shared[]; + std::size_t tid = threadIdx.x; + std::size_t blockStart = blockIdx.x * blockDim.x; + std::size_t idx = blockStart + tid; + + // always write shared for every thread in block + shared[tid] = (idx < count) ? input[idx] : -INFINITY; + __syncthreads(); + + // compute max (naive per-thread loop) + ValueType max_val = shared[0]; + for (unsigned int i = 1; i < blockDim.x; ++i) { + std::size_t curIdx = blockStart + i; + if (curIdx < count) max_val = fmaxf(max_val, shared[i]); + } + __syncthreads(); + + ValueType e = (idx < count) ? expf(shared[tid] - max_val) : 0.0f; + shared[tid] = e; + __syncthreads(); + + // compute sum (naive) + ValueType sum = 0.0f; + for (unsigned int i = 0; i < blockDim.x; ++i) { + std::size_t curIdx = blockStart + i; + if (curIdx < count) sum += shared[i]; + } + __syncthreads(); + + if (idx < count) output[idx] = shared[tid] / (sum == 0.0f ? 1.0f : sum); +} + +void softmax(const ValueType* input, ValueType* output, std::size_t count) { + std::size_t blockSize = 256; + std::size_t numBlocks = (count + blockSize - 1) / blockSize; + std::size_t sharedMemSize = blockSize * sizeof(ValueType); + softmaxKernel<<>>(input, output, count); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +// ================================================== +// Index Utilities +// ================================================== +__global__ void flattenIndexKernel(const size_t* indices, const size_t* shape, + const size_t* strides, size_t ndim, size_t* outIndex) { + size_t idx = 0; + for (size_t i = 0; i < ndim; ++i) { + if (indices[i] >= shape[i]) { + *outIndex = size_t(-1); + return; + } + idx += indices[i] * strides[i]; + } + *outIndex = idx; +} + +__global__ void computeFlatIndexKernel(const size_t* indices, const size_t* strides, + size_t rank, size_t* outIndex) { + size_t flatIndex = 0; + for (size_t i = 0; i < rank; ++i) { + flatIndex += indices[i] * strides[i]; + } + *outIndex = flatIndex; +} + +// ================================================== +// Matrix Operations +// ================================================== +__global__ void matmulKernel(const ValueType* A, const ValueType* B, ValueType* R, size_t M, size_t K) { + size_t row = blockIdx.x * blockDim.x + threadIdx.x; + if (row < M) { + ValueType sum = 0; + for (size_t j = 0; j < K; ++j) { + sum += A[row * K + j] * B[j]; + } + R[row] = sum; + } +} + +__global__ void outerKernel(const ValueType* a, const ValueType* b, ValueType* result, size_t m, size_t n) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total = m * n; + if (idx < total) { + size_t i = idx / n; + size_t j = idx % n; + result[i * n + j] += a[i] * b[j]; + } +} + +__global__ void matmulTKernel(const ValueType* W, const ValueType* V, ValueType* R, size_t M, size_t N) { + size_t col = blockIdx.x * blockDim.x + threadIdx.x; + if (col < N) { + ValueType sum = 0.0f; + for (size_t i = 0; i < M; ++i) { + sum += W[i * N + col] * V[i]; + } + R[col] = sum; + } +} + +void matmul(const ValueType* A, const ValueType* B, ValueType* R, size_t M, size_t K) { + const int blockSize = 256; + int gridSize = (M + blockSize - 1) / blockSize; + matmulKernel<<>>(A, B, R, M, K); + cudaDeviceSynchronize(); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +void outer(const ValueType* a, const ValueType* b, ValueType* result, size_t m, size_t n) { + const int blockSize = 256; + int gridSize = (m * n + blockSize - 1) / blockSize; + outerKernel<<>>(a, b, result, m, n); + cudaDeviceSynchronize(); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +void matmulT(const ValueType* W, const ValueType* V, ValueType* R, size_t M, size_t N) { + const int blockSize = 256; + int gridSize = (N + blockSize - 1) / blockSize; + matmulTKernel<<>>(W, V, R, M, N); + cudaDeviceSynchronize(); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +} // namespace nn::global::tensor_gpu diff --git a/src/model/tensor_gpu.hpp b/src/model/tensor_gpu.hpp new file mode 100644 index 0000000..ad2ba02 --- /dev/null +++ b/src/model/tensor_gpu.hpp @@ -0,0 +1,81 @@ +#ifndef TENSOR_GPU +#define TENSOR_GPU + +#include + +namespace nn::global { +using ValueType = float; +} + +class Tensor; // Forward declaration + +namespace nn::global::tensor_gpu { + +// ============================ +// Memory Management +// ============================ +void* allocate(std::size_t size); +void deallocate(void* devicePtr); + +void copyToDevice(void* deviceDst, const void* hostSrc, std::size_t count); +void copyToHost(void* hostDst, const void* deviceSrc, std::size_t count); +void copyDeviceToDevice(void* deviceDst, const void* deviceSrc, std::size_t count); + +void zero(ValueType* deviceData, std::size_t count); + +// ============================ +// Element-wise Operations (Vector-Vector) +// ============================ +void add_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count); +void subtraction_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count); +void division_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count); +void multiply_vec(const ValueType* A, const ValueType* B, ValueType* C, std::size_t count); + +// ============================ +// Element-wise Operations (Vector-Scalar) +// ============================ +void add_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count); +void subtraction_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count); +void division_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count); +void multiply_scalar(const ValueType* A, ValueType B, ValueType* C, std::size_t count); + +// ============================ +// Activation Functions +// ============================ + +// ReLU +void relu(const ValueType* input, ValueType* output, std::size_t count); +void relu_derivative(const ValueType* input, ValueType* output, std::size_t count); + +// Leaky ReLU +void leaky_relu(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha = 0.01f); +void leaky_relu_derivative(const ValueType* input, ValueType* output, std::size_t count, ValueType alpha = 0.01f); + +// Sigmoid +void sigmoid(const ValueType* input, ValueType* output, std::size_t count); +void sigmoid_derivative(const ValueType* input, ValueType* output, std::size_t count); + +// Tanh +void tanh_activation(const ValueType* input, ValueType* output, std::size_t count); +void tanh_derivative(const ValueType* input, ValueType* output, std::size_t count); + +// Softmax +void softmax(const ValueType* net, ValueType* out, std::size_t size); + +// ============================ +// Single Value Access +// ============================ +ValueType getValueAt(const ValueType* devicePtr, std::size_t index); +void setValueAt(ValueType* devicePtr, std::size_t index, ValueType value); + +// ============================ +// Matrix Operations +// ============================ +void matmul(const ValueType* A, const ValueType* B, ValueType* R, std::size_t M, std::size_t K); +void matmulT(const ValueType* W, const ValueType* V, ValueType* R, std::size_t M, std::size_t N); +void outer(const ValueType* a, const ValueType* b, ValueType* result, std::size_t m, std::size_t n); + +} // namespace nn::global::tensor_gpu + +#endif // TENSOR_GPU + diff --git a/src/networks/cnn/CNNetwork.cpp b/src/networks/cnn/CNNetwork.cpp index fc45ba3..a2d8b1c 100644 --- a/src/networks/cnn/CNNetwork.cpp +++ b/src/networks/cnn/CNNetwork.cpp @@ -16,7 +16,7 @@ void CNNetwork::forward(const global::Tensor &newInput) { input = newInput; } -void CNNetwork::backward(const global::Tensor &) { +void CNNetwork::backward(global::Tensor **) { } global::ValueType CNNetwork::getLoss(const global::Prediction &) const { diff --git a/src/networks/cnn/CNNetwork.hpp b/src/networks/cnn/CNNetwork.hpp index e2ad58d..1c08705 100644 --- a/src/networks/cnn/CNNetwork.hpp +++ b/src/networks/cnn/CNNetwork.hpp @@ -24,7 +24,7 @@ class CNNetwork : public INetwork { ~CNNetwork() override = default; void forward(const global::Tensor &newInput) override; - void backward(const global::Tensor &outputDeltas) override; + void backward(global::Tensor **outputDeltas) override; void updateWeights(IOptimizer &optimizer) override; void resetGradient() override; diff --git a/src/networks/fnn/DenseLayer.cpp b/src/networks/fnn/DenseLayer.cpp index dc8f7ea..b467dc2 100644 --- a/src/networks/fnn/DenseLayer.cpp +++ b/src/networks/fnn/DenseLayer.cpp @@ -1,5 +1,7 @@ #include "DenseLayer.hpp" +#include #include +#include namespace nn::model::fnn { DenseLayer::DenseLayer( @@ -11,6 +13,7 @@ DenseLayer::DenseLayer( out({size}), parameters(size, prevSize), gradients(size, prevSize), + deltaL({size}), activationFunction(activation) { if (randomInit) { fillParamRandom(); @@ -27,43 +30,46 @@ void Hidden_Layer::CreateDropoutMask() { static thread_local std::mt19937 rng{std::random_device{}()}; std::bernoulli_distribution bernoulli(keepProb); + static std::vector temp(dropoutMask.numElements(), 0); for (size_t i = 0; i < dropoutMask.numElements(); ++i) { - dropoutMask[i] = static_cast(bernoulli(rng)); + temp[i] = static_cast(bernoulli(rng)); } + + dropoutMask = temp; } void Output_Layer::forward(const global::Tensor &metrix) { - net = parameters.weights.matmul(metrix); + parameters.weights.matmul(metrix, net); net += parameters.biases; activationFunction.activate(net, out); } -global::Tensor Output_Layer::getDelta(const global::Tensor &output) { - global::Tensor deltas = out; - deltas -= output; - - return deltas; +void Output_Layer::getDelta(const global::Tensor &output) { + deltaL = out; + deltaL -= output; } void Output_Layer::backward( - global::Tensor &deltas, + global::Tensor **deltas, const global::Tensor &prevLayer, const LayerParams *) { if (activationFunction.getType() == ActivationType::Softmax) { - deltas = getDelta(deltas); + getDelta(**deltas); } else { - activationFunction.derivativeActivate(out, deltas); + activationFunction.derivativeActivate(out, **deltas); + deltaL = **deltas; } - gradients.biases += deltas; - gradients.weights += global::Tensor::outer(deltas, prevLayer); + gradients.biases += deltaL; + global::Tensor::outer(deltaL, prevLayer, gradients.weights); + *deltas = &deltaL; } global::ValueType Output_Layer::getCrossEntropyLoss( const global::Tensor &prediction, const size_t target) { - return -std::log(std::max(prediction[target], MIN_LOSS_VALUE)); + return -std::log(std::max(prediction.getValue({target}), MIN_LOSS_VALUE)); } global::ValueType Output_Layer::getLoss(const global::Prediction &targets) { @@ -73,8 +79,8 @@ global::ValueType Output_Layer::getLoss(const global::Prediction &targets) { void Hidden_Layer::forward(const global::Tensor &metrix) { if (isTraining) CreateDropoutMask(); - - net = parameters.weights.matmul(metrix); + + parameters.weights.matmul(metrix, net); net += parameters.biases; if (isTraining && config.dropoutRate > 0.0f) { @@ -86,32 +92,31 @@ void Hidden_Layer::forward(const global::Tensor &metrix) { } void Hidden_Layer::backward( - global::Tensor &deltas, + global::Tensor **deltas, const global::Tensor &prevLayer, const LayerParams *nextLayer) { if (!nextLayer) return; - deltas = getDelta(deltas, *nextLayer); + calculateDelta(**deltas, *nextLayer); if (isTraining && config.dropoutRate) { - deltas *= dropoutMask; + deltaL *= dropoutMask; } - gradients.biases += deltas; + gradients.biases += deltaL; - gradients.weights += global::Tensor::outer(deltas, prevLayer); + global::Tensor::outer(deltaL, prevLayer, gradients.weights); + *deltas = &deltaL; } -global::Tensor Hidden_Layer::getDelta( +void Hidden_Layer::calculateDelta( const global::Tensor &output, const LayerParams &nextLayer) { - auto deltas = nextLayer.weights.matmulT(output); - activationFunction.derivativeActivate(out, deltas); - - return deltas; + nextLayer.weights.matmulT(output, deltaL); + activationFunction.derivativeActivate(out, deltaL); } size_t DenseLayer::getParamCount() const { @@ -124,41 +129,29 @@ void DenseLayer::updateWeight(nn::model::IOptimizer &optimizer) { } const global::Tensor DenseLayer::getData() const { - global::Tensor matrix({parameters.paramSize()}); - - size_t currentI = 0; - for (size_t i = 0; i < size(); ++i) { - for (size_t j = 0; j < prevSize(); ++j) { - matrix[currentI] = parameters.weights({i, j}); + size_t weightsSize = parameters.weights.numElements(); + size_t biasesSize = parameters.biases.numElements(); - ++currentI; - } - } + global::Tensor matrix({weightsSize + biasesSize}); - for (size_t i = 0; i < size(); ++i) { - matrix[currentI] = parameters.biases[i]; + // Copy weights + matrix.insertRange(parameters.weights, 0, 0, weightsSize); - ++currentI; - } + // Copy biases + matrix.insertRange(parameters.biases, 0, weightsSize, biasesSize); return matrix; } -void DenseLayer::setData(const global::Tensor newParam) { - size_t currentI = 0; - for (size_t i = 0; i < size(); ++i) { - for (size_t j = 0; j < prevSize(); ++j) { - parameters.weights({i, j}) = newParam[currentI]; +void DenseLayer::setData(const global::Tensor newParam, const size_t offset) { + size_t weightsSize = parameters.weights.numElements(); + size_t biasesSize = parameters.biases.numElements(); - ++currentI; - } - } + // Copy into weights + parameters.weights.insertRange(newParam, offset, 0, weightsSize); - for (size_t i = 0; i < size(); ++i) { - parameters.biases[i] = newParam[currentI]; - - ++currentI; - } + // Copy into biases + parameters.biases.insertRange(newParam, offset + weightsSize, 0, biasesSize); } void DenseLayer::fillParamRandom() { @@ -167,18 +160,20 @@ void DenseLayer::fillParamRandom() { global::ValueType std_dev = std::sqrt(2.0 / static_cast(prevSize())); std::normal_distribution<> dist(0.0, std_dev); - for (auto &value : parameters.weights) { - value = dist(gen); + std::vector temp(parameters.weights.numElements()); + for (size_t i = 0; i < temp.size(); ++i) { + temp[i] = dist(gen); } + parameters.weights = temp; } void DenseLayer::resetDots() { - net.fill(0); - out.fill(0); + net.zero(); + out.zero(); } void DenseLayer::resetGradient() { - gradients.biases.fill(0); - gradients.weights.fill(0); + gradients.biases.zero(); + gradients.weights.zero(); } } // namespace nn::model::fnn diff --git a/src/networks/fnn/DenseLayer.hpp b/src/networks/fnn/DenseLayer.hpp index 84587aa..651adb2 100644 --- a/src/networks/fnn/DenseLayer.hpp +++ b/src/networks/fnn/DenseLayer.hpp @@ -1,8 +1,8 @@ #ifndef DENSELAYER #define DENSELAYER -#include "../../model/config.hpp" #include "../src/model/optimizers.hpp" +#include namespace nn::model::fnn { constexpr global::ValueType MIN_LOSS_VALUE = 1e-10; @@ -11,11 +11,15 @@ struct LayerParams { global::Tensor weights; global::Tensor biases; + size_t size_; + size_t prevSize_; + LayerParams(size_t out_dim, size_t in_dim) - : weights({out_dim, in_dim}), biases({out_dim}) {} + : weights({out_dim, in_dim}), biases({out_dim}), + size_(out_dim), prevSize_(in_dim) {} - size_t size() const { return biases.numElements(); } - size_t prevSize() const { return weights.getShape()[1]; } + size_t size() const { return size_; } + size_t prevSize() const { return prevSize_; } size_t paramSize() const { return biases.numElements() + weights.numElements(); } }; @@ -28,6 +32,8 @@ class DenseLayer { LayerParams parameters; LayerParams gradients; + global::Tensor deltaL; + Activation activationFunction; bool isTraining{false}; @@ -45,7 +51,7 @@ class DenseLayer { virtual void forward(const global::Tensor &metrix) = 0; void updateWeight(IOptimizer &optimizer); virtual void backward( - global::Tensor &deltas, + global::Tensor **deltas, const global::Tensor &prevLayer, const LayerParams *nextLayer = nullptr) = 0; virtual global::ValueType getLoss(const global::Prediction &) { return 0; }; @@ -65,7 +71,7 @@ class DenseLayer { size_t getParamCount() const; const global::Tensor getData() const; - void setData(const global::Tensor newParam); + void setData(const global::Tensor newParam, const size_t offset); void setTraining(const bool state) { isTraining = state; } }; @@ -73,7 +79,7 @@ class DenseLayer { class Hidden_Layer : public DenseLayer { private: const DenseLayerConfig &config; - global::Tensor getDelta( + void calculateDelta( const global::Tensor &output, const LayerParams &nextLayer); @@ -93,7 +99,7 @@ class Hidden_Layer : public DenseLayer { void forward(const global::Tensor &metrix) override; void backward( - global::Tensor &deltas, + global::Tensor **deltas, const global::Tensor &prevLayer, const LayerParams *nextLayer) override; }; @@ -102,7 +108,7 @@ class Output_Layer : public DenseLayer { private: const FNNConfig &config; - global::Tensor getDelta(const global::Tensor &output); + void getDelta(const global::Tensor &output); static global::ValueType getCrossEntropyLoss( const global::Tensor &prediction, const size_t target); @@ -122,7 +128,7 @@ class Output_Layer : public DenseLayer { void forward(const global::Tensor &metrix) override; void backward( - global::Tensor &deltas, + global::Tensor **deltas, const global::Tensor &prevLayer, const LayerParams *) override; diff --git a/src/networks/fnn/FNNetwork.cpp b/src/networks/fnn/FNNetwork.cpp index 46f163c..89621c1 100644 --- a/src/networks/fnn/FNNetwork.cpp +++ b/src/networks/fnn/FNNetwork.cpp @@ -52,7 +52,7 @@ void FNNetwork::sendNewVNeurons(const size_t i) const { void FNNetwork::forward(const global::Tensor &newInput) { input = newInput; - layers[0]->forward(input); + layers[0]->forward(newInput); sendNewVNeurons(0); for (size_t i = 1; i < layers.size(); ++i) { @@ -72,12 +72,10 @@ void FNNetwork::vUpdate() { visual->attempPause(); } -void FNNetwork::backward(const global::Tensor &outputDeltas) { - global::Tensor deltas = outputDeltas; - +void FNNetwork::backward(global::Tensor **outputDeltas) { resetGradient(); - layers.back()->backward(deltas, layers[layers.size() - 2]->getOut()); + layers.back()->backward(outputDeltas, layers[layers.size() - 2]->getOut()); if (visual) { visual->setGrad(layers.size() - 1, layers[layers.size() - 1]->getGrad()); @@ -85,7 +83,7 @@ void FNNetwork::backward(const global::Tensor &outputDeltas) { for (int i = static_cast(layers.size()) - 2; i >= 0; --i) { const global::Tensor &prev = (i == 0) ? input : layers[i - 1]->getOut(); - layers[i]->backward(deltas, prev, &layers[i + 1]->getParms()); + layers[i]->backward(outputDeltas, prev, &layers[i + 1]->getParms()); if (visual) { visual->setGrad(i, layers[i]->getGrad()); @@ -94,7 +92,7 @@ void FNNetwork::backward(const global::Tensor &outputDeltas) { vUpdate(); } - calculateInputDelta(deltas); + calculateInputDelta(outputDeltas); } global::ValueType FNNetwork::getLoss(const global::Prediction &pre) const { @@ -137,14 +135,8 @@ void FNNetwork::updateWeights(IOptimizer &optimizer) { } } -void FNNetwork::calculateInputDelta(const global::Tensor &deltas) { - input.fill(0); - - for (size_t i = 0; i < inputSize(); ++i) { - for (size_t j = 0; j < layers[0]->size(); ++j) { - input[i] += deltas[j] * layers[0]->getParms().weights({j, i}); - } - } +void FNNetwork::calculateInputDelta(global::Tensor **deltas) { + layers[0]->getParms().weights.matmulT(**deltas, input); } size_t FNNetwork::getParamCount() const { @@ -165,10 +157,8 @@ global::Tensor FNNetwork::getParams() const { for (size_t i = 0; i < layers.size(); ++i) { global::Tensor params = layers[i]->getData(); - for (size_t j = 0; j < params.numElements(); ++j) { - matrix[matrixI] = params[j]; - ++matrixI; - } + matrix.insertRange(params, 0, matrixI, params.numElements()); + matrixI += params.numElements(); } return matrix; @@ -177,14 +167,8 @@ global::Tensor FNNetwork::getParams() const { void FNNetwork::setParams(const global::Tensor params) { size_t j = 0; for (size_t i = 0; i < layers.size(); ++i) { - global::Tensor newParam({layers[i]->getParamCount()}); - - for (size_t k = 0; k < newParam.numElements(); ++k) { - newParam[k] = params[j]; - ++j; - } - - layers[i]->setData(newParam); + layers[i]->setData(params, j); + j += layers[i]->getParamCount(); if (visual) { visual->setParam(i, layers[i]->getParms()); diff --git a/src/networks/fnn/FNNetwork.hpp b/src/networks/fnn/FNNetwork.hpp index 93d2366..674735c 100644 --- a/src/networks/fnn/FNNetwork.hpp +++ b/src/networks/fnn/FNNetwork.hpp @@ -13,7 +13,7 @@ class FNNetwork : public INetwork { const std::shared_ptr visual; - void calculateInputDelta(const global::Tensor &deltas); + void calculateInputDelta(global::Tensor **deltas); void vUpdate(); @@ -28,7 +28,7 @@ class FNNetwork : public INetwork { ~FNNetwork() override = default; void forward(const global::Tensor &newInput) override; - void backward(const global::Tensor &outputDeltas) override; + void backward(global::Tensor **outputDeltas) override; void updateWeights(IOptimizer &optimizer) override; void resetGradient() override; diff --git a/src/networks/fnn/FnnVisualizer.cpp b/src/networks/fnn/FnnVisualizer.cpp index 5ae7306..1726a78 100644 --- a/src/networks/fnn/FnnVisualizer.cpp +++ b/src/networks/fnn/FnnVisualizer.cpp @@ -98,7 +98,7 @@ void VisualDenseLayer::drawWeights(const size_t neuron_i, sf::RenderTexture &tar line_[2].position = to; line_[0].color = LINE_COLOR; - line_[0].color.a = parameters.weights({neuron_i, neuronP}) * 50; + line_[0].color.a = parameters.weights.getValue({neuron_i, neuronP}) * 50; line_[1].color = line_[0].color; line_[2].color = getColorFromTextT(getTextT(neuron_i, neuronP)); target.draw(line_); @@ -162,7 +162,7 @@ void VisualDenseLayer::renderNeuron(const size_t index, sf::RenderTexture &targe drawWeights(index, target); } - drawNeuron(cacheNeurons[index], net[index], out[index], target); + drawNeuron(cacheNeurons[index], net.getValue({index}), out.getValue({index}), target); } void VisualDenseLayer::drawNeurons(sf::RenderTexture &target) { @@ -198,10 +198,10 @@ float VisualDenseLayer::calculateGap(const int size, const float scale) { } textType VisualDenseLayer::getTextT(const size_t layer_i, const size_t layer_p) { - if (gradients.weights({layer_i, layer_p}) < 0) + if (gradients.weights.getValue({layer_i, layer_p}) < 0) return textType::DOWN; - if (gradients.weights({layer_i, layer_p}) > 0) + if (gradients.weights.getValue({layer_i, layer_p}) > 0) return textType::UP; return textType::NORMAL; @@ -254,7 +254,6 @@ void VisualDenseLayer::setGrad(const model::fnn::LayerParams &newGrad) { } void FnnVisualier::setWidth(const std::uint32_t newWidth) { - visualWidth = newWidth; if (networkRender.resize({newWidth, networkRender.getSize().y})) { } diff --git a/src/networks/fnn/FnnVisualizer.hpp b/src/networks/fnn/FnnVisualizer.hpp index 067050c..a497484 100644 --- a/src/networks/fnn/FnnVisualizer.hpp +++ b/src/networks/fnn/FnnVisualizer.hpp @@ -32,11 +32,11 @@ static const std::array color_lookup = { class VisualDenseLayer { private: - global::Tensor net{{0}}; - global::Tensor out{{0}}; + global::Tensor net{{1}}; + global::Tensor out{{1}}; - model::fnn::LayerParams parameters{0, 0}; - model::fnn::LayerParams gradients{0, 0}; + model::fnn::LayerParams parameters{1, 1}; + model::fnn::LayerParams gradients{1, 1}; sf::Vector2f pos; diff --git a/src/visualizer/VInterface.cpp b/src/visualizer/VInterface.cpp index 5fa4ebd..3cfbd5c 100644 --- a/src/visualizer/VInterface.cpp +++ b/src/visualizer/VInterface.cpp @@ -1,30 +1,46 @@ #include "VInterface.hpp" +#include "state.hpp" namespace nn::visualizer { -IntefacePanel::IntefacePanel(const std::shared_ptr vstate) +InterfacePanel::InterfacePanel(const std::shared_ptr vstate) : Panel(vstate), VRender({VINTERFACE_WIDTH, VINTERFACE_HEIGHT}) { createVInterface(); } -void IntefacePanel::createVInterface() { +void InterfacePanel::createVInterface() { VRender.clear(INTERFACE_PANEL_COLOR); buttons.reserve(STATES_COUNT); + constexpr std::array skipWhenDisabled = { + SettingType::AutoPause, + SettingType::Pause, + SettingType::PreciseMode}; + for (int i = 0; i < STATES_COUNT; ++i) { - buttons.push_back(std::make_unique