Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 21 additions & 10 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,19 +1,29 @@
ROOT_DIR = $(patsubst %/,%,$(dir $(abspath $(firstword $(MAKEFILE_LIST)))))

CPPFLAGS += -MMD -I$(ROOT_DIR)/inc
CXXFLAGS += --std=c++17 -O3 -Wall -Wextra -Wshadow -Wpedantic
CXXFLAGS += --std=c++17 -Ofast -Wno-shadow -Wpedantic -fopenmp -Wno-variadic-macros

# CUDA flags
CUDA_FLAGS = -arch=all-major -O3 -ltoir -gen-opt-lto --use_fast_math --cudadevrt static --prec-div=false --extra-device-vectorization --default-stream per-thread
# Compiler and linker
CXX = g++
NVCC = nvcc

# vcpkg integration
TRIPLET_DIR = $(patsubst %/,%,$(firstword $(filter-out $(ROOT_DIR)/vcpkg_installed/vcpkg/, $(wildcard $(ROOT_DIR)/vcpkg_installed/*/))))
CPPFLAGS += -isystem $(TRIPLET_DIR)/include
LDFLAGS += -L$(TRIPLET_DIR)/lib -L$(TRIPLET_DIR)/lib/manual-link
LDLIBS += -llzma -lz -lbz2 -lfmt
LDLIBS += -L$(TRIPLET_DIR)/lib -L$(TRIPLET_DIR)/lib/manual-link -L/usr/local/cuda-12.6/lib64
LDLIBS += -llzma -lz -lbz2 -lfmt -lpthread -lcudart -lcublas -lgomp

TORCH_DIR = /home/john/libtorch/libtorch
CPPFLAGS += -I$(TORCH_DIR)/include -I$(TORCH_DIR)/include/torch/csrc/api/include
LDLIBS += -L$(TORCH_DIR)/lib -lc10 -ltorch -ltorch_cpu -lc

.phony: all all_execs clean configclean test makedirs
.phony: all all_execs clean configclean test makedirs cuda.o

test_main_name=$(ROOT_DIR)/test/bin/000-test-main

all: all_execs
all: cuda.o all_execs

# Generated configuration makefile contains:
# - $(executable_name), the list of all executables in the configuration
Expand All @@ -33,6 +43,7 @@ clean:
@-$(RM) inc/cache_modules.h
@-$(RM) inc/ooo_cpu_modules.h
@-$(RM) src/core_inst.cc
@-$(RM) cuda.o
@-$(RM) $(test_main_name)

# Remove all configuration files
Expand All @@ -48,18 +59,18 @@ $(filter-out test, $(sort $(build_dirs) $(module_dirs))): | $(dir $@)
$(build_objs) $(module_objs):
$(COMPILE.cc) $(OUTPUT_OPTION) $<

cuda.o:
$(NVCC) -c $(ROOT_DIR)/inc/cuda.cu -o $@ $(CUDA_FLAGS)

# Add address sanitizers for tests
#$(test_main_name): CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer
$(test_main_name): CXXFLAGS += -g3 -Og -Wconversion
$(test_main_name): LDLIBS += -lCatch2Main -lCatch2

# Link test executable
$(test_main_name):
$(LINK.cc) $(LDFLAGS) -o $@ $(filter-out %/main.o, $^) $(LOADLIBES) $(LDLIBS)

# Link main executables
$(filter-out $(test_main_name), $(executable_name)):
$(LINK.cc) $(LDFLAGS) -o $@ $^ $(LOADLIBES) $(LDLIBS)
$(CXX) $^ cuda.o $(LDLIBS) -o $@

# Tests: build and run
test: $(test_main_name)
Expand All @@ -68,4 +79,4 @@ test: $(test_main_name)
pytest:
PYTHONPATH=$(PYTHONPATH):$(shell pwd) python3 -m unittest discover -v --start-directory='test/python'

-include $(foreach dir,$(wildcard .csconfig/*/) $(wildcard .csconfig/test/*/),$(wildcard $(dir)/obj/*.d))
-include $(foreach dir,$(wildcard .csconfig/*/) $(wildcard .csconfig/test/*/),$(wildcard $(dir)/obj/*.d))
9 changes: 9 additions & 0 deletions branch/False/False.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#include "ooo_cpu.h"

void O3_CPU::initialize_branch_predictor() {}

uint8_t O3_CPU::predict_branch(uint64_t ip){
return 0;
}

void O3_CPU::last_branch_result(uint64_t ip, uint64_t branch_target, uint8_t taken, uint8_t branch_type){}
89 changes: 89 additions & 0 deletions branch/Linear_NN/Linear_NN.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#include <torch/torch.h>
#include <iostream>
#include <vector>
#include "ooo_cpu.h"

constexpr std::size_t HISTORY_LENGTH = 64;

struct Net : torch::nn::Module {
Net()
: linear1(register_module("linear1", torch::nn::Linear(1 + HISTORY_LENGTH, 16))), // Input: IP + History
linear2(register_module("linear2", torch::nn::Linear(16, 1))) {
another_bias = register_parameter("b", torch::randn(1));
}

torch::Tensor forward(torch::Tensor input) {
input = torch::relu(linear1(input)); // Apply ReLU activation
return torch::sigmoid(linear2(input) + another_bias); // Sigmoid to output probability
}

torch::nn::Linear linear1, linear2;
torch::Tensor another_bias;
};

std::bitset<HISTORY_LENGTH> Global_History;
Net net;

void O3_CPU::initialize_branch_predictor() {}

uint8_t O3_CPU::predict_branch(uint64_t ip)
{
// Convert history into Tensor
std::vector<float> history_features;
for (size_t i = 0; i < HISTORY_LENGTH; ++i) {
history_features.push_back(Global_History[i] ? 1.0f : 0.0f); // Convert bitset to float
}

// Normalize IP
float norm_ip = static_cast<float>(ip) / static_cast<float>(UINT64_MAX);
history_features.insert(history_features.begin(), norm_ip); // Insert normalized IP

// Convert to Tensor
torch::Tensor input = torch::tensor(history_features).view({1, HISTORY_LENGTH + 1});

// Forward pass through neural network
torch::Tensor output = net.forward(input);
float prediction = output.item<float>();

std::cout << "Prediction: " << prediction << std::endl;
return prediction > 0.5 ? 1 : 0;
}

void O3_CPU::last_branch_result(uint64_t ip, uint64_t branch_target, uint8_t taken, uint8_t branch_type)
{
// Convert history into Tensor
std::vector<float> history_features;
for (size_t i = 0; i < HISTORY_LENGTH; ++i) {
history_features.push_back(Global_History[i] ? 1.0f : 0.0f);
}

// Normalize IP
float norm_ip = static_cast<float>(ip) / static_cast<float>(UINT64_MAX);
history_features.insert(history_features.begin(), norm_ip);

// Convert to Tensor
torch::Tensor input = torch::tensor(history_features).view({1, HISTORY_LENGTH + 1});

std::cout << "Taken: " << (int)taken << std::endl;

// Convert expected output
torch::Tensor target = torch::tensor(static_cast<float>(taken), torch::dtype(torch::kFloat32)).view({1, 1});

// Define optimizer (Adam for adaptive learning)
static torch::optim::Adam optimizer(net.parameters(), torch::optim::AdamOptions(0.001));

// Forward pass
torch::Tensor prediction = net.forward(input);

// Compute loss
torch::Tensor loss = torch::binary_cross_entropy(prediction, target);

// Backpropagation
optimizer.zero_grad();
loss.backward();
optimizer.step();

// Update global history
Global_History >>= 1;
Global_History[HISTORY_LENGTH - 1] = taken;
}
158 changes: 158 additions & 0 deletions branch/Transformer_NN/Transformer_NN.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
#define TORCH_WARN_OFF
#include <torch/torch.h>
#include <vector>
#include <fstream>
#include <unordered_map>
#include <mutex>
#include "ooo_cpu.h"

#define LOCAL_HISTORY_SIZE 8192
#define HIDDEN_SIZE 16
#define NUM_LAYERS 1
#define LEARNING_RATE 0.0001
#define NUM_HEADS 4

constexpr size_t LOCAL_HISTORY_BITS = 8;
constexpr size_t GLOBAL_HISTORY_BITS = 64;
constexpr size_t INPUT_SIZE = LOCAL_HISTORY_BITS + GLOBAL_HISTORY_BITS + 1;

// Bimodal table for local branch history
std::vector<std::bitset<LOCAL_HISTORY_BITS>> Local_History(LOCAL_HISTORY_SIZE);
// Global branch history
std::bitset<GLOBAL_HISTORY_BITS> Global_History;

// Mutex for optimizer updates
std::mutex optimizer_mutex;

// Log file for async logging
std::ofstream log_file("debug.log", std::ios::app);

void log_debug(const std::string &msg) {
std::lock_guard<std::mutex> lock(optimizer_mutex);
log_file << msg << std::endl;
}

// this printout could be entirely wrong I really have no idea how to calculate this
void print_model_size() {
size_t fc_in_params = INPUT_SIZE * HIDDEN_SIZE + HIDDEN_SIZE;
size_t fc_out_params = HIDDEN_SIZE * 2 + 2;
size_t transformer_encoder_params = NUM_LAYERS * (4 * (HIDDEN_SIZE * HIDDEN_SIZE / NUM_HEADS) + 2 * HIDDEN_SIZE * HIDDEN_SIZE);
size_t transformer_decoder_params = NUM_LAYERS * (4 * (HIDDEN_SIZE * HIDDEN_SIZE / NUM_HEADS) + 2 * HIDDEN_SIZE * HIDDEN_SIZE);

size_t total_params = fc_in_params + fc_out_params + transformer_encoder_params + transformer_decoder_params;
double model_size_kb = (total_params * sizeof(float)) / 1024.0;
double local_history_size_kb = (LOCAL_HISTORY_SIZE * LOCAL_HISTORY_BITS) / 1024.0;

std::cout << "Model size: " << model_size_kb << " KB (" << total_params << " parameters)" << std::endl;
std::cout << "Local history size: " << local_history_size_kb << " KB (" << LOCAL_HISTORY_SIZE
<< " entries, " << LOCAL_HISTORY_BITS << " bits per entry)" << std::endl;
}

struct TransformerPredictor : torch::nn::Module {
TransformerPredictor()
: fc_in(register_module("fc_in", torch::nn::Linear(INPUT_SIZE, HIDDEN_SIZE))),
transformer_encoder_layer(register_module("encoder_layer",
torch::nn::TransformerEncoderLayer(torch::nn::TransformerEncoderLayerOptions(HIDDEN_SIZE, NUM_HEADS)))),
transformer_encoder(register_module("encoder",
torch::nn::TransformerEncoder(transformer_encoder_layer, NUM_LAYERS))),
transformer_decoder_layer(register_module("decoder_layer",
torch::nn::TransformerDecoderLayer(torch::nn::TransformerDecoderLayerOptions(HIDDEN_SIZE, NUM_HEADS)))),
transformer_decoder(register_module("decoder",
torch::nn::TransformerDecoder(transformer_decoder_layer, NUM_LAYERS))),
fc_out(register_module("fc_out", torch::nn::Linear(HIDDEN_SIZE, 2))),
optimizer(std::make_unique<torch::optim::Adam>(parameters(), torch::optim::AdamOptions(LEARNING_RATE))),
update_count(0), forward_count(0) {

torch::set_num_threads(1); // Limit PyTorch threading
}

torch::Tensor forward(torch::Tensor input) {
input = torch::relu(fc_in(input));
auto memory = transformer_encoder(input);
input = transformer_decoder(input, memory);
forward_count++;

if (forward_count % 10000 == 0) {
log_debug("[DEBUG] Forward count: " + std::to_string(forward_count));
}

return torch::softmax(fc_out(input.mean(1)), 1);
}

torch::nn::Linear fc_in, fc_out;
torch::nn::TransformerEncoderLayer transformer_encoder_layer;
torch::nn::TransformerEncoder transformer_encoder;
torch::nn::TransformerDecoderLayer transformer_decoder_layer;
torch::nn::TransformerDecoder transformer_decoder;
std::unique_ptr<torch::optim::Adam> optimizer;
int update_count;
int forward_count;
};

TransformerPredictor transformer_net;

void O3_CPU::initialize_branch_predictor() {
print_model_size();
}

uint8_t O3_CPU::predict_branch(uint64_t ip) {
size_t index = ip % LOCAL_HISTORY_SIZE;
std::bitset<LOCAL_HISTORY_BITS>& local_history = Local_History[index];

// XOR global history with the IP
uint64_t transformed_global_history = Global_History.to_ullong() ^ ip;

std::array<float, INPUT_SIZE> features;
features[0] = static_cast<float>(ip) / static_cast<float>(UINT64_MAX);

for (size_t i = 0; i < LOCAL_HISTORY_BITS; ++i)
features[i + 1] = local_history[i] ? 1.0f : 0.0f;

for (size_t i = 0; i < GLOBAL_HISTORY_BITS; ++i)
features[LOCAL_HISTORY_BITS + 1 + i] = (transformed_global_history >> i) & 1 ? 1.0f : 0.0f;

// Convert std::array to std::vector before creating tensor
torch::Tensor input = torch::tensor(std::vector<float>(features.begin(), features.end()),
torch::dtype(torch::kFloat32)).view({1, 1, INPUT_SIZE});

return transformer_net.forward(input).argmax(1).item<int>();
}

void O3_CPU::last_branch_result(uint64_t ip, uint64_t branch_target, uint8_t taken, uint8_t branch_type) {
size_t index = ip % LOCAL_HISTORY_SIZE;
std::bitset<LOCAL_HISTORY_BITS>& local_history = Local_History[index];

// XOR global history with the IP
uint64_t transformed_global_history = Global_History.to_ullong() ^ ip;

std::array<float, INPUT_SIZE> features;
features[0] = static_cast<float>(ip) / static_cast<float>(UINT64_MAX);

for (size_t i = 0; i < LOCAL_HISTORY_BITS; ++i)
features[i + 1] = local_history[i] ? 1.0f : 0.0f;

for (size_t i = 0; i < GLOBAL_HISTORY_BITS; ++i)
features[LOCAL_HISTORY_BITS + 1 + i] = (transformed_global_history >> i) & 1 ? 1.0f : 0.0f;

// Convert std::array to std::vector before creating tensor
torch::Tensor input = torch::tensor(std::vector<float>(features.begin(), features.end()),
torch::dtype(torch::kFloat32)).view({1, 1, INPUT_SIZE});

torch::Tensor target = torch::tensor({1.0f - static_cast<float>(taken), static_cast<float>(taken)}).view({1, 2});
torch::Tensor prediction = transformer_net.forward(input);
torch::Tensor loss = torch::binary_cross_entropy(prediction, target);

// Mutex to protect optimizer in parallel tests
{
std::lock_guard<std::mutex> lock(optimizer_mutex);
transformer_net.optimizer->zero_grad();
loss.backward();
transformer_net.optimizer->step();
}

// Update histories
local_history >>= 1;
local_history[LOCAL_HISTORY_BITS - 1] = taken;
Global_History >>= 1;
Global_History[GLOBAL_HISTORY_BITS - 1] = taken;
}
9 changes: 9 additions & 0 deletions branch/True/True.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#include "ooo_cpu.h"

void O3_CPU::initialize_branch_predictor() {}

uint8_t O3_CPU::predict_branch(uint64_t ip){
return 1;
}

void O3_CPU::last_branch_result(uint64_t ip, uint64_t branch_target, uint8_t taken, uint8_t branch_type){}
Loading