diff --git a/Makefile b/Makefile index 74b40105d1..94675c3f4a 100644 --- a/Makefile +++ b/Makefile @@ -1,19 +1,29 @@ ROOT_DIR = $(patsubst %/,%,$(dir $(abspath $(firstword $(MAKEFILE_LIST))))) CPPFLAGS += -MMD -I$(ROOT_DIR)/inc -CXXFLAGS += --std=c++17 -O3 -Wall -Wextra -Wshadow -Wpedantic +CXXFLAGS += --std=c++17 -Ofast -Wno-shadow -Wpedantic -fopenmp -Wno-variadic-macros + +# CUDA flags +CUDA_FLAGS = -arch=all-major -O3 -ltoir -gen-opt-lto --use_fast_math --cudadevrt static --prec-div=false --extra-device-vectorization --default-stream per-thread +# Compiler and linker +CXX = g++ +NVCC = nvcc # vcpkg integration TRIPLET_DIR = $(patsubst %/,%,$(firstword $(filter-out $(ROOT_DIR)/vcpkg_installed/vcpkg/, $(wildcard $(ROOT_DIR)/vcpkg_installed/*/)))) CPPFLAGS += -isystem $(TRIPLET_DIR)/include -LDFLAGS += -L$(TRIPLET_DIR)/lib -L$(TRIPLET_DIR)/lib/manual-link -LDLIBS += -llzma -lz -lbz2 -lfmt +LDLIBS += -L$(TRIPLET_DIR)/lib -L$(TRIPLET_DIR)/lib/manual-link -L/usr/local/cuda-12.6/lib64 +LDLIBS += -llzma -lz -lbz2 -lfmt -lpthread -lcudart -lcublas -lgomp + +TORCH_DIR = /home/john/libtorch/libtorch +CPPFLAGS += -I$(TORCH_DIR)/include -I$(TORCH_DIR)/include/torch/csrc/api/include +LDLIBS += -L$(TORCH_DIR)/lib -lc10 -ltorch -ltorch_cpu -lc -.phony: all all_execs clean configclean test makedirs +.phony: all all_execs clean configclean test makedirs cuda.o test_main_name=$(ROOT_DIR)/test/bin/000-test-main -all: all_execs +all: cuda.o all_execs # Generated configuration makefile contains: # - $(executable_name), the list of all executables in the configuration @@ -33,6 +43,7 @@ clean: @-$(RM) inc/cache_modules.h @-$(RM) inc/ooo_cpu_modules.h @-$(RM) src/core_inst.cc + @-$(RM) cuda.o @-$(RM) $(test_main_name) # Remove all configuration files @@ -48,18 +59,18 @@ $(filter-out test, $(sort $(build_dirs) $(module_dirs))): | $(dir $@) $(build_objs) $(module_objs): $(COMPILE.cc) $(OUTPUT_OPTION) $< +cuda.o: + $(NVCC) -c $(ROOT_DIR)/inc/cuda.cu -o $@ $(CUDA_FLAGS) + # Add address sanitizers for tests #$(test_main_name): CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer $(test_main_name): CXXFLAGS += -g3 -Og -Wconversion $(test_main_name): LDLIBS += -lCatch2Main -lCatch2 -# Link test executable -$(test_main_name): - $(LINK.cc) $(LDFLAGS) -o $@ $(filter-out %/main.o, $^) $(LOADLIBES) $(LDLIBS) # Link main executables $(filter-out $(test_main_name), $(executable_name)): - $(LINK.cc) $(LDFLAGS) -o $@ $^ $(LOADLIBES) $(LDLIBS) + $(CXX) $^ cuda.o $(LDLIBS) -o $@ # Tests: build and run test: $(test_main_name) @@ -68,4 +79,4 @@ test: $(test_main_name) pytest: PYTHONPATH=$(PYTHONPATH):$(shell pwd) python3 -m unittest discover -v --start-directory='test/python' --include $(foreach dir,$(wildcard .csconfig/*/) $(wildcard .csconfig/test/*/),$(wildcard $(dir)/obj/*.d)) +-include $(foreach dir,$(wildcard .csconfig/*/) $(wildcard .csconfig/test/*/),$(wildcard $(dir)/obj/*.d)) \ No newline at end of file diff --git a/branch/False/False.cc b/branch/False/False.cc new file mode 100644 index 0000000000..98a06eb620 --- /dev/null +++ b/branch/False/False.cc @@ -0,0 +1,9 @@ +#include "ooo_cpu.h" + +void O3_CPU::initialize_branch_predictor() {} + +uint8_t O3_CPU::predict_branch(uint64_t ip){ + return 0; +} + +void O3_CPU::last_branch_result(uint64_t ip, uint64_t branch_target, uint8_t taken, uint8_t branch_type){} diff --git a/branch/Linear_NN/Linear_NN.cc b/branch/Linear_NN/Linear_NN.cc new file mode 100644 index 0000000000..eed6584b43 --- /dev/null +++ b/branch/Linear_NN/Linear_NN.cc @@ -0,0 +1,89 @@ +#include +#include +#include +#include "ooo_cpu.h" + +constexpr std::size_t HISTORY_LENGTH = 64; + +struct Net : torch::nn::Module { + Net() + : linear1(register_module("linear1", torch::nn::Linear(1 + HISTORY_LENGTH, 16))), // Input: IP + History + linear2(register_module("linear2", torch::nn::Linear(16, 1))) { + another_bias = register_parameter("b", torch::randn(1)); + } + + torch::Tensor forward(torch::Tensor input) { + input = torch::relu(linear1(input)); // Apply ReLU activation + return torch::sigmoid(linear2(input) + another_bias); // Sigmoid to output probability + } + + torch::nn::Linear linear1, linear2; + torch::Tensor another_bias; +}; + +std::bitset Global_History; +Net net; + +void O3_CPU::initialize_branch_predictor() {} + +uint8_t O3_CPU::predict_branch(uint64_t ip) +{ + // Convert history into Tensor + std::vector history_features; + for (size_t i = 0; i < HISTORY_LENGTH; ++i) { + history_features.push_back(Global_History[i] ? 1.0f : 0.0f); // Convert bitset to float + } + + // Normalize IP + float norm_ip = static_cast(ip) / static_cast(UINT64_MAX); + history_features.insert(history_features.begin(), norm_ip); // Insert normalized IP + + // Convert to Tensor + torch::Tensor input = torch::tensor(history_features).view({1, HISTORY_LENGTH + 1}); + + // Forward pass through neural network + torch::Tensor output = net.forward(input); + float prediction = output.item(); + + std::cout << "Prediction: " << prediction << std::endl; + return prediction > 0.5 ? 1 : 0; +} + +void O3_CPU::last_branch_result(uint64_t ip, uint64_t branch_target, uint8_t taken, uint8_t branch_type) +{ + // Convert history into Tensor + std::vector history_features; + for (size_t i = 0; i < HISTORY_LENGTH; ++i) { + history_features.push_back(Global_History[i] ? 1.0f : 0.0f); + } + + // Normalize IP + float norm_ip = static_cast(ip) / static_cast(UINT64_MAX); + history_features.insert(history_features.begin(), norm_ip); + + // Convert to Tensor + torch::Tensor input = torch::tensor(history_features).view({1, HISTORY_LENGTH + 1}); + + std::cout << "Taken: " << (int)taken << std::endl; + + // Convert expected output + torch::Tensor target = torch::tensor(static_cast(taken), torch::dtype(torch::kFloat32)).view({1, 1}); + + // Define optimizer (Adam for adaptive learning) + static torch::optim::Adam optimizer(net.parameters(), torch::optim::AdamOptions(0.001)); + + // Forward pass + torch::Tensor prediction = net.forward(input); + + // Compute loss + torch::Tensor loss = torch::binary_cross_entropy(prediction, target); + + // Backpropagation + optimizer.zero_grad(); + loss.backward(); + optimizer.step(); + + // Update global history + Global_History >>= 1; + Global_History[HISTORY_LENGTH - 1] = taken; +} diff --git a/branch/Transformer_NN/Transformer_NN.cc b/branch/Transformer_NN/Transformer_NN.cc new file mode 100644 index 0000000000..ee1a89fae8 --- /dev/null +++ b/branch/Transformer_NN/Transformer_NN.cc @@ -0,0 +1,158 @@ +#define TORCH_WARN_OFF +#include +#include +#include +#include +#include +#include "ooo_cpu.h" + +#define LOCAL_HISTORY_SIZE 8192 +#define HIDDEN_SIZE 16 +#define NUM_LAYERS 1 +#define LEARNING_RATE 0.0001 +#define NUM_HEADS 4 + +constexpr size_t LOCAL_HISTORY_BITS = 8; +constexpr size_t GLOBAL_HISTORY_BITS = 64; +constexpr size_t INPUT_SIZE = LOCAL_HISTORY_BITS + GLOBAL_HISTORY_BITS + 1; + +// Bimodal table for local branch history +std::vector> Local_History(LOCAL_HISTORY_SIZE); +// Global branch history +std::bitset Global_History; + +// Mutex for optimizer updates +std::mutex optimizer_mutex; + +// Log file for async logging +std::ofstream log_file("debug.log", std::ios::app); + +void log_debug(const std::string &msg) { + std::lock_guard lock(optimizer_mutex); + log_file << msg << std::endl; +} + +// this printout could be entirely wrong I really have no idea how to calculate this +void print_model_size() { + size_t fc_in_params = INPUT_SIZE * HIDDEN_SIZE + HIDDEN_SIZE; + size_t fc_out_params = HIDDEN_SIZE * 2 + 2; + size_t transformer_encoder_params = NUM_LAYERS * (4 * (HIDDEN_SIZE * HIDDEN_SIZE / NUM_HEADS) + 2 * HIDDEN_SIZE * HIDDEN_SIZE); + size_t transformer_decoder_params = NUM_LAYERS * (4 * (HIDDEN_SIZE * HIDDEN_SIZE / NUM_HEADS) + 2 * HIDDEN_SIZE * HIDDEN_SIZE); + + size_t total_params = fc_in_params + fc_out_params + transformer_encoder_params + transformer_decoder_params; + double model_size_kb = (total_params * sizeof(float)) / 1024.0; + double local_history_size_kb = (LOCAL_HISTORY_SIZE * LOCAL_HISTORY_BITS) / 1024.0; + + std::cout << "Model size: " << model_size_kb << " KB (" << total_params << " parameters)" << std::endl; + std::cout << "Local history size: " << local_history_size_kb << " KB (" << LOCAL_HISTORY_SIZE + << " entries, " << LOCAL_HISTORY_BITS << " bits per entry)" << std::endl; +} + +struct TransformerPredictor : torch::nn::Module { + TransformerPredictor() + : fc_in(register_module("fc_in", torch::nn::Linear(INPUT_SIZE, HIDDEN_SIZE))), + transformer_encoder_layer(register_module("encoder_layer", + torch::nn::TransformerEncoderLayer(torch::nn::TransformerEncoderLayerOptions(HIDDEN_SIZE, NUM_HEADS)))), + transformer_encoder(register_module("encoder", + torch::nn::TransformerEncoder(transformer_encoder_layer, NUM_LAYERS))), + transformer_decoder_layer(register_module("decoder_layer", + torch::nn::TransformerDecoderLayer(torch::nn::TransformerDecoderLayerOptions(HIDDEN_SIZE, NUM_HEADS)))), + transformer_decoder(register_module("decoder", + torch::nn::TransformerDecoder(transformer_decoder_layer, NUM_LAYERS))), + fc_out(register_module("fc_out", torch::nn::Linear(HIDDEN_SIZE, 2))), + optimizer(std::make_unique(parameters(), torch::optim::AdamOptions(LEARNING_RATE))), + update_count(0), forward_count(0) { + + torch::set_num_threads(1); // Limit PyTorch threading + } + + torch::Tensor forward(torch::Tensor input) { + input = torch::relu(fc_in(input)); + auto memory = transformer_encoder(input); + input = transformer_decoder(input, memory); + forward_count++; + + if (forward_count % 10000 == 0) { + log_debug("[DEBUG] Forward count: " + std::to_string(forward_count)); + } + + return torch::softmax(fc_out(input.mean(1)), 1); + } + + torch::nn::Linear fc_in, fc_out; + torch::nn::TransformerEncoderLayer transformer_encoder_layer; + torch::nn::TransformerEncoder transformer_encoder; + torch::nn::TransformerDecoderLayer transformer_decoder_layer; + torch::nn::TransformerDecoder transformer_decoder; + std::unique_ptr optimizer; + int update_count; + int forward_count; +}; + +TransformerPredictor transformer_net; + +void O3_CPU::initialize_branch_predictor() { + print_model_size(); +} + +uint8_t O3_CPU::predict_branch(uint64_t ip) { + size_t index = ip % LOCAL_HISTORY_SIZE; + std::bitset& local_history = Local_History[index]; + + // XOR global history with the IP + uint64_t transformed_global_history = Global_History.to_ullong() ^ ip; + + std::array features; + features[0] = static_cast(ip) / static_cast(UINT64_MAX); + + for (size_t i = 0; i < LOCAL_HISTORY_BITS; ++i) + features[i + 1] = local_history[i] ? 1.0f : 0.0f; + + for (size_t i = 0; i < GLOBAL_HISTORY_BITS; ++i) + features[LOCAL_HISTORY_BITS + 1 + i] = (transformed_global_history >> i) & 1 ? 1.0f : 0.0f; + + // Convert std::array to std::vector before creating tensor + torch::Tensor input = torch::tensor(std::vector(features.begin(), features.end()), + torch::dtype(torch::kFloat32)).view({1, 1, INPUT_SIZE}); + + return transformer_net.forward(input).argmax(1).item(); +} + +void O3_CPU::last_branch_result(uint64_t ip, uint64_t branch_target, uint8_t taken, uint8_t branch_type) { + size_t index = ip % LOCAL_HISTORY_SIZE; + std::bitset& local_history = Local_History[index]; + + // XOR global history with the IP + uint64_t transformed_global_history = Global_History.to_ullong() ^ ip; + + std::array features; + features[0] = static_cast(ip) / static_cast(UINT64_MAX); + + for (size_t i = 0; i < LOCAL_HISTORY_BITS; ++i) + features[i + 1] = local_history[i] ? 1.0f : 0.0f; + + for (size_t i = 0; i < GLOBAL_HISTORY_BITS; ++i) + features[LOCAL_HISTORY_BITS + 1 + i] = (transformed_global_history >> i) & 1 ? 1.0f : 0.0f; + + // Convert std::array to std::vector before creating tensor + torch::Tensor input = torch::tensor(std::vector(features.begin(), features.end()), + torch::dtype(torch::kFloat32)).view({1, 1, INPUT_SIZE}); + + torch::Tensor target = torch::tensor({1.0f - static_cast(taken), static_cast(taken)}).view({1, 2}); + torch::Tensor prediction = transformer_net.forward(input); + torch::Tensor loss = torch::binary_cross_entropy(prediction, target); + + // Mutex to protect optimizer in parallel tests + { + std::lock_guard lock(optimizer_mutex); + transformer_net.optimizer->zero_grad(); + loss.backward(); + transformer_net.optimizer->step(); + } + + // Update histories + local_history >>= 1; + local_history[LOCAL_HISTORY_BITS - 1] = taken; + Global_History >>= 1; + Global_History[GLOBAL_HISTORY_BITS - 1] = taken; +} \ No newline at end of file diff --git a/branch/True/True.cc b/branch/True/True.cc new file mode 100644 index 0000000000..3409790ac9 --- /dev/null +++ b/branch/True/True.cc @@ -0,0 +1,9 @@ +#include "ooo_cpu.h" + +void O3_CPU::initialize_branch_predictor() {} + +uint8_t O3_CPU::predict_branch(uint64_t ip){ + return 1; +} + +void O3_CPU::last_branch_result(uint64_t ip, uint64_t branch_target, uint8_t taken, uint8_t branch_type){} diff --git a/branch/tage4/tage4.cc b/branch/tage4/tage4.cc new file mode 100644 index 0000000000..647d787430 --- /dev/null +++ b/branch/tage4/tage4.cc @@ -0,0 +1,346 @@ + +#include +#include +#include +#include +#include +#include + +#include "msl/fwcounter.h" +#include "ooo_cpu.h" + +#define Tag uint16_t +#define Index uint16_t +#define Path uint64_t +#define History uint64_t +#define BIMODAL_TABLE_SIZE 8192 +#define MAX_INDEX_BITS 12 +#define TAGE_TABLES 4 +#define TAGE_TAG_BITS 7 +#define TAGE_CONTER_BITS 3 +#define TAGE_USEFUL_BITS 2 +#define GLOBAL_HISTORY_LENGTH 1024 +#define PATH_HISTORY_BUFFER_LENGTH 32 +#define TAGE_MIN_LENGTH 5 +#define HISTORY_ALPHA 1.5 +#define TAGE_RESET_INTERVAL 512000 +#define BIMODE_COUNTER_BITS 3 + +int debug_counter = 0; + +struct TAGs + { + Tag tag; + uint8_t useful; + uint8_t counter; + }; + + +class Tage { + private: + int count; + std::bitset GLOBAL_HISTORY; + std::bitset PATH_HISTORY; + uint8_t T0[BIMODAL_TABLE_SIZE]; + std::array T[TAGE_TABLES]; + int table_history_lengths[TAGE_TABLES]; + uint8_t use_alt_on_na; + uint8_t tage_pred, pred, alt_pred; + int pred_comp, alt_comp; // Provider and alternate component of last branch PC + int STRONG; + int debug_ct[7] = {0,0,0,0,0,0,0}; + +public: + void init(); // initialise the member variables + uint8_t predict(uint64_t ip); // return the prediction from tage + void update(uint64_t ip, uint8_t taken); // updates the state of tage + + Index get_bimodal_index(uint64_t ip); // helper hash function to index into the bimodal table + Index get_predictor_index(uint64_t ip, int table); // helper hash function to index into the predictor table using histories + Tag get_tag(uint64_t ip, int table); // helper hash function to get the tag of particular ip and table + int get_match_below_n(uint64_t ip, int table); // helper function to find the hit table strictly before the table argument + void ctr_update(uint8_t &ctr, int cond, int low, int high); // counter update helper function (including clipping) + uint8_t get_prediction(uint64_t ip, int comp); // helper function for prediction + Path get_path_history_hash(int table); // hepoer hash function to compress the path history + History get_compressed_global_history(int inSize, int outSize); // Compress global history of last 'inSize' branches into 'outSize' by wrapping the history +}; + +uint8_t Tage::predict(uint64_t ip){ + + + pred_comp = get_match_below_n(ip,TAGE_TABLES + 1); // Get the first predictor from the end which matches the PC + alt_comp = get_match_below_n(ip, pred_comp); // Get the first predictor below the provider which matches the PC + + pred = get_prediction(ip, pred_comp); + alt_pred = get_prediction(ip, alt_comp); + + // std::cout << debug_ct[0] << "|" << debug_ct[1] << "|" << debug_ct[2] << "|" << debug_ct[3] << "|" << "|" << debug_ct[4] << "|" << debug_ct[5] << std::endl; + debug_ct[pred_comp]++; + if(pred_comp == 0) { // if there is no alternate predictor we use the default bimodal table + tage_pred = pred; + //debug_ct[0]++; + } + else // if there is an alternate predictor + { + Index index = get_predictor_index(ip,pred_comp); + STRONG = abs(2*T[pred_comp-1][index].counter + 1 - (1 << TAGE_CONTER_BITS)) > 1; // check to see if the current predictors guess is strong + // std::cout << "use_alt_on_na" << int(use_alt_on_na) << " | " << "STRONG:" << STRONG << std::endl; + if (use_alt_on_na < 8 || STRONG) { + tage_pred = pred; // if the prediction is strong, use that predictor + //debug_ct[1]++; + } + else + { + tage_pred = alt_pred; // if the prediction is not strong, use an alternate predictor + } + } + return tage_pred; +} + +void Tage::ctr_update(uint8_t &ctr, int cond, int low, int high) +{ + + if(cond && ctr < high) + ctr++; + else if (!cond && ctr > low) + ctr--; +} + +void Tage::update(uint64_t ip, uint8_t taken){ + if (pred_comp > 0) + { + struct TAGs *entry = &T[pred_comp-1][get_predictor_index(ip,pred_comp)]; + uint8_t useful = entry->useful; + if(!STRONG) + { + if (pred != alt_pred) + // std::cout << "ENTRY UPDATED" << std::endl; + ctr_update(use_alt_on_na, !(pred = taken), 0, 15); + } + + if (alt_comp > 0) + { + struct TAGs *alt_entry = &T[alt_comp-1][get_predictor_index(ip,alt_comp)]; + if(useful == 0) + // std::cout << "ENTRY UPDATED" << std::endl; + ctr_update(alt_entry->counter,taken,0,((1 << TAGE_CONTER_BITS) -1)); + } + + else + { + Index index = get_bimodal_index(ip); + if (useful == 0) + ctr_update(T0[index],taken,0,((1 << BIMODE_COUNTER_BITS)-1)); + } + + if(pred != alt_pred) + { + if (pred == taken) + { + if (entry->useful < ((1 << TAGE_USEFUL_BITS)-1)) + entry->useful++; + } + else + { + if(use_alt_on_na < 8) + { + if (entry->useful > 0) + entry->useful--; + } + } + } + ctr_update(entry->counter, taken, 0, ((1 < (rand()) / static_cast (RAND_MAX); + random = random & ((1 << (TAGE_TABLES - pred_comp -1))-1); + int start_component = pred_comp + 1; + + if(random & 1) + { + start_component++; + if(random & 2) + start_component++; + } + int isFree = 0; + for (int i = pred_comp + 1; i <= TAGE_TABLES; i++) + { + struct TAGs *entry_new = &T[i-1][get_predictor_index(ip,i)]; + if(entry_new->useful == 0) + isFree = 1; + } + if (!isFree && start_component <= TAGE_TABLES) + T[start_component-1][get_predictor_index(ip,start_component)].useful = 0; + + for(int i = start_component; i <= TAGE_TABLES; i++) + { + struct TAGs *entry_new = &T[i-1][get_predictor_index(ip,i)]; + if(entry_new->useful == 0) + { + entry_new->tag = get_tag(ip,i); + entry_new->counter = (1 << (TAGE_CONTER_BITS - 1)); + break; + } + } + } + for(int i = GLOBAL_HISTORY_LENGTH - 1; i > 0; i--) + GLOBAL_HISTORY[i] = GLOBAL_HISTORY[i-1]; + GLOBAL_HISTORY[0] = taken; + + for(int i = PATH_HISTORY_BUFFER_LENGTH - 1; i > 0; i--) + PATH_HISTORY[i] = PATH_HISTORY[i-1]; + PATH_HISTORY[0] = ip & 1; + + count++; + if (count % TAGE_RESET_INTERVAL == 0) + { + count = 0; + for(int i = 0; i < TAGE_TABLES; i++) + { + for (int j =0; j < (1 << MAX_INDEX_BITS); j++) + T[i][j].useful >>= 1; + } + } +} + +uint8_t Tage::get_prediction(uint64_t ip, int comp){ + if (comp == 0){ + Index index = get_bimodal_index(ip); + // std::cout << "T0[index] = " << int(T0[index]) << std::endl; + return (T0[index] >= (1 << (TAGE_CONTER_BITS-1))); + } + else + { + Index index = get_predictor_index(ip,comp); + // std::cout << "T[comp-1][index] = " << T[comp-1][index].counter << std::endl; + return (T[comp-1][index].counter >= (1 << (TAGE_CONTER_BITS-1))); + } +} + +Index Tage::get_bimodal_index(uint64_t ip) { + return ip & (BIMODAL_TABLE_SIZE - 1); +} + +Tag Tage::get_tag(uint64_t ip, int table) { + History global_history_hash = get_compressed_global_history(table_history_lengths[table-1],TAGE_TAG_BITS); + global_history_hash ^= get_compressed_global_history(table_history_lengths[table-1],TAGE_TAG_BITS-1); + return ((global_history_hash ^ ip) & ((1 << TAGE_TAG_BITS)-1)); +} + +int Tage::get_match_below_n(uint64_t ip, int table) +{ + for(int i = table - 1; i >= 1; i--){ + Index index = get_predictor_index(ip,i); + Tag tag = get_tag(ip,i); + + if (T[i-1][index].tag == tag) + { + // std::cout<< T[i-1][index].tag << " | " << tag << "|" << i << std::endl ; + return i; + } + } + return 0; +} + +void Tage::init() +{ + srand(time(0)); + use_alt_on_na = 8; + tage_pred = 0; + + for (int i = 0; i < BIMODAL_TABLE_SIZE; i++) + T0[i] = (1 << (BIMODE_COUNTER_BITS - 1)); // weakly taken + for (int i = 0; i < TAGE_TABLES; i++){ + for (int j = 0; j < (1 << MAX_INDEX_BITS); j++) + { + T[i][j].counter = (1 << (BIMODE_COUNTER_BITS - 1)); + T[i][j].useful = 0; + T[i][j].tag = 0; + } + } + double power = 1; + for (int i = 0; i < TAGE_TABLES; i++) + { + table_history_lengths[i] = int(TAGE_MIN_LENGTH * power + 0.5); + power *= HISTORY_ALPHA; + std::cout << "lengths" << table_history_lengths[i] << std::endl; + } + // for (int i = 0; i < TAGE_TABLES; i++) + // { + // for (int j = 0; j < (1 << MAX_INDEX_BITS); j++) + // std::cout << T[i][j].tag << "|" << int(T[i][j].useful) << "|" << int(T[i][j].counter) << std::endl; + // } + // exit(0); + +} + +Path Tage::get_path_history_hash(int table) +{ + Path A = 0; + Path size = table_history_lengths[table-1] > 16 ? 16 : table_history_lengths[table-1]; + for (int i = PATH_HISTORY_BUFFER_LENGTH -1; i>= 0; i--) + A = (A << 1) | PATH_HISTORY[i]; + A = A & ((1 << size)-1); + + Path A1, A2; + A1 = A & ((1 << MAX_INDEX_BITS)-1); + A2 = A >> MAX_INDEX_BITS; + + // Use hashign from CBP-4 L-tage submission + A2 = ((A2 << table) & ((1 << MAX_INDEX_BITS) - 1)) + (A2 >> abs(MAX_INDEX_BITS - table)); + A = A1 ^ A2; + A = ((A << table) & ((1 << MAX_INDEX_BITS) - 1)) + (A >> abs(MAX_INDEX_BITS - table)); + return(A); +} + +Index Tage::get_predictor_index(uint64_t ip, int table) { + Path path_history_hash = get_path_history_hash(table); + + // Hash of global history + History global_histor_hash = get_compressed_global_history(table_history_lengths[table-1],MAX_INDEX_BITS); + + // Really complex hashing function + return(global_histor_hash ^ ip ^ (ip >> (abs(MAX_INDEX_BITS-table)+1)) ^ path_history_hash) & ((1 << MAX_INDEX_BITS)-1); +} + +History Tage::get_compressed_global_history(int inSize, int outSize){ +History compressed_history = 0; +History temporary_history = 0; +int compressed_history_length = outSize; +for (int i = 0; i < inSize; i++) +{ + if (i % compressed_history_length == 0) + { + compressed_history ^= temporary_history; + temporary_history = 0; + } + temporary_history = (temporary_history << 1) | GLOBAL_HISTORY[i]; +} +compressed_history ^= temporary_history; +return compressed_history; +} + +Tage tage_predictor[NUM_CPUS]; + +void O3_CPU::initialize_branch_predictor() { + tage_predictor[cpu].init(); + +} + +uint8_t O3_CPU::predict_branch(uint64_t ip) +{ + //std::cout << debug_counter << std::endl; + return tage_predictor[cpu].predict(ip); +} + +void O3_CPU::last_branch_result(uint64_t ip, uint64_t branch_target, uint8_t taken, uint8_t branch_type) +{ + tage_predictor[cpu].update(ip,taken); +} \ No newline at end of file diff --git a/champsim_config.json b/champsim_config.json index 9631d1e5d4..4dbf394dff 100644 --- a/champsim_config.json +++ b/champsim_config.json @@ -2,7 +2,7 @@ "executable_name": "champsim", "block_size": 64, "page_size": 4096, - "heartbeat_frequency": 10000000, + "heartbeat_frequency": 100, "num_cores": 1, "ooo_cpu": [ { @@ -26,7 +26,7 @@ "dispatch_latency": 1, "schedule_latency": 0, "execute_latency": 0, - "branch_predictor": "transformer", + "branch_predictor": "lstm_predict", "btb": "basic_btb" } ], diff --git a/src/json_printer.cc b/src/json_printer.cc index cb2fdbf8c8..fc8230b6d4 100644 --- a/src/json_printer.cc +++ b/src/json_printer.cc @@ -14,82 +14,89 @@ * limitations under the License. */ -#include -#include - -#include "stats_printer.h" -#include - -void to_json(nlohmann::json& j, const O3_CPU::stats_type stats) -{ - std::array, 6> types{ - {std::pair{"BRANCH_DIRECT_JUMP", BRANCH_DIRECT_JUMP}, std::pair{"BRANCH_INDIRECT", BRANCH_INDIRECT}, std::pair{"BRANCH_CONDITIONAL", BRANCH_CONDITIONAL}, - std::pair{"BRANCH_DIRECT_CALL", BRANCH_DIRECT_CALL}, std::pair{"BRANCH_INDIRECT_CALL", BRANCH_INDIRECT_CALL}, - std::pair{"BRANCH_RETURN", BRANCH_RETURN}}}; - - auto total_mispredictions = std::ceil( - std::accumulate(std::begin(types), std::end(types), 0ll, [btm = stats.branch_type_misses](auto acc, auto next) { return acc + btm[next.second]; })); - - std::map mpki{}; - for (auto [name, idx] : types) - mpki.emplace(name, stats.branch_type_misses[idx]); - - j = nlohmann::json{{"instructions", stats.instrs()}, - {"cycles", stats.cycles()}, - {"Avg ROB occupancy at mispredict", std::ceil(stats.total_rob_occupancy_at_branch_mispredict) / std::ceil(total_mispredictions)}, - {"mispredict", mpki}}; -} - -void to_json(nlohmann::json& j, const CACHE::stats_type stats) -{ - constexpr std::array, 5> types{ - {std::pair{"LOAD", champsim::to_underlying(access_type::LOAD)}, std::pair{"RFO", champsim::to_underlying(access_type::RFO)}, - std::pair{"PREFETCH", champsim::to_underlying(access_type::PREFETCH)}, std::pair{"WRITE", champsim::to_underlying(access_type::WRITE)}, - std::pair{"TRANSLATION", champsim::to_underlying(access_type::TRANSLATION)}}}; - - std::map statsmap; - statsmap.emplace("prefetch requested", stats.pf_requested); - statsmap.emplace("prefetch issued", stats.pf_issued); - statsmap.emplace("useful prefetch", stats.pf_useful); - statsmap.emplace("useless prefetch", stats.pf_useless); - statsmap.emplace("miss latency", stats.avg_miss_latency); - for (const auto& type : types) { - statsmap.emplace(type.first, nlohmann::json{{"hit", stats.hits[type.second]}, {"miss", stats.misses[type.second]}}); - } - - j = statsmap; -} - -void to_json(nlohmann::json& j, const DRAM_CHANNEL::stats_type stats) -{ - j = nlohmann::json{{"RQ ROW_BUFFER_HIT", stats.RQ_ROW_BUFFER_HIT}, - {"RQ ROW_BUFFER_MISS", stats.RQ_ROW_BUFFER_MISS}, - {"WQ ROW_BUFFER_HIT", stats.WQ_ROW_BUFFER_HIT}, - {"WQ ROW_BUFFER_MISS", stats.WQ_ROW_BUFFER_MISS}, - {"AVG DBUS CONGESTED CYCLE", std::ceil(stats.dbus_cycle_congested) / std::ceil(stats.dbus_count_congested)}}; -} - -namespace champsim -{ -void to_json(nlohmann::json& j, const champsim::phase_stats stats) -{ - std::map roi_stats; - roi_stats.emplace("cores", stats.roi_cpu_stats); - roi_stats.emplace("DRAM", stats.roi_dram_stats); - for (auto x : stats.roi_cache_stats) - roi_stats.emplace(x.name, x); - - std::map sim_stats; - sim_stats.emplace("cores", stats.sim_cpu_stats); - sim_stats.emplace("DRAM", stats.sim_dram_stats); - for (auto x : stats.sim_cache_stats) - sim_stats.emplace(x.name, x); - - std::map statsmap{{"name", stats.name}, {"traces", stats.trace_names}}; - statsmap.emplace("roi", roi_stats); - statsmap.emplace("sim", sim_stats); - j = statsmap; -} -} // namespace champsim - -void champsim::json_printer::print(std::vector& stats) { stream << nlohmann::json::array_t{std::begin(stats), std::end(stats)}; } + #include + #include + + #include "stats_printer.h" + #include + + void to_json(nlohmann::json& j, const O3_CPU::stats_type stats) + { + std::array, 6> types{ + {std::pair{"BRANCH_DIRECT_JUMP", BRANCH_DIRECT_JUMP}, std::pair{"BRANCH_INDIRECT", BRANCH_INDIRECT}, std::pair{"BRANCH_CONDITIONAL", BRANCH_CONDITIONAL}, + std::pair{"BRANCH_DIRECT_CALL", BRANCH_DIRECT_CALL}, std::pair{"BRANCH_INDIRECT_CALL", BRANCH_INDIRECT_CALL}, + std::pair{"BRANCH_RETURN", BRANCH_RETURN}}}; + + auto total_mispredictions = std::ceil( + std::accumulate(std::begin(types), std::end(types), 0ll, [btm = stats.branch_type_misses](auto acc, auto next) { return acc + btm[next.second]; })); + + std::map mpki{}; + for (auto [name, idx] : types) + mpki.emplace(name, stats.branch_type_misses[idx]); + auto total_branch = std::ceil( + std::accumulate(std::begin(types), std::end(types), 0ll, [tbt = stats.total_branch_types](auto acc, auto next) { return acc + tbt[next.second]; })); + + j = nlohmann::json{{"instructions", stats.instrs()}, + {"cycles", stats.cycles()}, + {"Avg ROB occupancy at mispredict", std::ceil(stats.total_rob_occupancy_at_branch_mispredict) / std::ceil(total_mispredictions)}, + {"mispredict", mpki}, + {"Branch Prediction Accuracy",(100.0 * std::ceil(total_branch - total_mispredictions)) / total_branch}, + {"MPKI", (1000.0 * total_mispredictions)/ std::ceil(stats.instrs())}}; + + } + + + + void to_json(nlohmann::json& j, const CACHE::stats_type stats) + { + constexpr std::array, 5> types{ + {std::pair{"LOAD", champsim::to_underlying(access_type::LOAD)}, std::pair{"RFO", champsim::to_underlying(access_type::RFO)}, + std::pair{"PREFETCH", champsim::to_underlying(access_type::PREFETCH)}, std::pair{"WRITE", champsim::to_underlying(access_type::WRITE)}, + std::pair{"TRANSLATION", champsim::to_underlying(access_type::TRANSLATION)}}}; + + std::map statsmap; + statsmap.emplace("prefetch requested", stats.pf_requested); + statsmap.emplace("prefetch issued", stats.pf_issued); + statsmap.emplace("useful prefetch", stats.pf_useful); + statsmap.emplace("useless prefetch", stats.pf_useless); + statsmap.emplace("miss latency", stats.avg_miss_latency); + for (const auto& type : types) { + statsmap.emplace(type.first, nlohmann::json{{"hit", stats.hits[type.second]}, {"miss", stats.misses[type.second]}}); + } + + j = statsmap; + } + + void to_json(nlohmann::json& j, const DRAM_CHANNEL::stats_type stats) + { + j = nlohmann::json{{"RQ ROW_BUFFER_HIT", stats.RQ_ROW_BUFFER_HIT}, + {"RQ ROW_BUFFER_MISS", stats.RQ_ROW_BUFFER_MISS}, + {"WQ ROW_BUFFER_HIT", stats.WQ_ROW_BUFFER_HIT}, + {"WQ ROW_BUFFER_MISS", stats.WQ_ROW_BUFFER_MISS}, + {"AVG DBUS CONGESTED CYCLE", std::ceil(stats.dbus_cycle_congested) / std::ceil(stats.dbus_count_congested)}}; + } + + namespace champsim + { + void to_json(nlohmann::json& j, const champsim::phase_stats stats) + { + std::map roi_stats; + // roi_stats.emplace("cores", stats.roi_cpu_stats); + // roi_stats.emplace("DRAM", stats.roi_dram_stats); + // for (auto x : stats.roi_cache_stats) + // roi_stats.emplace(x.name, x); + + std::map sim_stats; + sim_stats.emplace("cores", stats.sim_cpu_stats); + // sim_stats.emplace("DRAM", stats.sim_dram_stats); + for (auto x : stats.sim_cache_stats) + sim_stats.emplace(x.name, x); + + std::map statsmap{{"name", stats.name}, {"traces", stats.trace_names}}; + //statsmap.emplace("roi", roi_stats); + statsmap.emplace("sim", sim_stats); + j = statsmap; + } + } // namespace champsim + + void champsim::json_printer::print(std::vector& stats) { stream << nlohmann::json::array_t{std::begin(stats), std::end(stats)}; } \ No newline at end of file diff --git a/src/ooo_cpu.cc b/src/ooo_cpu.cc index 04493c3409..03d94dc2ca 100644 --- a/src/ooo_cpu.cc +++ b/src/ooo_cpu.cc @@ -51,23 +51,31 @@ long O3_CPU::operate() progress += check_dib(); initialize_instruction(); - // heartbeat - if (show_heartbeat && (num_retired >= next_print_instruction)) { - auto heartbeat_instr{std::ceil(num_retired - last_heartbeat_instr)}; - auto heartbeat_cycle{std::ceil(current_cycle - last_heartbeat_cycle)}; +// heartbeat +if (show_heartbeat && (num_retired >= next_print_instruction)) { + auto heartbeat_instr{std::ceil(num_retired - last_heartbeat_instr)}; + auto heartbeat_cycle{std::ceil(current_cycle - last_heartbeat_cycle)}; - auto phase_instr{std::ceil(num_retired - begin_phase_instr)}; - auto phase_cycle{std::ceil(current_cycle - begin_phase_cycle)}; + auto phase_instr{std::ceil(num_retired - begin_phase_instr)}; + auto phase_cycle{std::ceil(current_cycle - begin_phase_cycle)}; + constexpr std::array, 6> types{ + {std::pair{"BRANCH_DIRECT_JUMP", BRANCH_DIRECT_JUMP}, std::pair{"BRANCH_INDIRECT", BRANCH_INDIRECT}, std::pair{"BRANCH_CONDITIONAL", BRANCH_CONDITIONAL}, + std::pair{"BRANCH_DIRECT_CALL", BRANCH_DIRECT_CALL}, std::pair{"BRANCH_INDIRECT_CALL", BRANCH_INDIRECT_CALL}, + std::pair{"BRANCH_RETURN", BRANCH_RETURN}}}; - fmt::print("Heartbeat CPU {} instructions: {} cycles: {} heartbeat IPC: {:.4g} cumulative IPC: {:.4g} (Simulation time: {:%H hr %M min %S sec})\n", cpu, - num_retired, current_cycle, heartbeat_instr / heartbeat_cycle, phase_instr / phase_cycle, elapsed_time()); - next_print_instruction += STAT_PRINTING_PERIOD; + auto total_branch = std::ceil(std::accumulate(std::begin(types), std::end(types), 0ll, [tbt = sim_stats.total_branch_types](auto acc, auto next) { return acc + tbt[next.second]; })); + auto total_mispredictions = std::ceil(std::accumulate(std::begin(types), std::end(types), 0ll, [btm = sim_stats.branch_type_misses](auto acc, auto next) { return acc + btm[next.second]; })); - last_heartbeat_instr = num_retired; - last_heartbeat_cycle = current_cycle; - } - return progress; + fmt::print("Heartbeat CPU {} instructions: {} cycles: {} heartbeat IPC: {:.4g} cumulative IPC: {:.4g} Prediction_Accuracy: {:.4g}% (Simulation time: {:%H hr %M min %S sec})\n", cpu, + num_retired, current_cycle, heartbeat_instr / heartbeat_cycle, phase_instr / phase_cycle,(100.0 * std::ceil(total_branch - total_mispredictions)) / total_branch, elapsed_time()); + next_print_instruction += STAT_PRINTING_PERIOD; + + last_heartbeat_instr = num_retired; + last_heartbeat_cycle = current_cycle; +} + +return progress; } void O3_CPU::initialize()