diff --git a/Makefile b/Makefile
index 74b40105d1..94675c3f4a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,19 +1,29 @@
 ROOT_DIR = $(patsubst %/,%,$(dir $(abspath $(firstword $(MAKEFILE_LIST)))))
 
 CPPFLAGS += -MMD -I$(ROOT_DIR)/inc
-CXXFLAGS += --std=c++17 -O3 -Wall -Wextra -Wshadow -Wpedantic
+CXXFLAGS += --std=c++17 -Ofast -Wno-shadow -Wpedantic -fopenmp -Wno-variadic-macros
+
+# CUDA flags
+CUDA_FLAGS = -arch=all-major -O3  -ltoir -gen-opt-lto --use_fast_math --cudadevrt static --prec-div=false --extra-device-vectorization --default-stream per-thread
+# Compiler and linker
+CXX = g++ 
+NVCC = nvcc
 
 # vcpkg integration
 TRIPLET_DIR = $(patsubst %/,%,$(firstword $(filter-out $(ROOT_DIR)/vcpkg_installed/vcpkg/, $(wildcard $(ROOT_DIR)/vcpkg_installed/*/))))
 CPPFLAGS += -isystem $(TRIPLET_DIR)/include
-LDFLAGS  += -L$(TRIPLET_DIR)/lib -L$(TRIPLET_DIR)/lib/manual-link
-LDLIBS   += -llzma -lz -lbz2 -lfmt
+LDLIBS  += -L$(TRIPLET_DIR)/lib -L$(TRIPLET_DIR)/lib/manual-link   -L/usr/local/cuda-12.6/lib64
+LDLIBS   += -llzma -lz -lbz2 -lfmt -lpthread -lcudart -lcublas -lgomp
+
+TORCH_DIR = /home/john/libtorch/libtorch
+CPPFLAGS += -I$(TORCH_DIR)/include -I$(TORCH_DIR)/include/torch/csrc/api/include
+LDLIBS   += -L$(TORCH_DIR)/lib -lc10 -ltorch -ltorch_cpu -lc
 
-.phony: all all_execs clean configclean test makedirs
+.phony: all all_execs clean configclean test makedirs cuda.o
 
 test_main_name=$(ROOT_DIR)/test/bin/000-test-main
 
-all: all_execs
+all: cuda.o all_execs 
 
 # Generated configuration makefile contains:
 #  - $(executable_name), the list of all executables in the configuration
@@ -33,6 +43,7 @@ clean:
 	@-$(RM) inc/cache_modules.h
 	@-$(RM) inc/ooo_cpu_modules.h
 	@-$(RM) src/core_inst.cc
+	@-$(RM) cuda.o
 	@-$(RM) $(test_main_name)
 
 # Remove all configuration files
@@ -48,18 +59,18 @@ $(filter-out test, $(sort $(build_dirs) $(module_dirs))): | $(dir $@)
 $(build_objs) $(module_objs):
 	$(COMPILE.cc) $(OUTPUT_OPTION) $<
 
+cuda.o:
+	$(NVCC) -c $(ROOT_DIR)/inc/cuda.cu -o $@ $(CUDA_FLAGS)
+
 # Add address sanitizers for tests
 #$(test_main_name): CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer
 $(test_main_name): CXXFLAGS += -g3 -Og -Wconversion
 $(test_main_name): LDLIBS   += -lCatch2Main -lCatch2
 
-# Link test executable
-$(test_main_name):
-	$(LINK.cc) $(LDFLAGS) -o $@ $(filter-out %/main.o, $^) $(LOADLIBES) $(LDLIBS)
 
 # Link main executables
 $(filter-out $(test_main_name), $(executable_name)):
-	$(LINK.cc) $(LDFLAGS) -o $@ $^ $(LOADLIBES) $(LDLIBS)
+	$(CXX) $^  cuda.o $(LDLIBS) -o $@
 
 # Tests: build and run
 test: $(test_main_name)
@@ -68,4 +79,4 @@ test: $(test_main_name)
 pytest:
 	PYTHONPATH=$(PYTHONPATH):$(shell pwd) python3 -m unittest discover -v --start-directory='test/python'
 
--include $(foreach dir,$(wildcard .csconfig/*/) $(wildcard .csconfig/test/*/),$(wildcard $(dir)/obj/*.d))
+-include $(foreach dir,$(wildcard .csconfig/*/) $(wildcard .csconfig/test/*/),$(wildcard $(dir)/obj/*.d))
\ No newline at end of file
diff --git a/branch/False/False.cc b/branch/False/False.cc
new file mode 100644
index 0000000000..98a06eb620
--- /dev/null
+++ b/branch/False/False.cc
@@ -0,0 +1,9 @@
+#include "ooo_cpu.h"
+
+void O3_CPU::initialize_branch_predictor() {}
+
+uint8_t O3_CPU::predict_branch(uint64_t ip){
+    return 0;
+}
+
+void O3_CPU::last_branch_result(uint64_t ip, uint64_t branch_target, uint8_t taken, uint8_t branch_type){}
diff --git a/branch/Linear_NN/Linear_NN.cc b/branch/Linear_NN/Linear_NN.cc
new file mode 100644
index 0000000000..eed6584b43
--- /dev/null
+++ b/branch/Linear_NN/Linear_NN.cc
@@ -0,0 +1,89 @@
+#include <torch/torch.h>
+#include <iostream>
+#include <vector>
+#include "ooo_cpu.h"
+
+constexpr std::size_t HISTORY_LENGTH = 64;
+
+struct Net : torch::nn::Module {
+  Net()
+      : linear1(register_module("linear1", torch::nn::Linear(1 + HISTORY_LENGTH, 16))), // Input: IP + History
+        linear2(register_module("linear2", torch::nn::Linear(16, 1))) {
+    another_bias = register_parameter("b", torch::randn(1));
+  }
+
+  torch::Tensor forward(torch::Tensor input) {
+    input = torch::relu(linear1(input));  // Apply ReLU activation
+    return torch::sigmoid(linear2(input) + another_bias);  // Sigmoid to output probability
+  }
+
+  torch::nn::Linear linear1, linear2;
+  torch::Tensor another_bias;
+};
+
+std::bitset<HISTORY_LENGTH> Global_History;
+Net net;
+
+void O3_CPU::initialize_branch_predictor() {}
+
+uint8_t O3_CPU::predict_branch(uint64_t ip)
+{
+    // Convert history into Tensor
+    std::vector<float> history_features;
+    for (size_t i = 0; i < HISTORY_LENGTH; ++i) {
+        history_features.push_back(Global_History[i] ? 1.0f : 0.0f);  // Convert bitset to float
+    }
+
+    // Normalize IP
+    float norm_ip = static_cast<float>(ip) / static_cast<float>(UINT64_MAX);
+    history_features.insert(history_features.begin(), norm_ip);  // Insert normalized IP
+
+    // Convert to Tensor
+    torch::Tensor input = torch::tensor(history_features).view({1, HISTORY_LENGTH + 1});
+
+    // Forward pass through neural network
+    torch::Tensor output = net.forward(input);
+    float prediction = output.item<float>();
+
+    std::cout << "Prediction: " << prediction << std::endl;
+    return prediction > 0.5 ? 1 : 0;
+}
+
+void O3_CPU::last_branch_result(uint64_t ip, uint64_t branch_target, uint8_t taken, uint8_t branch_type)
+{
+    // Convert history into Tensor
+    std::vector<float> history_features;
+    for (size_t i = 0; i < HISTORY_LENGTH; ++i) {
+        history_features.push_back(Global_History[i] ? 1.0f : 0.0f);
+    }
+
+    // Normalize IP
+    float norm_ip = static_cast<float>(ip) / static_cast<float>(UINT64_MAX);
+    history_features.insert(history_features.begin(), norm_ip);
+
+    // Convert to Tensor
+    torch::Tensor input = torch::tensor(history_features).view({1, HISTORY_LENGTH + 1});
+
+    std::cout << "Taken: " << (int)taken << std::endl;
+
+    // Convert expected output
+    torch::Tensor target = torch::tensor(static_cast<float>(taken), torch::dtype(torch::kFloat32)).view({1, 1});
+
+    // Define optimizer (Adam for adaptive learning)
+    static torch::optim::Adam optimizer(net.parameters(), torch::optim::AdamOptions(0.001));
+
+    // Forward pass
+    torch::Tensor prediction = net.forward(input);
+
+    // Compute loss
+    torch::Tensor loss = torch::binary_cross_entropy(prediction, target);
+
+    // Backpropagation
+    optimizer.zero_grad();
+    loss.backward();
+    optimizer.step();
+
+    // Update global history
+    Global_History >>= 1;
+    Global_History[HISTORY_LENGTH - 1] = taken;
+}
diff --git a/branch/Transformer_NN/Transformer_NN.cc b/branch/Transformer_NN/Transformer_NN.cc
new file mode 100644
index 0000000000..ee1a89fae8
--- /dev/null
+++ b/branch/Transformer_NN/Transformer_NN.cc
@@ -0,0 +1,158 @@
+#define TORCH_WARN_OFF
+#include <torch/torch.h>
+#include <vector>
+#include <fstream>
+#include <unordered_map>
+#include <mutex>
+#include "ooo_cpu.h"
+
+#define LOCAL_HISTORY_SIZE 8192  
+#define HIDDEN_SIZE 16
+#define NUM_LAYERS 1
+#define LEARNING_RATE 0.0001
+#define NUM_HEADS 4
+
+constexpr size_t LOCAL_HISTORY_BITS = 8;  
+constexpr size_t GLOBAL_HISTORY_BITS = 64; 
+constexpr size_t INPUT_SIZE = LOCAL_HISTORY_BITS + GLOBAL_HISTORY_BITS + 1;
+
+// Bimodal table for local branch history
+std::vector<std::bitset<LOCAL_HISTORY_BITS>> Local_History(LOCAL_HISTORY_SIZE);
+// Global branch history
+std::bitset<GLOBAL_HISTORY_BITS> Global_History;
+
+// Mutex for optimizer updates
+std::mutex optimizer_mutex;
+
+// Log file for async logging
+std::ofstream log_file("debug.log", std::ios::app);
+
+void log_debug(const std::string &msg) {
+    std::lock_guard<std::mutex> lock(optimizer_mutex);
+    log_file << msg << std::endl;
+}
+
+// this printout could be entirely wrong I really have no idea how to calculate this 
+void print_model_size() {
+    size_t fc_in_params = INPUT_SIZE * HIDDEN_SIZE + HIDDEN_SIZE;
+    size_t fc_out_params = HIDDEN_SIZE * 2 + 2;
+    size_t transformer_encoder_params = NUM_LAYERS * (4 * (HIDDEN_SIZE * HIDDEN_SIZE / NUM_HEADS) + 2 * HIDDEN_SIZE * HIDDEN_SIZE);
+    size_t transformer_decoder_params = NUM_LAYERS * (4 * (HIDDEN_SIZE * HIDDEN_SIZE / NUM_HEADS) + 2 * HIDDEN_SIZE * HIDDEN_SIZE);
+
+    size_t total_params = fc_in_params + fc_out_params + transformer_encoder_params + transformer_decoder_params;
+    double model_size_kb = (total_params * sizeof(float)) / 1024.0;
+    double local_history_size_kb = (LOCAL_HISTORY_SIZE * LOCAL_HISTORY_BITS) / 1024.0;
+
+    std::cout << "Model size: " << model_size_kb << " KB (" << total_params << " parameters)" << std::endl;
+    std::cout << "Local history size: " << local_history_size_kb << " KB (" << LOCAL_HISTORY_SIZE 
+              << " entries, " << LOCAL_HISTORY_BITS << " bits per entry)" << std::endl;
+}
+
+struct TransformerPredictor : torch::nn::Module {
+    TransformerPredictor()
+        : fc_in(register_module("fc_in", torch::nn::Linear(INPUT_SIZE, HIDDEN_SIZE))),
+          transformer_encoder_layer(register_module("encoder_layer", 
+              torch::nn::TransformerEncoderLayer(torch::nn::TransformerEncoderLayerOptions(HIDDEN_SIZE, NUM_HEADS)))),
+          transformer_encoder(register_module("encoder", 
+              torch::nn::TransformerEncoder(transformer_encoder_layer, NUM_LAYERS))),
+          transformer_decoder_layer(register_module("decoder_layer", 
+              torch::nn::TransformerDecoderLayer(torch::nn::TransformerDecoderLayerOptions(HIDDEN_SIZE, NUM_HEADS)))),
+          transformer_decoder(register_module("decoder", 
+              torch::nn::TransformerDecoder(transformer_decoder_layer, NUM_LAYERS))),
+          fc_out(register_module("fc_out", torch::nn::Linear(HIDDEN_SIZE, 2))),
+          optimizer(std::make_unique<torch::optim::Adam>(parameters(), torch::optim::AdamOptions(LEARNING_RATE))),
+          update_count(0), forward_count(0) {
+        
+        torch::set_num_threads(1); // Limit PyTorch threading
+    }
+
+    torch::Tensor forward(torch::Tensor input) {
+        input = torch::relu(fc_in(input));
+        auto memory = transformer_encoder(input);
+        input = transformer_decoder(input, memory);
+        forward_count++;
+
+        if (forward_count % 10000 == 0) {
+            log_debug("[DEBUG] Forward count: " + std::to_string(forward_count));
+        }
+
+        return torch::softmax(fc_out(input.mean(1)), 1);
+    }
+
+    torch::nn::Linear fc_in, fc_out;
+    torch::nn::TransformerEncoderLayer transformer_encoder_layer;
+    torch::nn::TransformerEncoder transformer_encoder;
+    torch::nn::TransformerDecoderLayer transformer_decoder_layer;
+    torch::nn::TransformerDecoder transformer_decoder;
+    std::unique_ptr<torch::optim::Adam> optimizer;
+    int update_count;
+    int forward_count;
+};
+
+TransformerPredictor transformer_net;
+
+void O3_CPU::initialize_branch_predictor() {
+    print_model_size();
+}
+
+uint8_t O3_CPU::predict_branch(uint64_t ip) {
+  size_t index = ip % LOCAL_HISTORY_SIZE;
+  std::bitset<LOCAL_HISTORY_BITS>& local_history = Local_History[index];
+
+  // XOR global history with the IP
+  uint64_t transformed_global_history = Global_History.to_ullong() ^ ip;
+
+  std::array<float, INPUT_SIZE> features;
+  features[0] = static_cast<float>(ip) / static_cast<float>(UINT64_MAX);
+
+  for (size_t i = 0; i < LOCAL_HISTORY_BITS; ++i)
+      features[i + 1] = local_history[i] ? 1.0f : 0.0f;
+
+  for (size_t i = 0; i < GLOBAL_HISTORY_BITS; ++i)
+      features[LOCAL_HISTORY_BITS + 1 + i] = (transformed_global_history >> i) & 1 ? 1.0f : 0.0f;
+
+  // Convert std::array to std::vector before creating tensor
+  torch::Tensor input = torch::tensor(std::vector<float>(features.begin(), features.end()), 
+                                      torch::dtype(torch::kFloat32)).view({1, 1, INPUT_SIZE});
+
+  return transformer_net.forward(input).argmax(1).item<int>();
+}
+
+void O3_CPU::last_branch_result(uint64_t ip, uint64_t branch_target, uint8_t taken, uint8_t branch_type) {
+  size_t index = ip % LOCAL_HISTORY_SIZE;
+  std::bitset<LOCAL_HISTORY_BITS>& local_history = Local_History[index];
+
+  // XOR global history with the IP
+  uint64_t transformed_global_history = Global_History.to_ullong() ^ ip;
+
+  std::array<float, INPUT_SIZE> features;
+  features[0] = static_cast<float>(ip) / static_cast<float>(UINT64_MAX);
+
+  for (size_t i = 0; i < LOCAL_HISTORY_BITS; ++i)
+      features[i + 1] = local_history[i] ? 1.0f : 0.0f;
+
+  for (size_t i = 0; i < GLOBAL_HISTORY_BITS; ++i)
+      features[LOCAL_HISTORY_BITS + 1 + i] = (transformed_global_history >> i) & 1 ? 1.0f : 0.0f;
+
+  // Convert std::array to std::vector before creating tensor
+  torch::Tensor input = torch::tensor(std::vector<float>(features.begin(), features.end()), 
+                                      torch::dtype(torch::kFloat32)).view({1, 1, INPUT_SIZE});
+
+  torch::Tensor target = torch::tensor({1.0f - static_cast<float>(taken), static_cast<float>(taken)}).view({1, 2});
+  torch::Tensor prediction = transformer_net.forward(input);
+  torch::Tensor loss = torch::binary_cross_entropy(prediction, target);
+
+  // Mutex to protect optimizer in parallel tests
+  {
+      std::lock_guard<std::mutex> lock(optimizer_mutex);
+      transformer_net.optimizer->zero_grad();
+      loss.backward();
+      transformer_net.optimizer->step();
+  }
+
+  // Update histories
+  local_history >>= 1;
+  local_history[LOCAL_HISTORY_BITS - 1] = taken;
+  Global_History >>= 1;
+  Global_History[GLOBAL_HISTORY_BITS - 1] = taken;
+}
\ No newline at end of file
diff --git a/branch/True/True.cc b/branch/True/True.cc
new file mode 100644
index 0000000000..3409790ac9
--- /dev/null
+++ b/branch/True/True.cc
@@ -0,0 +1,9 @@
+#include "ooo_cpu.h"
+
+void O3_CPU::initialize_branch_predictor() {}
+
+uint8_t O3_CPU::predict_branch(uint64_t ip){
+    return 1;
+}
+
+void O3_CPU::last_branch_result(uint64_t ip, uint64_t branch_target, uint8_t taken, uint8_t branch_type){}
diff --git a/branch/tage4/tage4.cc b/branch/tage4/tage4.cc
new file mode 100644
index 0000000000..647d787430
--- /dev/null
+++ b/branch/tage4/tage4.cc
@@ -0,0 +1,346 @@
+
+#include <map>
+#include <bitset>
+#include <iostream>
+#include <string>
+#include <math.h>
+#include <random>
+
+#include "msl/fwcounter.h"
+#include "ooo_cpu.h"
+
+#define Tag uint16_t
+#define Index uint16_t
+#define Path uint64_t
+#define History uint64_t
+#define BIMODAL_TABLE_SIZE 8192
+#define MAX_INDEX_BITS 12
+#define TAGE_TABLES 4
+#define TAGE_TAG_BITS 7
+#define TAGE_CONTER_BITS 3
+#define TAGE_USEFUL_BITS 2
+#define GLOBAL_HISTORY_LENGTH 1024
+#define PATH_HISTORY_BUFFER_LENGTH 32
+#define TAGE_MIN_LENGTH 5
+#define HISTORY_ALPHA 1.5
+#define TAGE_RESET_INTERVAL 512000
+#define BIMODE_COUNTER_BITS  3
+
+int debug_counter = 0;
+
+struct TAGs
+    {
+        Tag tag;
+        uint8_t useful;
+        uint8_t counter;
+    };
+
+
+class Tage {
+    private:
+    int count;
+    std::bitset<GLOBAL_HISTORY_LENGTH> GLOBAL_HISTORY;
+    std::bitset<PATH_HISTORY_BUFFER_LENGTH> PATH_HISTORY;
+    uint8_t T0[BIMODAL_TABLE_SIZE];
+    std::array<TAGs, BIMODAL_TABLE_SIZE> T[TAGE_TABLES];
+    int table_history_lengths[TAGE_TABLES];
+    uint8_t use_alt_on_na;
+    uint8_t tage_pred, pred, alt_pred;
+    int pred_comp, alt_comp; // Provider and alternate component of last branch PC
+    int STRONG;
+    int debug_ct[7] = {0,0,0,0,0,0,0};
+
+public:
+    void init();  // initialise the member variables
+    uint8_t predict(uint64_t ip);  // return the prediction from tage
+    void update(uint64_t ip, uint8_t taken);  // updates the state of tage
+
+    Index get_bimodal_index(uint64_t ip);   // helper hash function to index into the bimodal table
+    Index get_predictor_index(uint64_t ip, int table);   // helper hash function to index into the predictor table using histories
+    Tag get_tag(uint64_t ip, int table);   // helper hash function to get the tag of particular ip and table
+    int get_match_below_n(uint64_t ip, int table);   // helper function to find the hit table strictly before the table argument
+    void ctr_update(uint8_t &ctr, int cond, int low, int high);   // counter update helper function (including clipping)
+    uint8_t get_prediction(uint64_t ip, int comp);   // helper function for prediction
+    Path get_path_history_hash(int table);   // hepoer hash function to compress the path history
+    History get_compressed_global_history(int inSize, int outSize); // Compress global history of last 'inSize' branches into 'outSize' by wrapping the history
+};
+
+uint8_t Tage::predict(uint64_t ip){
+
+
+    pred_comp = get_match_below_n(ip,TAGE_TABLES + 1); // Get the first predictor from the end which matches the PC
+    alt_comp = get_match_below_n(ip, pred_comp); // Get the first predictor below the provider which matches the PC 
+
+    pred = get_prediction(ip, pred_comp);
+    alt_pred = get_prediction(ip, alt_comp);
+
+    // std::cout << debug_ct[0] << "|" << debug_ct[1] << "|" << debug_ct[2] << "|" << debug_ct[3] << "|" << "|" << debug_ct[4] << "|" << debug_ct[5] << std::endl;
+    debug_ct[pred_comp]++;
+    if(pred_comp == 0) { // if there is no alternate predictor we use the default bimodal table
+        tage_pred = pred;
+        //debug_ct[0]++;
+    }
+    else // if there is an alternate predictor 
+    {
+        Index index = get_predictor_index(ip,pred_comp);
+        STRONG = abs(2*T[pred_comp-1][index].counter + 1 - (1 << TAGE_CONTER_BITS)) > 1; // check to see if the current predictors guess is strong
+        // std::cout << "use_alt_on_na" << int(use_alt_on_na) << " | " << "STRONG:" << STRONG << std::endl;
+        if (use_alt_on_na < 8 || STRONG) {
+            tage_pred = pred; // if the prediction is strong, use that predictor
+            //debug_ct[1]++;
+        }
+        else
+        { 
+            tage_pred = alt_pred; // if the prediction is not strong, use an alternate predictor 
+        }
+    }
+    return tage_pred;
+}
+
+void Tage::ctr_update(uint8_t &ctr, int cond, int low, int high)
+{
+
+    if(cond && ctr < high)
+        ctr++;
+    else if (!cond && ctr > low)
+        ctr--;
+}
+
+void Tage::update(uint64_t ip, uint8_t taken){
+    if (pred_comp > 0)
+    {
+        struct TAGs *entry = &T[pred_comp-1][get_predictor_index(ip,pred_comp)];
+        uint8_t useful = entry->useful;
+        if(!STRONG)
+        {
+            if (pred != alt_pred)
+                // std::cout << "ENTRY UPDATED" << std::endl;
+                ctr_update(use_alt_on_na, !(pred = taken), 0, 15);
+        }
+
+        if (alt_comp > 0) 
+        {
+            struct TAGs *alt_entry = &T[alt_comp-1][get_predictor_index(ip,alt_comp)];
+            if(useful == 0)
+                // std::cout << "ENTRY UPDATED" << std::endl;
+                ctr_update(alt_entry->counter,taken,0,((1 << TAGE_CONTER_BITS) -1));
+        }
+
+        else 
+        {
+            Index index = get_bimodal_index(ip);
+            if (useful == 0)
+                ctr_update(T0[index],taken,0,((1 << BIMODE_COUNTER_BITS)-1));
+        }
+
+        if(pred != alt_pred)
+        {
+            if (pred == taken)
+            {
+                if (entry->useful < ((1 << TAGE_USEFUL_BITS)-1))
+                    entry->useful++;
+            }
+            else 
+            {
+                if(use_alt_on_na < 8)
+                {
+                    if (entry->useful > 0)
+                        entry->useful--;
+                }
+            }
+        }
+        ctr_update(entry->counter, taken, 0, ((1 <<TAGE_CONTER_BITS) -1));
+    }
+
+    else
+    {
+        Index index = get_bimodal_index(ip);
+        ctr_update(T0[index], taken, 0, ((1 << TAGE_CONTER_BITS)-1));
+    }
+    if (tage_pred != taken)
+    {
+        long random = static_cast <long> (rand()) / static_cast <long> (RAND_MAX);
+        random = random & ((1 << (TAGE_TABLES - pred_comp -1))-1);
+        int start_component = pred_comp + 1;
+
+        if(random & 1)
+        {
+            start_component++;
+            if(random & 2)
+                start_component++;
+        }
+        int isFree = 0;
+        for (int i = pred_comp + 1; i <= TAGE_TABLES; i++)
+        {
+            struct TAGs *entry_new = &T[i-1][get_predictor_index(ip,i)];
+            if(entry_new->useful == 0)
+                isFree = 1;
+        }
+        if (!isFree && start_component <= TAGE_TABLES)
+            T[start_component-1][get_predictor_index(ip,start_component)].useful = 0;
+
+        for(int i = start_component; i <= TAGE_TABLES; i++)
+        {
+            struct TAGs *entry_new = &T[i-1][get_predictor_index(ip,i)];
+            if(entry_new->useful == 0)
+            {
+                entry_new->tag = get_tag(ip,i);
+                entry_new->counter = (1 << (TAGE_CONTER_BITS - 1));
+                break;
+            }
+        }
+    }
+    for(int i = GLOBAL_HISTORY_LENGTH - 1; i > 0; i--) 
+        GLOBAL_HISTORY[i] = GLOBAL_HISTORY[i-1];
+    GLOBAL_HISTORY[0] = taken;
+    
+    for(int i = PATH_HISTORY_BUFFER_LENGTH - 1; i > 0; i--)
+        PATH_HISTORY[i] = PATH_HISTORY[i-1];
+    PATH_HISTORY[0] = ip & 1;
+
+    count++;
+    if (count % TAGE_RESET_INTERVAL == 0)
+    {
+        count = 0;
+        for(int i = 0; i < TAGE_TABLES; i++)
+        {
+            for (int j =0; j < (1 << MAX_INDEX_BITS); j++)
+                T[i][j].useful >>= 1;
+        }
+    }
+}
+
+uint8_t Tage::get_prediction(uint64_t ip, int comp){
+    if (comp == 0){
+        Index index = get_bimodal_index(ip);
+        // std::cout << "T0[index] = " << int(T0[index]) << std::endl;
+        return (T0[index] >= (1 << (TAGE_CONTER_BITS-1)));
+    }
+    else 
+    {
+        Index index = get_predictor_index(ip,comp);
+        // std::cout << "T[comp-1][index] = " << T[comp-1][index].counter << std::endl;
+        return (T[comp-1][index].counter >= (1 << (TAGE_CONTER_BITS-1)));
+    }
+}
+
+Index Tage::get_bimodal_index(uint64_t ip) {
+    return ip & (BIMODAL_TABLE_SIZE - 1);
+}
+
+Tag Tage::get_tag(uint64_t ip, int table) {
+    History global_history_hash = get_compressed_global_history(table_history_lengths[table-1],TAGE_TAG_BITS);
+    global_history_hash ^= get_compressed_global_history(table_history_lengths[table-1],TAGE_TAG_BITS-1);
+    return ((global_history_hash ^ ip) & ((1 << TAGE_TAG_BITS)-1));
+}
+
+int Tage::get_match_below_n(uint64_t ip, int table)
+{
+    for(int i = table - 1; i >= 1; i--){
+        Index index = get_predictor_index(ip,i);
+        Tag tag = get_tag(ip,i);
+         
+        if (T[i-1][index].tag == tag)
+        {
+            // std::cout<< T[i-1][index].tag << " | " << tag << "|" << i << std::endl ;
+            return i;
+        }
+    }
+    return 0;
+}
+
+void Tage::init()
+{
+    srand(time(0));
+    use_alt_on_na = 8;
+    tage_pred = 0;
+
+    for (int i = 0; i < BIMODAL_TABLE_SIZE; i++)
+        T0[i] = (1 << (BIMODE_COUNTER_BITS - 1)); // weakly taken
+    for (int i = 0; i < TAGE_TABLES; i++){
+        for (int j = 0; j < (1 << MAX_INDEX_BITS); j++)
+        {
+            T[i][j].counter = (1 << (BIMODE_COUNTER_BITS - 1));
+            T[i][j].useful = 0;
+            T[i][j].tag = 0;
+        }
+    }
+    double power = 1;
+    for (int i = 0; i < TAGE_TABLES; i++)
+    {
+        table_history_lengths[i] = int(TAGE_MIN_LENGTH * power + 0.5);
+        power *= HISTORY_ALPHA;
+        std::cout << "lengths" << table_history_lengths[i] << std::endl;
+    }
+    // for (int i = 0; i < TAGE_TABLES; i++)
+    // {
+    //     for (int j = 0; j < (1 << MAX_INDEX_BITS); j++)
+    //         std::cout << T[i][j].tag << "|" << int(T[i][j].useful) << "|" << int(T[i][j].counter)  << std::endl;
+    // }
+    // exit(0);
+    
+}
+
+Path Tage::get_path_history_hash(int table)
+{
+    Path A = 0; 
+    Path size = table_history_lengths[table-1] > 16 ? 16 : table_history_lengths[table-1];
+    for (int i = PATH_HISTORY_BUFFER_LENGTH -1; i>= 0; i--)
+        A = (A << 1) | PATH_HISTORY[i];
+    A = A & ((1 << size)-1);
+    
+    Path A1, A2;
+    A1 = A & ((1 << MAX_INDEX_BITS)-1);
+    A2 = A >> MAX_INDEX_BITS;
+
+    // Use hashign from CBP-4 L-tage submission
+    A2 = ((A2 << table) & ((1 << MAX_INDEX_BITS) - 1)) + (A2 >> abs(MAX_INDEX_BITS - table));
+    A = A1 ^ A2;
+    A = ((A << table) & ((1 << MAX_INDEX_BITS) - 1)) + (A >> abs(MAX_INDEX_BITS - table));
+    return(A);
+}
+
+Index Tage::get_predictor_index(uint64_t ip, int table) {
+    Path path_history_hash = get_path_history_hash(table);
+
+    // Hash of global history
+    History global_histor_hash = get_compressed_global_history(table_history_lengths[table-1],MAX_INDEX_BITS);
+
+    // Really complex hashing function 
+    return(global_histor_hash ^ ip ^ (ip >> (abs(MAX_INDEX_BITS-table)+1)) ^ path_history_hash) & ((1 << MAX_INDEX_BITS)-1);
+}
+
+History Tage::get_compressed_global_history(int inSize, int outSize){
+History compressed_history = 0;
+History temporary_history = 0;
+int compressed_history_length = outSize;
+for (int i = 0; i < inSize; i++)
+{
+    if (i % compressed_history_length == 0)
+    {
+        compressed_history ^= temporary_history;
+        temporary_history = 0;
+    }
+    temporary_history = (temporary_history << 1) | GLOBAL_HISTORY[i];
+}
+compressed_history ^= temporary_history;
+return compressed_history;
+}
+
+Tage tage_predictor[NUM_CPUS];
+
+void O3_CPU::initialize_branch_predictor() {
+    tage_predictor[cpu].init();
+
+}
+
+uint8_t O3_CPU::predict_branch(uint64_t ip)
+{  
+    //std::cout << debug_counter << std::endl;
+    return tage_predictor[cpu].predict(ip);
+}
+
+void O3_CPU::last_branch_result(uint64_t ip, uint64_t branch_target, uint8_t taken, uint8_t branch_type)
+{
+    tage_predictor[cpu].update(ip,taken);
+}
\ No newline at end of file
diff --git a/champsim_config.json b/champsim_config.json
index 9631d1e5d4..4dbf394dff 100644
--- a/champsim_config.json
+++ b/champsim_config.json
@@ -2,7 +2,7 @@
     "executable_name": "champsim",
     "block_size": 64,
     "page_size": 4096,
-    "heartbeat_frequency": 10000000,
+    "heartbeat_frequency": 100,
     "num_cores": 1,
     "ooo_cpu": [
         {
@@ -26,7 +26,7 @@
             "dispatch_latency": 1,
             "schedule_latency": 0,
             "execute_latency": 0,
-            "branch_predictor": "transformer",
+            "branch_predictor": "lstm_predict",
             "btb": "basic_btb"
         }
     ],
diff --git a/src/json_printer.cc b/src/json_printer.cc
index cb2fdbf8c8..fc8230b6d4 100644
--- a/src/json_printer.cc
+++ b/src/json_printer.cc
@@ -14,82 +14,89 @@
  * limitations under the License.
  */
 
-#include <algorithm>
-#include <utility>
-
-#include "stats_printer.h"
-#include <nlohmann/json.hpp>
-
-void to_json(nlohmann::json& j, const O3_CPU::stats_type stats)
-{
-  std::array<std::pair<std::string, std::size_t>, 6> types{
-      {std::pair{"BRANCH_DIRECT_JUMP", BRANCH_DIRECT_JUMP}, std::pair{"BRANCH_INDIRECT", BRANCH_INDIRECT}, std::pair{"BRANCH_CONDITIONAL", BRANCH_CONDITIONAL},
-       std::pair{"BRANCH_DIRECT_CALL", BRANCH_DIRECT_CALL}, std::pair{"BRANCH_INDIRECT_CALL", BRANCH_INDIRECT_CALL},
-       std::pair{"BRANCH_RETURN", BRANCH_RETURN}}};
-
-  auto total_mispredictions = std::ceil(
-      std::accumulate(std::begin(types), std::end(types), 0ll, [btm = stats.branch_type_misses](auto acc, auto next) { return acc + btm[next.second]; }));
-
-  std::map<std::string, std::size_t> mpki{};
-  for (auto [name, idx] : types)
-    mpki.emplace(name, stats.branch_type_misses[idx]);
-
-  j = nlohmann::json{{"instructions", stats.instrs()},
-                     {"cycles", stats.cycles()},
-                     {"Avg ROB occupancy at mispredict", std::ceil(stats.total_rob_occupancy_at_branch_mispredict) / std::ceil(total_mispredictions)},
-                     {"mispredict", mpki}};
-}
-
-void to_json(nlohmann::json& j, const CACHE::stats_type stats)
-{
-  constexpr std::array<std::pair<std::string_view, std::size_t>, 5> types{
-      {std::pair{"LOAD", champsim::to_underlying(access_type::LOAD)}, std::pair{"RFO", champsim::to_underlying(access_type::RFO)},
-       std::pair{"PREFETCH", champsim::to_underlying(access_type::PREFETCH)}, std::pair{"WRITE", champsim::to_underlying(access_type::WRITE)},
-       std::pair{"TRANSLATION", champsim::to_underlying(access_type::TRANSLATION)}}};
-
-  std::map<std::string, nlohmann::json> statsmap;
-  statsmap.emplace("prefetch requested", stats.pf_requested);
-  statsmap.emplace("prefetch issued", stats.pf_issued);
-  statsmap.emplace("useful prefetch", stats.pf_useful);
-  statsmap.emplace("useless prefetch", stats.pf_useless);
-  statsmap.emplace("miss latency", stats.avg_miss_latency);
-  for (const auto& type : types) {
-    statsmap.emplace(type.first, nlohmann::json{{"hit", stats.hits[type.second]}, {"miss", stats.misses[type.second]}});
-  }
-
-  j = statsmap;
-}
-
-void to_json(nlohmann::json& j, const DRAM_CHANNEL::stats_type stats)
-{
-  j = nlohmann::json{{"RQ ROW_BUFFER_HIT", stats.RQ_ROW_BUFFER_HIT},
-                     {"RQ ROW_BUFFER_MISS", stats.RQ_ROW_BUFFER_MISS},
-                     {"WQ ROW_BUFFER_HIT", stats.WQ_ROW_BUFFER_HIT},
-                     {"WQ ROW_BUFFER_MISS", stats.WQ_ROW_BUFFER_MISS},
-                     {"AVG DBUS CONGESTED CYCLE", std::ceil(stats.dbus_cycle_congested) / std::ceil(stats.dbus_count_congested)}};
-}
-
-namespace champsim
-{
-void to_json(nlohmann::json& j, const champsim::phase_stats stats)
-{
-  std::map<std::string, nlohmann::json> roi_stats;
-  roi_stats.emplace("cores", stats.roi_cpu_stats);
-  roi_stats.emplace("DRAM", stats.roi_dram_stats);
-  for (auto x : stats.roi_cache_stats)
-    roi_stats.emplace(x.name, x);
-
-  std::map<std::string, nlohmann::json> sim_stats;
-  sim_stats.emplace("cores", stats.sim_cpu_stats);
-  sim_stats.emplace("DRAM", stats.sim_dram_stats);
-  for (auto x : stats.sim_cache_stats)
-    sim_stats.emplace(x.name, x);
-
-  std::map<std::string, nlohmann::json> statsmap{{"name", stats.name}, {"traces", stats.trace_names}};
-  statsmap.emplace("roi", roi_stats);
-  statsmap.emplace("sim", sim_stats);
-  j = statsmap;
-}
-} // namespace champsim
-
-void champsim::json_printer::print(std::vector<phase_stats>& stats) { stream << nlohmann::json::array_t{std::begin(stats), std::end(stats)}; }
+ #include <algorithm>
+ #include <utility>
+ 
+ #include "stats_printer.h"
+ #include <nlohmann/json.hpp>
+ 
+ void to_json(nlohmann::json& j, const O3_CPU::stats_type stats)
+ {
+   std::array<std::pair<std::string, std::size_t>, 6> types{
+       {std::pair{"BRANCH_DIRECT_JUMP", BRANCH_DIRECT_JUMP}, std::pair{"BRANCH_INDIRECT", BRANCH_INDIRECT}, std::pair{"BRANCH_CONDITIONAL", BRANCH_CONDITIONAL},
+        std::pair{"BRANCH_DIRECT_CALL", BRANCH_DIRECT_CALL}, std::pair{"BRANCH_INDIRECT_CALL", BRANCH_INDIRECT_CALL},
+        std::pair{"BRANCH_RETURN", BRANCH_RETURN}}};
+ 
+   auto total_mispredictions = std::ceil(
+       std::accumulate(std::begin(types), std::end(types), 0ll, [btm = stats.branch_type_misses](auto acc, auto next) { return acc + btm[next.second]; }));
+ 
+   std::map<std::string, std::size_t> mpki{};
+   for (auto [name, idx] : types)
+     mpki.emplace(name, stats.branch_type_misses[idx]);
+   auto total_branch = std::ceil(
+       std::accumulate(std::begin(types), std::end(types), 0ll, [tbt = stats.total_branch_types](auto acc, auto next) { return acc + tbt[next.second]; }));
+ 
+   j = nlohmann::json{{"instructions", stats.instrs()},
+                      {"cycles", stats.cycles()},
+                      {"Avg ROB occupancy at mispredict", std::ceil(stats.total_rob_occupancy_at_branch_mispredict) / std::ceil(total_mispredictions)},
+                      {"mispredict", mpki},
+                      {"Branch Prediction Accuracy",(100.0 * std::ceil(total_branch - total_mispredictions)) / total_branch},
+                      {"MPKI",  (1000.0 * total_mispredictions)/ std::ceil(stats.instrs())}};
+ 
+ }
+ 
+ 
+ 
+ void to_json(nlohmann::json& j, const CACHE::stats_type stats)
+ {
+   constexpr std::array<std::pair<std::string_view, std::size_t>, 5> types{
+       {std::pair{"LOAD", champsim::to_underlying(access_type::LOAD)}, std::pair{"RFO", champsim::to_underlying(access_type::RFO)},
+        std::pair{"PREFETCH", champsim::to_underlying(access_type::PREFETCH)}, std::pair{"WRITE", champsim::to_underlying(access_type::WRITE)},
+        std::pair{"TRANSLATION", champsim::to_underlying(access_type::TRANSLATION)}}};
+ 
+   std::map<std::string, nlohmann::json> statsmap;
+   statsmap.emplace("prefetch requested", stats.pf_requested);
+   statsmap.emplace("prefetch issued", stats.pf_issued);
+   statsmap.emplace("useful prefetch", stats.pf_useful);
+   statsmap.emplace("useless prefetch", stats.pf_useless);
+   statsmap.emplace("miss latency", stats.avg_miss_latency);
+   for (const auto& type : types) {
+     statsmap.emplace(type.first, nlohmann::json{{"hit", stats.hits[type.second]}, {"miss", stats.misses[type.second]}});
+   }
+ 
+   j = statsmap;
+ }
+ 
+ void to_json(nlohmann::json& j, const DRAM_CHANNEL::stats_type stats)
+ {
+   j = nlohmann::json{{"RQ ROW_BUFFER_HIT", stats.RQ_ROW_BUFFER_HIT},
+                      {"RQ ROW_BUFFER_MISS", stats.RQ_ROW_BUFFER_MISS},
+                      {"WQ ROW_BUFFER_HIT", stats.WQ_ROW_BUFFER_HIT},
+                      {"WQ ROW_BUFFER_MISS", stats.WQ_ROW_BUFFER_MISS},
+                      {"AVG DBUS CONGESTED CYCLE", std::ceil(stats.dbus_cycle_congested) / std::ceil(stats.dbus_count_congested)}};
+ }
+ 
+ namespace champsim
+ {
+ void to_json(nlohmann::json& j, const champsim::phase_stats stats)
+ {
+   std::map<std::string, nlohmann::json> roi_stats;
+   // roi_stats.emplace("cores", stats.roi_cpu_stats);
+   // roi_stats.emplace("DRAM", stats.roi_dram_stats);
+   // for (auto x : stats.roi_cache_stats)
+   //   roi_stats.emplace(x.name, x);
+ 
+   std::map<std::string, nlohmann::json> sim_stats;
+   sim_stats.emplace("cores", stats.sim_cpu_stats);
+   // sim_stats.emplace("DRAM", stats.sim_dram_stats);
+   for (auto x : stats.sim_cache_stats)
+       sim_stats.emplace(x.name, x);
+ 
+   std::map<std::string, nlohmann::json> statsmap{{"name", stats.name}, {"traces", stats.trace_names}};
+   //statsmap.emplace("roi", roi_stats);
+   statsmap.emplace("sim", sim_stats);
+   j = statsmap;
+ }
+ } // namespace champsim
+ 
+ void champsim::json_printer::print(std::vector<phase_stats>& stats) { stream << nlohmann::json::array_t{std::begin(stats), std::end(stats)}; }
\ No newline at end of file
diff --git a/src/ooo_cpu.cc b/src/ooo_cpu.cc
index 04493c3409..03d94dc2ca 100644
--- a/src/ooo_cpu.cc
+++ b/src/ooo_cpu.cc
@@ -51,23 +51,31 @@ long O3_CPU::operate()
   progress += check_dib();
   initialize_instruction();
 
-  // heartbeat
-  if (show_heartbeat && (num_retired >= next_print_instruction)) {
-    auto heartbeat_instr{std::ceil(num_retired - last_heartbeat_instr)};
-    auto heartbeat_cycle{std::ceil(current_cycle - last_heartbeat_cycle)};
+// heartbeat
+if (show_heartbeat && (num_retired >= next_print_instruction)) {
+  auto heartbeat_instr{std::ceil(num_retired - last_heartbeat_instr)};
+  auto heartbeat_cycle{std::ceil(current_cycle - last_heartbeat_cycle)};
 
-    auto phase_instr{std::ceil(num_retired - begin_phase_instr)};
-    auto phase_cycle{std::ceil(current_cycle - begin_phase_cycle)};
+  auto phase_instr{std::ceil(num_retired - begin_phase_instr)};
+  auto phase_cycle{std::ceil(current_cycle - begin_phase_cycle)};
+  constexpr std::array<std::pair<std::string_view, std::size_t>, 6> types{
+      {std::pair{"BRANCH_DIRECT_JUMP", BRANCH_DIRECT_JUMP}, std::pair{"BRANCH_INDIRECT", BRANCH_INDIRECT}, std::pair{"BRANCH_CONDITIONAL", BRANCH_CONDITIONAL},
+       std::pair{"BRANCH_DIRECT_CALL", BRANCH_DIRECT_CALL}, std::pair{"BRANCH_INDIRECT_CALL", BRANCH_INDIRECT_CALL},
+       std::pair{"BRANCH_RETURN", BRANCH_RETURN}}};
 
-    fmt::print("Heartbeat CPU {} instructions: {} cycles: {} heartbeat IPC: {:.4g} cumulative IPC: {:.4g} (Simulation time: {:%H hr %M min %S sec})\n", cpu,
-               num_retired, current_cycle, heartbeat_instr / heartbeat_cycle, phase_instr / phase_cycle, elapsed_time());
-    next_print_instruction += STAT_PRINTING_PERIOD;
+  auto total_branch = std::ceil(std::accumulate(std::begin(types), std::end(types), 0ll, [tbt = sim_stats.total_branch_types](auto acc, auto next) { return acc + tbt[next.second]; }));
+  auto total_mispredictions = std::ceil(std::accumulate(std::begin(types), std::end(types), 0ll, [btm = sim_stats.branch_type_misses](auto acc, auto next) { return acc + btm[next.second]; }));
 
-    last_heartbeat_instr = num_retired;
-    last_heartbeat_cycle = current_cycle;
-  }
 
-  return progress;
+  fmt::print("Heartbeat CPU {} instructions: {} cycles: {} heartbeat IPC: {:.4g} cumulative IPC: {:.4g} Prediction_Accuracy: {:.4g}% (Simulation time: {:%H hr %M min %S sec})\n", cpu,
+             num_retired, current_cycle, heartbeat_instr / heartbeat_cycle, phase_instr / phase_cycle,(100.0 * std::ceil(total_branch - total_mispredictions)) / total_branch, elapsed_time());
+  next_print_instruction += STAT_PRINTING_PERIOD;
+
+  last_heartbeat_instr = num_retired;
+  last_heartbeat_cycle = current_cycle;
+}
+
+return progress;
 }
 
 void O3_CPU::initialize()